class TestMultiprocessIterator(IteratorTest): def setUp(self): super().setUp() self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f"sequence_tagging_{i}.tsv" with open(file_path, "w") as f: f.write(raw_data) self.glob = str(self.TEST_DIR / "sequence_tagging_*.tsv") # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path))) def test_yield_one_epoch_iterates_over_the_data_once(self): for test_instances in (self.instances, self.lazy_instances): base_iterator = BasicIterator(batch_size=2, max_instances_in_memory=1024) iterator = MultiprocessIterator(base_iterator, num_workers=4) iterator.index_with(self.vocab) batches = list(iterator(test_instances, num_epochs=1)) # We just want to get the single-token array for the text field in the instance. instances = [ tuple(instance.detach().cpu().numpy()) for batch in batches for instance in batch["text"]["tokens"]["tokens"] ] assert len(instances) == 5 def test_multiprocess_iterate_partial_does_not_hang(self): for test_instances in (self.instances, self.lazy_instances): base_iterator = BasicIterator(batch_size=2, max_instances_in_memory=1024) iterator = MultiprocessIterator(base_iterator, num_workers=4) iterator.index_with(self.vocab) generator = iterator(test_instances, num_epochs=1) # We only iterate through 3 of the 5 instances causing the # processes generating the tensors to remain active. for _ in range(3): next(generator) # The real test here is that we exit normally and don't hang due to # the still active processes. def test_multiprocess_reader_with_multiprocess_iterator(self): # use SequenceTaggingDatasetReader as the base reader reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2) base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024) iterator = MultiprocessIterator(base_iterator, num_workers=2) iterator.index_with(self.vocab) instances = reader.read(self.glob) tensor_dicts = iterator(instances, num_epochs=1) sizes = [len(tensor_dict["tags"]) for tensor_dict in tensor_dicts] assert sum(sizes) == 400
class TestMultiprocessIterator(IteratorTest): def setUp(self): super().setUp() self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv' # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv' with open(file_path, 'w') as f: f.write(raw_data) self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv') # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances( self.base_reader.read(str(base_file_path))) def test_yield_one_epoch_iterates_over_the_data_once(self): for test_instances in (self.instances, self.lazy_instances): base_iterator = BasicIterator(batch_size=2, max_instances_in_memory=1024) iterator = MultiprocessIterator(base_iterator, num_workers=4) iterator.index_with(self.vocab) batches = list(iterator(test_instances, num_epochs=1)) # We just want to get the single-token array for the text field in the instance. instances = [ tuple(instance.detach().cpu().numpy()) for batch in batches for instance in batch['text']["tokens"] ] assert len(instances) == 5 def test_multiprocess_reader_with_multiprocess_iterator(self): # use SequenceTaggingDatasetReader as the base reader reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2) base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024) iterator = MultiprocessIterator(base_iterator, num_workers=2) iterator.index_with(self.vocab) instances = reader.read(self.glob) tensor_dicts = iterator(instances, num_epochs=1) sizes = [len(tensor_dict['tags']) for tensor_dict in tensor_dicts] assert sum(sizes) == 400
def test_default_format(self): reader = SequenceTaggingDatasetReader() dataset = reader.read('tests/fixtures/data/sequence_tagging.tsv') assert len(dataset.instances) == 4 fields = dataset.instances[0].fields assert fields["tokens"].tokens == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[1].fields assert fields["tokens"].tokens == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[2].fields assert fields["tokens"].tokens == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[3].fields assert fields["tokens"].tokens == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
def test_brown_corpus_format(self): reader = SequenceTaggingDatasetReader(word_tag_delimiter='/') dataset = reader.read('tests/fixtures/data/brown_corpus.txt') assert len(dataset.instances) == 4 fields = dataset.instances[0].fields assert fields["tokens"].tokens == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[1].fields assert fields["tokens"].tokens == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[2].fields assert fields["tokens"].tokens == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = dataset.instances[3].fields assert fields["tokens"].tokens == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
class TestMultiprocessIterator(IteratorTest): def setUp(self): super().setUp() self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv' # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv' with open(file_path, 'w') as f: f.write(raw_data) self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv') # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path))) def test_yield_one_epoch_iterates_over_the_data_once(self): for test_instances in (self.instances, self.lazy_instances): base_iterator = BasicIterator(batch_size=2, max_instances_in_memory=1024) iterator = MultiprocessIterator(base_iterator, num_workers=4) iterator.index_with(self.vocab) batches = list(iterator(test_instances, num_epochs=1)) # We just want to get the single-token array for the text field in the instance. instances = [tuple(instance.detach().cpu().numpy()) for batch in batches for instance in batch['text']["tokens"]] assert len(instances) == 5 def test_multiprocess_reader_with_multiprocess_iterator(self): # use SequenceTaggingDatasetReader as the base reader reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2) base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024) iterator = MultiprocessIterator(base_iterator, num_workers=2) iterator.index_with(self.vocab) instances = reader.read(self.glob) tensor_dicts = iterator(instances, num_epochs=1) sizes = [len(tensor_dict['tags']) for tensor_dict in tensor_dicts] assert sum(sizes) == 400
def test_default_format(self, lazy): reader = SequenceTaggingDatasetReader(lazy=lazy) instances = reader.read('tests/fixtures/data/sequence_tagging.tsv') instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens] == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[3].fields assert [t.text for t in fields["tokens"].tokens] == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
def test_brown_corpus_format(self): reader = SequenceTaggingDatasetReader(word_tag_delimiter='/') instances = reader.read('tests/fixtures/data/brown_corpus.txt') instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens] == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[3].fields assert [t.text for t in fields["tokens"].tokens] == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
def test_read_from_file(self): reader = SequenceTaggingDatasetReader() dataset = reader.read(self.TRAIN_FILE) assert len(dataset.instances) == 4 fields = dataset.instances[0].fields() assert fields["tokens"].tokens() == ["cats", "are", "animals", "."] assert fields["tags"].tags() == ["N", "V", "N", "N"] fields = dataset.instances[1].fields() assert fields["tokens"].tokens() == ["dogs", "are", "animals", "."] assert fields["tags"].tags() == ["N", "V", "N", "N"] fields = dataset.instances[2].fields() assert fields["tokens"].tokens() == ["snakes", "are", "animals", "."] assert fields["tags"].tags() == ["N", "V", "N", "N"] fields = dataset.instances[3].fields() assert fields["tokens"].tokens() == ["birds", "are", "animals", "."] assert fields["tags"].tags() == ["N", "V", "N", "N"]
def test_brown_corpus_format(self): reader = SequenceTaggingDatasetReader(word_tag_delimiter=u'/') instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'brown_corpus.txt') instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"cats", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[1].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"dogs", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[2].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"snakes", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[3].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"birds", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
def test_default_format(self, lazy): reader = SequenceTaggingDatasetReader(lazy=lazy) instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv') instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"cats", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[1].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"dogs", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[2].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"snakes", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"] fields = instances[3].fields assert [t.text for t in fields[u"tokens"].tokens] == [u"birds", u"are", u"animals", u"."] assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
def test_default_format(self): reader = SequenceTaggingDatasetReader(max_instances=4) instances = list( reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens ] == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens ] == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens ] == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[3].fields assert [t.text for t in fields["tokens"].tokens ] == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
def test_brown_corpus_format(self): reader = SequenceTaggingDatasetReader(word_tag_delimiter="/") instances = list( reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "brown_corpus.txt")) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens ] == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens ] == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens ] == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[3].fields assert [t.text for t in fields["tokens"].tokens ] == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
class TestMultiprocessDatasetReader(AllenNlpTestCase): def setUp(self) -> None: super().setUp() # use SequenceTaggingDatasetReader as the base reader self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv' # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f'identical_{i}.tsv' with open(file_path, 'w') as f: f.write(raw_data) self.all_distinct_path = str(self.TEST_DIR / 'all_distinct.tsv') with open(self.all_distinct_path, 'w') as all_distinct: for i in range(100): file_path = self.TEST_DIR / f'distinct_{i}.tsv' line = f"This###DT\tis###VBZ\tsentence###NN\t{i}###CD\t.###.\n" with open(file_path, 'w') as f: f.write(line) all_distinct.write(line) self.identical_files_glob = str(self.TEST_DIR / 'identical_*.tsv') self.distinct_files_glob = str(self.TEST_DIR / 'distinct_*.tsv') # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path))) def test_multiprocess_read(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4) all_instances = [] for instance in reader.read(self.identical_files_glob): all_instances.append(instance) # 100 files * 4 sentences / file assert len(all_instances) == 100 * 4 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100 def test_multiprocess_read_in_subprocess_is_deterministic(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=1) q = Queue() def read(): for instance in reader.read(self.distinct_files_glob): q.put(fingerprint(instance)) # Ensure deterministic shuffling. np.random.seed(0) p = Process(target=read) p.start() p.join() # Convert queue to list. actual_fingerprints = [] while not q.empty(): actual_fingerprints.append(q.get(block=False)) assert len(actual_fingerprints) == 100 expected_fingerprints = [] for instance in self.base_reader.read(self.all_distinct_path): expected_fingerprints.append(fingerprint(instance)) np.random.seed(0) expected_fingerprints.sort() # This should be shuffled into exactly the same order as actual_fingerprints. np.random.shuffle(expected_fingerprints) assert actual_fingerprints == expected_fingerprints def test_multiple_epochs(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2, epochs_per_read=3) all_instances = [] for instance in reader.read(self.identical_files_glob): all_instances.append(instance) # 100 files * 4 sentences per file * 3 epochs assert len(all_instances) == 100 * 4 * 3 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 * 3 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 300 def test_with_iterator(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2) instances = reader.read(self.identical_files_glob) iterator = BasicIterator(batch_size=32) iterator.index_with(self.vocab) batches = [batch for batch in iterator(instances, num_epochs=1)] # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16 sizes = sorted([len(batch['tags']) for batch in batches]) assert sizes == [16] + 12 * [32]
class TestMultiprocessDatasetReader(AllenNlpTestCase): def setUp(self) -> None: super().setUp() # use SequenceTaggingDatasetReader as the base reader self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv" # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f"identical_{i}.tsv" with open(file_path, "w") as f: f.write(raw_data) self.all_distinct_path = str(self.TEST_DIR / "all_distinct.tsv") with open(self.all_distinct_path, "w") as all_distinct: for i in range(100): file_path = self.TEST_DIR / f"distinct_{i}.tsv" line = f"This###DT\tis###VBZ\tsentence###NN\t{i}###CD\t.###.\n" with open(file_path, "w") as f: f.write(line) all_distinct.write(line) self.identical_files_glob = str(self.TEST_DIR / "identical_*.tsv") self.distinct_files_glob = str(self.TEST_DIR / "distinct_*.tsv") # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path))) def test_multiprocess_read(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4) all_instances = [] for instance in reader.read(self.identical_files_glob): all_instances.append(instance) # 100 files * 4 sentences / file assert len(all_instances) == 100 * 4 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100 def test_multiprocess_read_partial_does_not_hang(self): # Use a small queue size such that the processes generating the data will block. reader = MultiprocessDatasetReader( base_reader=self.base_reader, num_workers=4, output_queue_size=10 ) all_instances = [] # Half of 100 files * 4 sentences / file i = 0 for instance in reader.read(self.identical_files_glob): # Stop early such that the processes generating the data remain # active (given the small queue size). if i == 200: break i += 1 all_instances.append(instance) # This should be trivially true. The real test here is that we exit # normally and don't hang due to the still active processes. assert len(all_instances) == 200 def test_multiprocess_read_with_qiterable(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4) all_instances = [] qiterable = reader.read(self.identical_files_glob) assert isinstance(qiterable, QIterable) # Essentially QIterable.__iter__. Broken out here as we intend it to be # a public interface. qiterable.start() while qiterable.num_active_workers.value > 0 or qiterable.num_inflight_items.value > 0: while True: try: all_instances.append(qiterable.output_queue.get(block=False, timeout=1.0)) with qiterable.num_inflight_items.get_lock(): qiterable.num_inflight_items.value -= 1 except Empty: break qiterable.join() # 100 files * 4 sentences / file assert len(all_instances) == 100 * 4 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100 def test_multiprocess_read_in_subprocess_is_deterministic(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=1) q = Queue() def read(): for instance in reader.read(self.distinct_files_glob): q.put(fingerprint(instance)) # Ensure deterministic shuffling. np.random.seed(0) p = Process(target=read) p.start() p.join() # Convert queue to list. actual_fingerprints = [] while not q.empty(): actual_fingerprints.append(q.get(block=False)) assert len(actual_fingerprints) == 100 expected_fingerprints = [] for instance in self.base_reader.read(self.all_distinct_path): expected_fingerprints.append(fingerprint(instance)) np.random.seed(0) expected_fingerprints.sort() # This should be shuffled into exactly the same order as actual_fingerprints. np.random.shuffle(expected_fingerprints) assert actual_fingerprints == expected_fingerprints def test_multiple_epochs(self): reader = MultiprocessDatasetReader( base_reader=self.base_reader, num_workers=2, epochs_per_read=3 ) all_instances = [] for instance in reader.read(self.identical_files_glob): all_instances.append(instance) # 100 files * 4 sentences per file * 3 epochs assert len(all_instances) == 100 * 4 * 3 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 * 3 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 300 def test_with_iterator(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2) instances = reader.read(self.identical_files_glob) iterator = BasicIterator(batch_size=32) iterator.index_with(self.vocab) batches = [batch for batch in iterator(instances, num_epochs=1)] # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16 sizes = sorted([len(batch["tags"]) for batch in batches]) assert sizes == [16] + 12 * [32]
class TestMultiprocessDatasetReader(AllenNlpTestCase): def setUp(self) -> None: super().setUp() # use SequenceTaggingDatasetReader as the base reader self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv' # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv' with open(file_path, 'w') as f: f.write(raw_data) self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv') # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances( self.base_reader.read(str(base_file_path))) def test_multiprocess_read(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4) all_instances = [] for instance in reader.read(self.glob): all_instances.append(instance) # 100 files * 4 sentences / file assert len(all_instances) == 100 * 4 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100 def test_multiple_epochs(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2, epochs_per_read=3) all_instances = [] for instance in reader.read(self.glob): all_instances.append(instance) # 100 files * 4 sentences per file * 3 epochs assert len(all_instances) == 100 * 4 * 3 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 * 3 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 300 def test_with_iterator(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2) instances = reader.read(self.glob) iterator = BasicIterator(batch_size=32) iterator.index_with(self.vocab) batches = [batch for batch in iterator(instances, num_epochs=1)] # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16 sizes = sorted([len(batch['tags']) for batch in batches]) assert sizes == [16] + 12 * [32]
class TestMultiprocessDatasetReader(AllenNlpTestCase): def setUp(self) -> None: super().setUp() # use SequenceTaggingDatasetReader as the base reader self.base_reader = SequenceTaggingDatasetReader(lazy=True) base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv' # Make 100 copies of the data raw_data = open(base_file_path).read() for i in range(100): file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv' with open(file_path, 'w') as f: f.write(raw_data) self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv') # For some of the tests we need a vocab, we'll just use the base_reader for that. self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path))) def test_multiprocess_read(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4) all_instances = [] for instance in reader.read(self.glob): all_instances.append(instance) # 100 files * 4 sentences / file assert len(all_instances) == 100 * 4 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100 def test_multiple_epochs(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2, epochs_per_read=3) all_instances = [] for instance in reader.read(self.glob): all_instances.append(instance) # 100 files * 4 sentences per file * 3 epochs assert len(all_instances) == 100 * 4 * 3 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 * 3 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 300 def test_with_iterator(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2) instances = reader.read(self.glob) iterator = BasicIterator(batch_size=32) iterator.index_with(self.vocab) batches = [batch for batch in iterator(instances, num_epochs=1)] # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16 sizes = sorted([len(batch['tags']) for batch in batches]) assert sizes == [16] + 12 * [32]