def test_multiprocess_read_with_qiterable(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4) all_instances = [] qiterable = reader.read(self.identical_files_glob) assert isinstance(qiterable, QIterable) # Essentially QIterable.__iter__. Broken out here as we intend it to be # a public interface. qiterable.start() while qiterable.num_active_workers.value > 0 or qiterable.num_inflight_items.value > 0: while True: try: all_instances.append(qiterable.output_queue.get(block=False, timeout=1.0)) with qiterable.num_inflight_items.get_lock(): qiterable.num_inflight_items.value -= 1 except Empty: break qiterable.join() # 100 files * 4 sentences / file assert len(all_instances) == 100 * 4 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100
def test_multiple_epochs(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2, epochs_per_read=3) all_instances = [] for instance in reader.read(self.glob): all_instances.append(instance) # 100 files * 4 sentences per file * 3 epochs assert len(all_instances) == 100 * 4 * 3 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 * 3 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 300 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 300
def test_with_iterator(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2) instances = reader.read(self.identical_files_glob) iterator = BasicIterator(batch_size=32) iterator.index_with(self.vocab) batches = [batch for batch in iterator(instances, num_epochs=1)] # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16 sizes = sorted([len(batch['tags']) for batch in batches]) assert sizes == [16] + 12 * [32]
def test_with_iterator(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2) instances = reader.read(self.glob) iterator = BasicIterator(batch_size=32) iterator.index_with(self.vocab) batches = [batch for batch in iterator(instances, num_epochs=1)] # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16 sizes = sorted([len(batch['tags']) for batch in batches]) assert sizes == [16] + 12 * [32]
def test_multiprocess_reader_with_multiprocess_iterator(self): # use SequenceTaggingDatasetReader as the base reader reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2) base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024) iterator = MultiprocessIterator(base_iterator, num_workers=2) iterator.index_with(self.vocab) instances = reader.read(self.glob) tensor_dicts = iterator(instances, num_epochs=1) sizes = [len(tensor_dict['tags']) for tensor_dict in tensor_dicts] assert sum(sizes) == 400
def test_multiprocess_reader_with_multiprocess_iterator(self): # use SequenceTaggingDatasetReader as the base reader reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2) base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024) iterator = MultiprocessIterator(base_iterator, num_workers=2) iterator.index_with(self.vocab) instances = reader.read(self.glob) tensor_dicts = iterator(instances, num_epochs=1) sizes = [len(tensor_dict["tags"]) for tensor_dict in tensor_dicts] assert sum(sizes) == 400
def test_multiprocess_read(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4) all_instances = [] for instance in reader.read(self.identical_files_glob): all_instances.append(instance) # 100 files * 4 sentences / file assert len(all_instances) == 100 * 4 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100
def test_multiprocess_read(self): reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4) all_instances = [] for instance in reader.read(self.glob): all_instances.append(instance) # 100 files * 4 sentences / file assert len(all_instances) == 100 * 4 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100
def test_multiprocess_read_partial_does_not_hang(self): # Use a small queue size such that the processes generating the data will block. reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4, output_queue_size=10) all_instances = [] # Half of 100 files * 4 sentences / file i = 0 for instance in reader.read(self.identical_files_glob): # Stop early such that the processes generating the data remain # active (given the small queue size). if i == 200: break i += 1 all_instances.append(instance) # This should be trivially true. The real test here is that we exit # normally and don't hang due to the still active processes. assert len(all_instances) == 200