Example #1
0
    def test_multiprocess_read_with_qiterable(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4)

        all_instances = []
        qiterable = reader.read(self.identical_files_glob)
        assert isinstance(qiterable, QIterable)

        # Essentially QIterable.__iter__. Broken out here as we intend it to be
        # a public interface.
        qiterable.start()
        while qiterable.num_active_workers.value > 0 or qiterable.num_inflight_items.value > 0:
            while True:
                try:
                    all_instances.append(qiterable.output_queue.get(block=False, timeout=1.0))
                    with qiterable.num_inflight_items.get_lock():
                        qiterable.num_inflight_items.value -= 1
                except Empty:
                    break
        qiterable.join()

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100
Example #2
0
    def test_multiple_epochs(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader,
                                           num_workers=2,
                                           epochs_per_read=3)

        all_instances = []

        for instance in reader.read(self.glob):
            all_instances.append(instance)

        # 100 files * 4 sentences per file * 3 epochs
        assert len(all_instances) == 100 * 4 * 3

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 * 3 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300
        assert counts[("birds", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300
Example #3
0
    def test_with_iterator(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        instances = reader.read(self.identical_files_glob)

        iterator = BasicIterator(batch_size=32)
        iterator.index_with(self.vocab)

        batches = [batch for batch in iterator(instances, num_epochs=1)]

        # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16
        sizes = sorted([len(batch['tags']) for batch in batches])
        assert sizes == [16] + 12 * [32]
    def test_with_iterator(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        instances = reader.read(self.glob)

        iterator = BasicIterator(batch_size=32)
        iterator.index_with(self.vocab)

        batches = [batch for batch in iterator(instances, num_epochs=1)]

        # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16
        sizes = sorted([len(batch['tags']) for batch in batches])
        assert sizes == [16] + 12 * [32]
    def test_multiprocess_reader_with_multiprocess_iterator(self):
        # use SequenceTaggingDatasetReader as the base reader
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024)

        iterator = MultiprocessIterator(base_iterator, num_workers=2)
        iterator.index_with(self.vocab)

        instances = reader.read(self.glob)

        tensor_dicts = iterator(instances, num_epochs=1)
        sizes = [len(tensor_dict['tags']) for tensor_dict in tensor_dicts]
        assert sum(sizes) == 400
    def test_multiprocess_reader_with_multiprocess_iterator(self):
        # use SequenceTaggingDatasetReader as the base reader
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024)

        iterator = MultiprocessIterator(base_iterator, num_workers=2)
        iterator.index_with(self.vocab)

        instances = reader.read(self.glob)

        tensor_dicts = iterator(instances, num_epochs=1)
        sizes = [len(tensor_dict["tags"]) for tensor_dict in tensor_dicts]
        assert sum(sizes) == 400
Example #7
0
    def test_multiprocess_read(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4)

        all_instances = []

        for instance in reader.read(self.identical_files_glob):
            all_instances.append(instance)

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100
    def test_multiprocess_read(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4)

        all_instances = []

        for instance in reader.read(self.glob):
            all_instances.append(instance)

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100
Example #9
0
    def test_multiprocess_read_partial_does_not_hang(self):
        # Use a small queue size such that the processes generating the data will block.
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4, output_queue_size=10)

        all_instances = []

        # Half of 100 files * 4 sentences / file
        i = 0
        for instance in reader.read(self.identical_files_glob):
            # Stop early such that the processes generating the data remain
            # active (given the small queue size).
            if i == 200:
                break
            i += 1
            all_instances.append(instance)

        # This should be trivially true. The real test here is that we exit
        # normally and don't hang due to the still active processes.
        assert len(all_instances) == 200