예제 #1
0
    def test_multiprocess_read_with_qiterable(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4)

        all_instances = []
        qiterable = reader.read(self.identical_files_glob)
        assert isinstance(qiterable, QIterable)

        # Essentially QIterable.__iter__. Broken out here as we intend it to be
        # a public interface.
        qiterable.start()
        while qiterable.num_active_workers.value > 0 or qiterable.num_inflight_items.value > 0:
            while True:
                try:
                    all_instances.append(qiterable.output_queue.get(block=False, timeout=1.0))
                    with qiterable.num_inflight_items.get_lock():
                        qiterable.num_inflight_items.value -= 1
                except Empty:
                    break
        qiterable.join()

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100
예제 #2
0
    def test_multiple_epochs(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader,
                                           num_workers=2,
                                           epochs_per_read=3)

        all_instances = []

        for instance in reader.read(self.glob):
            all_instances.append(instance)

        # 100 files * 4 sentences per file * 3 epochs
        assert len(all_instances) == 100 * 4 * 3

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 * 3 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300
        assert counts[("birds", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300
예제 #3
0
    def test_with_iterator(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        instances = reader.read(self.identical_files_glob)

        iterator = BasicIterator(batch_size=32)
        iterator.index_with(self.vocab)

        batches = [batch for batch in iterator(instances, num_epochs=1)]

        # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16
        sizes = sorted([len(batch['tags']) for batch in batches])
        assert sizes == [16] + 12 * [32]
    def test_with_iterator(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        instances = reader.read(self.glob)

        iterator = BasicIterator(batch_size=32)
        iterator.index_with(self.vocab)

        batches = [batch for batch in iterator(instances, num_epochs=1)]

        # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16
        sizes = sorted([len(batch['tags']) for batch in batches])
        assert sizes == [16] + 12 * [32]
    def test_multiprocess_reader_with_multiprocess_iterator(self):
        # use SequenceTaggingDatasetReader as the base reader
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024)

        iterator = MultiprocessIterator(base_iterator, num_workers=2)
        iterator.index_with(self.vocab)

        instances = reader.read(self.glob)

        tensor_dicts = iterator(instances, num_epochs=1)
        sizes = [len(tensor_dict['tags']) for tensor_dict in tensor_dicts]
        assert sum(sizes) == 400
    def test_multiprocess_reader_with_multiprocess_iterator(self):
        # use SequenceTaggingDatasetReader as the base reader
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024)

        iterator = MultiprocessIterator(base_iterator, num_workers=2)
        iterator.index_with(self.vocab)

        instances = reader.read(self.glob)

        tensor_dicts = iterator(instances, num_epochs=1)
        sizes = [len(tensor_dict["tags"]) for tensor_dict in tensor_dicts]
        assert sum(sizes) == 400
예제 #7
0
    def test_multiprocess_read_in_subprocess_is_deterministic(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=1)
        q = Queue()
        def read():
            for instance in reader.read(self.distinct_files_glob):
                q.put(fingerprint(instance))

        # Ensure deterministic shuffling.
        np.random.seed(0)
        p = Process(target=read)
        p.start()
        p.join()

        # Convert queue to list.
        actual_fingerprints = []
        while not q.empty():
            actual_fingerprints.append(q.get(block=False))

        assert len(actual_fingerprints) == 100

        expected_fingerprints = []
        for instance in self.base_reader.read(self.all_distinct_path):
            expected_fingerprints.append(fingerprint(instance))

        np.random.seed(0)
        expected_fingerprints.sort()
        # This should be shuffled into exactly the same order as actual_fingerprints.
        np.random.shuffle(expected_fingerprints)

        assert actual_fingerprints == expected_fingerprints
예제 #8
0
    def test_multiprocess_read(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4)

        all_instances = []

        for instance in reader.read(self.identical_files_glob):
            all_instances.append(instance)

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100
    def test_multiprocess_read(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4)

        all_instances = []

        for instance in reader.read(self.glob):
            all_instances.append(instance)

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100
예제 #10
0
    def test_multiprocess_read_partial_does_not_hang(self):
        # Use a small queue size such that the processes generating the data will block.
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4, output_queue_size=10)

        all_instances = []

        # Half of 100 files * 4 sentences / file
        i = 0
        for instance in reader.read(self.identical_files_glob):
            # Stop early such that the processes generating the data remain
            # active (given the small queue size).
            if i == 200:
                break
            i += 1
            all_instances.append(instance)

        # This should be trivially true. The real test here is that we exit
        # normally and don't hang due to the still active processes.
        assert len(all_instances) == 200