class TestMultiprocessIterator(IteratorTest):
    def setUp(self):
        super().setUp()

        self.base_reader = SequenceTaggingDatasetReader(lazy=True)
        base_file_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"

        # Make 100 copies of the data
        raw_data = open(base_file_path).read()
        for i in range(100):
            file_path = self.TEST_DIR / f"sequence_tagging_{i}.tsv"
            with open(file_path, "w") as f:
                f.write(raw_data)

        self.glob = str(self.TEST_DIR / "sequence_tagging_*.tsv")

        # For some of the tests we need a vocab, we'll just use the base_reader for that.
        self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))

    def test_yield_one_epoch_iterates_over_the_data_once(self):
        for test_instances in (self.instances, self.lazy_instances):
            base_iterator = BasicIterator(batch_size=2, max_instances_in_memory=1024)
            iterator = MultiprocessIterator(base_iterator, num_workers=4)
            iterator.index_with(self.vocab)
            batches = list(iterator(test_instances, num_epochs=1))
            # We just want to get the single-token array for the text field in the instance.
            instances = [
                tuple(instance.detach().cpu().numpy())
                for batch in batches
                for instance in batch["text"]["tokens"]["tokens"]
            ]
            assert len(instances) == 5

    def test_multiprocess_iterate_partial_does_not_hang(self):
        for test_instances in (self.instances, self.lazy_instances):
            base_iterator = BasicIterator(batch_size=2, max_instances_in_memory=1024)
            iterator = MultiprocessIterator(base_iterator, num_workers=4)
            iterator.index_with(self.vocab)
            generator = iterator(test_instances, num_epochs=1)
            # We only iterate through 3 of the 5 instances causing the
            # processes generating the tensors to remain active.
            for _ in range(3):
                next(generator)
            # The real test here is that we exit normally and don't hang due to
            # the still active processes.

    def test_multiprocess_reader_with_multiprocess_iterator(self):
        # use SequenceTaggingDatasetReader as the base reader
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024)

        iterator = MultiprocessIterator(base_iterator, num_workers=2)
        iterator.index_with(self.vocab)

        instances = reader.read(self.glob)

        tensor_dicts = iterator(instances, num_epochs=1)
        sizes = [len(tensor_dict["tags"]) for tensor_dict in tensor_dicts]
        assert sum(sizes) == 400
Exemple #2
0
class TestMultiprocessIterator(IteratorTest):
    def setUp(self):
        super().setUp()

        self.base_reader = SequenceTaggingDatasetReader(lazy=True)
        base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'

        # Make 100 copies of the data
        raw_data = open(base_file_path).read()
        for i in range(100):
            file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv'
            with open(file_path, 'w') as f:
                f.write(raw_data)

        self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv')

        # For some of the tests we need a vocab, we'll just use the base_reader for that.
        self.vocab = Vocabulary.from_instances(
            self.base_reader.read(str(base_file_path)))

    def test_yield_one_epoch_iterates_over_the_data_once(self):
        for test_instances in (self.instances, self.lazy_instances):
            base_iterator = BasicIterator(batch_size=2,
                                          max_instances_in_memory=1024)
            iterator = MultiprocessIterator(base_iterator, num_workers=4)
            iterator.index_with(self.vocab)
            batches = list(iterator(test_instances, num_epochs=1))
            # We just want to get the single-token array for the text field in the instance.
            instances = [
                tuple(instance.detach().cpu().numpy()) for batch in batches
                for instance in batch['text']["tokens"]
            ]
            assert len(instances) == 5

    def test_multiprocess_reader_with_multiprocess_iterator(self):
        # use SequenceTaggingDatasetReader as the base reader
        reader = MultiprocessDatasetReader(base_reader=self.base_reader,
                                           num_workers=2)
        base_iterator = BasicIterator(batch_size=32,
                                      max_instances_in_memory=1024)

        iterator = MultiprocessIterator(base_iterator, num_workers=2)
        iterator.index_with(self.vocab)

        instances = reader.read(self.glob)

        tensor_dicts = iterator(instances, num_epochs=1)
        sizes = [len(tensor_dict['tags']) for tensor_dict in tensor_dicts]
        assert sum(sizes) == 400
Exemple #3
0
    def test_default_format(self):
        reader = SequenceTaggingDatasetReader()
        dataset = reader.read('tests/fixtures/data/sequence_tagging.tsv')

        assert len(dataset.instances) == 4
        fields = dataset.instances[0].fields
        assert fields["tokens"].tokens == ["cats", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = dataset.instances[1].fields
        assert fields["tokens"].tokens == ["dogs", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = dataset.instances[2].fields
        assert fields["tokens"].tokens == ["snakes", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = dataset.instances[3].fields
        assert fields["tokens"].tokens == ["birds", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
Exemple #4
0
    def test_brown_corpus_format(self):
        reader = SequenceTaggingDatasetReader(word_tag_delimiter='/')
        dataset = reader.read('tests/fixtures/data/brown_corpus.txt')

        assert len(dataset.instances) == 4
        fields = dataset.instances[0].fields
        assert fields["tokens"].tokens == ["cats", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = dataset.instances[1].fields
        assert fields["tokens"].tokens == ["dogs", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = dataset.instances[2].fields
        assert fields["tokens"].tokens == ["snakes", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = dataset.instances[3].fields
        assert fields["tokens"].tokens == ["birds", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
class TestMultiprocessIterator(IteratorTest):
    def setUp(self):
        super().setUp()

        self.base_reader = SequenceTaggingDatasetReader(lazy=True)
        base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'

        # Make 100 copies of the data
        raw_data = open(base_file_path).read()
        for i in range(100):
            file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv'
            with open(file_path, 'w') as f:
                f.write(raw_data)

        self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv')

        # For some of the tests we need a vocab, we'll just use the base_reader for that.
        self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))


    def test_yield_one_epoch_iterates_over_the_data_once(self):
        for test_instances in (self.instances, self.lazy_instances):
            base_iterator = BasicIterator(batch_size=2, max_instances_in_memory=1024)
            iterator = MultiprocessIterator(base_iterator, num_workers=4)
            iterator.index_with(self.vocab)
            batches = list(iterator(test_instances, num_epochs=1))
            # We just want to get the single-token array for the text field in the instance.
            instances = [tuple(instance.detach().cpu().numpy())
                         for batch in batches
                         for instance in batch['text']["tokens"]]
            assert len(instances) == 5

    def test_multiprocess_reader_with_multiprocess_iterator(self):
        # use SequenceTaggingDatasetReader as the base reader
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        base_iterator = BasicIterator(batch_size=32, max_instances_in_memory=1024)

        iterator = MultiprocessIterator(base_iterator, num_workers=2)
        iterator.index_with(self.vocab)

        instances = reader.read(self.glob)

        tensor_dicts = iterator(instances, num_epochs=1)
        sizes = [len(tensor_dict['tags']) for tensor_dict in tensor_dicts]
        assert sum(sizes) == 400
    def test_default_format(self, lazy):
        reader = SequenceTaggingDatasetReader(lazy=lazy)
        instances = reader.read('tests/fixtures/data/sequence_tagging.tsv')
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == ["cats", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == ["dogs", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == ["snakes", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[3].fields
        assert [t.text for t in fields["tokens"].tokens] == ["birds", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
    def test_brown_corpus_format(self):
        reader = SequenceTaggingDatasetReader(word_tag_delimiter='/')
        instances = reader.read('tests/fixtures/data/brown_corpus.txt')
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == ["cats", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == ["dogs", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == ["snakes", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[3].fields
        assert [t.text for t in fields["tokens"].tokens] == ["birds", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
    def test_read_from_file(self):

        reader = SequenceTaggingDatasetReader()
        dataset = reader.read(self.TRAIN_FILE)

        assert len(dataset.instances) == 4
        fields = dataset.instances[0].fields()
        assert fields["tokens"].tokens() == ["cats", "are", "animals", "."]
        assert fields["tags"].tags() == ["N", "V", "N", "N"]
        fields = dataset.instances[1].fields()
        assert fields["tokens"].tokens() == ["dogs", "are", "animals", "."]
        assert fields["tags"].tags() == ["N", "V", "N", "N"]
        fields = dataset.instances[2].fields()
        assert fields["tokens"].tokens() == ["snakes", "are", "animals", "."]
        assert fields["tags"].tags() == ["N", "V", "N", "N"]
        fields = dataset.instances[3].fields()
        assert fields["tokens"].tokens() == ["birds", "are", "animals", "."]
        assert fields["tags"].tags() == ["N", "V", "N", "N"]
Exemple #9
0
    def test_brown_corpus_format(self):
        reader = SequenceTaggingDatasetReader(word_tag_delimiter=u'/')
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'brown_corpus.txt')
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"cats", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"dogs", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"snakes", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[3].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"birds", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
Exemple #10
0
    def test_default_format(self, lazy):
        reader = SequenceTaggingDatasetReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'sequence_tagging.tsv')
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"cats", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"dogs", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"snakes", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
        fields = instances[3].fields
        assert [t.text for t in fields[u"tokens"].tokens] == [u"birds", u"are", u"animals", u"."]
        assert fields[u"tags"].labels == [u"N", u"V", u"N", u"N"]
Exemple #11
0
    def test_default_format(self):
        reader = SequenceTaggingDatasetReader(max_instances=4)
        instances = list(
            reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" /
                        "sequence_tagging.tsv"))

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens
                ] == ["cats", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens
                ] == ["dogs", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens
                ] == ["snakes", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[3].fields
        assert [t.text for t in fields["tokens"].tokens
                ] == ["birds", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
Exemple #12
0
    def test_brown_corpus_format(self):
        reader = SequenceTaggingDatasetReader(word_tag_delimiter="/")
        instances = list(
            reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" /
                        "brown_corpus.txt"))

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens
                ] == ["cats", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens
                ] == ["dogs", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens
                ] == ["snakes", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[3].fields
        assert [t.text for t in fields["tokens"].tokens
                ] == ["birds", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
Exemple #13
0
class TestMultiprocessDatasetReader(AllenNlpTestCase):
    def setUp(self) -> None:
        super().setUp()

        # use SequenceTaggingDatasetReader as the base reader
        self.base_reader = SequenceTaggingDatasetReader(lazy=True)
        base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'


        # Make 100 copies of the data
        raw_data = open(base_file_path).read()
        for i in range(100):
            file_path = self.TEST_DIR / f'identical_{i}.tsv'
            with open(file_path, 'w') as f:
                f.write(raw_data)

        self.all_distinct_path = str(self.TEST_DIR / 'all_distinct.tsv')
        with open(self.all_distinct_path, 'w') as all_distinct:
            for i in range(100):
                file_path = self.TEST_DIR / f'distinct_{i}.tsv'
                line = f"This###DT\tis###VBZ\tsentence###NN\t{i}###CD\t.###.\n"
                with open(file_path, 'w') as f:
                    f.write(line)
                all_distinct.write(line)

        self.identical_files_glob = str(self.TEST_DIR / 'identical_*.tsv')
        self.distinct_files_glob = str(self.TEST_DIR / 'distinct_*.tsv')

        # For some of the tests we need a vocab, we'll just use the base_reader for that.
        self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))

    def test_multiprocess_read(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4)

        all_instances = []

        for instance in reader.read(self.identical_files_glob):
            all_instances.append(instance)

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100

    def test_multiprocess_read_in_subprocess_is_deterministic(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=1)
        q = Queue()
        def read():
            for instance in reader.read(self.distinct_files_glob):
                q.put(fingerprint(instance))

        # Ensure deterministic shuffling.
        np.random.seed(0)
        p = Process(target=read)
        p.start()
        p.join()

        # Convert queue to list.
        actual_fingerprints = []
        while not q.empty():
            actual_fingerprints.append(q.get(block=False))

        assert len(actual_fingerprints) == 100

        expected_fingerprints = []
        for instance in self.base_reader.read(self.all_distinct_path):
            expected_fingerprints.append(fingerprint(instance))

        np.random.seed(0)
        expected_fingerprints.sort()
        # This should be shuffled into exactly the same order as actual_fingerprints.
        np.random.shuffle(expected_fingerprints)

        assert actual_fingerprints == expected_fingerprints

    def test_multiple_epochs(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader,
                                           num_workers=2,
                                           epochs_per_read=3)

        all_instances = []

        for instance in reader.read(self.identical_files_glob):
            all_instances.append(instance)

        # 100 files * 4 sentences per file * 3 epochs
        assert len(all_instances) == 100 * 4 * 3

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 * 3 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 300
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 300
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 300
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 300

    def test_with_iterator(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        instances = reader.read(self.identical_files_glob)

        iterator = BasicIterator(batch_size=32)
        iterator.index_with(self.vocab)

        batches = [batch for batch in iterator(instances, num_epochs=1)]

        # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16
        sizes = sorted([len(batch['tags']) for batch in batches])
        assert sizes == [16] + 12 * [32]
Exemple #14
0
class TestMultiprocessDatasetReader(AllenNlpTestCase):
    def setUp(self) -> None:
        super().setUp()

        # use SequenceTaggingDatasetReader as the base reader
        self.base_reader = SequenceTaggingDatasetReader(lazy=True)
        base_file_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"

        # Make 100 copies of the data
        raw_data = open(base_file_path).read()
        for i in range(100):
            file_path = self.TEST_DIR / f"identical_{i}.tsv"
            with open(file_path, "w") as f:
                f.write(raw_data)

        self.all_distinct_path = str(self.TEST_DIR / "all_distinct.tsv")
        with open(self.all_distinct_path, "w") as all_distinct:
            for i in range(100):
                file_path = self.TEST_DIR / f"distinct_{i}.tsv"
                line = f"This###DT\tis###VBZ\tsentence###NN\t{i}###CD\t.###.\n"
                with open(file_path, "w") as f:
                    f.write(line)
                all_distinct.write(line)

        self.identical_files_glob = str(self.TEST_DIR / "identical_*.tsv")
        self.distinct_files_glob = str(self.TEST_DIR / "distinct_*.tsv")

        # For some of the tests we need a vocab, we'll just use the base_reader for that.
        self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))

    def test_multiprocess_read(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4)

        all_instances = []

        for instance in reader.read(self.identical_files_glob):
            all_instances.append(instance)

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100

    def test_multiprocess_read_partial_does_not_hang(self):
        # Use a small queue size such that the processes generating the data will block.
        reader = MultiprocessDatasetReader(
            base_reader=self.base_reader, num_workers=4, output_queue_size=10
        )

        all_instances = []

        # Half of 100 files * 4 sentences / file
        i = 0
        for instance in reader.read(self.identical_files_glob):
            # Stop early such that the processes generating the data remain
            # active (given the small queue size).
            if i == 200:
                break
            i += 1
            all_instances.append(instance)

        # This should be trivially true. The real test here is that we exit
        # normally and don't hang due to the still active processes.
        assert len(all_instances) == 200

    def test_multiprocess_read_with_qiterable(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4)

        all_instances = []
        qiterable = reader.read(self.identical_files_glob)
        assert isinstance(qiterable, QIterable)

        # Essentially QIterable.__iter__. Broken out here as we intend it to be
        # a public interface.
        qiterable.start()
        while qiterable.num_active_workers.value > 0 or qiterable.num_inflight_items.value > 0:
            while True:
                try:
                    all_instances.append(qiterable.output_queue.get(block=False, timeout=1.0))
                    with qiterable.num_inflight_items.get_lock():
                        qiterable.num_inflight_items.value -= 1
                except Empty:
                    break
        qiterable.join()

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100

    def test_multiprocess_read_in_subprocess_is_deterministic(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=1)
        q = Queue()

        def read():
            for instance in reader.read(self.distinct_files_glob):
                q.put(fingerprint(instance))

        # Ensure deterministic shuffling.
        np.random.seed(0)
        p = Process(target=read)
        p.start()
        p.join()

        # Convert queue to list.
        actual_fingerprints = []
        while not q.empty():
            actual_fingerprints.append(q.get(block=False))

        assert len(actual_fingerprints) == 100

        expected_fingerprints = []
        for instance in self.base_reader.read(self.all_distinct_path):
            expected_fingerprints.append(fingerprint(instance))

        np.random.seed(0)
        expected_fingerprints.sort()
        # This should be shuffled into exactly the same order as actual_fingerprints.
        np.random.shuffle(expected_fingerprints)

        assert actual_fingerprints == expected_fingerprints

    def test_multiple_epochs(self):
        reader = MultiprocessDatasetReader(
            base_reader=self.base_reader, num_workers=2, epochs_per_read=3
        )

        all_instances = []

        for instance in reader.read(self.identical_files_glob):
            all_instances.append(instance)

        # 100 files * 4 sentences per file * 3 epochs
        assert len(all_instances) == 100 * 4 * 3

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 * 3 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 300
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 300
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 300
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 300

    def test_with_iterator(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        instances = reader.read(self.identical_files_glob)

        iterator = BasicIterator(batch_size=32)
        iterator.index_with(self.vocab)

        batches = [batch for batch in iterator(instances, num_epochs=1)]

        # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16
        sizes = sorted([len(batch["tags"]) for batch in batches])
        assert sizes == [16] + 12 * [32]
Exemple #15
0
class TestMultiprocessDatasetReader(AllenNlpTestCase):
    def setUp(self) -> None:
        super().setUp()

        # use SequenceTaggingDatasetReader as the base reader
        self.base_reader = SequenceTaggingDatasetReader(lazy=True)
        base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'

        # Make 100 copies of the data
        raw_data = open(base_file_path).read()
        for i in range(100):
            file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv'
            with open(file_path, 'w') as f:
                f.write(raw_data)

        self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv')

        # For some of the tests we need a vocab, we'll just use the base_reader for that.
        self.vocab = Vocabulary.from_instances(
            self.base_reader.read(str(base_file_path)))

    def test_multiprocess_read(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader,
                                           num_workers=4)

        all_instances = []

        for instance in reader.read(self.glob):
            all_instances.append(instance)

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N",
                       "N")] == 100

    def test_multiple_epochs(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader,
                                           num_workers=2,
                                           epochs_per_read=3)

        all_instances = []

        for instance in reader.read(self.glob):
            all_instances.append(instance)

        # 100 files * 4 sentences per file * 3 epochs
        assert len(all_instances) == 100 * 4 * 3

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 * 3 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300
        assert counts[("birds", "are", "animals", ".", "N", "V", "N",
                       "N")] == 300

    def test_with_iterator(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader,
                                           num_workers=2)
        instances = reader.read(self.glob)

        iterator = BasicIterator(batch_size=32)
        iterator.index_with(self.vocab)

        batches = [batch for batch in iterator(instances, num_epochs=1)]

        # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16
        sizes = sorted([len(batch['tags']) for batch in batches])
        assert sizes == [16] + 12 * [32]
class TestMultiprocessDatasetReader(AllenNlpTestCase):
    def setUp(self) -> None:
        super().setUp()

        # use SequenceTaggingDatasetReader as the base reader
        self.base_reader = SequenceTaggingDatasetReader(lazy=True)
        base_file_path = AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'


        # Make 100 copies of the data
        raw_data = open(base_file_path).read()
        for i in range(100):
            file_path = self.TEST_DIR / f'sequence_tagging_{i}.tsv'
            with open(file_path, 'w') as f:
                f.write(raw_data)

        self.glob = str(self.TEST_DIR / 'sequence_tagging_*.tsv')

        # For some of the tests we need a vocab, we'll just use the base_reader for that.
        self.vocab = Vocabulary.from_instances(self.base_reader.read(str(base_file_path)))

    def test_multiprocess_read(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=4)

        all_instances = []

        for instance in reader.read(self.glob):
            all_instances.append(instance)

        # 100 files * 4 sentences / file
        assert len(all_instances) == 100 * 4

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100

    def test_multiple_epochs(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader,
                                           num_workers=2,
                                           epochs_per_read=3)

        all_instances = []

        for instance in reader.read(self.glob):
            all_instances.append(instance)

        # 100 files * 4 sentences per file * 3 epochs
        assert len(all_instances) == 100 * 4 * 3

        counts = Counter(fingerprint(instance) for instance in all_instances)

        # should have the exact same data 100 * 3 times
        assert len(counts) == 4
        assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 300
        assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 300
        assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 300
        assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 300

    def test_with_iterator(self):
        reader = MultiprocessDatasetReader(base_reader=self.base_reader, num_workers=2)
        instances = reader.read(self.glob)

        iterator = BasicIterator(batch_size=32)
        iterator.index_with(self.vocab)

        batches = [batch for batch in iterator(instances, num_epochs=1)]

        # 400 instances / batch_size 32 = 12 full batches + 1 batch of 16
        sizes = sorted([len(batch['tags']) for batch in batches])
        assert sizes == [16] + 12 * [32]