def test_set_skip_indexing_true(self, lazy: bool, label_name: str):
        reader = TextSentimentReader(lazy=lazy,
                                     skip_label_indexing=True,
                                     label_name=label_name)
        ag_path = Path(DATA_DIR, "integer_labels_original.jsonl").resolve()
        if label_name == 'text_sentiment':
            ag_path = Path(DATA_DIR, "integer_labels.jsonl").resolve()
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {
            "tokens": ["This", "text", "has", "label", "0"],
            "label": 0
        }
        instance2 = {
            "tokens": ["This", "text", "has", "label", "1"],
            "label": 1
        }

        assert len(instances) == 2
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]

        ag_path = Path(DATA_DIR, "imdb_corpus_original.jsonl").resolve()
        if label_name == 'text_sentiment':
            ag_path = Path(DATA_DIR, "imdb_corpus.jsonl").resolve()
        with pytest.raises(ValueError) as exec_info:
            ensure_list(reader.read(ag_path))
        assert str(
            exec_info.value
        ) == "Labels must be integers if skip_label_indexing is True."
    def test_spans_work_correctly(self):
        reader = SpanAeDatasetReader(max_span_width=1)
        instances = reader.read('tests/fixtures/parallel_copy.tsv')
        instances = ensure_list(instances)
        assert len(instances) == 3

        fields = instances[0].fields
        assert type(fields["source_spans"]) == ListField
        assert type(fields["source_spans"].field_list[0]) == SpanField

        assert len(fields["source_spans"].field_list) == len(
            fields["source_tokens"].tokens)

        reader = SpanAeDatasetReader(max_span_width=2)
        instances = reader.read('tests/fixtures/parallel_copy.tsv')
        instances = ensure_list(instances)
        fields = instances[1].fields
        assert len(fields["source_spans"].field_list
                   ) == len(fields["source_tokens"].tokens) * 2 - 1

        reader = SpanAeDatasetReader(max_span_width=3)
        instances = reader.read('tests/fixtures/parallel_copy.tsv')
        instances = ensure_list(instances)
        fields = instances[1].fields
        assert len(fields["source_spans"].field_list
                   ) == len(fields["source_tokens"].tokens) * 3 - 3
    def test_set_skip_indexing_true(self, lazy):
        reader = MultiLabelTextClassificationJsonReader(
            lazy=lazy, skip_label_indexing=True, num_labels=3)
        integer_label_path = Path(
            "tests/fixtures") / "data" / "integer_labels.jsonl"
        instances = reader.read(integer_label_path)
        instances = ensure_list(instances)

        instance1 = {
            "tokens": ["This", "text", "has", "labels", "0", "2"],
            "labels": [0, 2]
        }
        instance2 = {
            "tokens": ["This", "text", "has", "labels", "0", "1"],
            "labels": [0, 1]
        }

        assert len(instances) == 2
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["labels"].labels == instance1["labels"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["labels"].labels == instance2["labels"]

        with pytest.raises(ValueError) as exec_info:
            string_label_path = Path(
                "tests/fixtures") / "data" / "reuters-21578" / "train.jsonl"
            ensure_list(reader.read(string_label_path))
        assert str(
            exec_info.value
        ) == "Labels must be integers if skip_label_indexing is True."
    def test_max_sentences(self):
        conll_reader = ConllCorefReader(max_span_width=self.span_width)
        instances = ensure_list(
            conll_reader.read(str(FIXTURES_ROOT / "coref" / "coref.gold_conll"))
        )

        limited_conll_reader = ConllCorefReader(max_span_width=self.span_width, max_sentences=2)
        limited_instances = ensure_list(
            limited_conll_reader.read(str(FIXTURES_ROOT / "coref" / "coref.gold_conll"))
        )

        assert len(limited_instances) == len(instances) == 4

        tokens_of = lambda instance: instance.fields["text"].tokens
        text_of = lambda tokens: [token.text for token in tokens]
        docs = [tokens_of(instance) for instance in instances]
        limited_docs = [tokens_of(instance) for instance in limited_instances]

        # Short ones; not truncated
        assert limited_docs[1] == docs[1]
        assert limited_docs[3] == docs[3]

        # Truncation happened
        assert len(limited_docs[0]) < len(docs[0])
        assert len(limited_docs[2]) < len(docs[2])
        assert "Disney" in text_of(docs[0]) and "Disney" not in text_of(limited_docs[0])
        assert "tourism" in text_of(docs[2]) and "tourism" not in text_of(limited_docs[2])

        # Truncated tokens are the prefixes
        assert limited_docs[0] == docs[0][: len(limited_docs[0])]
        assert limited_docs[2] == docs[2][: len(limited_docs[2])]
Beispiel #5
0
    def test_entity_mask(self):
        # Check 'mask' mode has expected behavior
        reader = get_reader()
        reader.entity_masking = 'mask'
        instances = ensure_list(
            reader.read('tests/fixtures/tacred/LDC2018T24.json'))

        tokens_0 = [x.text for x in instances[0]['tokens']]
        subj_tokens_0 = tokens_0[14]
        self.assertEqual(subj_tokens_0, '[MASK]')

        tokens_0 = [x.text for x in instances[0]['tokens']]
        obj_tokens_0 = tokens_0[17]
        self.assertEqual(obj_tokens_0, '[MASK]')

        # Check 'type/role' mode has expected behavior
        reader.entity_masking = 'type/role'
        instances = ensure_list(
            reader.read('tests/fixtures/tacred/LDC2018T24.json'))

        tokens_0 = [x.text for x in instances[0]['tokens']]
        subj_tokens_0 = tokens_0[14]
        self.assertEqual(subj_tokens_0, '[s-person]')

        tokens_0 = [x.text for x in instances[0]['tokens']]
        obj_tokens_0 = tokens_0[17]
        self.assertEqual(obj_tokens_0, '[o-title]')
    def test_set_skip_indexing_true(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy, skip_label_indexing=True)
        ag_path = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "integer_labels.jsonl"
        )
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ["This", "text", "has", "label", "0"], "label": 0}
        instance2 = {"tokens": ["This", "text", "has", "label", "1"], "label": 1}

        assert len(instances) == 2
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]

        with pytest.raises(ValueError) as exec_info:
            ag_path = (
                AllenNlpTestCase.FIXTURES_ROOT
                / "data"
                / "text_classification_json"
                / "imdb_corpus.jsonl"
            )
            ensure_list(reader.read(ag_path))
        assert str(exec_info.value) == "Labels must be integers if skip_label_indexing is True."
Beispiel #7
0
    def test_read_from_file(self):
        reader = AclarcDatasetReader()
        instances = ensure_list(
            reader.read('tests/fixtures/aclarc-train.jsonl'))
        instance1 = {
            "citation_text": ['Typical', 'examples', 'are', 'Bulgarian']
        }
        assert len(instances) == 10
        fields = instances[0].fields
        assert isinstance(instances, list)
        assert [t.text for t in fields['citation_text'].tokens
                ][:4] == instance1['citation_text']

        reader = AclSectionTitleDatasetReader()
        instances = ensure_list(
            reader.read('tests/fixtures/aclarc-section-title.jsonl'))
        instance1 = {
            "section_name": 'related work',
            "citation_text": ['With', 'C99']
        }
        assert len(instances) == 10
        fields = instances[1].fields
        assert isinstance(instances, list)
        assert [t.text for t in fields['citation_text'].tokens
                ][:2] == instance1['citation_text']
        assert fields['section_label'].label == instance1['section_name']

        reader = AclCiteWorthinessDatasetReader()
        instances = ensure_list(
            reader.read('tests/fixtures/aclarc-cite-worthiness.jsonl'))
        instance1 = {"is_citation": 'False'}
        fields = instances[1].fields
        assert isinstance(instances, list)
        assert fields['is_citation'].label == instance1['is_citation']
Beispiel #8
0
    def test_length_limit_works(self):
        # We're making sure the length of the text is correct if length limit is provided.
        reader = SquadReader(passage_length_limit=30,
                             question_length_limit=10,
                             skip_invalid_examples=True)
        instances = ensure_list(
            reader.read(FIXTURES_ROOT / "rc" / "squad.json"))
        assert len(instances[0].fields["question"].tokens) == 10
        assert len(instances[0].fields["passage"].tokens) == 30
        # invalid examples where all the answers exceed the passage length should be skipped.
        assert len(instances) == 3

        # Length limit still works if we do not skip the invalid examples
        reader = SquadReader(passage_length_limit=30,
                             question_length_limit=10,
                             skip_invalid_examples=False)
        instances = ensure_list(
            reader.read(FIXTURES_ROOT / "rc" / "squad.json"))
        assert len(instances[0].fields["question"].tokens) == 10
        assert len(instances[0].fields["passage"].tokens) == 30
        # invalid examples should not be skipped.
        assert len(instances) == 5

        # Make sure the answer texts does not change, so that the evaluation will not be affected
        reader_unlimited = SquadReader(passage_length_limit=30,
                                       question_length_limit=10,
                                       skip_invalid_examples=False)
        instances_unlimited = ensure_list(
            reader_unlimited.read(FIXTURES_ROOT / "rc" / "squad.json"))
        for instance_x, instance_y in zip(instances, instances_unlimited):
            print(instance_x.fields["metadata"]["answer_texts"])
            assert set(instance_x.fields["metadata"]["answer_texts"]) == set(
                instance_y.fields["metadata"]["answer_texts"])
Beispiel #9
0
    def test_read_from_file(self, lazy):
        reader = SquadReader(lazy=lazy)
        instances = ensure_list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'squad.json'))
        assert len(instances) == 5

        assert [t.text for t in instances[0].fields["question"].tokens[:3]] == ["To", "whom", "did"]
        assert [t.text for t in instances[0].fields["passage"].tokens[:3]] == ["Architecturally", ",", "the"]
        assert [t.text for t in instances[0].fields["passage"].tokens[-3:]] == ["of", "Mary", "."]
        assert instances[0].fields["span_start"].sequence_index == 102
        assert instances[0].fields["span_end"].sequence_index == 104

        assert [t.text for t in instances[1].fields["question"].tokens[:3]] == ["What", "sits", "on"]
        assert [t.text for t in instances[1].fields["passage"].tokens[:3]] == ["Architecturally", ",", "the"]
        assert [t.text for t in instances[1].fields["passage"].tokens[-3:]] == ["of", "Mary", "."]
        assert instances[1].fields["span_start"].sequence_index == 17
        assert instances[1].fields["span_end"].sequence_index == 23

        # We're checking this case because I changed the answer text to only have a partial
        # annotation for the last token, which happens occasionally in the training data.  We're
        # making sure we get a reasonable output in that case here.
        assert ([t.text for t in instances[3].fields["question"].tokens[:3]] ==
                ["Which", "individual", "worked"])
        assert [t.text for t in instances[3].fields["passage"].tokens[:3]] == ["In", "1882", ","]
        assert [t.text for t in instances[3].fields["passage"].tokens[-3:]] == ["Nuclear", "Astrophysics", "."]
        span_start = instances[3].fields["span_start"].sequence_index
        span_end = instances[3].fields["span_end"].sequence_index
        answer_tokens = instances[3].fields["passage"].tokens[span_start:(span_end + 1)]
        expected_answer_tokens = ["Father", "Julius", "Nieuwland"]
        assert [t.text for t in answer_tokens] == expected_answer_tokens
Beispiel #10
0
 def setUp(self):
     super(TestCopyNetReader, self).setUp()
     params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json")
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv")
     self.instances = ensure_list(instances)
     self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
Beispiel #11
0
    def test_read(self, lazy):
        params = Params({'lazy': lazy, 'num_context_answers': 2,})
        reader = QuACReader.from_params(params)
        instances = reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'quac_sample.json'))
        instances = ensure_list(instances)

        assert instances[0].fields["question"].sequence_length() == 6
        assert instances[0].fields["yesno_list"].sequence_length() == 6
        assert [t.text for t in instances[0].fields["question"].field_list[0].tokens[:3]] == ["What", "was", "the"]

        assert len(instances) == 2
        passage_length = len(instances[0].fields["passage"].tokens)

        assert [t.text for t in instances[0].fields["passage"].tokens[:3]] == ["DJ", "Kool", "Herc"]
        assert [x.label for x in instances[0].fields["yesno_list"].field_list] == ["x", "x", "y", "x", "x", "x"]
        assert [x.label for x in instances[0].fields["followup_list"].field_list] == ["y", "m", "m", "n", "m", "y"]
        assert instances[0].fields["p1_answer_marker"].field_list[0].labels == ["O"] * passage_length

        # Check the previous answer marking here
        prev_1_list = ["O"] * passage_length
        prev_2_list = ["O"] * passage_length
        q0_span_start = instances[0].fields['span_start'].field_list[0].sequence_index
        q0_span_end = instances[0].fields['span_end'].field_list[0].sequence_index
        prev_1_list[q0_span_start] = "<{0:d}_{1:s}>".format(1, "start")
        prev_1_list[q0_span_end] = "<{0:d}_{1:s}>".format(1, "end")
        prev_2_list[q0_span_start] = "<{0:d}_{1:s}>".format(2, "start")
        prev_2_list[q0_span_end] = "<{0:d}_{1:s}>".format(2, "end")
        for passage_index in range(q0_span_start + 1, q0_span_end):
            prev_1_list[passage_index] = "<{0:d}_{1:s}>".format(1, "in")
            prev_2_list[passage_index] = "<{0:d}_{1:s}>".format(2, "in")

        assert instances[0].fields["p1_answer_marker"].field_list[1].labels == prev_1_list
        assert instances[0].fields["p2_answer_marker"].field_list[2].labels == prev_2_list
    def test_read_from_file(self):
        reader = SemanticDependenciesDatasetReader()
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'dm.sdp')
        instances = ensure_list(instances)

        instance = instances[0]
        arcs = instance.fields["arc_tags"]
        tokens = [x.text for x in instance.fields["tokens"].tokens]
        assert tokens == ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',',
                          'will', 'join', 'the', 'board', 'as', 'a',
                          'nonexecutive', 'director', 'Nov.', '29', '.']
        assert arcs.indices == [(1, 0), (1, 5), (1, 8), (4, 3), (5, 4),
                                (8, 11), (8, 16), (10, 8), (10, 9),
                                (14, 11), (14, 12), (14, 13), (16, 15)]
        assert arcs.labels == ['compound', 'ARG1', 'ARG1', 'ARG1', 'measure',
                               'ARG1', 'loc', 'ARG2', 'BV', 'ARG2', 'BV', 'ARG1', 'of']

        instance = instances[1]
        arcs = instance.fields["arc_tags"]
        tokens = [x.text for x in instance.fields["tokens"].tokens]
        assert tokens == ['Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier',
                          'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.']
        assert arcs.indices == [(1, 0), (1, 2), (3, 2), (3, 4), (5, 4), (5, 6),
                                (5, 11), (11, 8), (11, 9), (11, 10)]
        assert arcs.labels == ['compound', 'ARG1', 'ARG2', 'ARG1', 'ARG2', 'compound',
                               'appos', 'BV', 'ARG1', 'compound']
Beispiel #13
0
    def test_read_from_file(self, lazy):
        reader = MRPCReader(tokenizer=WordTokenizer(word_splitter=JustSpacesWordSplitter()),
                            token_indexers={"bert":
                                            PretrainedBertIndexer(pretrained_model=self.BERT_VOCAB_PATH)},
                            lazy=lazy,
                            skip_label_indexing=False,
                            mode='merge')
        instances = reader.read(
            str(self.FIXTURES_ROOT / 'mrpc_dev.tsv'))
        instances = ensure_list(instances)

        instance1 = {"tokens": "He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .".split() + ["[SEP]"] +
                     "\" The foodservice pie business does not fit our long-term growth strategy .".split(),
                     "label": '1'}

        instance2 = {"tokens": "Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .".split() + ["[SEP]"] +
                     "His wife said he was \" 100 percent behind George Bush \" and looked forward to using his years of training in the war .".split(),
                     "label": '0'}

        instance3 = {"tokens": "The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .".split() + ["[SEP]"] +
                     "The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .".split(),
                     "label": '0'}

        for instance, expected_instance in zip(instances, [instance1, instance2, instance3]):
            fields = instance.fields
            assert [
                t.text for t in fields["tokens"].tokens] == expected_instance["tokens"]
            assert fields["label"].label == expected_instance["label"]
    def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read('tests/fixtures/data/snli.jsonl')
        instances = ensure_list(instances)

        instance1 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken",
                                 "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "training", "his", "horse", "for", "a",
                                    "competition", "."],
                     "label": "neutral"}

        instance2 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken",
                                 "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "at", "a", "diner", ",", "ordering", "an",
                                    "omelette", "."],
                     "label": "contradiction"}
        instance3 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken",
                                 "down", "airplane", "."],
                     "hypothesis": ["A", "person", "is", "outdoors", ",", "on", "a", "horse", "."],
                     "label": "entailment"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["premise"].tokens] == instance1["premise"]
        assert [t.text for t in fields["hypothesis"].tokens] == instance1["hypothesis"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["premise"].tokens] == instance2["premise"]
        assert [t.text for t in fields["hypothesis"].tokens] == instance2["hypothesis"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["premise"].tokens] == instance3["premise"]
        assert [t.text for t in fields["hypothesis"].tokens] == instance3["hypothesis"]
        assert fields["label"].label == instance3["label"]
    def test_srl_reader_can_filter_by_domain(self):

        conll_reader = SrlReader(domain_identifier="subdomain2")
        instances = conll_reader.read('tests/fixtures/conll_2012/')
        instances = ensure_list(instances)
        # If we'd included the folder, we'd have 9 instances.
        assert len(instances) == 2
Beispiel #16
0
    def test_read_samples():
        """Tests parsing the samples file"""
        reader = DeftJsonlReader(subtasks=[1, 2, 3],
                                 read_spacy_pos_tags=False,
                                 read_spacy_dep_rels=False)
        instances = ensure_list(
            reader._read('tests/fixtures/jsonl_format_samples.jsonl'))
        assert len(instances) == 5

        expected_fields = [
            "metadata", "tokens", "sentence_labels", "tags",
            "relation_root_idxs", "relations"
        ]
        for instance in instances:
            assert list(instance.fields.keys()) == expected_fields

        expected_tokens = [
            "3616", ".", "Some", "of", "these", "are", "binocular", "cues",
            ",", "which", "means", "that", "they", "rely", "on", "the", "use",
            "of", "both", "eyes", ".", "One", "example", "of", "a",
            "binocular", "depth", "cue", "is", "binocular", "disparity", ",",
            "the", "slightly", "different", "view", "of", "the", "world",
            "that", "each", "of", "our", "eyes", "receives", ".", "To",
            "experience", "this", "slightly", "different", "view", ",", "do",
            "this", "simple", "exercise", ":", "extend", "your", "arm",
            "fully", "and", "extend", "one", "of", "your", "fingers", "and",
            "focus", "on", "that", "finger", "."
        ]
        metadata_field = instances[0].fields.get('metadata')
        assert metadata_field['words'] == expected_tokens
        instance_tokens = [t.text for t in instances[0].fields.get("tokens")]
        assert instance_tokens == expected_tokens
Beispiel #17
0
    def test_ultra_fine_reader(self):
        reader = get_reader("entity")
        instances = ensure_list(
            reader.read('tests/fixtures/evaluation/ultra_fine/train.json'))

        # Check number of instances is correct
        self.assertEqual(len(instances), 2)

        # Check that first instance's tokens are correct
        tokens_0 = [x.text for x in instances[0]['tokens']]
        segments_0 = list(instances[0]['segment_ids'].array)
        actual = list(zip(tokens_0, segments_0))
        expected = [('[CLS]', 0), ('the', 0), ('british', 0),
                    ('information', 0), ('commissioner', 0), ("'s", 0),
                    ('office', 0), ('invites', 0), ('[unused0]', 0), ('to', 0),
                    ('locate', 0), ('its', 0), ('add', 0), ('##ress', 0),
                    ('using', 0), ('google', 0), ('[UNK]', 0), ('.', 0),
                    ('[SEP]', 0), ('web', 1), ('users', 1), ('[SEP]', 1)]
        self.assertListEqual(actual, expected)

        iterator = DataIterator.from_params(Params({"type": "basic"}))
        iterator.index_with(Vocabulary())

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        expected_labels = [[0, 0, 0, 0, 0, 0, 1, 0, 0],
                           [1, 0, 0, 0, 0, 0, 0, 0, 0]]
        self.assertEqual(batch['label_ids'].numpy().tolist(), expected_labels)
    def _memory_sized_lists(self, instances: Iterable[Instance]) -> Iterable[List[Instance]]:
        """
        Breaks the dataset into "memory-sized" lists of instances,
        which it yields up one at a time until it gets through a full epoch.

        For example, if the dataset is already an in-memory list, and each epoch
        represents one pass through the dataset, it just yields back the dataset.
        Whereas if the dataset is lazily read from disk and we've specified to
        load 1000 instances at a time, then it yields lists of 1000 instances each.
        """
        lazy = is_lazy(instances)

        # Get an iterator over the next epoch worth of instances.
        iterator = self._take_instances(instances, self._instances_per_epoch)

        # We have four different cases to deal with:

        # With lazy instances and no guidance about how many to load into memory,
        # we just load ``batch_size`` instances at a time:
        if lazy and self._max_instances_in_memory is None:
            yield from lazy_groups_of(iterator, self._batch_size)
        # If we specified max instances in memory, lazy or not, we just
        # load ``max_instances_in_memory`` instances at a time:
        elif self._max_instances_in_memory is not None:
            yield from lazy_groups_of(iterator, self._max_instances_in_memory)
        # If we have non-lazy instances, and we want all instances each epoch,
        # then we just yield back the list of instances:
        elif self._instances_per_epoch is None:
            yield ensure_list(instances)
        # In the final case we have non-lazy instances, we want a specific number
        # of instances each epoch, and we didn't specify how to many instances to load
        # into memory. So we convert the whole iterator to a list:
        else:
            yield list(iterator)
Beispiel #19
0
    def test_read_from_file(self, lazy, coding_scheme):
        conll_reader = Conll2003DatasetReader(lazy=lazy,
                                              coding_scheme=coding_scheme)
        instances = conll_reader.read(
            str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'conll2003.txt'))
        instances = ensure_list(instances)

        if coding_scheme == 'IOB1':
            expected_labels = ['I-ORG', 'O', 'I-PER', 'O', 'O', 'I-LOC', 'O']
        else:
            expected_labels = ['U-ORG', 'O', 'U-PER', 'O', 'O', 'U-LOC', 'O']

        fields = instances[0].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == [
            'U.N.', 'official', 'Ekeus', 'heads', 'for', 'Baghdad', '.'
        ]
        assert fields["tags"].labels == expected_labels

        fields = instances[1].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == [
            'AI2', 'engineer', 'Joel', 'lives', 'in', 'Seattle', '.'
        ]
        assert fields["tags"].labels == expected_labels
Beispiel #20
0
 def setUp(self):
     super().setUp()
     params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json")
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv")
     self.instances = ensure_list(instances)
     self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
Beispiel #21
0
    def test_length_limit_works(self):
        max_query_length = 10
        stride = 20

        reader = TransformerSquadReader(
            length_limit=100,
            max_query_length=max_query_length,
            stride=stride,
            skip_invalid_examples=False,
        )
        instances = ensure_list(
            reader.read(FIXTURES_ROOT / "rc" / "squad.json"))

        assert len(instances) == 12
        # The sequence is "<s> question </s> </s> context".
        assert instances[0].fields["context_span"].span_start == len(
            reader._tokenizer.sequence_pair_start_tokens
        ) + max_query_length + len(reader._tokenizer.sequence_pair_mid_tokens)

        instance_0_text = [
            t.text for t in instances[0].fields["question_with_context"].tokens
        ]
        instance_1_text = [
            t.text for t in instances[1].fields["question_with_context"].tokens
        ]
        assert instance_0_text[:max_query_length +
                               2] == instance_1_text[:max_query_length + 2]
        assert instance_0_text[max_query_length +
                               3] != instance_1_text[max_query_length + 3]
        assert instance_0_text[-1] == "[SEP]"
        assert instance_0_text[-2] == "##rot"
        assert (
            instance_1_text[instances[1].fields["context_span"].span_start +
                            stride - 1] == "##rot")
Beispiel #22
0
    def test_read_from_file_ag_news_corpus(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy)
        ag_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "ag_news_corpus.jsonl"
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ['Memphis', 'Rout', 'Still', 'Stings', 'for', 'No', '.', '14',
                                'Louisville', ';', 'Coach', 'Petrino', 'Vows', 'to', 'Have',
                                'Team', 'Better', 'Prepared', '.', 'NASHVILLE', ',', 'Tenn.',
                                'Nov', '3', ',', '2004', '-', 'Louisville', '#', '39;s', '30-point',
                                'loss', 'at', 'home', 'to', 'Memphis', 'last', 'season', 'is', 'still',
                                'a', 'painful', 'memory', 'for', 'the', 'Cardinals', '.'],
                     "label": "2"}
        instance2 = {"tokens": ['AP', '-', 'Eli', 'Manning', 'has', 'replaced', 'Kurt', 'Warner',
                                'as', 'the', 'New', 'York', 'Giants', "'", 'starting',
                                'quarterback', '.'],
                     "label": "2"}
        instance3 = {"tokens": ['A', 'conference', 'dedicated', 'to', 'online', 'journalism',
                                'explores', 'the', 'effect', 'blogs', 'have', 'on', 'news',
                                'reporting', '.', 'Some', 'say', 'they', 'draw', 'attention',
                                'to', 'under', '-', 'reported', 'stories', '.', 'Others',
                                'struggle', 'to', 'establish', 'the', 'credibility',
                                'enjoyed', 'by', 'professionals', '.'],
                     "label": "4"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"]
    def test_read(self, lazy):
        params = Params({
                'base_tarball_path': 'tests/fixtures/data/triviaqa-sample.tgz',
                'lazy': lazy
                })
        reader = TriviaQaReader.from_params(params)
        instances = reader.read('web-train.json')
        instances = ensure_list(instances)
        assert len(instances) == 3

        assert [t.text for t in instances[0].fields["question"].tokens[:3]] == ["Which", "American", "-"]
        assert [t.text for t in instances[0].fields["passage"].tokens[:3]] == ["The", "Nobel", "Prize"]
        url = "http://www.nobelprize.org/nobel_prizes/literature/laureates/1930/"
        assert [t.text for t in instances[0].fields["passage"].tokens[-3:]] == ["<", url, ">"]
        assert instances[0].fields["span_start"].sequence_index == 12
        assert instances[0].fields["span_end"].sequence_index == 13

        assert [t.text for t in instances[1].fields["question"].tokens[:3]] == ["Which", "American", "-"]
        assert [t.text for t in instances[1].fields["passage"].tokens[:3]] == ["Why", "Do", "n’t"]
        assert [t.text for t in instances[1].fields["passage"].tokens[-3:]] == ["adults", ",", "and"]
        assert instances[1].fields["span_start"].sequence_index == 38
        assert instances[1].fields["span_end"].sequence_index == 39

        assert [t.text for t in instances[2].fields["question"].tokens[:3]] == ["Where", "in", "England"]
        assert [t.text for t in instances[2].fields["passage"].tokens[:3]] == ["Judi", "Dench", "-"]
        assert [t.text for t in instances[2].fields["passage"].tokens[-3:]] == [")", "(", "special"]
        assert instances[2].fields["span_start"].sequence_index == 16
        assert instances[2].fields["span_end"].sequence_index == 16
Beispiel #24
0
    def test_read_from_file(self, lazy):
        reader = QuoraParaphraseDatasetReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" /
                                "quora_paraphrase.tsv")
        instances = ensure_list(instances)

        instance1 = {
            "premise": "What should I do to avoid sleeping in class ?".split(),
            "hypothesis": "How do I not sleep in a boring class ?".split(),
            "label": "1",
        }

        instance2 = {
            "premise":
            "Do women support each other more than men do ?".split(),
            "hypothesis": "Do women need more compliments than men ?".split(),
            "label": "0",
        }

        instance3 = {
            "premise": "How can one root android devices ?".split(),
            "hypothesis": "How do I root an Android device ?".split(),
            "label": "1",
        }

        assert len(instances) == 3

        for instance, expected_instance in zip(
                instances, [instance1, instance2, instance3]):
            fields = instance.fields
            assert [t.text for t in fields["premise"].tokens
                    ] == expected_instance["premise"]
            assert [t.text for t in fields["hypothesis"].tokens
                    ] == expected_instance["hypothesis"]
            assert fields["label"].label == expected_instance["label"]
Beispiel #25
0
    def test_srl_reader_can_filter_by_domain(self):

        conll_reader = SrlReader(domain_identifier="subdomain2")
        instances = conll_reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'conll_2012')
        instances = ensure_list(instances)
        # If we'd included the folder, we'd have 9 instances.
        assert len(instances) == 2
Beispiel #26
0
    def test_unpruned_adjacency_matrix(self):
        MAX_LEN = 100
        reader = TacredDatasetReader(max_len=MAX_LEN,
                                     masking_mode="NER",
                                     dep_pruning=-1)
        instances = ensure_list(reader.read("tests/fixtures/tacred.json"))

        expected_edges = [(0, 0), (0, 3), (1, 1), (1, 3), (2, 2), (2, 3),
                          (3, 0), (3, 1), (3, 2), (3, 3), (3, 11), (4, 4),
                          (4, 11), (5, 5), (5, 9), (6, 6), (6, 9), (7, 7),
                          (7, 9), (8, 8), (8, 9), (9, 5), (9, 6), (9, 7),
                          (9, 8), (9, 9), (9, 11), (10, 10), (10, 11), (11, 3),
                          (11, 4), (11, 9), (11, 10), (11, 11), (11, 12),
                          (11, 13), (11, 14), (11, 25), (12, 11), (12, 12),
                          (13, 11), (13, 13), (14, 11), (14, 14), (14, 16),
                          (15, 15), (15, 16), (16, 14), (16, 15), (16, 16),
                          (16, 19), (17, 17), (17, 19), (18, 18), (18, 19),
                          (19, 16), (19, 17), (19, 18), (19, 19), (19, 21),
                          (20, 20), (20, 21), (21, 19), (21, 20), (21, 21),
                          (21, 24), (22, 22), (22, 24), (23, 23), (23, 24),
                          (24, 21), (24, 22), (24, 23), (24, 24), (25, 11),
                          (25, 25)]
        adjacency = instances[0].fields["adjacency"]

        assert sorted(adjacency.indices) == expected_edges
Beispiel #27
0
    def test_boolq_dataset_reader_default_setting(self):
        reader = BoolQDatasetReader()
        instances = reader.read(self.boolq_path)
        instances = ensure_list(instances)

        assert len(instances) == 5

        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens][:5] == [
            "Persian",
            "language",
            "--",
            "Persian",
            "(/ˈpɜːrʒən,",
        ]
        assert fields["label"].label == 1

        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens][:5] == [
            "Epsom",
            "railway",
            "station",
            "--",
            "Epsom",
        ]
        assert fields["label"].label == 0
Beispiel #28
0
    def _memory_sized_lists(
            self, instances: Iterable[Instance]) -> Iterable[List[Instance]]:
        """
        Breaks the dataset into "memory-sized" lists of instances,
        which it yields up one at a time until it gets through a full epoch.

        For example, if the dataset is already an in-memory list, and each epoch
        represents one pass through the dataset, it just yields back the dataset.
        Whereas if the dataset is lazily read from disk and we've specified to
        load 1000 instances at a time, then it yields lists of 1000 instances each.
        """
        lazy = is_lazy(instances)

        # Get an iterator over the next epoch worth of instances.
        iterator = self._take_instances(instances, self._instances_per_epoch)

        # We have four different cases to deal with:

        # With lazy instances and no guidance about how many to load into memory,
        # we just load ``batch_size`` instances at a time:
        if lazy and self._max_instances_in_memory is None:
            yield from lazy_groups_of(iterator, self._batch_size)
        # If we specified max instances in memory, lazy or not, we just
        # load ``max_instances_in_memory`` instances at a time:
        elif self._max_instances_in_memory is not None:
            yield from lazy_groups_of(iterator, self._max_instances_in_memory)
        # If we have non-lazy instances, and we want all instances each epoch,
        # then we just yield back the list of instances:
        elif self._instances_per_epoch is None:
            yield ensure_list(instances)
        # In the final case we have non-lazy instances, we want a specific number
        # of instances each epoch, and we didn't specify how to many instances to load
        # into memory. So we convert the whole iterator to a list:
        else:
            yield list(iterator)
Beispiel #29
0
    def test_kg_probe_reader(self):
        reader = get_reader()
        instances = ensure_list(reader.read('tests/fixtures/kg_probe/file1.txt'))

        # Check instances are correct length
        self.assertEqual(len(instances), 2)

        # Check masking is performed properly
        expected_tokens_0 = ['[CLS]', '[MASK]', '[MASK]', '[UNK]', 'quick',
                             '##est', '.', '[SEP]']
        tokens_0 = [x.text for x in instances[0]['tokens'].tokens]
        self.assertListEqual(expected_tokens_0, tokens_0)

        expected_mask_indicator_0 = np.array([0,1,1,0,0,0,0,0], dtype=np.uint8)
        mask_indicator_0 = instances[0]['mask_indicator'].array
        assert np.allclose(expected_mask_indicator_0, mask_indicator_0)

        expected_tokens_1 = ['[CLS]', 'the', 'brown', 'fox', 'jumped', 'over',
                             'the', '[MASK]', '[MASK]', '[MASK]', '[MASK]',
                             '.', '[SEP]']
        tokens_1 = [x.text for x in instances[1]['tokens'].tokens]
        self.assertListEqual(expected_tokens_1, tokens_1)

        expected_mask_indicator_1 = np.array([0,0,0,0,0,0,0,1,1,1,1,0,0], dtype=np.uint8)
        mask_indicator_1 = instances[1]['mask_indicator'].array
        assert np.allclose(expected_mask_indicator_1, mask_indicator_1)
Beispiel #30
0
    def test_read_from_file(self):

        reader = CcgBankDatasetReader(feature_labels=['modified_pos', 'original_pos', 'predicate_arg'])
        instances = ensure_list(reader.read(self.FIXTURES_ROOT / 'data' / 'ccgbank.txt'))

        assert len(instances) == 2

        instance = instances[0]
        fields = instance.fields
        tokens = [token.text for token in fields['tokens'].tokens]
        assert tokens == ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board',
                          'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']

        ccg_categories = fields['tags'].labels
        assert ccg_categories == ['N/N', 'N', ',', 'N/N', 'N', '(S[adj]\\NP)\\NP', ',', '(S[dcl]\\NP)/(S[b]\\NP)',
                                  '(S[b]\\NP)/NP', 'NP[nb]/N', 'N', '((S\\NP)\\(S\\NP))/NP', 'NP[nb]/N', 'N/N',
                                  'N', '((S\\NP)\\(S\\NP))/N[num]', 'N[num]', '.']

        original_pos_tags = fields['original_pos_tags'].labels
        assert original_pos_tags == ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN',
                                     'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']

        modified_pos_tags = fields['modified_pos_tags'].labels
        assert modified_pos_tags == ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN',
                                     'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']

        predicate_arg_categories = fields['predicate_arg_tags'].labels
        assert predicate_arg_categories == ['N_73/N_73', 'N', ',', 'N_93/N_93', 'N', '(S[adj]\\NP_83)\\NP_84',
                                            ',', '(S[dcl]\\NP_10)/(S[b]_11\\NP_10:B)_11', '(S[b]\\NP)/NP',
                                            'NP[nb]_29/N_29', 'N', '((S_1\\NP_2)_1\\(S_1\\NP_2)_1)/NP',
                                            'NP[nb]_48/N_48', 'N_43/N_43', 'N',
                                            '((S_61\\NP_56)_61\\(S_61\\NP_56)_61)/N[num]_62', 'N[num]', '.']
    def test_read_from_file(self, lazy):
        reader = SnliReader(lazy=lazy)
        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'snli.jsonl')
        instances = ensure_list(instances)

        instance1 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"training", u"his", u"horse", u"for", u"a",
                                    u"competition", u"."],
                     u"label": u"neutral"}

        instance2 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"at", u"a", u"diner", u",", u"ordering", u"an",
                                    u"omelette", u"."],
                     u"label": u"contradiction"}
        instance3 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken",
                                 u"down", u"airplane", u"."],
                     u"hypothesis": [u"A", u"person", u"is", u"outdoors", u",", u"on", u"a", u"horse", u"."],
                     u"label": u"entailment"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance1[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance1[u"hypothesis"]
        assert fields[u"label"].label == instance1[u"label"]
        fields = instances[1].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance2[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance2[u"hypothesis"]
        assert fields[u"label"].label == instance2[u"label"]
        fields = instances[2].fields
        assert [t.text for t in fields[u"premise"].tokens] == instance3[u"premise"]
        assert [t.text for t in fields[u"hypothesis"].tokens] == instance3[u"hypothesis"]
        assert fields[u"label"].label == instance3[u"label"]
    def test_read_from_file_ag_news_corpus_and_truncates_properly(self, lazy):
        reader = TextClassificationJsonReader(lazy=lazy, max_sequence_length=5)
        ag_path = (
            AllenNlpTestCase.FIXTURES_ROOT
            / "data"
            / "text_classification_json"
            / "ag_news_corpus.jsonl"
        )
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ["Memphis", "Rout", "Still", "Stings", "for"], "label": "2"}
        instance2 = {"tokens": ["AP", "-", "Eli", "Manning", "has"], "label": "2"}
        instance3 = {"tokens": ["A", "conference", "dedicated", "to", "online"], "label": "4"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"]
    def test_read_from_file(self, lazy):
        reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy)

        instances = ensure_list(
            reader.read('tests/fixtures/data/language_modeling.txt'))
        # The last potential instance is left out, which is ok, because we don't have an end token
        # in here, anyway.
        assert len(instances) == 5

        assert [t.text for t in instances[0].fields["input_tokens"].tokens
                ] == ["This", "is", "a"]
        assert [t.text for t in instances[0].fields["output_tokens"].tokens
                ] == ["is", "a", "sentence"]

        assert [t.text for t in instances[1].fields["input_tokens"].tokens
                ] == ["sentence", "for", "language"]
        assert [t.text for t in instances[1].fields["output_tokens"].tokens
                ] == ["for", "language", "modelling"]

        assert [t.text for t in instances[2].fields["input_tokens"].tokens
                ] == ["modelling", ".", "Here"]
        assert [t.text for t in instances[2].fields["output_tokens"].tokens
                ] == [".", "Here", "'s"]

        assert [t.text for t in instances[3].fields["input_tokens"].tokens
                ] == ["'s", "another", "one"]
        assert [t.text for t in instances[3].fields["output_tokens"].tokens
                ] == ["another", "one", "for"]

        assert [t.text for t in instances[4].fields["input_tokens"].tokens
                ] == ["for", "extra", "language"]
        assert [t.text for t in instances[4].fields["output_tokens"].tokens
                ] == ["extra", "language", "modelling"]
Beispiel #34
0
 def test_ner_reader_can_filter_by_domain(self):
     conll_reader = OntonotesNamedEntityRecognition(
         domain_identifier="subdomain2")
     instances = conll_reader.read(FIXTURES_ROOT / "structured_prediction" /
                                   "srl" / "conll_2012")
     instances = ensure_list(instances)
     assert len(instances) == 1
    def test_read_from_file(self):

        # Read in the data
        reader = SegmentedMHDDatasetReader()
        instances = ensure_list(
            reader.read('tests/fixtures/test_segments.jsonl'))

        # Define our expectations
        instance0 = {
            'session_id':
            '1337',
            'utterances':
            [['First', 'utterance'], ['Second', 'utterance'],
             ['Third', 'utterance', 'with', 'a', 'different', 'label']],
            'speakers': ['speaker0', 'speaker1', 'speaker0'],
            'labels': ['label0', 'label1'],
            'durations': [2, 1]
        }

        assert len(
            instances) == 2  # Ensure data has correct number of elements

        # Check first instance matches
        fields = instances[0].fields
        utterances = [[x.text for x in utterance.tokens]
                      for utterance in fields['utterances'].field_list]

        assert utterances == instance0['utterances']
        speakers = [x.text for x in fields['speakers'].tokens]
        assert speakers == instance0['speakers']
        assert fields['labels'].labels == instance0['labels']
        print(fields['durations'])
        assert fields['durations'].int_list == instance0['durations']
    def test_read_from_file(self, lazy):
        reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy)

        instances = ensure_list(
            reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' /
                        u'language_modeling.txt'))
        # The last potential instance is left out, which is ok, because we don't have an end token
        # in here, anyway.
        assert len(instances) == 5

        assert [t.text for t in instances[0].fields[u"input_tokens"].tokens
                ] == [u"This", u"is", u"a"]
        assert [t.text for t in instances[0].fields[u"output_tokens"].tokens
                ] == [u"is", u"a", u"sentence"]

        assert [t.text for t in instances[1].fields[u"input_tokens"].tokens
                ] == [u"sentence", u"for", u"language"]
        assert [t.text for t in instances[1].fields[u"output_tokens"].tokens
                ] == [u"for", u"language", u"modelling"]

        assert [t.text for t in instances[2].fields[u"input_tokens"].tokens
                ] == [u"modelling", u".", u"Here"]
        assert [t.text for t in instances[2].fields[u"output_tokens"].tokens
                ] == [u".", u"Here", u"'s"]

        assert [t.text for t in instances[3].fields[u"input_tokens"].tokens
                ] == [u"'s", u"another", u"one"]
        assert [t.text for t in instances[3].fields[u"output_tokens"].tokens
                ] == [u"another", u"one", u"for"]

        assert [t.text for t in instances[4].fields[u"input_tokens"].tokens
                ] == [u"for", u"extra", u"language"]
        assert [t.text for t in instances[4].fields[u"output_tokens"].tokens
                ] == [u"extra", u"language", u"modelling"]
Beispiel #37
0
    def test_default_format(self, lazy):
        reader = Seq2SeqDatasetReader(lazy=lazy)
        instances = reader.read('tests/fixtures/data/seq2seq_copy.tsv')
        instances = ensure_list(instances)

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["source_tokens"].tokens
                ] == ["@@START@@", "this", "is", "a", "sentence", "@@END@@"]
        assert [t.text for t in fields["target_tokens"].tokens
                ] == ["@@START@@", "this", "is", "a", "sentence", "@@END@@"]
        fields = instances[1].fields
        assert [t.text for t in fields["source_tokens"].tokens
                ] == ["@@START@@", "this", "is", "another", "@@END@@"]
        assert [t.text for t in fields["target_tokens"].tokens
                ] == ["@@START@@", "this", "is", "another", "@@END@@"]
        fields = instances[2].fields
        assert [t.text for t in fields["source_tokens"].tokens] == [
            "@@START@@", "all", "these", "sentences", "should", "get",
            "copied", "@@END@@"
        ]
        assert [t.text for t in fields["target_tokens"].tokens] == [
            "@@START@@", "all", "these", "sentences", "should", "get",
            "copied", "@@END@@"
        ]
    def test_read_from_file(self, lazy, coding_scheme):
        conll_reader = Conll2003DatasetReader(lazy=lazy,
                                              coding_scheme=coding_scheme)
        instances = conll_reader.read(
            str(AllenNlpTestCase.FIXTURES_ROOT / "data" / "conll2003.txt"))
        instances = ensure_list(instances)

        if coding_scheme == "IOB1":
            expected_labels = ["I-ORG", "O", "I-PER", "O", "O", "I-LOC", "O"]
        else:
            expected_labels = ["U-ORG", "O", "U-PER", "O", "O", "U-LOC", "O"]

        fields = instances[0].fields
        tokens = [t.text for t in fields["tokens"].tokens]
        assert tokens == [
            "U.N.", "official", "Ekeus", "heads", "for", "Baghdad", "."
        ]
        assert fields["tags"].labels == expected_labels

        fields = instances[1].fields
        tokens = [t.text for t in fields["tokens"].tokens]
        assert tokens == [
            "AI2", "engineer", "Joel", "lives", "in", "Seattle", "."
        ]
        assert fields["tags"].labels == expected_labels
Beispiel #39
0
    def test_read_from_file(self, lazy):
        conll_reader = ConllCorefReader(max_span_width=self.span_width, lazy=lazy)
        instances = ensure_list(conll_reader.read(str(AllenNlpTestCase.FIXTURES_ROOT /
                                                      'coref' / 'coref.gold_conll')))

        assert len(instances) == 2

        fields = instances[0].fields
        text = [x.text for x in fields["text"].tokens]

        assert text == ['In', 'the', 'summer', 'of', '2005', ',', 'a', 'picture', 'that',
                        'people', 'have', 'long', 'been', 'looking', 'forward', 'to',
                        'started', 'emerging', 'with', 'frequency', 'in', 'various', 'major',
                        'Hong', 'Kong', 'media', '.', 'With', 'their', 'unique', 'charm', ',',
                        'these', 'well', '-', 'known', 'cartoon', 'images', 'once', 'again',
                        'caused', 'Hong', 'Kong', 'to', 'be', 'a', 'focus', 'of', 'worldwide',
                        'attention', '.', 'The', 'world', "'s", 'fifth', 'Disney', 'park',
                        'will', 'soon', 'open', 'to', 'the', 'public', 'here', '.']

        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])

        candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text)

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x)
                                                               for i, x in gold_indices_with_ids]

        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        gold_mentions_with_ids.remove((["Hong", "Kong"], 0))
        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        assert (["their"], 1) in gold_mentions_with_ids
        # This is a span which exceeds our max_span_width, so it should not be considered.
        assert not (["these", "well", "known", "cartoon", "images"], 1) in gold_mentions_with_ids

        fields = instances[1].fields
        text = [x.text for x in fields["text"].tokens]
        assert text == ['The', 'area', 'of', 'Hong', 'Kong', 'is', 'only', 'one', 'thousand', '-', 'plus',
                        'square', 'kilometers', '.', 'The', 'population', 'is', 'dense', '.', 'Natural',
                        'resources', 'are', 'relatively', 'scarce', '.', 'However', ',', 'the', 'clever',
                        'Hong', 'Kong', 'people', 'will', 'utilize', 'all', 'resources', 'they', 'have',
                        'created', 'for', 'developing', 'the', 'Hong', 'Kong', 'tourism', 'industry', '.']

        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])

        candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text)

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x)
                                                               for i, x in gold_indices_with_ids]

        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        gold_mentions_with_ids.remove((["Hong", "Kong"], 0))
        assert (["Hong", "Kong"], 0) in gold_mentions_with_ids
        assert (["they"], 1) in gold_mentions_with_ids
        assert (['the', 'clever', 'Hong', 'Kong', 'people'], 1) in gold_mentions_with_ids
Beispiel #40
0
    def __init__(self, instances: Iterable[Instance]) -> None:
        """
        A Batch just takes an iterable of instances in its constructor and hangs onto them
        in a list.
        """
        super().__init__()

        self.instances: List[Instance] = ensure_list(instances)
        self._check_types()
 def get_num_batches(self, instances: Iterable[Instance]) -> int:
     if is_lazy(instances) and self._instances_per_epoch is None:
         # Unable to compute num batches, so just return 1.
         return 1
     elif self._instances_per_epoch is not None:
         return math.ceil(self._instances_per_epoch / self._batch_size)
     else:
         # Not lazy, so can compute the list length.
         return math.ceil(len(ensure_list(instances)) / self._batch_size)
Beispiel #42
0
    def test_source_add_start_token(self):
        reader = Seq2SeqDatasetReader(source_add_start_token=False)
        instances = reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'seq2seq_copy.tsv'))
        instances = ensure_list(instances)

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["source_tokens"].tokens] == ["this", "is", "a", "sentence", "@end@"]
        assert [t.text for t in fields["target_tokens"].tokens] == ["@start@", "this", "is",
                                                                    "a", "sentence", "@end@"]
Beispiel #43
0
    def test_source_add_start_token(self):
        reader = Seq2SeqDatasetReader(source_add_start_token=False)
        instances = reader.read('tests/fixtures/data/seq2seq_copy.tsv')
        instances = ensure_list(instances)

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["source_tokens"].tokens] == ["this", "is", "a", "sentence", "@@END@@"]
        assert [t.text for t in fields["target_tokens"].tokens] == ["@@START@@", "this", "is",
                                                                    "a", "sentence", "@@END@@"]
Beispiel #44
0
 def test_vocab_from_instances_namespaces(self):
     reader = CcgBankDatasetReader(feature_labels=['modified_pos', 'original_pos', 'predicate_arg'])
     instances = ensure_list(reader.read(self.FIXTURES_ROOT / 'data' / 'ccgbank.txt'))
     # check that we didn't clobber the labels namespace
     vocab = Vocabulary.from_instances(instances)
     self.assertSetEqual(
             set(vocab._token_to_index.keys()), # pylint: disable=protected-access
             {'tokens', 'labels', 'modified_pos_tags', 'original_pos_tags',
              'predicate_arg_tags'}
     )
Beispiel #45
0
    def test_read_from_file(self, lazy):
        reader = QangarooReader(lazy=lazy)
        instances = ensure_list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'qangaroo.json'))
        assert len(instances) == 2

        assert [t.text for t in instances[0].fields['candidates'][3]] == ['german', 'confederation']
        assert [t.text for t in instances[0].fields['query']] == ['country', 'sms', 'braunschweig']
        assert [t.text for t in instances[0].fields['supports'][0][:3]] == ['The', 'North', 'German']
        assert [t.text for t in instances[0].fields['answer']] == ['german', 'empire']
        assert instances[0].fields['answer_index'].sequence_index == 4
    def test_non_lazy(self):
        reader = LazyDatasetReader(self.instances, lazy=False)
        assert reader.num_reads == 0

        instances = reader.read('path/to/file')

        for _ in range(10):
            _instances = (i for i in instances)
            assert ensure_list(_instances) == self.instances

        assert reader.num_reads == 1
Beispiel #47
0
 def get_num_batches(self, instances: Iterable[Instance]) -> int:
     """
     Returns the number of batches that ``dataset`` will be split into; if you want to track
     progress through the batch with the generator produced by ``__call__``, this could be
     useful.
     """
     if is_lazy(instances) and self._instances_per_epoch is None:
         # Unable to compute num batches, so just return 1.
         return 1
     elif self._instances_per_epoch is not None:
         return math.ceil(self._instances_per_epoch / self._batch_size)
     else:
         # Not lazy, so can compute the list length.
         return math.ceil(len(ensure_list(instances)) / self._batch_size)
    def test_2_class(self):
        reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class")
        instances = reader.read(self.sst_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ["The", "actors", "are", "fantastic", "."],
                     "label": "1"}
        instance2 = {"tokens": ["It", "was", "terrible", "."],
                     "label": "0"}

        assert len(instances) == 2
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
    def test_brown_corpus_format(self):
        reader = SequenceTaggingDatasetReader(word_tag_delimiter='/')
        instances = reader.read('tests/fixtures/data/brown_corpus.txt')
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == ["cats", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == ["dogs", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == ["snakes", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[3].fields
        assert [t.text for t in fields["tokens"].tokens] == ["birds", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
    def test_default_format(self, lazy):
        reader = SequenceTaggingDatasetReader(lazy=lazy)
        instances = reader.read('tests/fixtures/data/sequence_tagging.tsv')
        instances = ensure_list(instances)

        assert len(instances) == 4
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == ["cats", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == ["dogs", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == ["snakes", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
        fields = instances[3].fields
        assert [t.text for t in fields["tokens"].tokens] == ["birds", "are", "animals", "."]
        assert fields["tags"].labels == ["N", "V", "N", "N"]
    def test_read_from_file(self, lazy, coding_scheme):
        conll_reader = Conll2003DatasetReader(lazy=lazy, coding_scheme=coding_scheme)
        instances = conll_reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'conll2003.txt'))
        instances = ensure_list(instances)

        if coding_scheme == 'IOB1':
            expected_labels = ['I-ORG', 'O', 'I-PER', 'O', 'O', 'I-LOC', 'O']
        else:
            expected_labels = ['U-ORG', 'O', 'U-PER', 'O', 'O', 'U-LOC', 'O']

        fields = instances[0].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == ['U.N.', 'official', 'Ekeus', 'heads', 'for', 'Baghdad', '.']
        assert fields["tags"].labels == expected_labels

        fields = instances[1].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == ['AI2', 'engineer', 'Joel', 'lives', 'in', 'Seattle', '.']
        assert fields["tags"].labels == expected_labels
    def test_read_from_file(self, lazy):
        conll_reader = SrlReader(lazy=lazy)
        instances = conll_reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'conll_2012' / 'subdomain')
        instances = ensure_list(instances)

        fields = instances[0].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == ["Mali", "government", "officials", "say", "the", "woman", "'s",
                          "confession", "was", "forced", "."]
        assert fields["verb_indicator"].labels[3] == 1
        assert fields["tags"].labels == ['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1',
                                         'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O']

        fields = instances[1].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == ["Mali", "government", "officials", "say", "the", "woman", "'s",
                          "confession", "was", "forced", "."]
        assert fields["verb_indicator"].labels[8] == 1
        assert fields["tags"].labels == ['O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1',
                                         'I-ARG1', 'I-ARG1', 'B-V', 'B-ARG2', 'O']

        fields = instances[2].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month', 'after',
                          'four', 'months', 'of', 'hearings', '.']
        assert fields["verb_indicator"].labels[2] == 1
        assert fields["tags"].labels == ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'B-ARGM-TMP',
                                         'I-ARGM-TMP', 'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP',
                                         'I-ARGM-TMP', 'I-ARGM-TMP', 'O']

        fields = instances[3].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month', 'after',
                          'four', 'months', 'of', 'hearings', '.']
        assert fields["verb_indicator"].labels[11] == 1
        assert fields["tags"].labels == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-V', 'O']

        # Tests a sentence with no verbal predicates.
        fields = instances[4].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == ["Denise", "Dillon", "Headline", "News", "."]
        assert fields["verb_indicator"].labels == [0, 0, 0, 0, 0]
        assert fields["tags"].labels == ['O', 'O', 'O', 'O', 'O']
Beispiel #53
0
    def test_default_format(self, lazy):
        reader = Seq2SeqDatasetReader(lazy=lazy)
        instances = reader.read('tests/fixtures/data/seq2seq_copy.tsv')
        instances = ensure_list(instances)

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["source_tokens"].tokens] == ["@@START@@", "this", "is",
                                                                    "a", "sentence", "@@END@@"]
        assert [t.text for t in fields["target_tokens"].tokens] == ["@@START@@", "this", "is",
                                                                    "a", "sentence", "@@END@@"]
        fields = instances[1].fields
        assert [t.text for t in fields["source_tokens"].tokens] == ["@@START@@", "this", "is",
                                                                    "another", "@@END@@"]
        assert [t.text for t in fields["target_tokens"].tokens] == ["@@START@@", "this", "is",
                                                                    "another", "@@END@@"]
        fields = instances[2].fields
        assert [t.text for t in fields["source_tokens"].tokens] == ["@@START@@", "all", "these", "sentences",
                                                                    "should", "get", "copied", "@@END@@"]
        assert [t.text for t in fields["target_tokens"].tokens] == ["@@START@@", "all", "these", "sentences",
                                                                    "should", "get", "copied", "@@END@@"]
Beispiel #54
0
    def test_default_format(self, lazy):
        reader = Seq2SeqDatasetReader(lazy=lazy)
        instances = reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'seq2seq_copy.tsv'))
        instances = ensure_list(instances)

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["source_tokens"].tokens] == ["@start@", "this", "is",
                                                                    "a", "sentence", "@end@"]
        assert [t.text for t in fields["target_tokens"].tokens] == ["@start@", "this", "is",
                                                                    "a", "sentence", "@end@"]
        fields = instances[1].fields
        assert [t.text for t in fields["source_tokens"].tokens] == ["@start@", "this", "is",
                                                                    "another", "@end@"]
        assert [t.text for t in fields["target_tokens"].tokens] == ["@start@", "this", "is",
                                                                    "another", "@end@"]
        fields = instances[2].fields
        assert [t.text for t in fields["source_tokens"].tokens] == ["@start@", "all", "these", "sentences",
                                                                    "should", "get", "copied", "@end@"]
        assert [t.text for t in fields["target_tokens"].tokens] == ["@start@", "all", "these", "sentences",
                                                                    "should", "get", "copied", "@end@"]
    def test_read_from_file(self, lazy):
        reader = StanfordSentimentTreeBankDatasetReader(lazy=lazy)
        instances = reader.read(self.sst_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ["The", "actors", "are", "fantastic", "."],
                     "label": "4"}
        instance2 = {"tokens": ["It", "was", "terrible", "."],
                     "label": "0"}
        instance3 = {"tokens": ["Chomp", "chomp", "!"],
                     "label": "2"}

        assert len(instances) == 3
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"]
    def test_use_subtrees(self):
        reader = StanfordSentimentTreeBankDatasetReader(use_subtrees=True)
        instances = reader.read(self.sst_path)
        instances = ensure_list(instances)

        instance1 = {"tokens": ["The", "actors", "are", "fantastic", "."],
                     "label": "4"}
        instance2 = {"tokens": ["The", "actors"],
                     "label": "2"}
        instance3 = {"tokens": ["The"],
                     "label": "2"}

        assert len(instances) == 21
        fields = instances[0].fields
        assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
        assert fields["label"].label == instance3["label"]
Beispiel #57
0
    def test_read_from_file(self, lazy):
        conll_reader = OntonotesNamedEntityRecognition(lazy=lazy)
        instances = conll_reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'conll_2012' / 'subdomain')
        instances = ensure_list(instances)

        fields = instances[0].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == ["Mali", "government", "officials", "say", "the", "woman", "'s",
                          "confession", "was", "forced", "."]
        assert fields["tags"].labels == ['B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

        fields = instances[1].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month', 'after',
                          'four', 'months', 'of', 'hearings', '.']
        assert fields["tags"].labels == ['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O',
                                         'B-DATE', 'I-DATE', 'O', 'O', 'O']

        fields = instances[2].fields
        tokens = [t.text for t in fields['tokens'].tokens]
        assert tokens == ["Denise", "Dillon", "Headline", "News", "."]
        assert fields["tags"].labels == ['B-PERSON', 'I-PERSON', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O']
    def test_read_from_file(self, lazy):
        conll_reader = WinobiasReader(max_span_width=self.span_width, lazy=lazy)
        instances = ensure_list(conll_reader.read(str(AllenNlpTestCase.FIXTURES_ROOT /
                                                      'coref' / 'winobias.sample')))

        assert len(instances) == 2

        fields = instances[0].fields
        text = [x.text for x in fields["text"].tokens]
        assert text == ['The', 'designer', 'argued', 'with', 'the', 'developer',
                        'and', 'slapped', 'her', 'in', 'the', 'face', '.']

        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])

        candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text)

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x)
                                                               for i, x in gold_indices_with_ids]
        assert gold_mentions_with_ids == [(['the', 'developer'], 0), (['her'], 0)]

        fields = instances[1].fields
        text = [x.text for x in fields["text"].tokens]
        assert text == ['The', 'salesperson', 'sold', 'some', 'books', 'to', 'the',
                        'librarian', 'because', 'she', 'was', 'trying', 'to', 'sell', 'them', '.']

        spans = fields["spans"].field_list
        span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans])
        candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text)

        gold_span_labels = fields["span_labels"]
        gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1]
        gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x)
                                                               for i, x in gold_indices_with_ids]
        assert gold_mentions_with_ids == [(['The', 'salesperson'], 0),
                                          (['some', 'books'], 1),
                                          (['she'], 0), (['them'], 1)]