def test_set_skip_indexing_true(self, lazy: bool, label_name: str): reader = TextSentimentReader(lazy=lazy, skip_label_indexing=True, label_name=label_name) ag_path = Path(DATA_DIR, "integer_labels_original.jsonl").resolve() if label_name == 'text_sentiment': ag_path = Path(DATA_DIR, "integer_labels.jsonl").resolve() instances = reader.read(ag_path) instances = ensure_list(instances) instance1 = { "tokens": ["This", "text", "has", "label", "0"], "label": 0 } instance2 = { "tokens": ["This", "text", "has", "label", "1"], "label": 1 } assert len(instances) == 2 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"] assert fields["label"].label == instance1["label"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"] assert fields["label"].label == instance2["label"] ag_path = Path(DATA_DIR, "imdb_corpus_original.jsonl").resolve() if label_name == 'text_sentiment': ag_path = Path(DATA_DIR, "imdb_corpus.jsonl").resolve() with pytest.raises(ValueError) as exec_info: ensure_list(reader.read(ag_path)) assert str( exec_info.value ) == "Labels must be integers if skip_label_indexing is True."
def test_spans_work_correctly(self): reader = SpanAeDatasetReader(max_span_width=1) instances = reader.read('tests/fixtures/parallel_copy.tsv') instances = ensure_list(instances) assert len(instances) == 3 fields = instances[0].fields assert type(fields["source_spans"]) == ListField assert type(fields["source_spans"].field_list[0]) == SpanField assert len(fields["source_spans"].field_list) == len( fields["source_tokens"].tokens) reader = SpanAeDatasetReader(max_span_width=2) instances = reader.read('tests/fixtures/parallel_copy.tsv') instances = ensure_list(instances) fields = instances[1].fields assert len(fields["source_spans"].field_list ) == len(fields["source_tokens"].tokens) * 2 - 1 reader = SpanAeDatasetReader(max_span_width=3) instances = reader.read('tests/fixtures/parallel_copy.tsv') instances = ensure_list(instances) fields = instances[1].fields assert len(fields["source_spans"].field_list ) == len(fields["source_tokens"].tokens) * 3 - 3
def test_set_skip_indexing_true(self, lazy): reader = MultiLabelTextClassificationJsonReader( lazy=lazy, skip_label_indexing=True, num_labels=3) integer_label_path = Path( "tests/fixtures") / "data" / "integer_labels.jsonl" instances = reader.read(integer_label_path) instances = ensure_list(instances) instance1 = { "tokens": ["This", "text", "has", "labels", "0", "2"], "labels": [0, 2] } instance2 = { "tokens": ["This", "text", "has", "labels", "0", "1"], "labels": [0, 1] } assert len(instances) == 2 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"] assert fields["labels"].labels == instance1["labels"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"] assert fields["labels"].labels == instance2["labels"] with pytest.raises(ValueError) as exec_info: string_label_path = Path( "tests/fixtures") / "data" / "reuters-21578" / "train.jsonl" ensure_list(reader.read(string_label_path)) assert str( exec_info.value ) == "Labels must be integers if skip_label_indexing is True."
def test_max_sentences(self): conll_reader = ConllCorefReader(max_span_width=self.span_width) instances = ensure_list( conll_reader.read(str(FIXTURES_ROOT / "coref" / "coref.gold_conll")) ) limited_conll_reader = ConllCorefReader(max_span_width=self.span_width, max_sentences=2) limited_instances = ensure_list( limited_conll_reader.read(str(FIXTURES_ROOT / "coref" / "coref.gold_conll")) ) assert len(limited_instances) == len(instances) == 4 tokens_of = lambda instance: instance.fields["text"].tokens text_of = lambda tokens: [token.text for token in tokens] docs = [tokens_of(instance) for instance in instances] limited_docs = [tokens_of(instance) for instance in limited_instances] # Short ones; not truncated assert limited_docs[1] == docs[1] assert limited_docs[3] == docs[3] # Truncation happened assert len(limited_docs[0]) < len(docs[0]) assert len(limited_docs[2]) < len(docs[2]) assert "Disney" in text_of(docs[0]) and "Disney" not in text_of(limited_docs[0]) assert "tourism" in text_of(docs[2]) and "tourism" not in text_of(limited_docs[2]) # Truncated tokens are the prefixes assert limited_docs[0] == docs[0][: len(limited_docs[0])] assert limited_docs[2] == docs[2][: len(limited_docs[2])]
def test_entity_mask(self): # Check 'mask' mode has expected behavior reader = get_reader() reader.entity_masking = 'mask' instances = ensure_list( reader.read('tests/fixtures/tacred/LDC2018T24.json')) tokens_0 = [x.text for x in instances[0]['tokens']] subj_tokens_0 = tokens_0[14] self.assertEqual(subj_tokens_0, '[MASK]') tokens_0 = [x.text for x in instances[0]['tokens']] obj_tokens_0 = tokens_0[17] self.assertEqual(obj_tokens_0, '[MASK]') # Check 'type/role' mode has expected behavior reader.entity_masking = 'type/role' instances = ensure_list( reader.read('tests/fixtures/tacred/LDC2018T24.json')) tokens_0 = [x.text for x in instances[0]['tokens']] subj_tokens_0 = tokens_0[14] self.assertEqual(subj_tokens_0, '[s-person]') tokens_0 = [x.text for x in instances[0]['tokens']] obj_tokens_0 = tokens_0[17] self.assertEqual(obj_tokens_0, '[o-title]')
def test_set_skip_indexing_true(self, lazy): reader = TextClassificationJsonReader(lazy=lazy, skip_label_indexing=True) ag_path = ( AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "integer_labels.jsonl" ) instances = reader.read(ag_path) instances = ensure_list(instances) instance1 = {"tokens": ["This", "text", "has", "label", "0"], "label": 0} instance2 = {"tokens": ["This", "text", "has", "label", "1"], "label": 1} assert len(instances) == 2 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"] assert fields["label"].label == instance1["label"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"] assert fields["label"].label == instance2["label"] with pytest.raises(ValueError) as exec_info: ag_path = ( AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "imdb_corpus.jsonl" ) ensure_list(reader.read(ag_path)) assert str(exec_info.value) == "Labels must be integers if skip_label_indexing is True."
def test_read_from_file(self): reader = AclarcDatasetReader() instances = ensure_list( reader.read('tests/fixtures/aclarc-train.jsonl')) instance1 = { "citation_text": ['Typical', 'examples', 'are', 'Bulgarian'] } assert len(instances) == 10 fields = instances[0].fields assert isinstance(instances, list) assert [t.text for t in fields['citation_text'].tokens ][:4] == instance1['citation_text'] reader = AclSectionTitleDatasetReader() instances = ensure_list( reader.read('tests/fixtures/aclarc-section-title.jsonl')) instance1 = { "section_name": 'related work', "citation_text": ['With', 'C99'] } assert len(instances) == 10 fields = instances[1].fields assert isinstance(instances, list) assert [t.text for t in fields['citation_text'].tokens ][:2] == instance1['citation_text'] assert fields['section_label'].label == instance1['section_name'] reader = AclCiteWorthinessDatasetReader() instances = ensure_list( reader.read('tests/fixtures/aclarc-cite-worthiness.jsonl')) instance1 = {"is_citation": 'False'} fields = instances[1].fields assert isinstance(instances, list) assert fields['is_citation'].label == instance1['is_citation']
def test_length_limit_works(self): # We're making sure the length of the text is correct if length limit is provided. reader = SquadReader(passage_length_limit=30, question_length_limit=10, skip_invalid_examples=True) instances = ensure_list( reader.read(FIXTURES_ROOT / "rc" / "squad.json")) assert len(instances[0].fields["question"].tokens) == 10 assert len(instances[0].fields["passage"].tokens) == 30 # invalid examples where all the answers exceed the passage length should be skipped. assert len(instances) == 3 # Length limit still works if we do not skip the invalid examples reader = SquadReader(passage_length_limit=30, question_length_limit=10, skip_invalid_examples=False) instances = ensure_list( reader.read(FIXTURES_ROOT / "rc" / "squad.json")) assert len(instances[0].fields["question"].tokens) == 10 assert len(instances[0].fields["passage"].tokens) == 30 # invalid examples should not be skipped. assert len(instances) == 5 # Make sure the answer texts does not change, so that the evaluation will not be affected reader_unlimited = SquadReader(passage_length_limit=30, question_length_limit=10, skip_invalid_examples=False) instances_unlimited = ensure_list( reader_unlimited.read(FIXTURES_ROOT / "rc" / "squad.json")) for instance_x, instance_y in zip(instances, instances_unlimited): print(instance_x.fields["metadata"]["answer_texts"]) assert set(instance_x.fields["metadata"]["answer_texts"]) == set( instance_y.fields["metadata"]["answer_texts"])
def test_read_from_file(self, lazy): reader = SquadReader(lazy=lazy) instances = ensure_list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'squad.json')) assert len(instances) == 5 assert [t.text for t in instances[0].fields["question"].tokens[:3]] == ["To", "whom", "did"] assert [t.text for t in instances[0].fields["passage"].tokens[:3]] == ["Architecturally", ",", "the"] assert [t.text for t in instances[0].fields["passage"].tokens[-3:]] == ["of", "Mary", "."] assert instances[0].fields["span_start"].sequence_index == 102 assert instances[0].fields["span_end"].sequence_index == 104 assert [t.text for t in instances[1].fields["question"].tokens[:3]] == ["What", "sits", "on"] assert [t.text for t in instances[1].fields["passage"].tokens[:3]] == ["Architecturally", ",", "the"] assert [t.text for t in instances[1].fields["passage"].tokens[-3:]] == ["of", "Mary", "."] assert instances[1].fields["span_start"].sequence_index == 17 assert instances[1].fields["span_end"].sequence_index == 23 # We're checking this case because I changed the answer text to only have a partial # annotation for the last token, which happens occasionally in the training data. We're # making sure we get a reasonable output in that case here. assert ([t.text for t in instances[3].fields["question"].tokens[:3]] == ["Which", "individual", "worked"]) assert [t.text for t in instances[3].fields["passage"].tokens[:3]] == ["In", "1882", ","] assert [t.text for t in instances[3].fields["passage"].tokens[-3:]] == ["Nuclear", "Astrophysics", "."] span_start = instances[3].fields["span_start"].sequence_index span_end = instances[3].fields["span_end"].sequence_index answer_tokens = instances[3].fields["passage"].tokens[span_start:(span_end + 1)] expected_answer_tokens = ["Father", "Julius", "Nieuwland"] assert [t.text for t in answer_tokens] == expected_answer_tokens
def setUp(self): super(TestCopyNetReader, self).setUp() params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json") self.reader = DatasetReader.from_params(params["dataset_reader"]) instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv") self.instances = ensure_list(instances) self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
def test_read(self, lazy): params = Params({'lazy': lazy, 'num_context_answers': 2,}) reader = QuACReader.from_params(params) instances = reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'quac_sample.json')) instances = ensure_list(instances) assert instances[0].fields["question"].sequence_length() == 6 assert instances[0].fields["yesno_list"].sequence_length() == 6 assert [t.text for t in instances[0].fields["question"].field_list[0].tokens[:3]] == ["What", "was", "the"] assert len(instances) == 2 passage_length = len(instances[0].fields["passage"].tokens) assert [t.text for t in instances[0].fields["passage"].tokens[:3]] == ["DJ", "Kool", "Herc"] assert [x.label for x in instances[0].fields["yesno_list"].field_list] == ["x", "x", "y", "x", "x", "x"] assert [x.label for x in instances[0].fields["followup_list"].field_list] == ["y", "m", "m", "n", "m", "y"] assert instances[0].fields["p1_answer_marker"].field_list[0].labels == ["O"] * passage_length # Check the previous answer marking here prev_1_list = ["O"] * passage_length prev_2_list = ["O"] * passage_length q0_span_start = instances[0].fields['span_start'].field_list[0].sequence_index q0_span_end = instances[0].fields['span_end'].field_list[0].sequence_index prev_1_list[q0_span_start] = "<{0:d}_{1:s}>".format(1, "start") prev_1_list[q0_span_end] = "<{0:d}_{1:s}>".format(1, "end") prev_2_list[q0_span_start] = "<{0:d}_{1:s}>".format(2, "start") prev_2_list[q0_span_end] = "<{0:d}_{1:s}>".format(2, "end") for passage_index in range(q0_span_start + 1, q0_span_end): prev_1_list[passage_index] = "<{0:d}_{1:s}>".format(1, "in") prev_2_list[passage_index] = "<{0:d}_{1:s}>".format(2, "in") assert instances[0].fields["p1_answer_marker"].field_list[1].labels == prev_1_list assert instances[0].fields["p2_answer_marker"].field_list[2].labels == prev_2_list
def test_read_from_file(self): reader = SemanticDependenciesDatasetReader() instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'dm.sdp') instances = ensure_list(instances) instance = instances[0] arcs = instance.fields["arc_tags"] tokens = [x.text for x in instance.fields["tokens"].tokens] assert tokens == ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] assert arcs.indices == [(1, 0), (1, 5), (1, 8), (4, 3), (5, 4), (8, 11), (8, 16), (10, 8), (10, 9), (14, 11), (14, 12), (14, 13), (16, 15)] assert arcs.labels == ['compound', 'ARG1', 'ARG1', 'ARG1', 'measure', 'ARG1', 'loc', 'ARG2', 'BV', 'ARG2', 'BV', 'ARG1', 'of'] instance = instances[1] arcs = instance.fields["arc_tags"] tokens = [x.text for x in instance.fields["tokens"].tokens] assert tokens == ['Mr.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N.V.', ',', 'the', 'Dutch', 'publishing', 'group', '.'] assert arcs.indices == [(1, 0), (1, 2), (3, 2), (3, 4), (5, 4), (5, 6), (5, 11), (11, 8), (11, 9), (11, 10)] assert arcs.labels == ['compound', 'ARG1', 'ARG2', 'ARG1', 'ARG2', 'compound', 'appos', 'BV', 'ARG1', 'compound']
def test_read_from_file(self, lazy): reader = MRPCReader(tokenizer=WordTokenizer(word_splitter=JustSpacesWordSplitter()), token_indexers={"bert": PretrainedBertIndexer(pretrained_model=self.BERT_VOCAB_PATH)}, lazy=lazy, skip_label_indexing=False, mode='merge') instances = reader.read( str(self.FIXTURES_ROOT / 'mrpc_dev.tsv')) instances = ensure_list(instances) instance1 = {"tokens": "He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .".split() + ["[SEP]"] + "\" The foodservice pie business does not fit our long-term growth strategy .".split(), "label": '1'} instance2 = {"tokens": "Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .".split() + ["[SEP]"] + "His wife said he was \" 100 percent behind George Bush \" and looked forward to using his years of training in the war .".split(), "label": '0'} instance3 = {"tokens": "The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .".split() + ["[SEP]"] + "The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .".split(), "label": '0'} for instance, expected_instance in zip(instances, [instance1, instance2, instance3]): fields = instance.fields assert [ t.text for t in fields["tokens"].tokens] == expected_instance["tokens"] assert fields["label"].label == expected_instance["label"]
def test_read_from_file(self, lazy): reader = SnliReader(lazy=lazy) instances = reader.read('tests/fixtures/data/snli.jsonl') instances = ensure_list(instances) instance1 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken", "down", "airplane", "."], "hypothesis": ["A", "person", "is", "training", "his", "horse", "for", "a", "competition", "."], "label": "neutral"} instance2 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken", "down", "airplane", "."], "hypothesis": ["A", "person", "is", "at", "a", "diner", ",", "ordering", "an", "omelette", "."], "label": "contradiction"} instance3 = {"premise": ["A", "person", "on", "a", "horse", "jumps", "over", "a", "broken", "down", "airplane", "."], "hypothesis": ["A", "person", "is", "outdoors", ",", "on", "a", "horse", "."], "label": "entailment"} assert len(instances) == 3 fields = instances[0].fields assert [t.text for t in fields["premise"].tokens] == instance1["premise"] assert [t.text for t in fields["hypothesis"].tokens] == instance1["hypothesis"] assert fields["label"].label == instance1["label"] fields = instances[1].fields assert [t.text for t in fields["premise"].tokens] == instance2["premise"] assert [t.text for t in fields["hypothesis"].tokens] == instance2["hypothesis"] assert fields["label"].label == instance2["label"] fields = instances[2].fields assert [t.text for t in fields["premise"].tokens] == instance3["premise"] assert [t.text for t in fields["hypothesis"].tokens] == instance3["hypothesis"] assert fields["label"].label == instance3["label"]
def test_srl_reader_can_filter_by_domain(self): conll_reader = SrlReader(domain_identifier="subdomain2") instances = conll_reader.read('tests/fixtures/conll_2012/') instances = ensure_list(instances) # If we'd included the folder, we'd have 9 instances. assert len(instances) == 2
def test_read_samples(): """Tests parsing the samples file""" reader = DeftJsonlReader(subtasks=[1, 2, 3], read_spacy_pos_tags=False, read_spacy_dep_rels=False) instances = ensure_list( reader._read('tests/fixtures/jsonl_format_samples.jsonl')) assert len(instances) == 5 expected_fields = [ "metadata", "tokens", "sentence_labels", "tags", "relation_root_idxs", "relations" ] for instance in instances: assert list(instance.fields.keys()) == expected_fields expected_tokens = [ "3616", ".", "Some", "of", "these", "are", "binocular", "cues", ",", "which", "means", "that", "they", "rely", "on", "the", "use", "of", "both", "eyes", ".", "One", "example", "of", "a", "binocular", "depth", "cue", "is", "binocular", "disparity", ",", "the", "slightly", "different", "view", "of", "the", "world", "that", "each", "of", "our", "eyes", "receives", ".", "To", "experience", "this", "slightly", "different", "view", ",", "do", "this", "simple", "exercise", ":", "extend", "your", "arm", "fully", "and", "extend", "one", "of", "your", "fingers", "and", "focus", "on", "that", "finger", "." ] metadata_field = instances[0].fields.get('metadata') assert metadata_field['words'] == expected_tokens instance_tokens = [t.text for t in instances[0].fields.get("tokens")] assert instance_tokens == expected_tokens
def test_ultra_fine_reader(self): reader = get_reader("entity") instances = ensure_list( reader.read('tests/fixtures/evaluation/ultra_fine/train.json')) # Check number of instances is correct self.assertEqual(len(instances), 2) # Check that first instance's tokens are correct tokens_0 = [x.text for x in instances[0]['tokens']] segments_0 = list(instances[0]['segment_ids'].array) actual = list(zip(tokens_0, segments_0)) expected = [('[CLS]', 0), ('the', 0), ('british', 0), ('information', 0), ('commissioner', 0), ("'s", 0), ('office', 0), ('invites', 0), ('[unused0]', 0), ('to', 0), ('locate', 0), ('its', 0), ('add', 0), ('##ress', 0), ('using', 0), ('google', 0), ('[UNK]', 0), ('.', 0), ('[SEP]', 0), ('web', 1), ('users', 1), ('[SEP]', 1)] self.assertListEqual(actual, expected) iterator = DataIterator.from_params(Params({"type": "basic"})) iterator.index_with(Vocabulary()) for batch in iterator(instances, num_epochs=1, shuffle=False): break expected_labels = [[0, 0, 0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0]] self.assertEqual(batch['label_ids'].numpy().tolist(), expected_labels)
def _memory_sized_lists(self, instances: Iterable[Instance]) -> Iterable[List[Instance]]: """ Breaks the dataset into "memory-sized" lists of instances, which it yields up one at a time until it gets through a full epoch. For example, if the dataset is already an in-memory list, and each epoch represents one pass through the dataset, it just yields back the dataset. Whereas if the dataset is lazily read from disk and we've specified to load 1000 instances at a time, then it yields lists of 1000 instances each. """ lazy = is_lazy(instances) # Get an iterator over the next epoch worth of instances. iterator = self._take_instances(instances, self._instances_per_epoch) # We have four different cases to deal with: # With lazy instances and no guidance about how many to load into memory, # we just load ``batch_size`` instances at a time: if lazy and self._max_instances_in_memory is None: yield from lazy_groups_of(iterator, self._batch_size) # If we specified max instances in memory, lazy or not, we just # load ``max_instances_in_memory`` instances at a time: elif self._max_instances_in_memory is not None: yield from lazy_groups_of(iterator, self._max_instances_in_memory) # If we have non-lazy instances, and we want all instances each epoch, # then we just yield back the list of instances: elif self._instances_per_epoch is None: yield ensure_list(instances) # In the final case we have non-lazy instances, we want a specific number # of instances each epoch, and we didn't specify how to many instances to load # into memory. So we convert the whole iterator to a list: else: yield list(iterator)
def test_read_from_file(self, lazy, coding_scheme): conll_reader = Conll2003DatasetReader(lazy=lazy, coding_scheme=coding_scheme) instances = conll_reader.read( str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'conll2003.txt')) instances = ensure_list(instances) if coding_scheme == 'IOB1': expected_labels = ['I-ORG', 'O', 'I-PER', 'O', 'O', 'I-LOC', 'O'] else: expected_labels = ['U-ORG', 'O', 'U-PER', 'O', 'O', 'U-LOC', 'O'] fields = instances[0].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == [ 'U.N.', 'official', 'Ekeus', 'heads', 'for', 'Baghdad', '.' ] assert fields["tags"].labels == expected_labels fields = instances[1].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == [ 'AI2', 'engineer', 'Joel', 'lives', 'in', 'Seattle', '.' ] assert fields["tags"].labels == expected_labels
def setUp(self): super().setUp() params = Params.from_file(self.FIXTURES_ROOT / "encoder_decoder" / "copynet_seq2seq" / "experiment.json") self.reader = DatasetReader.from_params(params["dataset_reader"]) instances = self.reader.read(self.FIXTURES_ROOT / "data" / "copynet" / "copyover.tsv") self.instances = ensure_list(instances) self.vocab = Vocabulary.from_params(params=params["vocabulary"], instances=instances)
def test_length_limit_works(self): max_query_length = 10 stride = 20 reader = TransformerSquadReader( length_limit=100, max_query_length=max_query_length, stride=stride, skip_invalid_examples=False, ) instances = ensure_list( reader.read(FIXTURES_ROOT / "rc" / "squad.json")) assert len(instances) == 12 # The sequence is "<s> question </s> </s> context". assert instances[0].fields["context_span"].span_start == len( reader._tokenizer.sequence_pair_start_tokens ) + max_query_length + len(reader._tokenizer.sequence_pair_mid_tokens) instance_0_text = [ t.text for t in instances[0].fields["question_with_context"].tokens ] instance_1_text = [ t.text for t in instances[1].fields["question_with_context"].tokens ] assert instance_0_text[:max_query_length + 2] == instance_1_text[:max_query_length + 2] assert instance_0_text[max_query_length + 3] != instance_1_text[max_query_length + 3] assert instance_0_text[-1] == "[SEP]" assert instance_0_text[-2] == "##rot" assert ( instance_1_text[instances[1].fields["context_span"].span_start + stride - 1] == "##rot")
def test_read_from_file_ag_news_corpus(self, lazy): reader = TextClassificationJsonReader(lazy=lazy) ag_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "ag_news_corpus.jsonl" instances = reader.read(ag_path) instances = ensure_list(instances) instance1 = {"tokens": ['Memphis', 'Rout', 'Still', 'Stings', 'for', 'No', '.', '14', 'Louisville', ';', 'Coach', 'Petrino', 'Vows', 'to', 'Have', 'Team', 'Better', 'Prepared', '.', 'NASHVILLE', ',', 'Tenn.', 'Nov', '3', ',', '2004', '-', 'Louisville', '#', '39;s', '30-point', 'loss', 'at', 'home', 'to', 'Memphis', 'last', 'season', 'is', 'still', 'a', 'painful', 'memory', 'for', 'the', 'Cardinals', '.'], "label": "2"} instance2 = {"tokens": ['AP', '-', 'Eli', 'Manning', 'has', 'replaced', 'Kurt', 'Warner', 'as', 'the', 'New', 'York', 'Giants', "'", 'starting', 'quarterback', '.'], "label": "2"} instance3 = {"tokens": ['A', 'conference', 'dedicated', 'to', 'online', 'journalism', 'explores', 'the', 'effect', 'blogs', 'have', 'on', 'news', 'reporting', '.', 'Some', 'say', 'they', 'draw', 'attention', 'to', 'under', '-', 'reported', 'stories', '.', 'Others', 'struggle', 'to', 'establish', 'the', 'credibility', 'enjoyed', 'by', 'professionals', '.'], "label": "4"} assert len(instances) == 3 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"] assert fields["label"].label == instance1["label"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"] assert fields["label"].label == instance2["label"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"] assert fields["label"].label == instance3["label"]
def test_read(self, lazy): params = Params({ 'base_tarball_path': 'tests/fixtures/data/triviaqa-sample.tgz', 'lazy': lazy }) reader = TriviaQaReader.from_params(params) instances = reader.read('web-train.json') instances = ensure_list(instances) assert len(instances) == 3 assert [t.text for t in instances[0].fields["question"].tokens[:3]] == ["Which", "American", "-"] assert [t.text for t in instances[0].fields["passage"].tokens[:3]] == ["The", "Nobel", "Prize"] url = "http://www.nobelprize.org/nobel_prizes/literature/laureates/1930/" assert [t.text for t in instances[0].fields["passage"].tokens[-3:]] == ["<", url, ">"] assert instances[0].fields["span_start"].sequence_index == 12 assert instances[0].fields["span_end"].sequence_index == 13 assert [t.text for t in instances[1].fields["question"].tokens[:3]] == ["Which", "American", "-"] assert [t.text for t in instances[1].fields["passage"].tokens[:3]] == ["Why", "Do", "n’t"] assert [t.text for t in instances[1].fields["passage"].tokens[-3:]] == ["adults", ",", "and"] assert instances[1].fields["span_start"].sequence_index == 38 assert instances[1].fields["span_end"].sequence_index == 39 assert [t.text for t in instances[2].fields["question"].tokens[:3]] == ["Where", "in", "England"] assert [t.text for t in instances[2].fields["passage"].tokens[:3]] == ["Judi", "Dench", "-"] assert [t.text for t in instances[2].fields["passage"].tokens[-3:]] == [")", "(", "special"] assert instances[2].fields["span_start"].sequence_index == 16 assert instances[2].fields["span_end"].sequence_index == 16
def test_read_from_file(self, lazy): reader = QuoraParaphraseDatasetReader(lazy=lazy) instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "quora_paraphrase.tsv") instances = ensure_list(instances) instance1 = { "premise": "What should I do to avoid sleeping in class ?".split(), "hypothesis": "How do I not sleep in a boring class ?".split(), "label": "1", } instance2 = { "premise": "Do women support each other more than men do ?".split(), "hypothesis": "Do women need more compliments than men ?".split(), "label": "0", } instance3 = { "premise": "How can one root android devices ?".split(), "hypothesis": "How do I root an Android device ?".split(), "label": "1", } assert len(instances) == 3 for instance, expected_instance in zip( instances, [instance1, instance2, instance3]): fields = instance.fields assert [t.text for t in fields["premise"].tokens ] == expected_instance["premise"] assert [t.text for t in fields["hypothesis"].tokens ] == expected_instance["hypothesis"] assert fields["label"].label == expected_instance["label"]
def test_srl_reader_can_filter_by_domain(self): conll_reader = SrlReader(domain_identifier="subdomain2") instances = conll_reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'conll_2012') instances = ensure_list(instances) # If we'd included the folder, we'd have 9 instances. assert len(instances) == 2
def test_unpruned_adjacency_matrix(self): MAX_LEN = 100 reader = TacredDatasetReader(max_len=MAX_LEN, masking_mode="NER", dep_pruning=-1) instances = ensure_list(reader.read("tests/fixtures/tacred.json")) expected_edges = [(0, 0), (0, 3), (1, 1), (1, 3), (2, 2), (2, 3), (3, 0), (3, 1), (3, 2), (3, 3), (3, 11), (4, 4), (4, 11), (5, 5), (5, 9), (6, 6), (6, 9), (7, 7), (7, 9), (8, 8), (8, 9), (9, 5), (9, 6), (9, 7), (9, 8), (9, 9), (9, 11), (10, 10), (10, 11), (11, 3), (11, 4), (11, 9), (11, 10), (11, 11), (11, 12), (11, 13), (11, 14), (11, 25), (12, 11), (12, 12), (13, 11), (13, 13), (14, 11), (14, 14), (14, 16), (15, 15), (15, 16), (16, 14), (16, 15), (16, 16), (16, 19), (17, 17), (17, 19), (18, 18), (18, 19), (19, 16), (19, 17), (19, 18), (19, 19), (19, 21), (20, 20), (20, 21), (21, 19), (21, 20), (21, 21), (21, 24), (22, 22), (22, 24), (23, 23), (23, 24), (24, 21), (24, 22), (24, 23), (24, 24), (25, 11), (25, 25)] adjacency = instances[0].fields["adjacency"] assert sorted(adjacency.indices) == expected_edges
def test_boolq_dataset_reader_default_setting(self): reader = BoolQDatasetReader() instances = reader.read(self.boolq_path) instances = ensure_list(instances) assert len(instances) == 5 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens][:5] == [ "Persian", "language", "--", "Persian", "(/ˈpɜːrʒən,", ] assert fields["label"].label == 1 fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens][:5] == [ "Epsom", "railway", "station", "--", "Epsom", ] assert fields["label"].label == 0
def _memory_sized_lists( self, instances: Iterable[Instance]) -> Iterable[List[Instance]]: """ Breaks the dataset into "memory-sized" lists of instances, which it yields up one at a time until it gets through a full epoch. For example, if the dataset is already an in-memory list, and each epoch represents one pass through the dataset, it just yields back the dataset. Whereas if the dataset is lazily read from disk and we've specified to load 1000 instances at a time, then it yields lists of 1000 instances each. """ lazy = is_lazy(instances) # Get an iterator over the next epoch worth of instances. iterator = self._take_instances(instances, self._instances_per_epoch) # We have four different cases to deal with: # With lazy instances and no guidance about how many to load into memory, # we just load ``batch_size`` instances at a time: if lazy and self._max_instances_in_memory is None: yield from lazy_groups_of(iterator, self._batch_size) # If we specified max instances in memory, lazy or not, we just # load ``max_instances_in_memory`` instances at a time: elif self._max_instances_in_memory is not None: yield from lazy_groups_of(iterator, self._max_instances_in_memory) # If we have non-lazy instances, and we want all instances each epoch, # then we just yield back the list of instances: elif self._instances_per_epoch is None: yield ensure_list(instances) # In the final case we have non-lazy instances, we want a specific number # of instances each epoch, and we didn't specify how to many instances to load # into memory. So we convert the whole iterator to a list: else: yield list(iterator)
def test_kg_probe_reader(self): reader = get_reader() instances = ensure_list(reader.read('tests/fixtures/kg_probe/file1.txt')) # Check instances are correct length self.assertEqual(len(instances), 2) # Check masking is performed properly expected_tokens_0 = ['[CLS]', '[MASK]', '[MASK]', '[UNK]', 'quick', '##est', '.', '[SEP]'] tokens_0 = [x.text for x in instances[0]['tokens'].tokens] self.assertListEqual(expected_tokens_0, tokens_0) expected_mask_indicator_0 = np.array([0,1,1,0,0,0,0,0], dtype=np.uint8) mask_indicator_0 = instances[0]['mask_indicator'].array assert np.allclose(expected_mask_indicator_0, mask_indicator_0) expected_tokens_1 = ['[CLS]', 'the', 'brown', 'fox', 'jumped', 'over', 'the', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '.', '[SEP]'] tokens_1 = [x.text for x in instances[1]['tokens'].tokens] self.assertListEqual(expected_tokens_1, tokens_1) expected_mask_indicator_1 = np.array([0,0,0,0,0,0,0,1,1,1,1,0,0], dtype=np.uint8) mask_indicator_1 = instances[1]['mask_indicator'].array assert np.allclose(expected_mask_indicator_1, mask_indicator_1)
def test_read_from_file(self): reader = CcgBankDatasetReader(feature_labels=['modified_pos', 'original_pos', 'predicate_arg']) instances = ensure_list(reader.read(self.FIXTURES_ROOT / 'data' / 'ccgbank.txt')) assert len(instances) == 2 instance = instances[0] fields = instance.fields tokens = [token.text for token in fields['tokens'].tokens] assert tokens == ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] ccg_categories = fields['tags'].labels assert ccg_categories == ['N/N', 'N', ',', 'N/N', 'N', '(S[adj]\\NP)\\NP', ',', '(S[dcl]\\NP)/(S[b]\\NP)', '(S[b]\\NP)/NP', 'NP[nb]/N', 'N', '((S\\NP)\\(S\\NP))/NP', 'NP[nb]/N', 'N/N', 'N', '((S\\NP)\\(S\\NP))/N[num]', 'N[num]', '.'] original_pos_tags = fields['original_pos_tags'].labels assert original_pos_tags == ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.'] modified_pos_tags = fields['modified_pos_tags'].labels assert modified_pos_tags == ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.'] predicate_arg_categories = fields['predicate_arg_tags'].labels assert predicate_arg_categories == ['N_73/N_73', 'N', ',', 'N_93/N_93', 'N', '(S[adj]\\NP_83)\\NP_84', ',', '(S[dcl]\\NP_10)/(S[b]_11\\NP_10:B)_11', '(S[b]\\NP)/NP', 'NP[nb]_29/N_29', 'N', '((S_1\\NP_2)_1\\(S_1\\NP_2)_1)/NP', 'NP[nb]_48/N_48', 'N_43/N_43', 'N', '((S_61\\NP_56)_61\\(S_61\\NP_56)_61)/N[num]_62', 'N[num]', '.']
def test_read_from_file(self, lazy): reader = SnliReader(lazy=lazy) instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'snli.jsonl') instances = ensure_list(instances) instance1 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken", u"down", u"airplane", u"."], u"hypothesis": [u"A", u"person", u"is", u"training", u"his", u"horse", u"for", u"a", u"competition", u"."], u"label": u"neutral"} instance2 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken", u"down", u"airplane", u"."], u"hypothesis": [u"A", u"person", u"is", u"at", u"a", u"diner", u",", u"ordering", u"an", u"omelette", u"."], u"label": u"contradiction"} instance3 = {u"premise": [u"A", u"person", u"on", u"a", u"horse", u"jumps", u"over", u"a", u"broken", u"down", u"airplane", u"."], u"hypothesis": [u"A", u"person", u"is", u"outdoors", u",", u"on", u"a", u"horse", u"."], u"label": u"entailment"} assert len(instances) == 3 fields = instances[0].fields assert [t.text for t in fields[u"premise"].tokens] == instance1[u"premise"] assert [t.text for t in fields[u"hypothesis"].tokens] == instance1[u"hypothesis"] assert fields[u"label"].label == instance1[u"label"] fields = instances[1].fields assert [t.text for t in fields[u"premise"].tokens] == instance2[u"premise"] assert [t.text for t in fields[u"hypothesis"].tokens] == instance2[u"hypothesis"] assert fields[u"label"].label == instance2[u"label"] fields = instances[2].fields assert [t.text for t in fields[u"premise"].tokens] == instance3[u"premise"] assert [t.text for t in fields[u"hypothesis"].tokens] == instance3[u"hypothesis"] assert fields[u"label"].label == instance3[u"label"]
def test_read_from_file_ag_news_corpus_and_truncates_properly(self, lazy): reader = TextClassificationJsonReader(lazy=lazy, max_sequence_length=5) ag_path = ( AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "ag_news_corpus.jsonl" ) instances = reader.read(ag_path) instances = ensure_list(instances) instance1 = {"tokens": ["Memphis", "Rout", "Still", "Stings", "for"], "label": "2"} instance2 = {"tokens": ["AP", "-", "Eli", "Manning", "has"], "label": "2"} instance3 = {"tokens": ["A", "conference", "dedicated", "to", "online"], "label": "4"} assert len(instances) == 3 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"] assert fields["label"].label == instance1["label"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"] assert fields["label"].label == instance2["label"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"] assert fields["label"].label == instance3["label"]
def test_read_from_file(self, lazy): reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy) instances = ensure_list( reader.read('tests/fixtures/data/language_modeling.txt')) # The last potential instance is left out, which is ok, because we don't have an end token # in here, anyway. assert len(instances) == 5 assert [t.text for t in instances[0].fields["input_tokens"].tokens ] == ["This", "is", "a"] assert [t.text for t in instances[0].fields["output_tokens"].tokens ] == ["is", "a", "sentence"] assert [t.text for t in instances[1].fields["input_tokens"].tokens ] == ["sentence", "for", "language"] assert [t.text for t in instances[1].fields["output_tokens"].tokens ] == ["for", "language", "modelling"] assert [t.text for t in instances[2].fields["input_tokens"].tokens ] == ["modelling", ".", "Here"] assert [t.text for t in instances[2].fields["output_tokens"].tokens ] == [".", "Here", "'s"] assert [t.text for t in instances[3].fields["input_tokens"].tokens ] == ["'s", "another", "one"] assert [t.text for t in instances[3].fields["output_tokens"].tokens ] == ["another", "one", "for"] assert [t.text for t in instances[4].fields["input_tokens"].tokens ] == ["for", "extra", "language"] assert [t.text for t in instances[4].fields["output_tokens"].tokens ] == ["extra", "language", "modelling"]
def test_ner_reader_can_filter_by_domain(self): conll_reader = OntonotesNamedEntityRecognition( domain_identifier="subdomain2") instances = conll_reader.read(FIXTURES_ROOT / "structured_prediction" / "srl" / "conll_2012") instances = ensure_list(instances) assert len(instances) == 1
def test_read_from_file(self): # Read in the data reader = SegmentedMHDDatasetReader() instances = ensure_list( reader.read('tests/fixtures/test_segments.jsonl')) # Define our expectations instance0 = { 'session_id': '1337', 'utterances': [['First', 'utterance'], ['Second', 'utterance'], ['Third', 'utterance', 'with', 'a', 'different', 'label']], 'speakers': ['speaker0', 'speaker1', 'speaker0'], 'labels': ['label0', 'label1'], 'durations': [2, 1] } assert len( instances) == 2 # Ensure data has correct number of elements # Check first instance matches fields = instances[0].fields utterances = [[x.text for x in utterance.tokens] for utterance in fields['utterances'].field_list] assert utterances == instance0['utterances'] speakers = [x.text for x in fields['speakers'].tokens] assert speakers == instance0['speakers'] assert fields['labels'].labels == instance0['labels'] print(fields['durations']) assert fields['durations'].int_list == instance0['durations']
def test_read_from_file(self, lazy): reader = LanguageModelingReader(tokens_per_instance=3, lazy=lazy) instances = ensure_list( reader.read(AllenNlpTestCase.FIXTURES_ROOT / u'data' / u'language_modeling.txt')) # The last potential instance is left out, which is ok, because we don't have an end token # in here, anyway. assert len(instances) == 5 assert [t.text for t in instances[0].fields[u"input_tokens"].tokens ] == [u"This", u"is", u"a"] assert [t.text for t in instances[0].fields[u"output_tokens"].tokens ] == [u"is", u"a", u"sentence"] assert [t.text for t in instances[1].fields[u"input_tokens"].tokens ] == [u"sentence", u"for", u"language"] assert [t.text for t in instances[1].fields[u"output_tokens"].tokens ] == [u"for", u"language", u"modelling"] assert [t.text for t in instances[2].fields[u"input_tokens"].tokens ] == [u"modelling", u".", u"Here"] assert [t.text for t in instances[2].fields[u"output_tokens"].tokens ] == [u".", u"Here", u"'s"] assert [t.text for t in instances[3].fields[u"input_tokens"].tokens ] == [u"'s", u"another", u"one"] assert [t.text for t in instances[3].fields[u"output_tokens"].tokens ] == [u"another", u"one", u"for"] assert [t.text for t in instances[4].fields[u"input_tokens"].tokens ] == [u"for", u"extra", u"language"] assert [t.text for t in instances[4].fields[u"output_tokens"].tokens ] == [u"extra", u"language", u"modelling"]
def test_default_format(self, lazy): reader = Seq2SeqDatasetReader(lazy=lazy) instances = reader.read('tests/fixtures/data/seq2seq_copy.tsv') instances = ensure_list(instances) assert len(instances) == 3 fields = instances[0].fields assert [t.text for t in fields["source_tokens"].tokens ] == ["@@START@@", "this", "is", "a", "sentence", "@@END@@"] assert [t.text for t in fields["target_tokens"].tokens ] == ["@@START@@", "this", "is", "a", "sentence", "@@END@@"] fields = instances[1].fields assert [t.text for t in fields["source_tokens"].tokens ] == ["@@START@@", "this", "is", "another", "@@END@@"] assert [t.text for t in fields["target_tokens"].tokens ] == ["@@START@@", "this", "is", "another", "@@END@@"] fields = instances[2].fields assert [t.text for t in fields["source_tokens"].tokens] == [ "@@START@@", "all", "these", "sentences", "should", "get", "copied", "@@END@@" ] assert [t.text for t in fields["target_tokens"].tokens] == [ "@@START@@", "all", "these", "sentences", "should", "get", "copied", "@@END@@" ]
def test_read_from_file(self, lazy, coding_scheme): conll_reader = Conll2003DatasetReader(lazy=lazy, coding_scheme=coding_scheme) instances = conll_reader.read( str(AllenNlpTestCase.FIXTURES_ROOT / "data" / "conll2003.txt")) instances = ensure_list(instances) if coding_scheme == "IOB1": expected_labels = ["I-ORG", "O", "I-PER", "O", "O", "I-LOC", "O"] else: expected_labels = ["U-ORG", "O", "U-PER", "O", "O", "U-LOC", "O"] fields = instances[0].fields tokens = [t.text for t in fields["tokens"].tokens] assert tokens == [ "U.N.", "official", "Ekeus", "heads", "for", "Baghdad", "." ] assert fields["tags"].labels == expected_labels fields = instances[1].fields tokens = [t.text for t in fields["tokens"].tokens] assert tokens == [ "AI2", "engineer", "Joel", "lives", "in", "Seattle", "." ] assert fields["tags"].labels == expected_labels
def test_read_from_file(self, lazy): conll_reader = ConllCorefReader(max_span_width=self.span_width, lazy=lazy) instances = ensure_list(conll_reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / 'coref' / 'coref.gold_conll'))) assert len(instances) == 2 fields = instances[0].fields text = [x.text for x in fields["text"].tokens] assert text == ['In', 'the', 'summer', 'of', '2005', ',', 'a', 'picture', 'that', 'people', 'have', 'long', 'been', 'looking', 'forward', 'to', 'started', 'emerging', 'with', 'frequency', 'in', 'various', 'major', 'Hong', 'Kong', 'media', '.', 'With', 'their', 'unique', 'charm', ',', 'these', 'well', '-', 'known', 'cartoon', 'images', 'once', 'again', 'caused', 'Hong', 'Kong', 'to', 'be', 'a', 'focus', 'of', 'worldwide', 'attention', '.', 'The', 'world', "'s", 'fifth', 'Disney', 'park', 'will', 'soon', 'open', 'to', 'the', 'public', 'here', '.'] spans = fields["spans"].field_list span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans]) candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text) gold_span_labels = fields["span_labels"] gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1] gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x) for i, x in gold_indices_with_ids] assert (["Hong", "Kong"], 0) in gold_mentions_with_ids gold_mentions_with_ids.remove((["Hong", "Kong"], 0)) assert (["Hong", "Kong"], 0) in gold_mentions_with_ids assert (["their"], 1) in gold_mentions_with_ids # This is a span which exceeds our max_span_width, so it should not be considered. assert not (["these", "well", "known", "cartoon", "images"], 1) in gold_mentions_with_ids fields = instances[1].fields text = [x.text for x in fields["text"].tokens] assert text == ['The', 'area', 'of', 'Hong', 'Kong', 'is', 'only', 'one', 'thousand', '-', 'plus', 'square', 'kilometers', '.', 'The', 'population', 'is', 'dense', '.', 'Natural', 'resources', 'are', 'relatively', 'scarce', '.', 'However', ',', 'the', 'clever', 'Hong', 'Kong', 'people', 'will', 'utilize', 'all', 'resources', 'they', 'have', 'created', 'for', 'developing', 'the', 'Hong', 'Kong', 'tourism', 'industry', '.'] spans = fields["spans"].field_list span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans]) candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text) gold_span_labels = fields["span_labels"] gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1] gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x) for i, x in gold_indices_with_ids] assert (["Hong", "Kong"], 0) in gold_mentions_with_ids gold_mentions_with_ids.remove((["Hong", "Kong"], 0)) assert (["Hong", "Kong"], 0) in gold_mentions_with_ids assert (["they"], 1) in gold_mentions_with_ids assert (['the', 'clever', 'Hong', 'Kong', 'people'], 1) in gold_mentions_with_ids
def __init__(self, instances: Iterable[Instance]) -> None: """ A Batch just takes an iterable of instances in its constructor and hangs onto them in a list. """ super().__init__() self.instances: List[Instance] = ensure_list(instances) self._check_types()
def get_num_batches(self, instances: Iterable[Instance]) -> int: if is_lazy(instances) and self._instances_per_epoch is None: # Unable to compute num batches, so just return 1. return 1 elif self._instances_per_epoch is not None: return math.ceil(self._instances_per_epoch / self._batch_size) else: # Not lazy, so can compute the list length. return math.ceil(len(ensure_list(instances)) / self._batch_size)
def test_source_add_start_token(self): reader = Seq2SeqDatasetReader(source_add_start_token=False) instances = reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'seq2seq_copy.tsv')) instances = ensure_list(instances) assert len(instances) == 3 fields = instances[0].fields assert [t.text for t in fields["source_tokens"].tokens] == ["this", "is", "a", "sentence", "@end@"] assert [t.text for t in fields["target_tokens"].tokens] == ["@start@", "this", "is", "a", "sentence", "@end@"]
def test_source_add_start_token(self): reader = Seq2SeqDatasetReader(source_add_start_token=False) instances = reader.read('tests/fixtures/data/seq2seq_copy.tsv') instances = ensure_list(instances) assert len(instances) == 3 fields = instances[0].fields assert [t.text for t in fields["source_tokens"].tokens] == ["this", "is", "a", "sentence", "@@END@@"] assert [t.text for t in fields["target_tokens"].tokens] == ["@@START@@", "this", "is", "a", "sentence", "@@END@@"]
def test_vocab_from_instances_namespaces(self): reader = CcgBankDatasetReader(feature_labels=['modified_pos', 'original_pos', 'predicate_arg']) instances = ensure_list(reader.read(self.FIXTURES_ROOT / 'data' / 'ccgbank.txt')) # check that we didn't clobber the labels namespace vocab = Vocabulary.from_instances(instances) self.assertSetEqual( set(vocab._token_to_index.keys()), # pylint: disable=protected-access {'tokens', 'labels', 'modified_pos_tags', 'original_pos_tags', 'predicate_arg_tags'} )
def test_read_from_file(self, lazy): reader = QangarooReader(lazy=lazy) instances = ensure_list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'qangaroo.json')) assert len(instances) == 2 assert [t.text for t in instances[0].fields['candidates'][3]] == ['german', 'confederation'] assert [t.text for t in instances[0].fields['query']] == ['country', 'sms', 'braunschweig'] assert [t.text for t in instances[0].fields['supports'][0][:3]] == ['The', 'North', 'German'] assert [t.text for t in instances[0].fields['answer']] == ['german', 'empire'] assert instances[0].fields['answer_index'].sequence_index == 4
def test_non_lazy(self): reader = LazyDatasetReader(self.instances, lazy=False) assert reader.num_reads == 0 instances = reader.read('path/to/file') for _ in range(10): _instances = (i for i in instances) assert ensure_list(_instances) == self.instances assert reader.num_reads == 1
def get_num_batches(self, instances: Iterable[Instance]) -> int: """ Returns the number of batches that ``dataset`` will be split into; if you want to track progress through the batch with the generator produced by ``__call__``, this could be useful. """ if is_lazy(instances) and self._instances_per_epoch is None: # Unable to compute num batches, so just return 1. return 1 elif self._instances_per_epoch is not None: return math.ceil(self._instances_per_epoch / self._batch_size) else: # Not lazy, so can compute the list length. return math.ceil(len(ensure_list(instances)) / self._batch_size)
def test_2_class(self): reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class") instances = reader.read(self.sst_path) instances = ensure_list(instances) instance1 = {"tokens": ["The", "actors", "are", "fantastic", "."], "label": "1"} instance2 = {"tokens": ["It", "was", "terrible", "."], "label": "0"} assert len(instances) == 2 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"] assert fields["label"].label == instance1["label"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"] assert fields["label"].label == instance2["label"]
def test_brown_corpus_format(self): reader = SequenceTaggingDatasetReader(word_tag_delimiter='/') instances = reader.read('tests/fixtures/data/brown_corpus.txt') instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens] == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[3].fields assert [t.text for t in fields["tokens"].tokens] == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
def test_default_format(self, lazy): reader = SequenceTaggingDatasetReader(lazy=lazy) instances = reader.read('tests/fixtures/data/sequence_tagging.tsv') instances = ensure_list(instances) assert len(instances) == 4 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == ["cats", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == ["dogs", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens] == ["snakes", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"] fields = instances[3].fields assert [t.text for t in fields["tokens"].tokens] == ["birds", "are", "animals", "."] assert fields["tags"].labels == ["N", "V", "N", "N"]
def test_read_from_file(self, lazy, coding_scheme): conll_reader = Conll2003DatasetReader(lazy=lazy, coding_scheme=coding_scheme) instances = conll_reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'conll2003.txt')) instances = ensure_list(instances) if coding_scheme == 'IOB1': expected_labels = ['I-ORG', 'O', 'I-PER', 'O', 'O', 'I-LOC', 'O'] else: expected_labels = ['U-ORG', 'O', 'U-PER', 'O', 'O', 'U-LOC', 'O'] fields = instances[0].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == ['U.N.', 'official', 'Ekeus', 'heads', 'for', 'Baghdad', '.'] assert fields["tags"].labels == expected_labels fields = instances[1].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == ['AI2', 'engineer', 'Joel', 'lives', 'in', 'Seattle', '.'] assert fields["tags"].labels == expected_labels
def test_read_from_file(self, lazy): conll_reader = SrlReader(lazy=lazy) instances = conll_reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'conll_2012' / 'subdomain') instances = ensure_list(instances) fields = instances[0].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == ["Mali", "government", "officials", "say", "the", "woman", "'s", "confession", "was", "forced", "."] assert fields["verb_indicator"].labels[3] == 1 assert fields["tags"].labels == ['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O'] fields = instances[1].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == ["Mali", "government", "officials", "say", "the", "woman", "'s", "confession", "was", "forced", "."] assert fields["verb_indicator"].labels[8] == 1 assert fields["tags"].labels == ['O', 'O', 'O', 'O', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'B-V', 'B-ARG2', 'O'] fields = instances[2].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month', 'after', 'four', 'months', 'of', 'hearings', '.'] assert fields["verb_indicator"].labels[2] == 1 assert fields["tags"].labels == ['B-ARG0', 'I-ARG0', 'B-V', 'B-ARG1', 'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-TMP', 'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'O'] fields = instances[3].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month', 'after', 'four', 'months', 'of', 'hearings', '.'] assert fields["verb_indicator"].labels[11] == 1 assert fields["tags"].labels == ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-V', 'O'] # Tests a sentence with no verbal predicates. fields = instances[4].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == ["Denise", "Dillon", "Headline", "News", "."] assert fields["verb_indicator"].labels == [0, 0, 0, 0, 0] assert fields["tags"].labels == ['O', 'O', 'O', 'O', 'O']
def test_default_format(self, lazy): reader = Seq2SeqDatasetReader(lazy=lazy) instances = reader.read('tests/fixtures/data/seq2seq_copy.tsv') instances = ensure_list(instances) assert len(instances) == 3 fields = instances[0].fields assert [t.text for t in fields["source_tokens"].tokens] == ["@@START@@", "this", "is", "a", "sentence", "@@END@@"] assert [t.text for t in fields["target_tokens"].tokens] == ["@@START@@", "this", "is", "a", "sentence", "@@END@@"] fields = instances[1].fields assert [t.text for t in fields["source_tokens"].tokens] == ["@@START@@", "this", "is", "another", "@@END@@"] assert [t.text for t in fields["target_tokens"].tokens] == ["@@START@@", "this", "is", "another", "@@END@@"] fields = instances[2].fields assert [t.text for t in fields["source_tokens"].tokens] == ["@@START@@", "all", "these", "sentences", "should", "get", "copied", "@@END@@"] assert [t.text for t in fields["target_tokens"].tokens] == ["@@START@@", "all", "these", "sentences", "should", "get", "copied", "@@END@@"]
def test_default_format(self, lazy): reader = Seq2SeqDatasetReader(lazy=lazy) instances = reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / 'data' / 'seq2seq_copy.tsv')) instances = ensure_list(instances) assert len(instances) == 3 fields = instances[0].fields assert [t.text for t in fields["source_tokens"].tokens] == ["@start@", "this", "is", "a", "sentence", "@end@"] assert [t.text for t in fields["target_tokens"].tokens] == ["@start@", "this", "is", "a", "sentence", "@end@"] fields = instances[1].fields assert [t.text for t in fields["source_tokens"].tokens] == ["@start@", "this", "is", "another", "@end@"] assert [t.text for t in fields["target_tokens"].tokens] == ["@start@", "this", "is", "another", "@end@"] fields = instances[2].fields assert [t.text for t in fields["source_tokens"].tokens] == ["@start@", "all", "these", "sentences", "should", "get", "copied", "@end@"] assert [t.text for t in fields["target_tokens"].tokens] == ["@start@", "all", "these", "sentences", "should", "get", "copied", "@end@"]
def test_read_from_file(self, lazy): reader = StanfordSentimentTreeBankDatasetReader(lazy=lazy) instances = reader.read(self.sst_path) instances = ensure_list(instances) instance1 = {"tokens": ["The", "actors", "are", "fantastic", "."], "label": "4"} instance2 = {"tokens": ["It", "was", "terrible", "."], "label": "0"} instance3 = {"tokens": ["Chomp", "chomp", "!"], "label": "2"} assert len(instances) == 3 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"] assert fields["label"].label == instance1["label"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"] assert fields["label"].label == instance2["label"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"] assert fields["label"].label == instance3["label"]
def test_use_subtrees(self): reader = StanfordSentimentTreeBankDatasetReader(use_subtrees=True) instances = reader.read(self.sst_path) instances = ensure_list(instances) instance1 = {"tokens": ["The", "actors", "are", "fantastic", "."], "label": "4"} instance2 = {"tokens": ["The", "actors"], "label": "2"} instance3 = {"tokens": ["The"], "label": "2"} assert len(instances) == 21 fields = instances[0].fields assert [t.text for t in fields["tokens"].tokens] == instance1["tokens"] assert fields["label"].label == instance1["label"] fields = instances[1].fields assert [t.text for t in fields["tokens"].tokens] == instance2["tokens"] assert fields["label"].label == instance2["label"] fields = instances[2].fields assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"] assert fields["label"].label == instance3["label"]
def test_read_from_file(self, lazy): conll_reader = OntonotesNamedEntityRecognition(lazy=lazy) instances = conll_reader.read(AllenNlpTestCase.FIXTURES_ROOT / 'conll_2012' / 'subdomain') instances = ensure_list(instances) fields = instances[0].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == ["Mali", "government", "officials", "say", "the", "woman", "'s", "confession", "was", "forced", "."] assert fields["tags"].labels == ['B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] fields = instances[1].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == ['The', 'prosecution', 'rested', 'its', 'case', 'last', 'month', 'after', 'four', 'months', 'of', 'hearings', '.'] assert fields["tags"].labels == ['O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'O', 'B-DATE', 'I-DATE', 'O', 'O', 'O'] fields = instances[2].fields tokens = [t.text for t in fields['tokens'].tokens] assert tokens == ["Denise", "Dillon", "Headline", "News", "."] assert fields["tags"].labels == ['B-PERSON', 'I-PERSON', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O']
def test_read_from_file(self, lazy): conll_reader = WinobiasReader(max_span_width=self.span_width, lazy=lazy) instances = ensure_list(conll_reader.read(str(AllenNlpTestCase.FIXTURES_ROOT / 'coref' / 'winobias.sample'))) assert len(instances) == 2 fields = instances[0].fields text = [x.text for x in fields["text"].tokens] assert text == ['The', 'designer', 'argued', 'with', 'the', 'developer', 'and', 'slapped', 'her', 'in', 'the', 'face', '.'] spans = fields["spans"].field_list span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans]) candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text) gold_span_labels = fields["span_labels"] gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1] gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x) for i, x in gold_indices_with_ids] assert gold_mentions_with_ids == [(['the', 'developer'], 0), (['her'], 0)] fields = instances[1].fields text = [x.text for x in fields["text"].tokens] assert text == ['The', 'salesperson', 'sold', 'some', 'books', 'to', 'the', 'librarian', 'because', 'she', 'was', 'trying', 'to', 'sell', 'them', '.'] spans = fields["spans"].field_list span_starts, span_ends = zip(*[(field.span_start, field.span_end) for field in spans]) candidate_mentions = self.check_candidate_mentions_are_well_defined(span_starts, span_ends, text) gold_span_labels = fields["span_labels"] gold_indices_with_ids = [(i, x) for i, x in enumerate(gold_span_labels.labels) if x != -1] gold_mentions_with_ids: List[Tuple[List[str], int]] = [(candidate_mentions[i], x) for i, x in gold_indices_with_ids] assert gold_mentions_with_ids == [(['The', 'salesperson'], 0), (['some', 'books'], 1), (['she'], 0), (['them'], 1)]