def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")} self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() super(TestListField, self).setUp()
class TestListField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")} self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() super(TestListField, self).setUp() def test_get_padding_lengths(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == {"num_fields": 3, "list_words_length": 5, "list_num_tokens": 5} def test_list_field_can_handle_empty_text_fields(self): list_field = ListField([self.field1, self.field2, self.empty_text_field]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]])) def test_list_field_can_handle_empty_index_fields(self): list_field = ListField([self.index_field, self.index_field, self.empty_index_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]])) def test_list_field_can_handle_empty_sequence_label_fields(self): list_field = ListField([self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]])) def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0])) def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']]) nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]]) def test_fields_can_pad_to_greater_than_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() padding_lengths["list_words_length"] = 7 padding_lengths["num_fields"] = 5 tensor_dict = list_field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][3].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][4].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) def test_as_tensor_can_handle_multiple_token_indexers(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 4, 1, 5, 1, 3, 1, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1.empty_field(), self.field1, self.field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(words, numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9])) numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) def test_printing_doesnt_crash(self): list_field = ListField([self.field1, self.field2]) print(list_field) def test_sequence_methods(self): list_field = ListField([self.field1, self.field2, self.field3]) assert len(list_field) == 3 assert list_field[1] == self.field2 assert [f for f in list_field] == [self.field1, self.field2, self.field3]
def test_as_tensor_converts_field_correctly(self): index_field = IndexField(4, self.text) tensor = index_field.as_tensor(index_field.get_padding_lengths()).data.cpu().numpy() numpy.testing.assert_array_equal(tensor, numpy.array([4]))
def test_index_field_empty_field_works(self): index_field = IndexField(4, self.text) empty_index = index_field.empty_field() assert empty_index.sequence_index == -1
def _read(self, file_path: str): file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset = json.load(dataset_file) # if self._span_file_path is not None: span_file = open(self._span_file_path) span_file = json.load(span_file) #archive = load_archive(self._extraction_model_path) #model = archive.model model = None p1_dataset_reader = DatasetReader.from_params( archive.config["dataset_reader"]) p1_token_indexers = p1_dataset_reader._token_indexers logger.info("Reading the dataset") for data, best_span in zip(dataset, span_file): answer = data['answers'][0] question = data['query'] well_formed_answer = data['wellFormedAnswers'][0] passages_json = data['passages'] passages = [ passages_json[i]['passage_text'] for i in range(len(passages_json)) ] # passages_length = [len(p) for p in passages] passages_is_selected = [ passages_json[i]['is_selected'] for i in range(len(passages_json)) ] # concatenated_passage = ' '.join(passages) tokenized_passages_list = [ self._tokenizer.tokenize(util.normalize_text(p)) for p in passages ] passages_length = [len(p) for p in tokenized_passages_list] cumulative_passages_length = np.cumsum(passages_length) normalized_answer = None if answer != None: normalized_answer = util.normalize_text(answer) normalized_question = util.normalize_text(question) tokenized_answer = self._tokenizer.tokenize(normalized_answer) tokenized_question = self._tokenizer.tokenize(normalized_question) question_field = TextField(tokenized_question, self._token_indexers) fields = {'question': question_field} start_idx, end_idx, rouge_score, passage_idx = None, None, None, None tokenized_answer.insert(0, Token(START_SYMBOL)) tokenized_answer.append(Token(END_SYMBOL)) tokenized_passage = [ token for sublist in tokenized_passages_list for token in sublist ] passage_field = TextField(tokenized_passage, self._token_indexers) fields['passage'] = passage_field p1_question_field = TextField(tokenized_question, p1_token_indexers) p1_passage_field = TextField(tokenized_passage, p1_token_indexers) p1_fields = { 'question': p1_question_field, 'passage': p1_passage_field } p1_instance = Instance(p1_fields) outputs = model.forward_on_instance(p1_instance, -1) start_idx = outputs['span_start_idx'] end_idx = outputs['span_end_idx'] for idx in range(len(cumulative_passages_length)): if start_idx < cumulative_passages_length[idx]: break if idx != 0: start_idx = start_idx - cumulative_passages_length[idx - 1] end_idx = end_idx - cumulative_passages_length[idx - 1] assert start_idx <= end_idx, "Span prediction does not make sense!!!" # yield instance from predicted span span_start_field = IndexField(int(start_idx), passage_field) span_end_field = IndexField(int(end_idx), passage_field) answer_field = TextField(tokenized_answer, self._token_indexers) fields['passage'] = passage_field fields['span_start'] = span_start_field fields['span_end'] = span_end_field fields['answer'] = answer_field evidence = self.get_evidence(tokenized_passage, int(start_idx), int(end_idx)) fields['metadata'] = MetadataField({ 'evidence': evidence, 'question_text': normalized_question, 'answer_text': normalized_answer }) yield Instance(fields) # yield instances from gold spans for item in best_span: if item['score'] > 0.5: passage_field = TextField( tokenized_passages_list[item['passage']], self._token_indexers) span_start_field = IndexField(item['start'], passage_field) span_end_field = IndexField(item['end'], passage_field) answer_field = TextField(tokenized_answer, self._token_indexers) fields['passage'] = passage_field fields['span_start'] = span_start_field fields['span_end'] = span_end_field fields['answer'] = answer_field evidence = self.get_evidence( tokenized_passages_list[item['passage']], int(start_idx), int(end_idx)) fields['metadata'] = MetadataField({ 'evidence': evidence, 'question_text': normalized_question, 'answer_text': normalized_answer }) yield Instance(fields)