Beispiel #1
0
    def text_to_instance(self,
                         tokens: List[str],
                         ner_tags: List[str],
                         # entity_labels: List[str],
                         entity_spans: List[Tuple[int, int]],
                         trigger_labels: Optional[List[str]] = None,
                         arg_role_labels: Optional[List[List[str]]] = None
                         ) -> Instance:

        text_field = TextField([Token(t) for t in tokens], token_indexers=self._token_indexers)

        # These are required by allennlp for empty list fields
        # see: https://github.com/allenai/allennlp/issues/1391
        dummy_arg_roles_field = ListField([ListField([
            LabelField(label='a', label_namespace='arg_role_labels')
        ])])
        dummy_span_list_field = ListField([SpanField(0, 0, text_field)])

        if len(entity_spans) > 0:
            entity_spans_field = ListField([
                SpanField(span_start=span[0], span_end=span[1] - 1, sequence_field=text_field)
                for span in entity_spans
            ])
        else:
            entity_spans_field = dummy_span_list_field.empty_field()
        entity_tags_field = SequenceLabelField(labels=ner_tags,
                                               sequence_field=text_field,
                                               label_namespace='entity_tags')

        fields: Dict[str, Field] = {
            'metadata': MetadataField({"words": tokens}),
            'tokens': text_field,
            'entity_tags': entity_tags_field,
            'entity_spans': entity_spans_field,
        }

        # Optionally add trigger labels
        if trigger_labels is not None:
            trigger_labels_field = SequenceLabelField(labels=trigger_labels,
                                                      sequence_field=text_field,
                                                      label_namespace='trigger_labels')
            fields['triggers'] = trigger_labels_field

        # Optionally add argument role labels
        if arg_role_labels is not None:
            arg_role_labels_field = ListField([
                ListField([LabelField(label=label, label_namespace='arg_role_labels')
                           for label in token_role_labels])
                for token_role_labels in arg_role_labels
            ])
            fields['arg_roles'] = arg_role_labels_field
        else:
            fields['arg_roles'] = dummy_arg_roles_field.empty_field()

        return Instance(fields)
Beispiel #2
0
 def test_nested_list_fields_are_padded_correctly(self):
     nested_field1 = ListField([LabelField(c) for c in ["a", "b", "c", "d", "e"]])
     nested_field2 = ListField([LabelField(c) for c in ["f", "g", "h", "i", "j", "k"]])
     list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
     list_field.index(self.vocab)
     padding_lengths = list_field.get_padding_lengths()
     assert padding_lengths == {"num_fields": 3, "list_num_fields": 6}
     tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
     numpy.testing.assert_almost_equal(
         tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]]
     )
Beispiel #3
0
 def test_nested_list_fields_are_padded_correctly(self):
     nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']])
     nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']])
     list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
     list_field.index(self.vocab)
     padding_lengths = list_field.get_padding_lengths()
     assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6}
     tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
     numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1],
                                                [0, 1, 2, 3, 4, -1],
                                                [5, 6, 7, 8, 9, 10]])
 def test_nested_list_fields_are_padded_correctly(self):
     nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']])
     nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']])
     list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
     list_field.index(self.vocab)
     padding_lengths = list_field.get_padding_lengths()
     assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6}
     tensor = list_field.as_tensor(padding_lengths).data.cpu().numpy()
     numpy.testing.assert_almost_equal(tensor, [[[-1], [-1], [-1], [-1], [-1], [-1]],
                                                [[0], [1], [2], [3], [4], [-1]],
                                                [[5], [6], [7], [8], [9], [10]]])
Beispiel #5
0
 def test_nested_list_fields_are_padded_correctly(self):
     nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']])
     nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']])
     list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
     list_field.index(self.vocab)
     padding_lengths = list_field.get_padding_lengths()
     assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6}
     array = list_field.as_array(padding_lengths)
     numpy.testing.assert_almost_equal(array, [[[-1], [-1], [-1], [-1], [-1], [-1]],
                                               [[0], [1], [2], [3], [4], [-1]],
                                               [[5], [6], [7], [8], [9], [10]]])
 def test_nested_list_fields_are_padded_correctly(self):
     nested_field1 = ListField([LabelField(c) for c in [u'a', u'b', u'c', u'd', u'e']])
     nested_field2 = ListField([LabelField(c) for c in [u'f', u'g', u'h', u'i', u'j', u'k']])
     list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
     list_field.index(self.vocab)
     padding_lengths = list_field.get_padding_lengths()
     assert padding_lengths == {u'num_fields': 3, u'list_num_fields': 6}
     tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
     numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1],
                                                [0, 1, 2, 3, 4, -1],
                                                [5, 6, 7, 8, 9, 10]])
    def text_to_instance(self, # pylint: disable=arguments-differ
                         premises: List[str],
                         hypotheses: List[str],
                         answer_indices: List[int] = None,
                         relevant_sentence_idxs: List[int] = None) -> Instance:
        fields = {}
        premises_tokens = [self._tokenizer.tokenize(premise)[-self._premise_max_tokens:]
                           for premise in premises]
        hypotheses_tokens = [self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:]
                             for hypothesis in hypotheses]
        if premises:
            premises_text_fields = [TextField(premise_tokens, self._token_indexers)
                                    for premise_tokens in premises_tokens]
            premises_field = ListField(premises_text_fields)
        else:
            empty_stub = ListField([TextField([Token('dummy')], self._token_indexers)])
            premises_field = empty_stub.empty_field()
        fields['premises'] = premises_field

        hypotheses_text_fields = [TextField(hypothesis_tokens, self._token_indexers)
                                for hypothesis_tokens in hypotheses_tokens]
        hypotheses_field = ListField(hypotheses_text_fields)
        fields['hypotheses'] = hypotheses_field

        # If sentence relevance is available
        if relevant_sentence_idxs is not None:
            relevance_presence_mask = np.zeros(len(premises))
            for idx in relevant_sentence_idxs:
                relevance_presence_mask[idx] = 1
            fields['relevance_presence_mask'] = ArrayField(np.array(relevance_presence_mask))

        # If answer_indices labels are available
        if answer_indices is not None:
            answer_correctness_mask = np.zeros(len(hypotheses))
            for answer_index in answer_indices:
                answer_correctness_mask[answer_index] = 1
            fields['answer_correctness_mask'] = ArrayField(answer_correctness_mask, padding_value=-1, dtype=np.long)

        paragraph_tokens = [token for premise_tokens in premises_tokens for token in premise_tokens]
        paragraph_text_field = TextField(paragraph_tokens, self._token_indexers)
        fields['paragraph'] = paragraph_text_field
        return Instance(fields)
Beispiel #8
0
    def text_to_instance(
            self,  # pylint: disable=arguments-differ
            premises: List[str],
            hypotheses: List[str],
            answer_index: int = None,
            relevant_sentence_idxs: List[int] = None) -> Instance:
        fields = {}
        premises_tokens = [
            self._tokenizer.tokenize(premise)[-self._premise_max_tokens:]
            for premise in premises
        ]
        hypotheses_tokens = [
            self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:]
            for hypothesis in hypotheses
        ]
        if premises:
            premises_text_fields = [
                TextField(premise_tokens, self._token_indexers)
                for premise_tokens in premises_tokens
            ]
            premises_field = ListField(premises_text_fields)
        else:
            empty_stub = ListField(
                [TextField([Token('dummy')], self._token_indexers)])
            premises_field = empty_stub.empty_field()
        fields['premises'] = premises_field

        hypotheses_text_fields = [
            TextField(hypothesis_tokens, self._token_indexers)
            for hypothesis_tokens in hypotheses_tokens
        ]
        hypotheses_field = ListField(hypotheses_text_fields)
        fields['hypotheses'] = hypotheses_field

        # If sentence relevance is available
        if relevant_sentence_idxs is not None:
            relevance_presence_mask = np.zeros(len(premises))
            for idx in relevant_sentence_idxs:
                relevance_presence_mask[idx] = 1
            fields['relevance_presence_mask'] = ArrayField(
                np.array(relevance_presence_mask))

        # If entailment labels are available
        if answer_index is not None:
            # if answer_index not in range(0, len(hypotheses)):
            #     raise ConfigurationError("Provided label must be in 0 to {}".format(len(hypotheses)))
            fields['answer_index'] = ArrayField(np.array(answer_index),
                                                padding_value=-1,
                                                dtype=np.long)

        paragraph_tokens = [
            token for premise_tokens in premises_tokens
            for token in premise_tokens
        ]
        #print(len(paragraph_tokens))
        if (len(paragraph_tokens) == 0):
            return None
        paragraph_text_field = TextField(paragraph_tokens,
                                         self._token_indexers)

        fields['paragraph'] = paragraph_text_field
        return Instance(fields)
Beispiel #9
0
    def text_to_instance(self, example: Dict) -> Instance:
        words = example['words']
        text_field = TextField([Token(t) for t in words],
                               token_indexers=self._token_indexers)

        # These are required by allennlp for empty list fields
        # see: https://github.com/allenai/allennlp/issues/1391
        dummy_arg_roles_field = ListField([
            ListField(
                [LabelField(label='a', label_namespace='arg_role_labels')])
        ])
        dummy_entity_labels_field = ListField(
            [LabelField(label='a', label_namespace='entity_labels')])
        dummy_span_list_field = ListField([SpanField(0, 0, text_field)])

        # Extract entities
        entity_labels = []
        entity_spans = []
        entities = example['golden-entity-mentions']
        if len(entities) > 0:
            for entity in entities:
                entity_labels.append(
                    LabelField(label=entity['entity-type'],
                               label_namespace='entity_labels'))
                entity_spans.append(
                    SpanField(span_start=entity['start'],
                              span_end=entity['end'] - 1,
                              sequence_field=text_field))
            entity_labels_field = ListField(entity_labels)
            entity_spans_field = ListField(entity_spans)
        else:
            entity_labels_field = dummy_entity_labels_field.empty_field()
            entity_spans_field = dummy_span_list_field.empty_field()

        triggers = [NEGATIVE_TRIGGER_LABEL] * len(words)
        events = example['golden-event-mentions']

        if len(entity_spans) > 0:
            arg_roles = [[
                NEGATIVE_ARGUMENT_LABEL for _ in range(len(entity_spans))
            ] for _ in range(len(words))]
        else:
            arg_roles = None

        for event in events:
            trigger = event['trigger']
            trigger_start = trigger['start']
            trigger_end = trigger['end']
            for idx in range(trigger_start, trigger_end):
                label = event['event_type']
                # Encode triggers with IOB2 encoding scheme
                if idx == trigger['start']:
                    triggers[idx] = 'B-' + label
                else:
                    triggers[idx] = 'I-' + label

            if arg_roles:
                # Every entity is a potential negative example for event arguments
                for argument in event['arguments']:
                    entity_idx = next(
                        idx for idx, entity in enumerate(entities)
                        if entity['start'] == argument['start']
                        and entity['end'] == argument['end']
                        and entity['entity-type'] == argument['entity-type'])
                    for trigger_idx in range(trigger_start, trigger_end):
                        arg_roles[trigger_idx][entity_idx] = argument['role']

        if arg_roles:
            arg_roles_field = ListField([
                ListField([
                    LabelField(label=label, label_namespace='arg_role_labels')
                    for label in token_role_labels
                ]) for token_role_labels in arg_roles
            ])
        else:
            arg_roles_field = dummy_arg_roles_field.empty_field()

        fields = {
            'metadata':
            MetadataField({"words": example['words']}),
            'tokens':
            text_field,
            'entity_labels':
            entity_labels_field,
            'entity_spans':
            entity_spans_field,
            'triggers':
            SequenceLabelField(labels=triggers,
                               sequence_field=text_field,
                               label_namespace='trigger_labels'),
            'arg_roles':
            arg_roles_field,
        }
        return Instance(fields)