def text_to_instance(self, tokens: List[str], ner_tags: List[str], # entity_labels: List[str], entity_spans: List[Tuple[int, int]], trigger_labels: Optional[List[str]] = None, arg_role_labels: Optional[List[List[str]]] = None ) -> Instance: text_field = TextField([Token(t) for t in tokens], token_indexers=self._token_indexers) # These are required by allennlp for empty list fields # see: https://github.com/allenai/allennlp/issues/1391 dummy_arg_roles_field = ListField([ListField([ LabelField(label='a', label_namespace='arg_role_labels') ])]) dummy_span_list_field = ListField([SpanField(0, 0, text_field)]) if len(entity_spans) > 0: entity_spans_field = ListField([ SpanField(span_start=span[0], span_end=span[1] - 1, sequence_field=text_field) for span in entity_spans ]) else: entity_spans_field = dummy_span_list_field.empty_field() entity_tags_field = SequenceLabelField(labels=ner_tags, sequence_field=text_field, label_namespace='entity_tags') fields: Dict[str, Field] = { 'metadata': MetadataField({"words": tokens}), 'tokens': text_field, 'entity_tags': entity_tags_field, 'entity_spans': entity_spans_field, } # Optionally add trigger labels if trigger_labels is not None: trigger_labels_field = SequenceLabelField(labels=trigger_labels, sequence_field=text_field, label_namespace='trigger_labels') fields['triggers'] = trigger_labels_field # Optionally add argument role labels if arg_role_labels is not None: arg_role_labels_field = ListField([ ListField([LabelField(label=label, label_namespace='arg_role_labels') for label in token_role_labels]) for token_role_labels in arg_role_labels ]) fields['arg_roles'] = arg_role_labels_field else: fields['arg_roles'] = dummy_arg_roles_field.empty_field() return Instance(fields)
def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ["a", "b", "c", "d", "e"]]) nested_field2 = ListField([LabelField(c) for c in ["f", "g", "h", "i", "j", "k"]]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {"num_fields": 3, "list_num_fields": 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal( tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]] )
def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']]) nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]])
def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']]) nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).data.cpu().numpy() numpy.testing.assert_almost_equal(tensor, [[[-1], [-1], [-1], [-1], [-1], [-1]], [[0], [1], [2], [3], [4], [-1]], [[5], [6], [7], [8], [9], [10]]])
def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']]) nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6} array = list_field.as_array(padding_lengths) numpy.testing.assert_almost_equal(array, [[[-1], [-1], [-1], [-1], [-1], [-1]], [[0], [1], [2], [3], [4], [-1]], [[5], [6], [7], [8], [9], [10]]])
def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in [u'a', u'b', u'c', u'd', u'e']]) nested_field2 = ListField([LabelField(c) for c in [u'f', u'g', u'h', u'i', u'j', u'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {u'num_fields': 3, u'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]])
def text_to_instance(self, # pylint: disable=arguments-differ premises: List[str], hypotheses: List[str], answer_indices: List[int] = None, relevant_sentence_idxs: List[int] = None) -> Instance: fields = {} premises_tokens = [self._tokenizer.tokenize(premise)[-self._premise_max_tokens:] for premise in premises] hypotheses_tokens = [self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:] for hypothesis in hypotheses] if premises: premises_text_fields = [TextField(premise_tokens, self._token_indexers) for premise_tokens in premises_tokens] premises_field = ListField(premises_text_fields) else: empty_stub = ListField([TextField([Token('dummy')], self._token_indexers)]) premises_field = empty_stub.empty_field() fields['premises'] = premises_field hypotheses_text_fields = [TextField(hypothesis_tokens, self._token_indexers) for hypothesis_tokens in hypotheses_tokens] hypotheses_field = ListField(hypotheses_text_fields) fields['hypotheses'] = hypotheses_field # If sentence relevance is available if relevant_sentence_idxs is not None: relevance_presence_mask = np.zeros(len(premises)) for idx in relevant_sentence_idxs: relevance_presence_mask[idx] = 1 fields['relevance_presence_mask'] = ArrayField(np.array(relevance_presence_mask)) # If answer_indices labels are available if answer_indices is not None: answer_correctness_mask = np.zeros(len(hypotheses)) for answer_index in answer_indices: answer_correctness_mask[answer_index] = 1 fields['answer_correctness_mask'] = ArrayField(answer_correctness_mask, padding_value=-1, dtype=np.long) paragraph_tokens = [token for premise_tokens in premises_tokens for token in premise_tokens] paragraph_text_field = TextField(paragraph_tokens, self._token_indexers) fields['paragraph'] = paragraph_text_field return Instance(fields)
def text_to_instance( self, # pylint: disable=arguments-differ premises: List[str], hypotheses: List[str], answer_index: int = None, relevant_sentence_idxs: List[int] = None) -> Instance: fields = {} premises_tokens = [ self._tokenizer.tokenize(premise)[-self._premise_max_tokens:] for premise in premises ] hypotheses_tokens = [ self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:] for hypothesis in hypotheses ] if premises: premises_text_fields = [ TextField(premise_tokens, self._token_indexers) for premise_tokens in premises_tokens ] premises_field = ListField(premises_text_fields) else: empty_stub = ListField( [TextField([Token('dummy')], self._token_indexers)]) premises_field = empty_stub.empty_field() fields['premises'] = premises_field hypotheses_text_fields = [ TextField(hypothesis_tokens, self._token_indexers) for hypothesis_tokens in hypotheses_tokens ] hypotheses_field = ListField(hypotheses_text_fields) fields['hypotheses'] = hypotheses_field # If sentence relevance is available if relevant_sentence_idxs is not None: relevance_presence_mask = np.zeros(len(premises)) for idx in relevant_sentence_idxs: relevance_presence_mask[idx] = 1 fields['relevance_presence_mask'] = ArrayField( np.array(relevance_presence_mask)) # If entailment labels are available if answer_index is not None: # if answer_index not in range(0, len(hypotheses)): # raise ConfigurationError("Provided label must be in 0 to {}".format(len(hypotheses))) fields['answer_index'] = ArrayField(np.array(answer_index), padding_value=-1, dtype=np.long) paragraph_tokens = [ token for premise_tokens in premises_tokens for token in premise_tokens ] #print(len(paragraph_tokens)) if (len(paragraph_tokens) == 0): return None paragraph_text_field = TextField(paragraph_tokens, self._token_indexers) fields['paragraph'] = paragraph_text_field return Instance(fields)
def text_to_instance(self, example: Dict) -> Instance: words = example['words'] text_field = TextField([Token(t) for t in words], token_indexers=self._token_indexers) # These are required by allennlp for empty list fields # see: https://github.com/allenai/allennlp/issues/1391 dummy_arg_roles_field = ListField([ ListField( [LabelField(label='a', label_namespace='arg_role_labels')]) ]) dummy_entity_labels_field = ListField( [LabelField(label='a', label_namespace='entity_labels')]) dummy_span_list_field = ListField([SpanField(0, 0, text_field)]) # Extract entities entity_labels = [] entity_spans = [] entities = example['golden-entity-mentions'] if len(entities) > 0: for entity in entities: entity_labels.append( LabelField(label=entity['entity-type'], label_namespace='entity_labels')) entity_spans.append( SpanField(span_start=entity['start'], span_end=entity['end'] - 1, sequence_field=text_field)) entity_labels_field = ListField(entity_labels) entity_spans_field = ListField(entity_spans) else: entity_labels_field = dummy_entity_labels_field.empty_field() entity_spans_field = dummy_span_list_field.empty_field() triggers = [NEGATIVE_TRIGGER_LABEL] * len(words) events = example['golden-event-mentions'] if len(entity_spans) > 0: arg_roles = [[ NEGATIVE_ARGUMENT_LABEL for _ in range(len(entity_spans)) ] for _ in range(len(words))] else: arg_roles = None for event in events: trigger = event['trigger'] trigger_start = trigger['start'] trigger_end = trigger['end'] for idx in range(trigger_start, trigger_end): label = event['event_type'] # Encode triggers with IOB2 encoding scheme if idx == trigger['start']: triggers[idx] = 'B-' + label else: triggers[idx] = 'I-' + label if arg_roles: # Every entity is a potential negative example for event arguments for argument in event['arguments']: entity_idx = next( idx for idx, entity in enumerate(entities) if entity['start'] == argument['start'] and entity['end'] == argument['end'] and entity['entity-type'] == argument['entity-type']) for trigger_idx in range(trigger_start, trigger_end): arg_roles[trigger_idx][entity_idx] = argument['role'] if arg_roles: arg_roles_field = ListField([ ListField([ LabelField(label=label, label_namespace='arg_role_labels') for label in token_role_labels ]) for token_role_labels in arg_roles ]) else: arg_roles_field = dummy_arg_roles_field.empty_field() fields = { 'metadata': MetadataField({"words": example['words']}), 'tokens': text_field, 'entity_labels': entity_labels_field, 'entity_spans': entity_spans_field, 'triggers': SequenceLabelField(labels=triggers, sequence_field=text_field, label_namespace='trigger_labels'), 'arg_roles': arg_roles_field, } return Instance(fields)