def test_count_vocab_items_correctly_indexes_tags(self): tags = ["B", "I", "O", "O", "O"] sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="labels") counter = defaultdict(lambda: defaultdict(int)) sequence_label_field.count_vocab_items(counter) assert counter["labels"]["B"] == 1 assert counter["labels"]["I"] == 1 assert counter["labels"]["O"] == 3 assert set(counter.keys()) == {"labels"}
def test_as_tensor_produces_integer_targets(self): vocab = Vocabulary() vocab.add_token_to_namespace("B", namespace='*labels') vocab.add_token_to_namespace("I", namespace='*labels') vocab.add_token_to_namespace("O", namespace='*labels') tags = ["B", "I", "O", "O", "O"] sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels") sequence_label_field.index(vocab) padding_lengths = sequence_label_field.get_padding_lengths() tensor = sequence_label_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 1, 2, 2, 2]))
def test_index_converts_field_correctly(self): vocab = Vocabulary() b_index = vocab.add_token_to_namespace("B", namespace='*labels') i_index = vocab.add_token_to_namespace("I", namespace='*labels') o_index = vocab.add_token_to_namespace("O", namespace='*labels') tags = ["B", "I", "O", "O", "O"] sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels") sequence_label_field.index(vocab) # pylint: disable=protected-access assert sequence_label_field._indexed_labels == [b_index, i_index, o_index, o_index, o_index]
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")} self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() super(TestListField, self).setUp()
class TestListField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")} self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() super(TestListField, self).setUp() def test_get_padding_lengths(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == {"num_fields": 3, "list_words_length": 5, "list_num_tokens": 5} def test_list_field_can_handle_empty_text_fields(self): list_field = ListField([self.field1, self.field2, self.empty_text_field]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]])) def test_list_field_can_handle_empty_index_fields(self): list_field = ListField([self.index_field, self.index_field, self.empty_index_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]])) def test_list_field_can_handle_empty_sequence_label_fields(self): list_field = ListField([self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]])) def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0])) def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']]) nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]]) def test_fields_can_pad_to_greater_than_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() padding_lengths["list_words_length"] = 7 padding_lengths["num_fields"] = 5 tensor_dict = list_field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][3].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][4].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) def test_as_tensor_can_handle_multiple_token_indexers(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 4, 1, 5, 1, 3, 1, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1.empty_field(), self.field1, self.field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(words, numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9])) numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) def test_printing_doesnt_crash(self): list_field = ListField([self.field1, self.field2]) print(list_field) def test_sequence_methods(self): list_field = ListField([self.field1, self.field2, self.field3]) assert len(list_field) == 3 assert list_field[1] == self.field2 assert [f for f in list_field] == [self.field1, self.field2, self.field3]
def text_to_instance( self, # type: ignore tokens: List[str], upos_tags: List[str] = None, lemmas: List[str] = None, streusle_lextags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. upos_tags : ``List[str]``, optional, (default = None). The upos_tags for the tokens in a given sentence. If None, we use StanfordNLP to predict them. If self._use_predicted_upos, we use StanfordNLP to predict them (ignoring any provided here). lemmas : ``List[str]``, optional, (default = None). The lemmas for the tokens in a given sentence. If None, we use StanfordNLP to predict them. If self._use_predicted_lemmas, we use StanfordNLP to predict them (ignoring any provided here). streusle_lextags : ``List[str]``, optional, (default = None). The STREUSLE lextags associated with a token. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. tags : ``SequenceLabelField`` The tags corresponding to the ``tag_label`` constructor argument. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} roberta_inputs = convert_tokens_to_roberta_inputs( tokens=tokens, tokenizer=self.tokenizer, max_seq_length=self.max_seq_length) metadata = { "tokens": tokens, "token_indices_to_wordpiece_indices": roberta_inputs["token_indices_to_wordpiece_indices"] } fields["token_indices_to_wordpiece_indices"] = SequentialArrayField( np.array(roberta_inputs["token_indices_to_wordpiece_indices"], dtype="int64"), "int64", padding_value=-1) fields["input_ids"] = SequentialArrayField( np.array(roberta_inputs["input_ids"], dtype="int64"), "int64") fields["input_mask"] = SequentialArrayField( np.array(roberta_inputs["input_mask"], dtype="int64"), "int64") if self._use_predicted_upos or upos_tags is None: if self._upos_predictor is None: # Initialize UPOS predictor. self._upos_predictor = stanfordnlp.Pipeline( processors="tokenize,pos", tokenize_pretokenized=True) doc = self._upos_predictor([tokens]) upos_tags = [ word.upos for sent in doc.sentences for word in sent.words ] # Check number of UPOS tags equals number of tokens. assert len(upos_tags) == len(tokens) metadata["upos_tags"] = upos_tags if self._use_predicted_lemmas or lemmas is None: if self._lemma_predictor is None: # Initialize LEMMAS predictor. self._lemma_predictor = stanfordnlp.Pipeline( processors="tokenize,lemma", tokenize_pretokenized=True) doc = self._lemma_predictor([tokens]) lemmas = [ word.lemma for sent in doc.sentences for word in sent.words ] # Check number of LEMMAS tags equals number of tokens. assert len(lemmas) == len(tokens) metadata["lemmas"] = lemmas fields["metadata"] = MetadataField(metadata) # Add "tag label" to instance if streusle_lextags is not None: fields['tags'] = SequenceLabelField( streusle_lextags, fields["token_indices_to_wordpiece_indices"], self.label_namespace) return Instance(fields)
def text_to_instance(self, example: Dict) -> Instance: words = example['words'] text_field = TextField([Token(t) for t in words], token_indexers=self._token_indexers) # These are required by allennlp for empty list fields # see: https://github.com/allenai/allennlp/issues/1391 dummy_arg_roles_field = ListField([ ListField( [LabelField(label='a', label_namespace='arg_role_labels')]) ]) dummy_entity_labels_field = ListField( [LabelField(label='a', label_namespace='entity_labels')]) dummy_span_list_field = ListField([SpanField(0, 0, text_field)]) # Extract entities entity_labels = [] entity_spans = [] entities = example['golden-entity-mentions'] if len(entities) > 0: for entity in entities: entity_labels.append( LabelField(label=entity['entity-type'], label_namespace='entity_labels')) entity_spans.append( SpanField(span_start=entity['start'], span_end=entity['end'] - 1, sequence_field=text_field)) entity_labels_field = ListField(entity_labels) entity_spans_field = ListField(entity_spans) else: entity_labels_field = dummy_entity_labels_field.empty_field() entity_spans_field = dummy_span_list_field.empty_field() triggers = [NEGATIVE_TRIGGER_LABEL] * len(words) events = example['golden-event-mentions'] if len(entity_spans) > 0: arg_roles = [[ NEGATIVE_ARGUMENT_LABEL for _ in range(len(entity_spans)) ] for _ in range(len(words))] else: arg_roles = None for event in events: trigger = event['trigger'] trigger_start = trigger['start'] trigger_end = trigger['end'] for idx in range(trigger_start, trigger_end): label = event['event_type'] # Encode triggers with IOB2 encoding scheme if idx == trigger['start']: triggers[idx] = 'B-' + label else: triggers[idx] = 'I-' + label if arg_roles: # Every entity is a potential negative example for event arguments for argument in event['arguments']: entity_idx = next( idx for idx, entity in enumerate(entities) if entity['start'] == argument['start'] and entity['end'] == argument['end'] and entity['entity-type'] == argument['entity-type']) for trigger_idx in range(trigger_start, trigger_end): arg_roles[trigger_idx][entity_idx] = argument['role'] if arg_roles: arg_roles_field = ListField([ ListField([ LabelField(label=label, label_namespace='arg_role_labels') for label in token_role_labels ]) for token_role_labels in arg_roles ]) else: arg_roles_field = dummy_arg_roles_field.empty_field() fields = { 'metadata': MetadataField({"words": example['words']}), 'tokens': text_field, 'entity_labels': entity_labels_field, 'entity_spans': entity_spans_field, 'triggers': SequenceLabelField(labels=triggers, sequence_field=text_field, label_namespace='trigger_labels'), 'arg_roles': arg_roles_field, } return Instance(fields)
def featurize( self, text: Union[str, List[str]], entities: Optional[List[dict]] = None, tags: Optional[Union[List[str], List[int]]] = None, ) -> Instance: """ Parameters ---------- text Can be either a simple str or a list of str, in which case it will be treated as a list of pretokenized tokens entities A list of span labels Span labels are dictionaries that contain: 'start': int, char index of the start of the span 'end': int, char index of the end of the span (exclusive) 'label': str, label of the span They are used with the `spacy.gold.biluo_tags_from_offsets` method. tags A list of tags in the BIOUL or BIO format. """ if isinstance(text, str): doc = self.backbone.tokenizer.nlp(text) tokens = [spacy_to_allennlp_token(token) for token in doc] tags = (tags_from_offsets(doc, entities, self._label_encoding) if entities is not None else []) # discard misaligned examples for now if "-" in tags: raise FeaturizeError( f"Could not align spans with tokens for following example: '{text}' {entities}" ) # text is already pre-tokenized else: tokens = [Token(t) for t in text] instance = self.backbone.featurizer(tokens, to_field="text", tokenize=False, aggregate=True) if self.training: try: instance.add_field( "tags", SequenceLabelField( tags, sequence_field=cast(TextField, instance["text"]), label_namespace=vocabulary.LABELS_NAMESPACE, ), ) except Exception as exception: raise FeaturizeError( f"Could not create SequenceLabelField for {(tokens, tags)}" ) from exception instance.add_field("raw_text", MetadataField(text)) return instance
def text_to_instance( self, # type: ignore question: str, logical_forms: List[str] = None, additional_metadata: Dict[str, Any] = None, world_extractions: Dict[str, Union[str, List[str]]] = None, entity_literals: Dict[str, Union[str, List[str]]] = None, tokenized_question: List[Token] = None, debug_counter: int = None, qr_spec_override: List[Dict[str, int]] = None, dynamic_entities_override: Dict[str, str] = None) -> Instance: # pylint: disable=arguments-differ tokenized_question = tokenized_question or self._tokenizer.tokenize( question.lower()) additional_metadata = additional_metadata or dict() additional_metadata['question_tokens'] = [ token.text for token in tokenized_question ] if world_extractions is not None: additional_metadata['world_extractions'] = world_extractions question_field = TextField(tokenized_question, self._question_token_indexers) if qr_spec_override is not None or dynamic_entities_override is not None: # Dynamically specify theory and/or entities dynamic_entities = dynamic_entities_override or self._dynamic_entities neighbors: Dict[str, List[str]] = { key: [] for key in dynamic_entities.keys() } knowledge_graph = KnowledgeGraph(entities=set( dynamic_entities.keys()), neighbors=neighbors, entity_text=dynamic_entities) world = QuarelWorld(knowledge_graph, self._lf_syntax, qr_coeff_sets=qr_spec_override) else: knowledge_graph = self._knowledge_graph world = self._world table_field = KnowledgeGraphField(knowledge_graph, tokenized_question, self._entity_token_indexers, tokenizer=self._tokenizer) if self._tagger_only: fields: Dict[str, Field] = {'tokens': question_field} if entity_literals is not None: entity_tags = self._get_entity_tags(self._all_entities, table_field, entity_literals, tokenized_question) if debug_counter > 0: logger.info(f'raw entity tags = {entity_tags}') entity_tags_bio = self._convert_tags_bio(entity_tags) fields['tags'] = SequenceLabelField(entity_tags_bio, question_field) additional_metadata['tags_gold'] = entity_tags_bio additional_metadata['words'] = [x.text for x in tokenized_question] fields['metadata'] = MetadataField(additional_metadata) return Instance(fields) world_field = MetadataField(world) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_actions(): _, rule_right_side = production_rule.split(' -> ') is_global_rule = not world.is_table_entity(rule_right_side) field = ProductionRuleField(production_rule, is_global_rule) production_rule_fields.append(field) action_field = ListField(production_rule_fields) fields = { 'question': question_field, 'table': table_field, 'world': world_field, 'actions': action_field } if self._denotation_only: denotation_field = LabelField(additional_metadata['answer_index'], skip_indexing=True) fields['denotation_target'] = denotation_field if self._entity_bits_mode is not None and world_extractions is not None: entity_bits = self._get_entity_tags(['world1', 'world2'], table_field, world_extractions, tokenized_question) if self._entity_bits_mode == "simple": entity_bits_v = [[[0, 0], [1, 0], [0, 1]][tag] for tag in entity_bits] elif self._entity_bits_mode == "simple_collapsed": entity_bits_v = [[[0], [1], [1]][tag] for tag in entity_bits] elif self._entity_bits_mode == "simple3": entity_bits_v = [[[1, 0, 0], [0, 1, 0], [0, 0, 1]][tag] for tag in entity_bits] entity_bits_field = ArrayField(np.array(entity_bits_v)) fields['entity_bits'] = entity_bits_field if logical_forms: action_map = { action.rule: i for i, action in enumerate(action_field.field_list) } # type: ignore action_sequence_fields: List[Field] = [] for logical_form in logical_forms: expression = world.parse_logical_form(logical_form) action_sequence = world.get_action_sequence(expression) try: index_fields: List[Field] = [] for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) action_sequence_fields.append(ListField(index_fields)) except KeyError as error: logger.info( f'Missing production rule: {error.args}, skipping logical form' ) logger.info(f'Question was: {question}') logger.info(f'Logical form was: {logical_form}') continue fields['target_action_sequences'] = ListField( action_sequence_fields) fields['metadata'] = MetadataField(additional_metadata or {}) return Instance(fields)
def text_to_instance( self, # type: ignore sentence: List[Token], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- sentence : ``List[Token]``, required. The already tokenised sentence to analyse. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the sentence, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full sentence. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the sentence text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ metadata: Dict[str, Any] = {"original_text": sentence} if gold_clusters is not None: metadata["clusters"] = gold_clusters text_field = TextField(sentence, self._token_indexers) cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List[Field] = [] span_labels: Optional[ List[int]] = [] if gold_clusters is not None else None for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) spans.append(SpanField(start, end, text_field)) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field } if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields)
def text_to_instance(self, text: str, targets: Optional[List[str]] = None, target_sentiments: Optional[List[Union[str, int]]] = None, spans: Optional[List[List[int]]] = None, categories: Optional[List[str]] = None, category_sentiments: Optional[List[Union[str, int]]] = None, **kwargs) -> Instance: ''' The original text, text tokens as well as the targets and target tokens are stored in the MetadataField. :NOTE: At least targets and/or categories must be present. :NOTE: That the left and right contexts returned in the instance are a List of a List of tokens. A list for each Target. :param text: The text that contains the target(s) and/or categories. :param targets: The targets that are within the text :param target_sentiments: The sentiment of the targets. To be used if training the classifier :param spans: The spans that represent the character offsets for each of the targets given in the targets list. :param categories: The categories that are within the text :param category_sentiments: The sentiment of the categories :returns: An Instance object with all of the above encoded for a PyTorch model. :raises ValueError: If either targets and categories are both None :raises ValueError: If `self._target_sequences` is True and the passed `spans` argument is None. :raises ValueError: If `self._left_right_contexts` is True and the passed `spans` argument is None. ''' if targets is None and categories is None: raise ValueError('Either targets or categories must be given if you ' 'want to be predict the sentiment of a target ' 'or a category') instance_fields: Dict[str, Field] = {} # Metadata field metadata_dict = {} if targets is not None: # need to change this so that it takes into account the case where # the positions are True but not the target sequences. if self._target_sequences or self._position_embeddings or self._position_weights: if spans is None: raise ValueError('To create target sequences requires `spans`') spans = [Span(span[0], span[1]) for span in spans] target_text_object = TargetText(text=text, spans=spans, targets=targets, text_id='anything') target_text_object.force_targets() text = target_text_object['text'] allen_tokens = self._tokenizer.tokenize(text) tokens = [x.text for x in allen_tokens] target_text_object['tokenized_text'] = tokens target_text_object.sequence_labels(per_target=True) target_sequences = target_text_object['sequence_labels'] # Need to add the target sequences to the instances in_label = {'B', 'I'} number_targets = len(targets) all_target_tokens: List[List[Token]] = [[] for _ in range(number_targets)] target_sequence_fields = [] target_indicators: List[List[int]] = [] for target_index in range(number_targets): one_values = [] target_ones = [0] * len(allen_tokens) for token_index, token in enumerate(allen_tokens): target_sequence_value = target_sequences[target_index][token_index] in_target = 1 if target_sequence_value in in_label else 0 if in_target: all_target_tokens[target_index].append(allen_tokens[token_index]) one_value_list = [0] * len(allen_tokens) one_value_list[token_index] = 1 one_values.append(one_value_list) target_ones[token_index] = 1 one_values = np.array(one_values) target_sequence_fields.append(ArrayField(one_values, dtype=np.int32)) target_indicators.append(target_ones) if self._position_embeddings: target_distances = self._target_indicators_to_distances(target_indicators, max_distance=self._max_position_distance, as_string=True) target_text_distances = [] for target_distance in target_distances: token_distances = [Token(distance) for distance in target_distance] token_distances = TextField(token_distances, self._position_indexers) target_text_distances.append(token_distances) instance_fields['position_embeddings'] = ListField(target_text_distances) if self._position_weights: target_distances = self._target_indicators_to_distances(target_indicators, max_distance=self._max_position_distance, as_string=False) target_distances = np.array(target_distances) instance_fields['position_weights'] = ArrayField(target_distances, dtype=np.int32) if self._target_sequences: instance_fields['target_sequences'] = ListField(target_sequence_fields) instance_fields['tokens'] = TextField(allen_tokens, self._token_indexers) metadata_dict['text words'] = tokens metadata_dict['text'] = text # update target variable as the targets could have changed due # to the force_targets function targets = target_text_object['targets'] else: all_target_tokens = [self._tokenizer.tokenize(target) for target in targets] target_fields = [TextField(target_tokens, self._token_indexers) for target_tokens in all_target_tokens] target_fields = ListField(target_fields) instance_fields['targets'] = target_fields # Add the targets and the tokenised targets to the metadata metadata_dict['targets'] = [target for target in targets] metadata_dict['target words'] = [[x.text for x in target_tokens] for target_tokens in all_target_tokens] # Target sentiment if it exists if target_sentiments is not None: target_sentiments_field = SequenceLabelField(target_sentiments, target_fields, label_namespace='target-sentiment-labels') instance_fields['target_sentiments'] = target_sentiments_field if categories is not None and self._use_categories: category_fields = TextField([Token(category) for category in categories], self._token_indexers) instance_fields['categories'] = category_fields # Category sentiment if it exists if category_sentiments is not None: category_sentiments_field = SequenceLabelField(category_sentiments, category_fields, label_namespace='category-sentiment-labels') instance_fields['category_sentiments'] = category_sentiments_field # Add the categories to the metadata metadata_dict['categories'] = [category for category in categories] if 'tokens' not in instance_fields: tokens = self._tokenizer.tokenize(text) instance_fields['tokens'] = TextField(tokens, self._token_indexers) metadata_dict['text'] = text metadata_dict['text words'] = [x.text for x in tokens] # If required processes the left and right contexts left_contexts = None right_contexts = None if self._left_right_contexts: if spans is None: raise ValueError('To create left, right, target contexts requires' ' the `spans` of the targets which is None') spans = [Span(span[0], span[1]) for span in spans] target_text_object = TargetText(text=text, spans=spans, targets=targets, text_id='anything') # left, right, and target contexts for each target in the # the text left_right_targets = target_text_object.left_right_target_contexts(incl_target=self._incl_target) left_contexts: List[str] = [] right_contexts: List[str] = [] for left_right_target in left_right_targets: left, right, _ = left_right_target left_contexts.append(left) if self._reverse_right_context: right_tokens = self._tokenizer.tokenize(right) reversed_right_tokens = [] for token in reversed(right_tokens): reversed_right_tokens.append(token.text) right = ' '.join(reversed_right_tokens) right_contexts.append(right) if left_contexts is not None: left_field = self._add_context_field(left_contexts) instance_fields["left_contexts"] = left_field if right_contexts is not None: right_field = self._add_context_field(right_contexts) instance_fields["right_contexts"] = right_field instance_fields["metadata"] = MetadataField(metadata_dict) return Instance(instance_fields)
def make_reading_comprehension_instance_quac( question_list_tokens: List[List[Token]], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_span_lists: List[List[Tuple[int, int]]] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None, num_context_answers: int = 0, ) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_list_tokens : ``List[List[Token]]`` An already-tokenized list of questions. Each dialog have multiple questions. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_span_lists : ``List[List[Tuple[int, int]]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list of list, first because there is multiple questions per dialog, and because there might be several possible correct answer spans in the passage. Currently, we just select the last span in this list (i.e., QuAC has multiple annotations on the dev set; this will select the last span, which was given by the original annotator). yesno_list : ``List[int]`` List of the affirmation bit for each question answer pairs. followup_list : ``List[int]`` List of the continuation bit for each question answer pairs. num_context_answers : ``int``, optional How many answers to encode into the passage. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields["passage"] = passage_field fields["question"] = ListField([ TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens ]) metadata = { "original_passage": passage_text, "token_offsets": passage_offsets, "question_tokens": [[token.text for token in question_tokens] for question_tokens in question_list_tokens], "passage_tokens": [token.text for token in passage_tokens], } p1_answer_marker_list: List[Field] = [] p2_answer_marker_list: List[Field] = [] p3_answer_marker_list: List[Field] = [] def get_tag(i, i_name): # Generate a tag to mark previous answer span in the passage. return "<{0:d}_{1:s}>".format(i, i_name) def mark_tag(span_start, span_end, passage_tags, prev_answer_distance): try: assert span_start >= 0 assert span_end >= 0 except: # noqa raise ValueError( "Previous {0:d}th answer span should have been updated!". format(prev_answer_distance)) # Modify "tags" to mark previous answer span. if span_start == span_end: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "") else: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "start") passage_tags[prev_answer_distance][span_end] = get_tag( prev_answer_distance, "end") for passage_index in range(span_start + 1, span_end): passage_tags[prev_answer_distance][passage_index] = get_tag( prev_answer_distance, "in") if token_span_lists: span_start_list: List[Field] = [] span_end_list: List[Field] = [] p1_span_start, p1_span_end, p2_span_start = -1, -1, -1 p2_span_end, p3_span_start, p3_span_end = -1, -1, -1 # Looping each <<answers>>. for question_index, answer_span_lists in enumerate(token_span_lists): span_start, span_end = answer_span_lists[ -1] # Last one is the original answer span_start_list.append(IndexField(span_start, passage_field)) span_end_list.append(IndexField(span_end, passage_field)) prev_answer_marker_lists = [ ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ] if question_index > 0 and num_context_answers > 0: mark_tag(p1_span_start, p1_span_end, prev_answer_marker_lists, 1) if question_index > 1 and num_context_answers > 1: mark_tag(p2_span_start, p2_span_end, prev_answer_marker_lists, 2) if question_index > 2 and num_context_answers > 2: mark_tag(p3_span_start, p3_span_end, prev_answer_marker_lists, 3) p3_span_start = p2_span_start p3_span_end = p2_span_end p2_span_start = p1_span_start p2_span_end = p1_span_end p1_span_start = span_start p1_span_end = span_end if num_context_answers > 2: p3_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[3], passage_field, label_namespace="answer_tags")) if num_context_answers > 1: p2_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[2], passage_field, label_namespace="answer_tags")) if num_context_answers > 0: p1_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[1], passage_field, label_namespace="answer_tags")) fields["span_start"] = ListField(span_start_list) fields["span_end"] = ListField(span_end_list) if num_context_answers > 0: fields["p1_answer_marker"] = ListField(p1_answer_marker_list) if num_context_answers > 1: fields["p2_answer_marker"] = ListField(p2_answer_marker_list) if num_context_answers > 2: fields["p3_answer_marker"] = ListField( p3_answer_marker_list) fields["yesno_list"] = ListField([ LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list ]) fields["followup_list"] = ListField([ LabelField(followup, label_namespace="followup_labels") for followup in followup_list ]) metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, # type: ignore sentence_tokens: List[str], predicates: List[int], predicate_index: int, constits: List[List[str]] = None, parents: List[List[str]] = None) -> Instance: """ We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ # pylint: disable=arguments-differ text_field = TextField([Token(t) for t in sentence_tokens], token_indexers=self._token_indexers) verb_field = SequenceLabelField(predicates, text_field) predicate_field = IndexField(predicate_index, text_field) # Span-based output fields. span_starts: List[Field] = [] span_ends: List[Field] = [] span_mask: List[int] = [ 1 for _ in range(len(sentence_tokens) * self.max_span_width) ] span_labels: Optional[List[str]] = [] if constits is not None else None parent_labels: Optional[ List[str]] = [] if parents is not None else None for j in range(len(sentence_tokens)): for diff in range(self.max_span_width): width = diff if j - diff < 0: # This is an invalid span. span_mask[j * self.max_span_width + diff] = 0 width = j span_starts.append(IndexField(j - width, text_field)) span_ends.append(IndexField(j, text_field)) if constits is not None: label = constits[j][diff] span_labels.append(label) if parents is not None: parent_labels.append(parents[j][diff]) start_fields = ListField(span_starts) end_fields = ListField(span_ends) span_mask_fields = SequenceLabelField(span_mask, start_fields) fields: Dict[str, Field] = { "tokens": text_field, "targets": verb_field, "span_starts": start_fields, "span_ends": end_fields, "span_mask": span_mask_fields, "target_index": predicate_field } if constits: fields['tags'] = SequenceLabelField( span_labels, start_fields, label_namespace=self.label_namespace) fields['parent_tags'] = SequenceLabelField( parent_labels, start_fields, label_namespace=self.parent_label_namespace) return Instance(fields)