def text_to_instance( self, # type: ignore tokens: List[str], lemmas: List[str] = None, pos_tags: List[str] = None, arc_indices: List[Tuple[int, int]] = None, arc_tags: List[str] = None, gold_actions: List[str] = None, root_id: List[int] = None, meta_info: List[str] = None, concept_label: List[int] = None, tokens_range: List[Tuple[int, int]] = None, gold_mrps: List[str] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} token_field = TextField([Token(t) for t in tokens], self._token_indexers) fields["tokens"] = token_field meta_dict = {"tokens": tokens} if lemmas is not None and self._lemma_indexers is not None: fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers) if pos_tags is not None: fields["pos_tags"] = SequenceLabelField(pos_tags, token_field, label_namespace="pos") if arc_indices is not None and arc_tags is not None: meta_dict["arc_indices"] = arc_indices meta_dict["arc_tags"] = arc_tags fields["arc_tags"] = TextField([Token(a) for a in arc_tags], self._arc_tag_indexers) if gold_actions is not None: meta_dict["gold_actions"] = gold_actions fields["gold_actions"] = TextField( [Token(a) for a in gold_actions], self._action_indexers) if meta_info is not None: meta_dict["meta_info"] = meta_info[0] if gold_mrps is not None: meta_dict["gold_mrps"] = gold_mrps[0] if tokens_range is not None: meta_dict["tokens_range"] = tokens_range if concept_label is not None: meta_dict["concept_label"] = concept_label fields["concept_label"] = TextField( [Token(a) for a in concept_label], self._concept_label_indexers) if root_id is not None: meta_dict["root_id"] = root_id[0] fields["metadata"] = MetadataField(meta_dict) return Instance(fields)
def text_to_instance( self, # type: ignore tokens, pos_tags=None, gold_tree=None): u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. pos_tags ``List[str]``, optional, (default = None). The POS tags for the words in the sentence. gold_tree : ``Tree``, optional (default = None). The gold parse tree to create span labels from. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. pos_tags : ``SequenceLabelField`` The POS tags of the words in the sentence. Only returned if ``use_pos_tags`` is ``True`` spans : ``ListField[SpanField]`` A ListField containing all possible subspans of the sentence. span_labels : ``SequenceLabelField``, optional. The constiutency tags for each of the possible spans, with respect to a gold parse tree. If a span is not contained within the tree, a span will have a ``NO-LABEL`` label. gold_tree : ``MetadataField(Tree)`` The gold NLTK parse tree for use in evaluation. """ # pylint: disable=arguments-differ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields = {u"tokens": text_field} if self._use_pos_tags and pos_tags is not None: pos_tag_field = SequenceLabelField(pos_tags, text_field, label_namespace=u"pos") fields[u"pos_tags"] = pos_tag_field elif self._use_pos_tags: raise ConfigurationError( u"use_pos_tags was set to True but no gold pos" u" tags were passed to the dataset reader.") spans = [] gold_labels = [] if gold_tree is not None: gold_spans = {} self._get_gold_spans(gold_tree, 0, gold_spans) else: gold_spans = None for start, end in enumerate_spans(tokens): spans.append(SpanField(start, end, text_field)) if gold_spans is not None: if (start, end) in list(gold_spans.keys()): gold_labels.append(gold_spans[(start, end)]) else: gold_labels.append(u"NO-LABEL") metadata = {u"tokens": tokens} if gold_tree: metadata[u"gold_tree"] = gold_tree if self._use_pos_tags: metadata[u"pos_tags"] = pos_tags fields[u"metadata"] = MetadataField(metadata) span_list_field = ListField(spans) fields[u"spans"] = span_list_field if gold_tree is not None: fields[u"span_labels"] = SequenceLabelField( gold_labels, span_list_field) return Instance(fields)
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray]) -> List[Instance]: """ This function currently only handles BIOUL tags. Imagine an NER model predicts three named entities (each one with potentially multiple tokens). For each individual entity, we create a new Instance that has the label set to only that entity and the rest of the tokens are labeled as outside. We then return a list of those Instances. For example: Mary went to Seattle to visit Microsoft Research U-Per O O U-Loc O O B-Org L-Org We create three instances. Mary went to Seattle to visit Microsoft Research U-Per O O O O O O O Mary went to Seattle to visit Microsoft Research O O O U-LOC O O O O Mary went to Seattle to visit Microsoft Research O O O O O O B-Org L-Org """ predicted_tags = outputs["tags"] predicted_spans = [] i = 0 while i < len(predicted_tags): tag = predicted_tags[i] # if its a U, add it to the list if tag[0] == "U": current_tags = [ t if idx == i else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) # if its a B, keep going until you hit an L. elif tag[0] == "B": begin_idx = i while tag[0] != "L": i += 1 tag = predicted_tags[i] end_idx = i current_tags = [ t if begin_idx <= idx <= end_idx else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) i += 1 # Creates a new instance for each contiguous tag instances = [] for labels in predicted_spans: new_instance = deepcopy(instance) text_field: TextField = instance["tokens"] # type: ignore new_instance.add_field("tags", SequenceLabelField(labels, text_field), self._model.vocab) instances.append(new_instance) instances.reverse( ) # NER tags are in the opposite order as desired for the interpret UI return instances
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray]) -> List[Instance]: new_instance = deepcopy(instance) # For BiDAF if "best_span" in outputs: span_start_label = outputs["best_span"][0] span_end_label = outputs["best_span"][1] passage_field: SequenceField = new_instance[ "passage"] # type: ignore new_instance.add_field( "span_start", IndexField(int(span_start_label), passage_field)) new_instance.add_field( "span_end", IndexField(int(span_end_label), passage_field)) # For NAQANet model. It has the fields: answer_as_passage_spans, answer_as_question_spans, # answer_as_add_sub_expressions, answer_as_counts. We need labels for all. elif "answer" in outputs: answer_type = outputs["answer"]["answer_type"] # When the problem is a counting problem if answer_type == "count": field = ListField([ LabelField(int(outputs["answer"]["count"]), skip_indexing=True) ]) new_instance.add_field("answer_as_counts", field) # When the answer is in the passage elif answer_type == "passage_span": # TODO(mattg): Currently we only handle one predicted span. span = outputs["answer"]["spans"][0] # Convert character span indices into word span indices word_span_start = None word_span_end = None offsets = new_instance["metadata"].metadata[ "passage_token_offsets"] # type: ignore for index, offset in enumerate(offsets): if offset[0] == span[0]: word_span_start = index if offset[1] == span[1]: word_span_end = index passage_field: SequenceField = new_instance[ "passage"] # type: ignore field = ListField( [SpanField(word_span_start, word_span_end, passage_field)]) new_instance.add_field("answer_as_passage_spans", field) # When the answer is an arithmetic calculation elif answer_type == "arithmetic": # The different numbers in the passage that the model encounters sequence_labels = outputs["answer"]["numbers"] numbers_field: ListField = instance[ "number_indices"] # type: ignore # The numbers in the passage are given signs, that's what we are labeling here. # Negative signs are given the class label 2 (for 0 and 1, the sign matches the # label). labels = [] for label in sequence_labels: if label["sign"] == -1: labels.append(2) else: labels.append(label["sign"]) # There's a dummy number added in the dataset reader to handle passages with no # numbers; it has a label of 0 (not included). labels.append(0) field = ListField([SequenceLabelField(labels, numbers_field)]) new_instance.add_field("answer_as_add_sub_expressions", field) # When the answer is in the question elif answer_type == "question_span": span = outputs["answer"]["spans"][0] # Convert character span indices into word span indices word_span_start = None word_span_end = None question_offsets = new_instance[ "metadata"].metadata[ # type: ignore "question_token_offsets"] for index, offset in enumerate(question_offsets): if offset[0] == span[0]: word_span_start = index if offset[1] == span[1]: word_span_end = index question_field: SequenceField = new_instance[ "question"] # type: ignore field = ListField([ SpanField(word_span_start, word_span_end, question_field) ]) new_instance.add_field("answer_as_question_spans", field) return [new_instance]
def text_to_instance( self, # type: ignore tokens: List[str], ccg_categories: List[str] = None, original_pos_tags: List[str] = None, modified_pos_tags: List[str] = None, predicate_arg_categories: List[str] = None, ) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. # Parameters tokens : `List[str]`, required. The tokens in a given sentence. ccg_categories : `List[str]`, optional, (default = `None`). The CCG categories for the words in the sentence. (e.g. N/N) original_pos_tags : `List[str]`, optional, (default = `None`). The tag assigned to the word in the Penn Treebank. modified_pos_tags : `List[str]`, optional, (default = `None`). The POS tag might have changed during the translation to CCG. predicate_arg_categories : `List[str]`, optional, (default = `None`). Encodes the word-word dependencies in the underlying predicate- argument structure. # Returns An `Instance` containing the following fields: tokens : `TextField` The tokens in the sentence. tags : `SequenceLabelField` The tags corresponding to the `tag_label` constructor argument. feature_label_tags : `SequenceLabelField` Tags corresponding to each feature_label (if any) specified in the `feature_labels` constructor argument. """ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} # Add "feature labels" to instance if "ccg" in self.feature_labels: if ccg_categories is None: raise ConfigurationError( "Dataset reader was specified to use CCG categories as " "features. Pass them to text_to_instance.") fields["ccg_tags"] = SequenceLabelField(ccg_categories, text_field, "ccg_tags") if "original_pos" in self.feature_labels: if original_pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use original POS tags as " "features. Pass them to text_to_instance.") fields["original_pos_tags"] = SequenceLabelField( original_pos_tags, text_field, "original_pos_tags") if "modified_pos" in self.feature_labels: if modified_pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use modified POS tags as " " features. Pass them to text_to_instance.") fields["modified_pos_tags"] = SequenceLabelField( modified_pos_tags, text_field, "modified_pos_tags") if "predicate_arg" in self.feature_labels: if predicate_arg_categories is None: raise ConfigurationError( "Dataset reader was specified to use predicate arg tags as " " features. Pass them to text_to_instance.") fields["predicate_arg_tags"] = SequenceLabelField( predicate_arg_categories, text_field, "predicate_arg_tags") # Add "tag label" to instance if self.tag_label == "ccg" and ccg_categories is not None: fields["tags"] = SequenceLabelField(ccg_categories, text_field, self.label_namespace) elif self.tag_label == "original_pos" and original_pos_tags is not None: fields["tags"] = SequenceLabelField(original_pos_tags, text_field, self.label_namespace) elif self.tag_label == "modified_pos" and modified_pos_tags is not None: fields["tags"] = SequenceLabelField(modified_pos_tags, text_field, self.label_namespace) elif self.tag_label == "predicate_arg" and predicate_arg_categories is not None: fields["tags"] = SequenceLabelField(predicate_arg_categories, text_field, self.label_namespace) return Instance(fields)
def text_to_instance( self, # type: ignore tokens: List[str], pos_tags: List[str] = None, gold_tree: Tree = None, ) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. pos_tags : ``List[str]``, optional, (default = None). The POS tags for the words in the sentence. gold_tree : ``Tree``, optional (default = None). The gold parse tree to create span labels from. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. pos_tags : ``SequenceLabelField`` The POS tags of the words in the sentence. Only returned if ``use_pos_tags`` is ``True`` spans : ``ListField[SpanField]`` A ListField containing all possible subspans of the sentence. span_labels : ``SequenceLabelField``, optional. The constituency tags for each of the possible spans, with respect to a gold parse tree. If a span is not contained within the tree, a span will have a ``NO-LABEL`` label. gold_tree : ``MetadataField(Tree)`` The gold NLTK parse tree for use in evaluation. """ if self._convert_parentheses: tokens = [PTB_PARENTHESES.get(token, token) for token in tokens] text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} pos_namespace = self._label_namespace_prefix + self._pos_label_namespace if self._use_pos_tags and pos_tags is not None: pos_tag_field = SequenceLabelField(pos_tags, text_field, label_namespace=pos_namespace) fields["pos_tags"] = pos_tag_field elif self._use_pos_tags: raise ConfigurationError( "use_pos_tags was set to True but no gold pos" " tags were passed to the dataset reader.") spans: List[Field] = [] gold_labels = [] if gold_tree is not None: gold_spans: Dict[Tuple[int, int], str] = {} self._get_gold_spans(gold_tree, 0, gold_spans) else: gold_spans = None for start, end in enumerate_spans(tokens): spans.append(SpanField(start, end, text_field)) if gold_spans is not None: gold_labels.append(gold_spans.get((start, end), "NO-LABEL")) metadata = {"tokens": tokens} if gold_tree: metadata["gold_tree"] = gold_tree if self._use_pos_tags: metadata["pos_tags"] = pos_tags fields["metadata"] = MetadataField(metadata) span_list_field: ListField = ListField(spans) fields["spans"] = span_list_field if gold_tree is not None: fields["span_labels"] = SequenceLabelField( gold_labels, span_list_field, label_namespace=self._label_namespace_prefix + "labels", ) return Instance(fields)
def make_reading_comprehension_instance_quac( question_list_tokens: List[List[Token]], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_span_lists: List[List[Tuple[int, int]]] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None, num_context_answers: int = 0) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_list_tokens : ``List[List[Token]]`` An already-tokenized list of questions. Each dialog have multiple questions. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_spans_lists : ``List[List[Tuple[int, int]]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list of list, first because there is multiple questions per dialog, and because there might be several possible correct answer spans in the passage. Currently, we just select the last span in this list (i.e., QuAC has multiple annotations on the dev set; this will select the last span, which was given by the original annotator). yesno_list : ``List[int]`` List of the affirmation bit for each question answer pairs. followup_list : ``List[int]`` List of the continuation bit for each question answer pairs. num_context_answers : ``int``, optional How many answers to encode into the passage. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = ListField([ TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens ]) metadata = {'original_passage': passage_text, 'token_offsets': passage_offsets, 'question_tokens': [[token.text for token in question_tokens] \ for question_tokens in question_list_tokens], 'passage_tokens': [token.text for token in passage_tokens], } p1_answer_marker_list: List[Field] = [] p2_answer_marker_list: List[Field] = [] p3_answer_marker_list: List[Field] = [] def get_tag(i, i_name): # Generate a tag to mark previous answer span in the passage. return "<{0:d}_{1:s}>".format(i, i_name) def mark_tag(span_start, span_end, passage_tags, prev_answer_distance): try: assert span_start >= 0 assert span_end >= 0 except: raise ValueError( "Previous {0:d}th answer span should have been updated!". format(prev_answer_distance)) # Modify "tags" to mark previous answer span. if span_start == span_end: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "") else: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "start") passage_tags[prev_answer_distance][span_end] = get_tag( prev_answer_distance, "end") for passage_index in range(span_start + 1, span_end): passage_tags[prev_answer_distance][passage_index] = get_tag( prev_answer_distance, "in") if token_span_lists: span_start_list: List[Field] = [] span_end_list: List[Field] = [] p1_span_start, p1_span_end, p2_span_start = -1, -1, -1 p2_span_end, p3_span_start, p3_span_end = -1, -1, -1 # Looping each <<answers>>. answer_list_tokens = list() for question_index, answer_span_lists in enumerate(token_span_lists): span_start, span_end = answer_span_lists[ -1] # Last one is the original answer span_start_list.append(IndexField(span_start, passage_field)) span_end_list.append(IndexField(span_end, passage_field)) answer_tokens = passage_tokens[span_start:span_end] answer_list_tokens.append(answer_tokens) prev_answer_marker_lists = [["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens)] if question_index > 0 and num_context_answers > 0: mark_tag(p1_span_start, p1_span_end, prev_answer_marker_lists, 1) if question_index > 1 and num_context_answers > 1: mark_tag(p2_span_start, p2_span_end, prev_answer_marker_lists, 2) if question_index > 2 and num_context_answers > 2: mark_tag(p3_span_start, p3_span_end, prev_answer_marker_lists, 3) p3_span_start = p2_span_start p3_span_end = p2_span_end p2_span_start = p1_span_start p2_span_end = p1_span_end p1_span_start = span_start p1_span_end = span_end if num_context_answers > 2: p3_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[3], passage_field, label_namespace="answer_tags")) if num_context_answers > 1: p2_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[2], passage_field, label_namespace="answer_tags")) if num_context_answers > 0: p1_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[1], passage_field, label_namespace="answer_tags")) fields['span_start'] = ListField(span_start_list) fields['span_end'] = ListField(span_end_list) dialog_list_tokens = list() #dialog_list_tokens.append([]) #### For firwst question dialog is null dialog_tokens_string_as_list = [] for diag_idx in range(len(question_list_tokens)): t = [] if diag_idx > num_context_answers: t += [passage_tokens[-2]] + question_list_tokens[0] + [ passage_tokens[-1] ] + answer_list_tokens[0] for prev_idx in range(max(diag_idx - num_context_answers, 0), diag_idx): t += [passage_tokens[-2]] + question_list_tokens[prev_idx] + [ passage_tokens[-1] ] + answer_list_tokens[prev_idx] dialog_list_tokens.append(t) #fields['dialog'] = ListField([TextField(d_tokens, token_indexers) for d_tokens in dialog_list_tokens]) dialog_list_tokens = list() #dialog_list_tokens.append([]) #### For firwst question dialog is null dialog_tokens_string_as_list = [] for diag_idx in range(len(question_list_tokens)): t = [] for prev_idx in range(max(diag_idx - num_context_answers, 0), diag_idx): t += [passage_tokens[-2]] + question_list_tokens[prev_idx] + [ passage_tokens[-1] ] + answer_list_tokens[prev_idx] dialog_list_tokens.append(t) #fields['dialog'] = ListField([TextField(d_tokens, token_indexers) for d_tokens in dialog_list_tokens]) ### Creating entire dialog field dialog_list_tokens = list() dialog_list_tokens.append([]) #### For firwst question dialog is null dialog_tokens_string_as_list = [] for ques_tokens, ans_tokens in list( zip(question_list_tokens, answer_list_tokens))[:-1]: dialog_tokens_string_as_list = dialog_tokens_string_as_list + [ passage_tokens[-2] ] + ques_tokens + [passage_tokens[-1]] + ans_tokens dialog_list_tokens.append(dialog_tokens_string_as_list) fields['dialog'] = ListField([ TextField(d_tokens, token_indexers) for d_tokens in dialog_list_tokens ]) fields['answer'] = ListField([ TextField(a_tokens, token_indexers) for a_tokens in answer_list_tokens ]) questions_answer_appended_list = list() for q_tok, a_tok in zip(question_list_tokens, answer_list_tokens): if q_tok == question_list_tokens[0]: ques_ans_tokens = q_tok else: ques_ans_tokens = q_tok + [passage_tokens[-1] ] + prev_ans_tokens questions_answer_appended_list.append(ques_ans_tokens) prev_ans_tokens = a_tok fields['previous_answer_appended'] = ListField([ TextField(tok, token_indexers) for tok in questions_answer_appended_list ]) #print ("question_list_tokens", question_list_tokens) #print ("answer_list_tokens", answer_list_tokens) #print ("prev answer list is ", fields['previous_answer_appended']) if num_context_answers > 0: fields['p1_answer_marker'] = ListField(p1_answer_marker_list) if num_context_answers > 1: fields['p2_answer_marker'] = ListField(p2_answer_marker_list) if num_context_answers > 2: fields['p3_answer_marker'] = ListField( p3_answer_marker_list) fields['yesno_list'] = ListField( \ [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list]) fields['followup_list'] = ListField([LabelField(followup, label_namespace="followup_labels") \ for followup in followup_list]) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) #print ("question_list_tokens", question_list_tokens) #print ("answer_list_tokens", answer_list_tokens) #print ("\nfields question ", fields['question']) #print ("\nfields answer ", fields['answer']) #print ("\nfields dialog ", fields['dialog']) return Instance(fields)
def text_to_instance( self, # type: ignore sentence: List[Token], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- sentences : ``List[Token]``, required. The already tokenised sentence to analyse. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the sentence, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full sentence. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the sentence text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ metadata: Dict[str, Any] = {"original_text": sentence} if gold_clusters is not None: metadata["clusters"] = gold_clusters text_field = TextField(sentence, self._token_indexers) cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List[Field] = [] span_labels: Optional[ List[int]] = [] if gold_clusters is not None else None for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) spans.append(SpanField(start, end, text_field)) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field } if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields)
def test_printing_doesnt_crash(self): tags = ["B", "I", "O", "O", "O"] sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="labels") print(sequence_label_field)
def text_to_instance(self, # type: ignore tokens: List[Token], ner_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} def _remove_BI(_one_tag): if _one_tag == 'O': return _one_tag else: return _one_tag[2:] if self.coding_scheme == "BIOUL": coded_ner = to_bioul(ner_tags, encoding=self._original_coding_scheme) if ner_tags is not None else None else: # the default IOB1 coded_ner = ner_tags # TODO: # ner_tags -> spans of NE # return something like spans, span_labels ("O" if span not in golden_spans, "PER", "LOC"... otherwise) spans: List[Field] = [] span_labels: List[str] = [] gold_spans: List[Field] = [] gold_span_labels: List[str] = [] assert len(ner_tags) == len(tokens), "sentence:%s but ner_tags:%s"%(str(tokens), str(ner_tags)) ner_gold_spans = _extract_spans(ner_tags) # ner_gold_spans: Dict[tuple(startid, endid), str(entity_type)] for start, end in enumerate_spans(ner_tags, offset=0, max_span_width=self._max_span_width): span_labels.append(ner_gold_spans.get((start, end), 'O')) spans.append(SpanField(start, end, sequence)) pass _dict_gold_spans = {} for ky, val in ner_gold_spans.items(): gold_span_labels.append(val) gold_spans.append(SpanField(ky[0], ky[1], sequence)) if val != 'O': _dict_gold_spans[ky] = val pass instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens] , "gold_spans": _dict_gold_spans}) assert len(spans) == len(span_labels), "span length not equal to span label length..." span_field = ListField(spans) # a list of (start, end) tuples... # contains all possible spans and their tags instance_fields['spans'] = span_field instance_fields['span_labels'] = SequenceLabelField(span_labels, span_field, "span_tags") # only contain gold_spans and their tags # e.g. (0,0,O), (1,1,O), (2,3,PER), (4,4,O) for 'I am Donald Trump .' gold_span_field = ListField(gold_spans) instance_fields['gold_spans'] = gold_span_field instance_fields['gold_span_labels'] = SequenceLabelField(gold_span_labels, gold_span_field, "span_tags") # Add "tag label" to instance if self.tag_label == 'ner' and coded_ner is not None: instance_fields['tags'] = SequenceLabelField(coded_ner, sequence, 'token_tags') return Instance(instance_fields)
def make_marginal_drop_instance( question_tokens: List[Token], passage_tokens: List[Token], number_tokens: List[Token], number_indices: List[int], token_indexers: Dict[str, TokenIndexer], passage_text: str, answer_info: Dict[str, Any] = None, additional_metadata: Dict[str, Any] = None, ) -> Instance: additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] question_offsets = [(token.idx, token.idx + len(token.text)) for token in question_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) question_field = TextField(question_tokens, token_indexers) fields["passage"] = passage_field fields["question"] = question_field number_index_fields: List[Field] = [ IndexField(index, passage_field) for index in number_indices ] fields["number_indices"] = ListField(number_index_fields) # This field is actually not required in the model, # it is used to create the `answer_as_plus_minus_combinations` field, which is a `SequenceLabelField`. # We cannot use `number_indices` field for creating that, because the `ListField` will not be empty # when we want to create a new empty field. That will lead to error. numbers_in_passage_field = TextField(number_tokens, token_indexers) metadata = { "original_passage": passage_text, "passage_token_offsets": passage_offsets, "question_token_offsets": question_offsets, "question_tokens": [token.text for token in question_tokens], "passage_tokens": [token.text for token in passage_tokens], "number_tokens": [token.text for token in number_tokens], "number_indices": number_indices, } if answer_info: metadata["answer_texts"] = answer_info["answer_texts"] passage_span_fields: List[Field] = [ SpanField(span[0], span[1], passage_field) for span in answer_info["answer_passage_spans"] ] if not passage_span_fields: passage_span_fields.append(SpanField(-1, -1, passage_field)) fields["answer_as_passage_spans"] = ListField(passage_span_fields) question_span_fields: List[Field] = [ SpanField(span[0], span[1], question_field) for span in answer_info["answer_question_spans"] ] if not question_span_fields: question_span_fields.append(SpanField(-1, -1, question_field)) fields["answer_as_question_spans"] = ListField( question_span_fields) add_sub_signs_field: List[Field] = [] for signs_for_one_add_sub_expression in answer_info[ "signs_for_add_sub_expressions"]: add_sub_signs_field.append( SequenceLabelField(signs_for_one_add_sub_expression, numbers_in_passage_field)) if not add_sub_signs_field: add_sub_signs_field.append( SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field)) fields["answer_as_add_sub_expressions"] = ListField( add_sub_signs_field) count_fields: List[Field] = [ LabelField(count_label, skip_indexing=True) for count_label in answer_info["counts"] ] if not count_fields: count_fields.append(LabelField(-1, skip_indexing=True)) fields["answer_as_counts"] = ListField(count_fields) metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def make_reading_comprehension_instance_quac( question_list_tokens: List[List[Token]], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_span_lists: List[List[Tuple[int, int]]] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None, num_context_answers: int = 0, ) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. # Parameters question_list_tokens : `List[List[Token]]` An already-tokenized list of questions. Each dialog have multiple questions. passage_tokens : `List[Token]` An already-tokenized passage that contains the answer to the given question. token_indexers : `Dict[str, TokenIndexer]` Determines how the question and passage `TextFields` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : `str` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_span_lists : `List[List[Tuple[int, int]]]`, optional Indices into `passage_tokens` to use as the answer to the question for training. This is a list of list, first because there is multiple questions per dialog, and because there might be several possible correct answer spans in the passage. Currently, we just select the last span in this list (i.e., QuAC has multiple annotations on the dev set; this will select the last span, which was given by the original annotator). yesno_list : `List[int]` List of the affirmation bit for each question answer pairs. followup_list : `List[int]` List of the continuation bit for each question answer pairs. num_context_answers : `int`, optional How many answers to encode into the passage. additional_metadata : `Dict[str, Any]`, optional The constructed `metadata` field will by default contain `original_passage`, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields["passage"] = passage_field fields["question"] = ListField([ TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens ]) metadata = { "original_passage": passage_text, "token_offsets": passage_offsets, "question_tokens": [[token.text for token in question_tokens] for question_tokens in question_list_tokens], "passage_tokens": [token.text for token in passage_tokens], } p1_answer_marker_list: List[Field] = [] p2_answer_marker_list: List[Field] = [] p3_answer_marker_list: List[Field] = [] def get_tag(i, i_name): # Generate a tag to mark previous answer span in the passage. return "<{0:d}_{1:s}>".format(i, i_name) def mark_tag(span_start, span_end, passage_tags, prev_answer_distance): try: assert span_start >= 0 assert span_end >= 0 except: # noqa raise ValueError( "Previous {0:d}th answer span should have been updated!". format(prev_answer_distance)) # Modify "tags" to mark previous answer span. if span_start == span_end: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "") else: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "start") passage_tags[prev_answer_distance][span_end] = get_tag( prev_answer_distance, "end") for passage_index in range(span_start + 1, span_end): passage_tags[prev_answer_distance][passage_index] = get_tag( prev_answer_distance, "in") if token_span_lists: span_start_list: List[Field] = [] span_end_list: List[Field] = [] p1_span_start, p1_span_end, p2_span_start = -1, -1, -1 p2_span_end, p3_span_start, p3_span_end = -1, -1, -1 # Looping each <<answers>>. for question_index, answer_span_lists in enumerate(token_span_lists): span_start, span_end = answer_span_lists[ -1] # Last one is the original answer span_start_list.append(IndexField(span_start, passage_field)) span_end_list.append(IndexField(span_end, passage_field)) prev_answer_marker_lists = [ ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ] if question_index > 0 and num_context_answers > 0: mark_tag(p1_span_start, p1_span_end, prev_answer_marker_lists, 1) if question_index > 1 and num_context_answers > 1: mark_tag(p2_span_start, p2_span_end, prev_answer_marker_lists, 2) if question_index > 2 and num_context_answers > 2: mark_tag(p3_span_start, p3_span_end, prev_answer_marker_lists, 3) p3_span_start = p2_span_start p3_span_end = p2_span_end p2_span_start = p1_span_start p2_span_end = p1_span_end p1_span_start = span_start p1_span_end = span_end if num_context_answers > 2: p3_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[3], passage_field, label_namespace="answer_tags")) if num_context_answers > 1: p2_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[2], passage_field, label_namespace="answer_tags")) if num_context_answers > 0: p1_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[1], passage_field, label_namespace="answer_tags")) fields["span_start"] = ListField(span_start_list) fields["span_end"] = ListField(span_end_list) if num_context_answers > 0: fields["p1_answer_marker"] = ListField(p1_answer_marker_list) if num_context_answers > 1: fields["p2_answer_marker"] = ListField(p2_answer_marker_list) if num_context_answers > 2: fields["p3_answer_marker"] = ListField( p3_answer_marker_list) fields["yesno_list"] = ListField([ LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list ]) fields["followup_list"] = ListField([ LabelField(followup, label_namespace="followup_labels") for followup in followup_list ]) metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, # type: ignore sentences: List[List[str]], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- sentences : ``List[List[str]]``, required. A list of lists representing the tokenised words and sentences in the document. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the document, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full document. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the document text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ flattened_sentences = [ self._normalize_word(word) for sentence in sentences for word in sentence ] # align clusters gold_clusters = self.align_clusters_to_tokens(flattened_sentences, gold_clusters) def tokenizer(s: str): return self.token_indexer.wordpiece_tokenizer(s) flattened_sentences = tokenizer(" ".join(flattened_sentences)) metadata: Dict[str, Any] = {"original_text": flattened_sentences} if gold_clusters is not None: metadata["clusters"] = gold_clusters if len(flattened_sentences) > 512: #import pdb #pdb.set_trace() text_field = TextField( [Token(word) for word in flattened_sentences[:512]], self._token_indexers) total_list = [text_field] import math for i in range( math.ceil(float(len(flattened_sentences[512:])) / 100.0)): # slide by 100 text_field = TextField([ Token(word) for word in flattened_sentences[512 + (i * 100):512 + ((i + 1) * 100)] ], self._token_indexers) total_list.append(text_field) text_field = ListField(total_list) # doing the Listfield else: text_field = TextField( [Token(word) for word in flattened_sentences], self._token_indexers) cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List[Field] = [] span_labels: Optional[ List[int]] = [] if gold_clusters is not None else None sentence_offset = 0 normal = [] for sentence in sentences: # enumerate the spans. for start, end in enumerate_spans( sentence, offset=sentence_offset, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) # align the spans to the BERT tokeniation normal.append((start, end)) # span field for Span, which needs to be a flattened esnetnece. span_field = text_field """ if len(flattened_sentences) > 512: span_field = TextField([Token(["[CLS]"])] + [Token(word) for word in flattened_sentences]+ [Token(["[SEP]"])] , self._token_indexers) else: span_field = text_field """ spans.append(SpanField(start, end, span_field)) sentence_offset += len(sentence) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field } if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields)
def text_to_instance( self, # type: ignore tokens: List[str], verb_label: List[int], tags: List[str] = None, pos_tags: List[str] = None, gold_tree: Tree = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. verb_label: ``List[int]``, required The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. tags: ``List[str]``, , optional (default = None). SRL tags pos_tags ``List[str]``, optional (default = None). The pos tags for the words in the sentence. gold_tree : ``Tree``, optional (default = None). The gold parse tree to create span labels from. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. pos_tags : ``SequenceLabelField`` The pos tags of the words in the sentence. spans : ``ListField[SpanField]`` A ListField containing all possible subspans of the sentence. span_labels : ``SequenceLabelField``, optional. The constituency tags for each of the possible spans, with respect to a gold parse tree. If a span is not contained within the tree, a span will have a ``NO-LABEL`` label. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} text_field = TextField(tokens, token_indexers=self._token_indexers) fields['tokens'] = text_field fields['verb_indicator'] = SequenceLabelField(verb_label, text_field) metadata: Dict[str, Any] = {} if tags: fields['tags'] = SequenceLabelField(tags, text_field) if pos_tags: pos_tag_field = SequenceLabelField(pos_tags, text_field, "pos_tags") fields['pos_tags'] = pos_tag_field metadata['pos_tags'] = True else: pos_tags = ['X' for _ in tokens] fields['pos_tags'] = SequenceLabelField(pos_tags, text_field, "pos_tags") metadata['pos_tags'] = False spans: List[Field] = [] gold_labels = [] if gold_tree is not None: gold_spans_with_pos_tags: Dict[Tuple[int, int], str] = {} self._get_gold_spans(gold_tree, 0, gold_spans_with_pos_tags) gold_spans = { span: label for (span, label) in gold_spans_with_pos_tags.items() if "-POS" not in label } else: gold_spans = None for start, end in enumerate_spans(tokens): spans.append(SpanField(start, end, text_field)) if gold_spans is not None: if (start, end) in gold_spans.keys(): gold_labels.append(gold_spans[(start, end)]) else: gold_labels.append("NO-LABEL") else: gold_labels.append("NO-LABEL") span_list_field: ListField = ListField(spans) fields['spans'] = span_list_field if gold_tree is not None: fields['span_labels'] = SequenceLabelField(gold_labels, span_list_field, "constituent_labels") metadata['span_labels'] = True else: fields['span_labels'] = SequenceLabelField(gold_labels, span_list_field, "constituent_labels") metadata['span_labels'] = False metadata_field = MetadataField(metadata) fields['metadata'] = metadata_field return Instance(fields)
def text_to_instance( self, # type: ignore tokens: List[str], lemmas: List[str] = None, upos_tags: List[str] = None, xpos_tags: List[str] = None, feats: List[str] = None, dependencies: List[Tuple[str, int]] = None, deps: List[List[Tuple[str, int]]] = None, ids: List[str] = None, misc: List[str] = None, multiword_ids: List[str] = None, multiword_forms: List[str] = None, conllu_metadata: List[str] = None, contains_elided_token: bool = False, ) -> Instance: """ # Parameters tokens : ``List[str]``, required. The tokens in the sentence to be encoded. upos_tags : ``List[str]``, required. The universal dependencies POS tags for each word. dependencies : ``List[Tuple[str, int]]``, optional (default = None) A list of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. deps : ``List[List[Tuple[str, int]]]``, optional (default = None) A list of lists of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. # Returns An instance containing tokens, pos tags, basic and enhanced dependency head tags and head indices as fields. """ fields: Dict[str, Field] = {} token_field = TextField([Token(t) for t in tokens], self._token_indexers) fields["tokens"] = token_field names = ["upos", "xpos", "lemmas"] all_tags = [upos_tags, xpos_tags, lemmas] for name, field in zip(names, all_tags): if field: fields[name] = SequenceLabelField(field, token_field, label_namespace=name) sublist_fields = [] for atomic_feat in feats: feat_fields = ListField([ LabelField(feat, label_namespace="feats") for feat in atomic_feat.split("|") ]) sublist_fields.append(feat_fields) fields["feats"] = ListField(sublist_fields) # basic dependency tree if dependencies is not None: head_tags = [x[0] for x in dependencies] head_indices = [x[1] for x in dependencies] # we're not using the basic tree in the parse at the moment # so we are excluding these fields. #fields["head_tags"] = SequenceLabelField( # [x[0] for x in dependencies], token_field, label_namespace="head_tags" #) #fields["head_indices"] = SequenceLabelField( # [x[1] for x in dependencies], token_field, label_namespace="head_index_tags" #) # enhanced dependencies if deps is not None: enhanced_arc_tags, enhanced_arc_indices = self._convert_deps_to_nested_sequences( deps) # extra processing is needed if a sentence contains an elided token if self.contains_elided_token == True: original_to_new_indices, augmented_heads = self._process_elided_tokens( ids, enhanced_arc_indices) enhanced_arc_indices = augmented_heads else: original_to_new_indices = None assert len(enhanced_arc_tags) == len( enhanced_arc_indices), "each arc should have a label" arc_indices = [] arc_tags = [] arc_indices_and_tags = [] for modifier, head_list in enumerate(enhanced_arc_indices, start=1): for head in head_list: arc_indices.append((head, modifier)) for relation_list in enhanced_arc_tags: for relation in relation_list: arc_tags.append(relation) assert len(arc_indices) == len( arc_tags), "each arc should have a label" for arc_index, arc_tag in zip(arc_indices, arc_tags): arc_indices_and_tags.append((arc_index, arc_tag)) if arc_indices is not None and arc_tags is not None: token_field_with_root = ['root'] + tokens fields["enhanced_tags"] = RootedAdjacencyField( arc_indices, token_field_with_root, arc_tags, label_namespace="deps") fields["metadata"] = MetadataField({ "tokens": tokens, "upos_tags": upos_tags, "xpos_tags": xpos_tags, "feats": feats, "lemmas": lemmas, "ids": ids, "misc": misc, "original_to_new_indices": original_to_new_indices, "head_tags": head_tags, "head_indices": head_indices, "arc_indices": arc_indices, "arc_tags": arc_tags, "labeled_arcs": arc_indices_and_tags, "multiword_ids": multiword_ids, "multiword_forms": multiword_forms, "conllu_metadata": conllu_metadata }) return Instance(fields)
def test_human_readable_repr(self): tags = ["B", "I", "O", "O", "O"] sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="labels") assert sequence_label_field.human_readable_repr() == tags
def text_to_instance(self, source_key: str, target_key: str = None, line_obj: Dict = {}) -> Instance: """ Turn json object into an ``Instance``. Parameters ---------- source_key : ``str``, required, json object key name of the source sequence target_key : ``str``, optional (default = None), json object key name of the target sequence line_obj : ``Dict``, required, json object containing the raw instance info Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # Read source and target target_sequence = line_obj.get(target_key, None) lang_src_token = line_obj["src_lang"].upper() lang_tgt_token = line_obj["tgt_lang"].upper() # Read Predicate Indicator and make Array verb_label = [0, 0] + [1 if label[-2:] == "-V" else 0 for label in line_obj["BIO"]] + [0] # Read Language Indicator and make Array lang_src_ix = self._available_languages[lang_src_token] lang_tgt_ix = self._available_languages[lang_tgt_token] # This array goes to the encoder as a whole lang_src_ix_arr = [0, 0] + [lang_src_ix for tok in line_obj[source_key]] + [0] # This array goes to each one of the decoder_steps lang_tgt_ix_arr = lang_tgt_ix # is just int for step decoder dimensionality # Tokenize Source tokenized_source = list(map(Token, line_obj[source_key])) # Data comes already tokenized! tokenized_source.insert(0, Token(lang_tgt_token)) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace) meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]} fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } # Process Target info during training... if target_sequence is not None: tokenized_target = list(map(Token, line_obj[target_key])) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]] source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2] fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:] fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) # Add Verb Indicator to the Fields fields_dict['verb_indicator'] = SequenceLabelField(verb_label, source_field) if all([x == 0 for x in verb_label]): verb = None else: verb = tokenized_source[verb_label.index(1)].text meta_fields["verb"] = verb # Add Language Indicator to the Fields meta_fields["src_lang"] = lang_src_token meta_fields["tgt_lang"] = lang_tgt_token meta_fields["original_BIO"] = line_obj.get("BIO", []) meta_fields["original_predicate_senses"] = line_obj.get("pred_sense_origin", []) meta_fields["predicate_senses"] = line_obj.get("pred_sense", []) meta_fields["original_target"] = line_obj.get("seq_tag_tokens", []) fields_dict['language_enc_indicator'] = ArrayField(np.array(lang_src_ix_arr)) fields_dict['language_dec_indicator'] = ArrayField(np.array(lang_tgt_ix_arr)) fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def test_tag_length_mismatch_raises(self): with pytest.raises(ConfigurationError): wrong_tags = ["B", "O", "O"] _ = SequenceLabelField(wrong_tags, self.text)
def text_to_instance( self, # type: ignore tokens: List[str], lemmas: List[str] = None, upos_tags: List[str] = None, xpos_tags: List[str] = None, feats: List[str] = None, dependencies: List[Tuple[str, int]] = None, deps: List[List[Tuple[str, int]]] = None, ids: List[str] = None, misc: List[str] = None, multiword_ids: List[str] = None, multiword_forms: List[str] = None, conllu_metadata: List[str] = None, contains_elided_token: bool = False, ) -> Instance: """ # Parameters tokens : ``List[str]``, required. The tokens in the sentence to be encoded. upos_tags : ``List[str]``, required. The universal dependencies POS tags for each word. dependencies : ``List[Tuple[str, int]]``, optional (default = None) A list of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. deps : ``List[List[Tuple[str, int]]]``, optional (default = None) A list of lists of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. # Returns An instance containing tokens, pos tags, basic and enhanced dependency head tags and head indices as fields. """ fields: Dict[str, Field] = {} token_field = TextField([Token(t) for t in tokens], self._token_indexers) fields["tokens"] = token_field names = ["upos", "xpos", "lemmas"] all_tags = [upos_tags, xpos_tags, lemmas] for name, field in zip(names, all_tags): if field: fields[name] = SequenceLabelField(field, token_field, label_namespace=name) sublist_fields = [] for atomic_feat in feats: feat_fields = ListField([ LabelField(feat, label_namespace="feats") for feat in atomic_feat.split("|") ]) sublist_fields.append(feat_fields) fields["feats"] = ListField(sublist_fields) # basic dependency tree if dependencies is not None: head_tags = [x[0] for x in dependencies] head_indices = [x[1] for x in dependencies] fields["deprels"] = SequenceLabelField( [x[0] for x in dependencies], token_field, label_namespace="deprels") # head indices will be encoded as direction and distance features instead #fields["head_indices"] = SequenceLabelField( # [x[1] for x in dependencies], token_field, label_namespace="head_index_tags" #) # enhanced dependencies # NOTE: we always assume there is something in the edeps column at the moment. if deps is not None: enhanced_arc_tags, enhanced_arc_indices = self._convert_deps_to_nested_sequences( deps) # extra processing is needed if a sentence contains an elided token if self.contains_elided_token == True: original_to_new_indices, augmented_heads = self._process_elided_tokens( ids, enhanced_arc_indices) enhanced_arc_indices = augmented_heads else: original_to_new_indices = None assert len(enhanced_arc_tags) == len( enhanced_arc_indices), "each arc should have a label" arc_indices = [] arc_tags = [] arc_indices_and_tags = [] for modifier, head_list in enumerate(enhanced_arc_indices, start=1): for head in head_list: arc_indices.append((head, modifier)) for relation_list in enhanced_arc_tags: for relation in relation_list: arc_tags.append(relation) assert len(arc_indices) == len( arc_tags), "each arc should have a label" for arc_index, arc_tag in zip(arc_indices, arc_tags): arc_indices_and_tags.append((arc_index, arc_tag)) if arc_indices is not None and arc_tags is not None: token_field_with_root = ['root'] + tokens fields["enhanced_tags"] = RootedAdjacencyField( arc_indices, token_field_with_root, arc_tags, label_namespace="deps") if original_to_new_indices: # 1-indexed conllu ids as they appear in the sentence, e.g. 13.1 -> 14. offsets = list(original_to_new_indices.values()) # we start from index 1 as there is a placeholder for root (0) in the above dictionary conllu_ids = offsets[1:] # change the indices of the heads to reflect the new order augmented_heads = [] for head in head_indices: # the "_" head won't be in here if head in original_to_new_indices.keys(): # take the 1-indexed head based on the order of words in the sentence augmented_head = original_to_new_indices[head] augmented_heads.append(augmented_head) else: augmented_heads.append("_") basic_heads = augmented_heads else: conllu_ids = ids basic_heads = head_indices assert len(conllu_ids) == len( basic_heads), "each token should have a head" head_information = [] for dep, head in zip(conllu_ids, basic_heads): if head != "_": distance = head - dep # get a qualitative distance category distance_category = self.get_distance_categories(distance) # get a qualitative category of whether the head is to the left or right if distance < 0: # left-headed direction_label = "<L>" elif distance > 0: # right-headed direction_label = "<R>" else: # there is no information from the basic tree for elided tokens direction_label = "<NULL_DIR>" distance_category = "<NULL_DIST>" # join direction and category head_direction_and_distance = direction_label + "|" + distance_category head_information.append(head_direction_and_distance) # embed the head information like a morphological feature, e.g. it is a combination of direction and distance features sublist_heads = [] for full_head_information in head_information: head_feats = ListField([ LabelField(head_metadata, label_namespace="heads") for head_metadata in full_head_information.split("|") ]) sublist_heads.append(head_feats) fields["heads"] = ListField(sublist_heads) fields["metadata"] = MetadataField({ "tokens": tokens, "upos_tags": upos_tags, "xpos_tags": xpos_tags, "feats": feats, "lemmas": lemmas, "ids": ids, "misc": misc, "original_to_new_indices": original_to_new_indices, "head_tags": head_tags, "head_indices": head_indices, "arc_indices": arc_indices, "arc_tags": arc_tags, "labeled_arcs": arc_indices_and_tags, "multiword_ids": multiword_ids, "multiword_forms": multiword_forms, "conllu_metadata": conllu_metadata }) return Instance(fields)
def test_sequence_label_field_raises_on_incorrect_type(self): with pytest.raises(ConfigurationError): _ = SequenceLabelField([[], [], [], [], []], self.text)
def text_to_instance( self, paragraph_num: int, paragraph: List[str], ner_dict: Dict[Span, str], start_ix: int, end_ix: int, sentence_indices: List[Span], document_metadata: Dict[str, Any], ): if self.to_scierc_converter: return dict( paragraph_num=paragraph_num, paragraph=paragraph, ner_dict=ner_dict, start_ix=start_ix, end_ix=end_ix, sentence_indices=sentence_indices, document_metadata=document_metadata, ) text_field = TextField([Token(word) for word in paragraph], self._token_indexers) metadata_field = MetadataField( dict( doc_id=document_metadata["doc_id"], paragraph_num=paragraph_num, paragraph=paragraph, start_pos_in_doc=start_ix, end_pos_in_doc=end_ix, ner_dict=ner_dict, sentence_indices=sentence_indices, document_metadata=document_metadata, num_spans=len(ner_dict), )) ner_type_labels = spans_to_bio_tags( [(k[0] - start_ix, k[1] - start_ix, v[0]) for k, v in ner_dict.items()], len(paragraph)) ner_entity_field = SequenceLabelField( ner_type_labels, text_field, label_namespace="ner_type_labels") # Pull it all together. fields = dict(text=text_field, ner_type_labels=ner_entity_field, metadata=metadata_field) spans = [] span_cluster_labels = [] span_saliency_labels = [] span_type_labels = [] span_features = [] entities_to_features_map = document_metadata[ "entities_to_features_map"] cluster_name_to_id = document_metadata["cluster_name_to_id"] relation_to_cluster_ids = document_metadata["relation_to_cluster_ids"] span_to_cluster_ids = document_metadata["span_to_cluster_ids"] for (s, e), label in ner_dict.items(): spans.append( SpanField(int(s - start_ix), int(e - start_ix - 1), text_field)) span_cluster_labels.append( MultiLabelField( span_to_cluster_ids.get((s, e), []), label_namespace="cluster_labels", skip_indexing=True, num_labels=len(cluster_name_to_id), )) span_saliency_labels.append(1 if label[-1] == "True" else 0) span_type_labels.append(label[0]) span_features.append( MultiLabelField(entities_to_features_map[(s, e)], label_namespace="section_feature_labels", num_labels=5)) if len(spans) > 0: fields["spans"] = ListField(spans) fields["span_cluster_labels"] = ListField(span_cluster_labels) fields["span_saliency_labels"] = SequenceLabelField( span_saliency_labels, fields["spans"], label_namespace="span_saliency_labels") fields["span_type_labels"] = SequenceLabelField( span_type_labels, fields["spans"], label_namespace="span_type_labels") fields["span_features"] = ListField(span_features) else: # Some paragraphs may not have anything ! fields["spans"] = ListField( [SpanField(-1, -1, text_field).empty_field()]).empty_field() fields["span_cluster_labels"] = ListField([ MultiLabelField( [], label_namespace="cluster_labels", skip_indexing=True, num_labels=len(cluster_name_to_id), ) ]) #.empty_field() fields["span_saliency_labels"] = SequenceLabelField( [0], fields["spans"], label_namespace="span_saliency_labels") fields["span_type_labels"] = SequenceLabelField( ["Method"], fields["spans"], label_namespace="span_type_labels") fields["span_features"] = ListField([ MultiLabelField([], label_namespace="section_feature_labels", num_labels=5) ]) if len(relation_to_cluster_ids) > 0: fields["relation_to_cluster_ids"] = ListField([ MultiLabelField( v, label_namespace="cluster_labels", skip_indexing=True, num_labels=len(cluster_name_to_id), ) for k, v in relation_to_cluster_ids.items() ]) return Instance(fields)
def text_to_instance(self, sentence: List[str], ner_dict: Dict[Tuple[int, int], str], relation_dict, doc_key: str, dataset: str, sentence_num: int, groups: List[str], start_ix: int, end_ix: int, tree: Dict[str, Any], children_dict: Dict[Tuple[int, int], List[Tuple[int, int]]], dep_children_dict: Dict[Tuple[int, int], List[Tuple[int, int]]], tf_dict: Dict[Tuple[int, int], Any]): sentence = [self._normalize_word(word) for word in sentence] text_field = TextField([Token(word) for word in sentence], self._token_indexers) text_field_with_context = TextField([Token(word) for word in groups], self._token_indexers) # Put together the metadata. metadata = dict(sentence=sentence, ner_dict=ner_dict, relation_dict=relation_dict, doc_key=doc_key, dataset=dataset, groups=groups, start_ix=start_ix, end_ix=end_ix, sentence_num=sentence_num, tree=tree, children_dict=children_dict, dep_children_dict=dep_children_dict) metadata_field = MetadataField(metadata) # Generate fields for text spans, ner labels spans = [] span_ner_labels = [] span_children_labels = [] raw_spans = [] for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width): span_ix = (start, end) span_ner_labels.append(ner_dict[span_ix]) spans.append(SpanField(start, end, text_field)) raw_spans.append(span_ix) span_field = ListField(spans) for span in raw_spans: if len(children_dict[span]) == 0: children_field = ListField([IndexField(-1, span_field)]) else: children_field = [] for children_span in children_dict[span]: if children_span in raw_spans: children_field.append( IndexField(raw_spans.index(children_span), span_field)) else: children_field.append(IndexField(-1, span_field)) children_field = ListField(children_field) span_children_labels.append(children_field) n_tokens = len(sentence) candidate_indices = [(i, j) for i in range(n_tokens) for j in range(n_tokens)] dep_adjs = [] dep_adjs_indices = [] tf_indices = [] tf_features = [] for token_pair in candidate_indices: dep_adj_label = dep_children_dict[token_pair] if dep_adj_label: dep_adjs_indices.append(token_pair) dep_adjs.append(dep_adj_label) feature = tf_dict[token_pair] if feature: tf_indices.append(token_pair) tf_features.append(feature) ner_label_field = SequenceLabelField(span_ner_labels, span_field, label_namespace="ner_labels") n_spans = len(spans) span_tuples = [(span.span_start, span.span_end) for span in spans] candidate_indices = [(i, j) for i in range(n_spans) for j in range(n_spans)] relations = [] relation_indices = [] for i, j in candidate_indices: span_pair = (span_tuples[i], span_tuples[j]) relation_label = relation_dict[span_pair] if relation_label: relation_indices.append((i, j)) relations.append(relation_label) relation_label_field = AdjacencyField( indices=relation_indices, sequence_field=span_field, labels=relations, label_namespace="relation_labels") # Syntax span_children_field = ListField(span_children_labels) dep_span_children_field = AdjacencyField( indices=dep_adjs_indices, sequence_field=text_field, labels=dep_adjs, label_namespace="dep_adj_labels") tf_field = AdjacencyField(indices=tf_indices, sequence_field=text_field, labels=tf_features, label_namespace="tf_labels") fields = dict(text=text_field_with_context, spans=span_field, ner_labels=ner_label_field, relation_labels=relation_label_field, metadata=metadata_field, span_children=span_children_field, dep_span_children=dep_span_children_field, tf=tf_field) return Instance(fields)
def text_to_instance(self, sentence: List[str], ner_dict: Dict[Tuple[int, int], str], relation_dict, cluster_dict, trigger_dict, argument_dict, doc_key: str, dataset: str, sentence_num: int, groups: List[str], start_ix: int, end_ix: int, tree: Dict[str, Any], syntax_dict: Dict[Tuple[int, int], str], children_dict: Dict[Tuple[int, int], List[Tuple[int, int]]], dep_children_dict: Dict[Tuple[int, int], List[Tuple[int, int]]], tf_dict: Dict[Tuple[int, int], Any]): """ TODO(dwadden) document me. """ sentence = [self._normalize_word(word) for word in sentence] text_field = TextField([Token(word) for word in sentence], self._token_indexers) text_field_with_context = TextField([Token(word) for word in groups], self._token_indexers) # feili, NER labels. One label per token ner_sequence_labels = self._generate_ner_label(sentence, ner_dict) ner_sequence_label_field = SequenceLabelField( ner_sequence_labels, text_field, label_namespace="ner_sequence_labels") # Put together the metadata. metadata = dict(sentence=sentence, ner_dict=ner_dict, relation_dict=relation_dict, cluster_dict=cluster_dict, trigger_dict=trigger_dict, argument_dict=argument_dict, doc_key=doc_key, dataset=dataset, groups=groups, start_ix=start_ix, end_ix=end_ix, sentence_num=sentence_num, seq_dict=ner_sequence_labels, tree=tree, syntax_dict=syntax_dict, children_dict=children_dict, dep_children_dict=dep_children_dict) metadata_field = MetadataField(metadata) # Trigger labels. One label per token in the input. token_trigger_labels = [] for i in range(len(text_field)): token_trigger_labels.append(trigger_dict[i]) trigger_label_field = SequenceLabelField( token_trigger_labels, text_field, label_namespace="trigger_labels") # Generate fields for text spans, ner labels, coref labels. spans = [] span_ner_labels = [] # feili span_labels = [] span_coref_labels = [] span_syntax_labels = [] span_children_labels = [] dep_span_children_labels = [] # span_children_syntax_labels = [] span_tree_labels = [] raw_spans = [] assert len(syntax_dict) == len(children_dict) for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width): span_ix = (start, end) # here we need to consider how to use tree info # for example, use_tree, span is in tree, match is true or false # if self._tree_span_filter and not self._is_span_in_tree(span_ix, syntax_dict, children_dict): # if len(raw_spans) == 0: # in case that there is no span for this instance # pass # else: # continue span_tree_labels.append('1' if self._is_span_in_tree( span_ix, syntax_dict, children_dict) else '') span_ner_labels.append(ner_dict[span_ix]) span_labels.append('' if ner_dict[span_ix] == '' else '1') span_coref_labels.append(cluster_dict[span_ix]) spans.append(SpanField(start, end, text_field)) span_syntax_labels.append(syntax_dict[span_ix]) raw_spans.append(span_ix) # if len(children_dict[span_ix]) == 0: # children_field = ListField([SpanField(-1, -1, text_field)]) # children_syntax_field = SequenceLabelField([''], children_field, # label_namespace="span_syntax_labels") # else: # children_field = ListField([SpanField(children_span[0], children_span[1], text_field) # for children_span in children_dict[span_ix]]) # children_syntax_field = SequenceLabelField([syntax_dict[children_span] for children_span in children_dict[span_ix]], # children_field, label_namespace="span_syntax_labels") # span_children_labels.append(children_field) # span_children_syntax_labels.append(children_syntax_field) span_field = ListField(spans) for span in raw_spans: if len(children_dict[span]) == 0: children_field = ListField([IndexField(-1, span_field)]) else: children_field = [] for children_span in children_dict[span]: if children_span in raw_spans: children_field.append( IndexField(raw_spans.index(children_span), span_field)) else: children_field.append(IndexField(-1, span_field)) children_field = ListField(children_field) span_children_labels.append(children_field) # for span in raw_spans: # if len(dep_children_dict[span]) == 0: # children_field = ListField([IndexField(-1, span_field)]) # else: # children_field = [] # for children_span in dep_children_dict[span]: # if children_span in raw_spans: # children_field.append(IndexField(raw_spans.index(children_span), span_field)) # else: # children_field.append(IndexField(-1, span_field)) # children_field = ListField(children_field) # dep_span_children_labels.append(children_field) n_tokens = len(sentence) candidate_indices = [(i, j) for i in range(n_tokens) for j in range(n_tokens)] dep_adjs = [] dep_adjs_indices = [] # tf_indices = {} # tf_features = {} # for k, v in tf_dict.items(): # tf_indices[k] = [] # tf_features[k] = [] tf_indices = [] tf_features = [] for token_pair in candidate_indices: dep_adj_label = dep_children_dict[token_pair] if dep_adj_label: dep_adjs_indices.append(token_pair) dep_adjs.append(dep_adj_label) # for k,v in tf_dict.items(): # feature = tf_dict[k][token_pair] # if feature: # tf_indices[k].append(token_pair) # tf_features[k].append(feature) feature = tf_dict[token_pair] if feature: tf_indices.append(token_pair) tf_features.append(feature) ner_label_field = SequenceLabelField(span_ner_labels, span_field, label_namespace="ner_labels") coref_label_field = SequenceLabelField(span_coref_labels, span_field, label_namespace="coref_labels") # feili span_label_field = SequenceLabelField(span_labels, span_field, label_namespace="span_labels") # Generate labels for relations and arguments. Only store non-null values. # For the arguments, by convention the first span specifies the trigger, and the second # specifies the argument. Ideally we'd have an adjacency field between (token, span) pairs # for the event arguments field, but AllenNLP doesn't make it possible to express # adjacencies between two different sequences. n_spans = len(spans) span_tuples = [(span.span_start, span.span_end) for span in spans] candidate_indices = [(i, j) for i in range(n_spans) for j in range(n_spans)] relations = [] relation_indices = [] for i, j in candidate_indices: span_pair = (span_tuples[i], span_tuples[j]) relation_label = relation_dict[span_pair] if relation_label: relation_indices.append((i, j)) relations.append(relation_label) relation_label_field = AdjacencyField( indices=relation_indices, sequence_field=span_field, labels=relations, label_namespace="relation_labels") arguments = [] argument_indices = [] n_tokens = len(sentence) candidate_indices = [(i, j) for i in range(n_tokens) for j in range(n_spans)] for i, j in candidate_indices: token_span_pair = (i, span_tuples[j]) argument_label = argument_dict[token_span_pair] if argument_label: argument_indices.append((i, j)) arguments.append(argument_label) argument_label_field = AdjacencyFieldAssym( indices=argument_indices, row_field=text_field, col_field=span_field, labels=arguments, label_namespace="argument_labels") # Syntax span_syntax_field = SequenceLabelField( span_syntax_labels, span_field, label_namespace="span_syntax_labels") span_children_field = ListField(span_children_labels) span_tree_field = SequenceLabelField( span_tree_labels, span_field, label_namespace="span_tree_labels") # span_children_syntax_field = ListField(span_children_syntax_labels) # dep_span_children_field = ListField(dep_span_children_labels) dep_span_children_field = AdjacencyField( indices=dep_adjs_indices, sequence_field=text_field, labels=dep_adjs, label_namespace="dep_adj_labels") # tf_f1_field = AdjacencyField(indices=tf_indices['F1'], sequence_field=text_field, labels=tf_features['F1'], # label_namespace="tf_f1_labels") # tf_f2_field = AdjacencyField(indices=tf_indices['F2'], sequence_field=text_field, labels=tf_features['F2'], # label_namespace="tf_f2_labels") # tf_f3_field = AdjacencyField(indices=tf_indices['F3'], sequence_field=text_field, labels=tf_features['F3'], # label_namespace="tf_f3_labels") # tf_f4_field = AdjacencyField(indices=tf_indices['F4'], sequence_field=text_field, labels=tf_features['F4'], # label_namespace="tf_f4_labels") # tf_f5_field = AdjacencyField(indices=tf_indices['F5'], sequence_field=text_field, labels=tf_features['F5'], # label_namespace="tf_f5_labels") tf_field = AdjacencyField(indices=tf_indices, sequence_field=text_field, labels=tf_features, label_namespace="tf_labels") # Pull it all together. fields = dict( text=text_field_with_context, spans=span_field, ner_labels=ner_label_field, coref_labels=coref_label_field, trigger_labels=trigger_label_field, argument_labels=argument_label_field, relation_labels=relation_label_field, metadata=metadata_field, span_labels=span_label_field, ner_sequence_labels=ner_sequence_label_field, syntax_labels=span_syntax_field, span_children=span_children_field, span_tree_labels=span_tree_field, dep_span_children=dep_span_children_field, # tf_f1 = tf_f1_field, # tf_f2 = tf_f2_field, # tf_f3 = tf_f3_field, # tf_f4 = tf_f4_field, # tf_f5 = tf_f5_field) tf=tf_field) # span_children_syntax=span_children_syntax_field) return Instance(fields)
def text_to_instance(self, question_text: str, passage_text: str, passage_tokens: List[Token], numbers_in_passage: List[Any], number_words: List[str], number_indices: List[int], number_len: List[int], question_id: str = None, passage_id: str = None, answer_annotations: List[Dict] = None, specific_answer_type: str = None) -> Optional[Instance]: # Tokenize question and passage ''' ### all_number_in_qp_tokens = [qp_tokens[idx] for idx in number_indices] unit_tokens = self.tokenizer.tokenize(answer_annotations[0]['unit']) valid_unit_spans = DropReader.find_valid_spans(question_tokens, [answer_annotations[0]['unit']]) assert len(valid_unit_spans) == 1 ### index + 1 since there is an CLS token at the front valid_unit_spans = [(valid_unit_spans[0][0]+1, valid_unit_spans[0][1]+1)] ''' question_tokens = self.tokenizer.tokenize(question_text) question_tokens = fill_token_indices(question_tokens, question_text, self._uncased, self.basic_tokenizer) qlen = len(question_tokens) qp_tokens = [Token('[CLS]')] + question_tokens + [Token('[SEP]')] + passage_tokens # if qp has more than max_pieces tokens (including CLS and SEP), clip the passage max_passage_length = -1 if len(qp_tokens) > self.max_pieces - 1: qp_tokens = qp_tokens[:self.max_pieces - 1] passage_tokens = passage_tokens[:self.max_pieces - qlen - 3] plen = len(passage_tokens) number_indices, number_len, numbers_in_passage = \ clipped_passage_num(number_indices, number_len, numbers_in_passage, plen) max_passage_length = token_to_span(passage_tokens[-1])[1] if plen > 0 else 0 qp_tokens += [Token('[SEP]')] # update the indices of the numbers with respect to the question. # Not done in-place so they won't change the numbers saved for the passage number_indices = [index + qlen + 2 for index in number_indices] + [-1] number_len = number_len + [1] numbers_in_passage = numbers_in_passage + [0] number_tokens = [Token(str(number)) for number in numbers_in_passage] extra_number_tokens = [Token(str(num)) for num in self.extra_numbers] mask_indices = [0, qlen + 1, len(qp_tokens) - 1] fields: Dict[str, Field] = {} # Add feature fields qp_field = TextField(qp_tokens, self.token_indexers) fields["question_passage"] = qp_field number_token_indices = \ [ArrayField(np.arange(start_ind, start_ind + number_len[i]), padding_value=-1) for i, start_ind in enumerate(number_indices)] fields["number_indices"] = ListField(number_token_indices) numbers_in_passage_field = TextField(number_tokens, self.token_indexers) extra_numbers_field = TextField(extra_number_tokens, self.token_indexers) mask_index_fields: List[Field] = [IndexField(index, qp_field) for index in mask_indices] fields["mask_indices"] = ListField(mask_index_fields) # Compile question, passage, answer metadata metadata = {"original_passage": passage_text, "original_question": question_text, "original_numbers": numbers_in_passage, "original_number_words": number_words, "extra_numbers": self.extra_numbers, "passage_tokens": passage_tokens, "question_tokens": question_tokens, "question_passage_tokens": qp_tokens, "passage_id": passage_id, "question_id": question_id, "max_passage_length": max_passage_length} # in a word broken up into pieces, every piece except the first should be ignored when calculating the loss wordpiece_mask = [not token.text.startswith('##') for token in qp_tokens] wordpiece_mask = np.array(wordpiece_mask) fields['bio_wordpiece_mask'] = ArrayField(wordpiece_mask, dtype=np.int64) if answer_annotations: # Get answer type, answer text, tokenize # For multi-span, remove repeating answers. Although possible, in the dataset it is mostly mistakes. if answer_annotations[0]['yesno']: answer_type = YESNO_ANSER_TYPE answer_texts = 'true' if answer_annotations[0]['yesno'] == '1' else 'false' else: answer_type, answer_texts = DropReader.extract_answer_info_from_annotation(answer_annotations[0]) if answer_type == SPAN_ANSWER_TYPE: answer_texts = list(OrderedDict.fromkeys(answer_texts)) tokenized_answer_texts = [] for answer_text in answer_texts: answer_tokens = self.tokenizer.tokenize(answer_text) tokenized_answer_text = ' '.join(token.text for token in answer_tokens) if tokenized_answer_text not in tokenized_answer_texts and tokenized_answer_text != '': tokenized_answer_texts.append(tokenized_answer_text) metadata["answer_annotations"] = answer_annotations metadata["answer_texts"] = answer_texts metadata["answer_tokens"] = tokenized_answer_texts # Find unit text in question # import pdb; pdb.set_trace() if answer_annotations[0]['unit'] != '': # print('answer_annotations[0][unit] = '+str(answer_annotations[0]['unit'])) valid_unit_spans = DropReader.find_valid_spans(question_tokens, [answer_annotations[0]['unit']]) ## assert len(valid_unit_spans) <= 1 ### index + 1 since there is an CLS token at the front valid_unit_spans = [(unit_span[0]+1, unit_span[1]+1) for unit_span in valid_unit_spans] else: valid_unit_spans = [] # Find answer text in question and passage # if len(tokenized_answer_texts)==1 and tokenized_answer_texts[0] == '': # import pdb; pdb.set_trace() valid_question_spans = DropReader.find_valid_spans(question_tokens, tokenized_answer_texts) for span_ind, span in enumerate(valid_question_spans): valid_question_spans[span_ind] = (span[0] + 1, span[1] + 1) valid_passage_spans = DropReader.find_valid_spans(passage_tokens, tokenized_answer_texts) for span_ind, span in enumerate(valid_passage_spans): valid_passage_spans[span_ind] = (span[0] + qlen + 2, span[1] + qlen + 2) # throw away an instance in training if a span appearing in the answer is missing from the question and passage if self._is_training: if specific_answer_type in SPAN_ANSWER_TYPES: for tokenized_answer_text in tokenized_answer_texts: temp_spans = DropReader.find_valid_spans(qp_field, [tokenized_answer_text]) if len(temp_spans) == 0: return None # Get target numbers target_numbers = [] if specific_answer_type != MULTIPLE_SPAN or self.multispan_allow_all_heads_to_answer: for answer_text in answer_texts: number = self.word_to_num(answer_text, self.improve_number_extraction) if number is not None: target_numbers.append(number) # Get possible ways to arrive at target numbers with add/sub valid_expressions: List[List[int]] = [] exp_strings = None if answer_type in ["number", "date"]: if self.target_number_rounding: valid_expressions = \ find_valid_add_sub_expressions_with_rounding( self.extra_numbers + numbers_in_passage, target_numbers, self.max_numbers_expression) else: valid_expressions = \ DropReader.find_valid_add_sub_expressions(self.extra_numbers + numbers_in_passage, target_numbers, self.max_numbers_expression) if len(target_numbers) == 0: import pdb; pdb.set_trace() if self.discard_impossible_number_questions: # The train set was verified to have all of its target_numbers lists of length 1. if (answer_type == "number" and len(valid_expressions) == 0 and self._is_training and self.max_count < target_numbers[0]): # The number to predict can't be derived from any head, so we shouldn't train on it. # arithmetic - no expressions that yield the number to predict. # counting - the maximal count is smaller than the number to predict. # However, although the answer is marked in the dataset as a number type answer, # maybe it cannot be found due to a bug in DROP's text parsing. # So in addition, we try to find the answer as a span in the text. # If the answer is indeed a span in the text, we don't discard that question. if len(valid_question_spans) == 0 and len(valid_passage_spans) == 0: return None if not self.keep_impossible_number_questions_which_exist_as_spans: return None # Get possible ways to arrive at target numbers with counting valid_counts: List[int] = [] if answer_type in ["number"]: numbers_for_count = list(range(self.max_count + 1)) valid_counts = DropReader.find_valid_counts(numbers_for_count, target_numbers) valid_yesno: int = -1 if answer_type in ["yesno"]: valid_yesno = 1 if answer_texts == 'true' else 0 # Update metadata with answer info answer_info = {"answer_passage_spans": valid_passage_spans, "answer_question_spans": valid_question_spans, "expressions": valid_expressions, "counts": valid_counts, "unit": valid_unit_spans, "yesno": valid_yesno} metadata["answer_info"] = answer_info # Add answer fields passage_span_fields: List[Field] = [] if specific_answer_type != MULTIPLE_SPAN or self.multispan_allow_all_heads_to_answer: passage_span_fields: List[Field] = [SpanField(span[0], span[1], qp_field) for span in valid_passage_spans] if not passage_span_fields: passage_span_fields.append(SpanField(-1, -1, qp_field)) fields["answer_as_passage_spans"] = ListField(passage_span_fields) question_span_fields: List[Field] = [] if specific_answer_type != MULTIPLE_SPAN or self.multispan_allow_all_heads_to_answer: question_span_fields: List[Field] = [SpanField(span[0], span[1], qp_field) for span in valid_question_spans] if not question_span_fields: question_span_fields.append(SpanField(-1, -1, qp_field)) fields["answer_as_question_spans"] = ListField(question_span_fields) add_sub_signs_field: List[Field] = [] extra_signs_field: List[Field] = [] for signs_for_one_add_sub_expressions in valid_expressions: extra_signs = signs_for_one_add_sub_expressions[:len(self.extra_numbers)] normal_signs = signs_for_one_add_sub_expressions[len(self.extra_numbers):] add_sub_signs_field.append(SequenceLabelField(normal_signs, numbers_in_passage_field)) extra_signs_field.append(SequenceLabelField(extra_signs, extra_numbers_field)) if not add_sub_signs_field: add_sub_signs_field.append(SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field)) if not extra_signs_field: extra_signs_field.append(SequenceLabelField([0] * len(self.extra_numbers), extra_numbers_field)) fields["answer_as_expressions"] = ListField(add_sub_signs_field) if self.extra_numbers: fields["answer_as_expressions_extra"] = ListField(extra_signs_field) ''' Add unit_field ''' unit_span_fields: List[Field] = [] unit_span_fields: List[Field] = [SpanField(span[0], span[1], qp_field) for span in valid_unit_spans] if not unit_span_fields: unit_span_fields.append(SpanField(-1, -1, qp_field)) fields["answer_as_unit_spans"] = ListField(unit_span_fields) count_fields: List[Field] = [LabelField(count_label, skip_indexing=True) for count_label in valid_counts] if not count_fields: count_fields.append(LabelField(-1, skip_indexing=True)) fields["answer_as_counts"] = ListField(count_fields) yesno_field: List[Field] = [LabelField(valid_yesno, skip_indexing=True)] fields["answer_as_yesno"] = ListField(yesno_field) no_answer_bios = SequenceLabelField([0] * len(qp_tokens), sequence_field=qp_field) if (specific_answer_type in self.bio_types) and (len(valid_passage_spans) > 0 or len(valid_question_spans) > 0): # Used for flexible BIO loss # START spans_dict = {} text_to_disjoint_bios: List[ListField] = [] flexibility_count = 1 for tokenized_answer_text in tokenized_answer_texts: spans = DropReader.find_valid_spans(qp_tokens, [tokenized_answer_text]) if len(spans) == 0: # possible if the passage was clipped, but not for all of the answers continue spans_dict[tokenized_answer_text] = spans disjoint_bios: List[SequenceLabelField] = [] for span_ind, span in enumerate(spans): bios = create_bio_labels([span], len(qp_field)) disjoint_bios.append(SequenceLabelField(bios, sequence_field=qp_field)) text_to_disjoint_bios.append(ListField(disjoint_bios)) flexibility_count *= ((2**len(spans)) - 1) fields["answer_as_text_to_disjoint_bios"] = ListField(text_to_disjoint_bios) if (flexibility_count < self.flexibility_threshold): # generate all non-empty span combinations per each text spans_combinations_dict = {} for key, spans in spans_dict.items(): spans_combinations_dict[key] = all_combinations = [] for i in range(1, len(spans) + 1): all_combinations += list(itertools.combinations(spans, i)) # calculate product between all the combinations per each text packed_gold_spans_list = itertools.product(*list(spans_combinations_dict.values())) bios_list: List[SequenceLabelField] = [] for packed_gold_spans in packed_gold_spans_list: gold_spans = [s for sublist in packed_gold_spans for s in sublist] bios = create_bio_labels(gold_spans, len(qp_field)) bios_list.append(SequenceLabelField(bios, sequence_field=qp_field)) fields["answer_as_list_of_bios"] = ListField(bios_list) fields["answer_as_text_to_disjoint_bios"] = ListField([ListField([no_answer_bios])]) else: fields["answer_as_list_of_bios"] = ListField([no_answer_bios]) # END # Used for both "require-all" BIO loss and flexible loss bio_labels = create_bio_labels(valid_question_spans + valid_passage_spans, len(qp_field)) fields['span_bio_labels'] = SequenceLabelField(bio_labels, sequence_field=qp_field) fields["is_bio_mask"] = LabelField(1, skip_indexing=True) else: fields["answer_as_text_to_disjoint_bios"] = ListField([ListField([no_answer_bios])]) fields["answer_as_list_of_bios"] = ListField([no_answer_bios]) # create all 'O' BIO labels for non-span questions fields['span_bio_labels'] = no_answer_bios fields["is_bio_mask"] = LabelField(0, skip_indexing=True) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray]) -> List[Instance]: """ This function currently only handles BIOUL tags. Imagine an NER model predicts three named entities (each one with potentially multiple tokens). For each individual entity, we create a new Instance that has the label set to only that entity and the rest of the tokens are labeled as outside. We then return a list of those Instances. For example: ```text Mary went to Seattle to visit Microsoft Research U-Per O O U-Loc O O B-Org L-Org ``` We create three instances. ```text Mary went to Seattle to visit Microsoft Research U-Per O O O O O O O Mary went to Seattle to visit Microsoft Research O O O U-LOC O O O O Mary went to Seattle to visit Microsoft Research O O O O O O B-Org L-Org ``` We additionally add a flag to these instances to tell the model to only compute loss on non-O tags, so that we get gradients that are specific to the particular span prediction that each instance represents. """ predicted_tags = outputs["tags"] predicted_spans = [] i = 0 while i < len(predicted_tags): tag = predicted_tags[i] # if its a U, add it to the list if tag[0] == "U": current_tags = [ t if idx == i else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) # if its a B, keep going until you hit an L. elif tag[0] == "B": begin_idx = i while tag[0] != "L": i += 1 tag = predicted_tags[i] end_idx = i current_tags = [ t if begin_idx <= idx <= end_idx else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) i += 1 # Creates a new instance for each contiguous tag instances = [] for labels in predicted_spans: new_instance = deepcopy(instance) text_field: TextField = instance["tokens"] # type: ignore new_instance.add_field("tags", SequenceLabelField(labels, text_field), self._model.vocab) new_instance.add_field("ignore_loss_on_o_tags", FlagField(True)) instances.append(new_instance) return instances
def text_to_instance(self, source_string: str, gold_spans: Dict[Tuple[int, int], str], scene_string: str, answer: str, program: str) -> Instance: # type: ignore """Turns raw source string and target string into an ``Instance``.""" tokens = self.tokenizer.tokenize(source_string) word_pieces = self._get_wordpieces(source_string) word_pieces_tokens = [Token('[CLS]') ] + [Token(wp) for wp in word_pieces] + [Token('[SEP]')] text_field = TextField(tokens, self._token_indexers) wp_field = TextField(word_pieces_tokens, self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} if gold_spans is None: constants = self._domain_utils.get_constants(program) spans: List[Field] = [] gold_labels = [] for start, end in enumerate_spans(word_pieces): # Shift by 1 due to CLS token spans.append(SpanField(start + 1, end + 1, wp_field)) if gold_spans is not None: # Shift by 1 due to CLS token gold_labels.append( gold_spans.get((start + 1, end + 1), "NO-LABEL")) else: # Create random labels for each span so that labels would be collected. When no # more true labels are left, draw between NO-LABEL and span. These randomly assigned # labels would be ignored during training if constants[0]: gold_labels.append(constants[0].pop()) else: rand_label = np.random.choice(a=["NO-LABEL", "span"], size=1, p=[0.7, 0.3]) gold_labels.append(rand_label[0]) span_list_field: ListField = ListField(spans) fields["spans"] = span_list_field fields["span_labels"] = SequenceLabelField( gold_labels, span_list_field, label_namespace="labels", ) metadata = { "tokens": word_pieces, "scene_str": scene_string, "answer": answer } if program: metadata["program"] = program if gold_spans: metadata["gold_spans"] = gold_spans fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, # type: ignore sentences: List[List[str]], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None, *, mention_token_spans: Optional[Sequence[Tuple[int, int]]] = None ) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- sentences : ``List[List[str]]``, required. A list of lists representing the tokenized words and sentences in the document. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the document, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. mention_token_spans: optional A Sequence of spans which should be consider for coref. This will override the usual behavior of including all spans up to the maximum width. The spans should be specified in terms of token indices with inclusive end token indices. Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full document. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the document text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ flattened_sentences = [ self._normalize_word(word) for sentence in sentences for word in sentence ] metadata: Dict[str, Any] = {"original_text": flattened_sentences} if gold_clusters is not None: metadata["clusters"] = gold_clusters text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers) cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id span_fields: List[Field] = [] span_labels: Optional[ List[int]] = [] if gold_clusters is not None else None if mention_token_spans is None: # every possible span in the document up to a certain maximum size is a # mention candidate sentence_offset = 0 for sentence in sentences: for start, end in enumerate_spans( sentence, offset=sentence_offset, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) span_fields.append(SpanField(start, end, text_field)) sentence_offset += len(sentence) else: if span_labels is not None: raise NotImplementedError( "We currently don't handle known mentions plus " "gold labels") # the mentions spans are already known; we just need to make SpanFields for them span_fields = [ SpanField(start, end, text_field) for (start, end) in mention_token_spans ] span_field = ListField(span_fields) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field } if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields)
def text_to_instance( self, # type: ignore tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None, ner_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField( {"words": [x.text for x in tokens]}) # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_chunks = to_bioul(chunk_tags, encoding=self._original_coding_scheme ) if chunk_tags is not None else None coded_ner = to_bioul(ner_tags, encoding=self._original_coding_scheme ) if ner_tags is not None else None else: # the default IOB1 coded_chunks = chunk_tags coded_ner = ner_tags # Add "feature labels" to instance if 'pos' in self.feature_labels: if pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance.") instance_fields['pos_tags'] = SequenceLabelField( pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: if coded_chunks is None: raise ConfigurationError( "Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance.") instance_fields['chunk_tags'] = SequenceLabelField( coded_chunks, sequence, "chunk_tags") if 'ner' in self.feature_labels: if coded_ner is None: raise ConfigurationError( "Dataset reader was specified to use NER tags as " " features. Pass them to text_to_instance.") instance_fields['ner_tags'] = SequenceLabelField( coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner' and coded_ner is not None: instance_fields['tags'] = SequenceLabelField( coded_ner, sequence, self.label_namespace) elif self.tag_label == 'pos' and pos_tags is not None: instance_fields['tags'] = SequenceLabelField( pos_tags, sequence, self.label_namespace) elif self.tag_label == 'chunk' and coded_chunks is not None: instance_fields['tags'] = SequenceLabelField( coded_chunks, sequence, self.label_namespace) return Instance(instance_fields)
def text_to_instance( self, question_text: str, passage_text: str, passage_tokens: List[Token], passage_spans: List[Tuple[int, int]], numbers_in_passage: List[Any], number_words: List[str], number_indices: List[int], number_len: List[int], question_id: str = None, passage_id: str = None, answer_annotations: List[Dict] = None) -> Union[Instance, None]: # Tokenize question and passage question_tokens = self.tokenizer.tokenize(question_text) qlen = len(question_tokens) plen = len(passage_tokens) question_passage_tokens = [Token('[CLS]')] + question_tokens + [ Token('[SEP]') ] + passage_tokens if len(question_passage_tokens) > self.max_pieces - 1: question_passage_tokens = question_passage_tokens[:self. max_pieces - 1] passage_tokens = passage_tokens[:self.max_pieces - qlen - 3] plen = len(passage_tokens) number_indices, number_len, numbers_in_passage = \ clipped_passage_num(number_indices, number_len, numbers_in_passage, plen) question_passage_tokens += [Token('[SEP]')] number_indices = [index + qlen + 2 for index in number_indices] + [-1] # Not done in-place so they won't change the numbers saved for the passage number_len = number_len + [1] numbers_in_passage = numbers_in_passage + [0] number_tokens = [Token(str(number)) for number in numbers_in_passage] extra_number_tokens = [Token(str(num)) for num in self.extra_numbers] mask_indices = [0, qlen + 1, len(question_passage_tokens) - 1] if self.extract_spans: # adapt indexes to question_passage_tokens sequence passage_spans = [(span[0] + qlen + 2, span[1] + qlen + 2) for span in passage_spans] # remove spans of truncated part of passage passage_spans = [ span for span in passage_spans if span[1] <= len(question_passage_tokens) ] # make span indexes inclusive passage_spans = [(span[0], span[1] - 1) for span in passage_spans] fields: Dict[str, Field] = {} # Add feature fields question_passage_field = TextField(question_passage_tokens, self.token_indexers) fields["question_passage"] = question_passage_field if self.extract_spans: passage_span_fields = [ SpanField(span[0], span[1], question_passage_field) for span in passage_spans ] fields["passage_spans"] = ListField(passage_span_fields) number_token_indices = \ [ArrayField(np.arange(start_ind, start_ind + number_len[i]), padding_value=-1) for i, start_ind in enumerate(number_indices)] fields["number_indices"] = ListField(number_token_indices) numbers_in_passage_field = TextField(number_tokens, self.token_indexers) extra_numbers_field = TextField(extra_number_tokens, self.token_indexers) all_numbers_field = TextField(extra_number_tokens + number_tokens, self.token_indexers) mask_index_fields: List[Field] = [ IndexField(index, question_passage_field) for index in mask_indices ] fields["mask_indices"] = ListField(mask_index_fields) # Compile question, passage, answer metadata metadata = { "original_passage": passage_text, "original_question": question_text, "original_numbers": numbers_in_passage, "original_number_words": number_words, "extra_numbers": self.extra_numbers, "passage_tokens": passage_tokens, "question_tokens": question_tokens, "question_passage_tokens": question_passage_tokens, "passage_id": passage_id, "question_id": question_id } if answer_annotations: for annotation in answer_annotations: tokenized_spans = [[ token.text for token in self.tokenizer.tokenize(answer) ] for answer in annotation['spans']] annotation['spans'] = [ tokenlist_to_passage(token_list) for token_list in tokenized_spans ] # Get answer type, answer text, tokenize answer_type, answer_texts = DropReader.extract_answer_info_from_annotation( answer_annotations[0]) tokenized_answer_texts = [] num_spans = min(len(answer_texts), self.max_spans) for answer_text in answer_texts: answer_tokens = self.tokenizer.tokenize(answer_text) tokenized_answer_texts.append(' '.join( token.text for token in answer_tokens)) metadata["answer_annotations"] = answer_annotations metadata["answer_texts"] = answer_texts metadata["answer_tokens"] = tokenized_answer_texts # Find answer text in question and passage valid_question_spans = DropReader.find_valid_spans( question_tokens, tokenized_answer_texts) for span_ind, span in enumerate(valid_question_spans): valid_question_spans[span_ind] = (span[0] + 1, span[1] + 1) valid_passage_spans = DropReader.find_valid_spans( passage_tokens, tokenized_answer_texts) for span_ind, span in enumerate(valid_passage_spans): valid_passage_spans[span_ind] = (span[0] + qlen + 2, span[1] + qlen + 2) # Get target numbers target_numbers = [] for answer_text in answer_texts: number = self.word_to_num(answer_text) if number is not None: target_numbers.append(number) # Get possible ways to arrive at target numbers with add/sub valid_expressions: List[List[int]] = [] exp_strings = None if answer_type in ["number", "date"]: if self.exp_search == 'full': expressions = get_full_exp( list(enumerate(self.extra_numbers + numbers_in_passage)), target_numbers, self.operations, self.op_dict, self.max_depth) zipped = list(zip(*expressions)) if zipped: valid_expressions = list(zipped[0]) exp_strings = list(zipped[1]) elif self.exp_search == 'add_sub': valid_expressions = \ DropReader.find_valid_add_sub_expressions(self.extra_numbers + numbers_in_passage, target_numbers, self.max_numbers_expression) elif self.exp_search == 'template': valid_expressions, exp_strings = \ get_template_exp(self.extra_numbers + numbers_in_passage, target_numbers, self.templates, self.template_strings) exp_strings = sum(exp_strings, []) # Get possible ways to arrive at target numbers with counting valid_counts: List[int] = [] if answer_type in ["number"]: numbers_for_count = list(range(self.max_count + 1)) valid_counts = DropReader.find_valid_counts( numbers_for_count, target_numbers) # Update metadata with answer info answer_info = { "answer_passage_spans": valid_passage_spans, "answer_question_spans": valid_question_spans, "num_spans": num_spans, "expressions": valid_expressions, "counts": valid_counts } if self.exp_search in ['template', 'full']: answer_info['expr_text'] = exp_strings metadata["answer_info"] = answer_info # Add answer fields passage_span_fields: List[Field] = [ SpanField(span[0], span[1], question_passage_field) for span in valid_passage_spans ] if not passage_span_fields: passage_span_fields.append( SpanField(-1, -1, question_passage_field)) fields["answer_as_passage_spans"] = ListField(passage_span_fields) question_span_fields: List[Field] = [ SpanField(span[0], span[1], question_passage_field) for span in valid_question_spans ] if not question_span_fields: question_span_fields.append( SpanField(-1, -1, question_passage_field)) fields["answer_as_question_spans"] = ListField( question_span_fields) if self.exp_search == 'add_sub': add_sub_signs_field: List[Field] = [] extra_signs_field: List[Field] = [] for signs_for_one_add_sub_expressions in valid_expressions: extra_signs = signs_for_one_add_sub_expressions[:len( self.extra_numbers)] normal_signs = signs_for_one_add_sub_expressions[ len(self.extra_numbers):] add_sub_signs_field.append( SequenceLabelField(normal_signs, numbers_in_passage_field)) extra_signs_field.append( SequenceLabelField(extra_signs, extra_numbers_field)) if not add_sub_signs_field: add_sub_signs_field.append( SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field)) if not extra_signs_field: extra_signs_field.append( SequenceLabelField([0] * len(self.extra_numbers), extra_numbers_field)) fields["answer_as_expressions"] = ListField( add_sub_signs_field) if self.extra_numbers: fields["answer_as_expressions_extra"] = ListField( extra_signs_field) elif self.exp_search in ['template', 'full']: expression_indices = [] for expression in valid_expressions: if not expression: expression.append(3 * [-1]) expression_indices.append( ArrayField(np.array(expression), padding_value=-1)) if not expression_indices: expression_indices = \ [ArrayField(np.array([3 * [-1]]), padding_value=-1) for _ in range(len(self.templates))] fields["answer_as_expressions"] = ListField(expression_indices) count_fields: List[Field] = [ LabelField(count_label, skip_indexing=True) for count_label in valid_counts ] if not count_fields: count_fields.append(LabelField(-1, skip_indexing=True)) fields["answer_as_counts"] = ListField(count_fields) fields["num_spans"] = LabelField(num_spans, skip_indexing=True) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance(self, # type: ignore tokens: List[str], pos_tags: List[str] = None, gold_tree: Tree = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. pos_tags ``List[str]``, optional, (default = None). The POS tags for the words in the sentence. gold_tree : ``Tree``, optional (default = None). The gold parse tree to create span labels from. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. pos_tags : ``SequenceLabelField`` The POS tags of the words in the sentence. Only returned if ``use_pos_tags`` is ``True`` spans : ``ListField[SpanField]`` A ListField containing all possible subspans of the sentence. span_labels : ``SequenceLabelField``, optional. The constiutency tags for each of the possible spans, with respect to a gold parse tree. If a span is not contained within the tree, a span will have a ``NO-LABEL`` label. """ # pylint: disable=arguments-differ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} if self._use_pos_tags and pos_tags is not None: pos_tag_field = SequenceLabelField(pos_tags, text_field, "pos_tags") fields["pos_tags"] = pos_tag_field elif self._use_pos_tags: raise ConfigurationError("use_pos_tags was set to True but no gold pos" " tags were passed to the dataset reader.") spans: List[Field] = [] gold_labels = [] if gold_tree is not None: gold_spans_with_pos_tags: Dict[Tuple[int, int], str] = {} self._get_gold_spans(gold_tree, 0, gold_spans_with_pos_tags) gold_spans = {span: label for (span, label) in gold_spans_with_pos_tags.items() if "-POS" not in label} else: gold_spans = None for start, end in enumerate_spans(tokens): spans.append(SpanField(start, end, text_field)) if gold_spans is not None: if (start, end) in gold_spans.keys(): gold_labels.append(gold_spans[(start, end)]) else: gold_labels.append("NO-LABEL") span_list_field: ListField = ListField(spans) fields["spans"] = span_list_field if gold_tree is not None: fields["span_labels"] = SequenceLabelField(gold_labels, span_list_field) return Instance(fields)