def make_reading_comprehension_instance_quac( question_list_tokens: List[List[Token]], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_span_lists: List[List[Tuple[int, int]]] = None, yesno_list: List[int] = None, followup_list: List[int] = None, additional_metadata: Dict[str, Any] = None, num_context_answers: int = 0) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_list_tokens : ``List[List[Token]]`` An already-tokenized list of questions. Each dialog have multiple questions. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_spans_lists : ``List[List[Tuple[int, int]]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list of list, first because there is multiple questions per dialog, and because there might be several possible correct answer spans in the passage. Currently, we just select the last span in this list (i.e., QuAC has multiple annotations on the dev set; this will select the last span, which was given by the original annotator). yesno_list : ``List[int]`` List of the affirmation bit for each question answer pairs. followup_list : ``List[int]`` List of the continuation bit for each question answer pairs. num_context_answers : ``int``, optional How many answers to encode into the passage. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = ListField([ TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens ]) metadata = {'original_passage': passage_text, 'token_offsets': passage_offsets, 'question_tokens': [[token.text for token in question_tokens] \ for question_tokens in question_list_tokens], 'passage_tokens': [token.text for token in passage_tokens], } p1_answer_marker_list: List[Field] = [] p2_answer_marker_list: List[Field] = [] p3_answer_marker_list: List[Field] = [] def get_tag(i, i_name): # Generate a tag to mark previous answer span in the passage. return "<{0:d}_{1:s}>".format(i, i_name) def mark_tag(span_start, span_end, passage_tags, prev_answer_distance): try: assert span_start > 0 assert span_end > 0 except: raise ValueError( "Previous {0:d}th answer span should have been updated!". format(prev_answer_distance)) # Modify "tags" to mark previous answer span. if span_start == span_end: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "") else: passage_tags[prev_answer_distance][span_start] = get_tag( prev_answer_distance, "start") passage_tags[prev_answer_distance][span_end] = get_tag( prev_answer_distance, "end") for passage_index in range(span_start + 1, span_end): passage_tags[prev_answer_distance][passage_index] = get_tag( prev_answer_distance, "in") if token_span_lists: span_start_list: List[Field] = [] span_end_list: List[Field] = [] p1_span_start, p1_span_end, p2_span_start = -1, -1, -1 p2_span_end, p3_span_start, p3_span_end = -1, -1, -1 # Looping each <<answers>>. for question_index, answer_span_lists in enumerate(token_span_lists): span_start, span_end = answer_span_lists[ -1] # Last one is the original answer span_start_list.append(IndexField(span_start, passage_field)) span_end_list.append(IndexField(span_end, passage_field)) prev_answer_marker_lists = [["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens), ["O"] * len(passage_tokens)] if question_index > 0 and num_context_answers > 0: mark_tag(p1_span_start, p1_span_end, prev_answer_marker_lists, 1) if question_index > 1 and num_context_answers > 1: mark_tag(p2_span_start, p2_span_end, prev_answer_marker_lists, 2) if question_index > 2 and num_context_answers > 2: mark_tag(p3_span_start, p3_span_end, prev_answer_marker_lists, 3) p3_span_start = p2_span_start p3_span_end = p2_span_end p2_span_start = p1_span_start p2_span_end = p1_span_end p1_span_start = span_start p1_span_end = span_end if num_context_answers > 2: p3_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[3], passage_field, label_namespace="answer_tags")) if num_context_answers > 1: p2_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[2], passage_field, label_namespace="answer_tags")) if num_context_answers > 0: p1_answer_marker_list.append( SequenceLabelField(prev_answer_marker_lists[1], passage_field, label_namespace="answer_tags")) fields['span_start'] = ListField(span_start_list) fields['span_end'] = ListField(span_end_list) if num_context_answers > 0: fields['p1_answer_marker'] = ListField(p1_answer_marker_list) if num_context_answers > 1: fields['p2_answer_marker'] = ListField(p2_answer_marker_list) if num_context_answers > 2: fields['p3_answer_marker'] = ListField( p3_answer_marker_list) fields['yesno_list'] = ListField( \ [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list]) fields['followup_list'] = ListField([LabelField(followup, label_namespace="followup_labels") \ for followup in followup_list]) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)
def make_reading_comprehension_instance( self, question_tokens: List[Token], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, additional_metadata: Dict[str, Any] = None) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_tokens : ``List[Token]`` An already-tokenized question. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_spans : ``List[Tuple[int, int]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list because there might be several possible correct answer spans in the passage. Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple annotations on the dev set; this will select the span that the most annotators gave as correct). answer_texts : ``List[str]``, optional All valid answer strings for the given question. In SQuAD, e.g., the training set has exactly one answer per question, but the dev and test sets have several. TriviaQA has many possible answers, which are the aliases for the known correct entity. This is put into the metadata for use with official evaluation scripts, but not used anywhere else. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = TextField(question_tokens, token_indexers) metadata = { 'original_passage': passage_text, 'question_tokens': [token.text for token in question_tokens], 'passage_tokens': [token.text for token in passage_tokens], } if answer_texts: metadata['answer_texts'] = answer_texts if token_spans: metadata["token_spans"] = token_spans # assume spans are sorted by some criteria span_start = token_spans[0][0] span_end = token_spans[0][1] - 1 assert (span_start <= span_end) if span_end > len(passage_tokens) - 1: return None fields['span_start'] = IndexField(span_start, passage_field) fields['span_end'] = IndexField(span_end, passage_field) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)
def text_to_instance(self, line: str) -> Instance: # type: ignore tokens = self._tokenizer.tokenize(line) return Instance({"line": TextField(tokens, self._token_indexers)})
def text_to_instance(self, sentence: List[str], ner_dict: Dict[Tuple[int, int], str], relation_dict, cluster_dict, trigger_dict, argument_dict, doc_key: str, dataset: str, sentence_num: int, groups: List[str], start_ix: int, end_ix: int): """ TODO(dwadden) document me. """ sentence = [self._normalize_word(word) for word in sentence] text_field = TextField([Token(word) for word in sentence], self._token_indexers) text_field_with_context = TextField([Token(word) for word in groups], self._token_indexers) # Put together the metadata. metadata = dict(sentence=sentence, ner_dict=ner_dict, relation_dict=relation_dict, cluster_dict=cluster_dict, trigger_dict=trigger_dict, argument_dict=argument_dict, doc_key=doc_key, dataset=dataset, groups=groups, start_ix=start_ix, end_ix=end_ix, sentence_num=sentence_num) metadata_field = MetadataField(metadata) # Trigger labels. One label per token in the input. token_trigger_labels = [] for i in range(len(text_field)): token_trigger_labels.append(trigger_dict[i]) trigger_label_field = SequenceLabelField( token_trigger_labels, text_field, label_namespace="trigger_labels") # Generate fields for text spans, ner labels, coref labels. spans = [] span_ner_labels = [] span_coref_labels = [] for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width): span_ix = (start, end) span_ner_labels.append(ner_dict[span_ix]) span_coref_labels.append(cluster_dict[span_ix]) spans.append(SpanField(start, end, text_field)) span_field = ListField(spans) ner_label_field = SequenceLabelField(span_ner_labels, span_field, label_namespace="ner_labels") coref_label_field = SequenceLabelField(span_coref_labels, span_field, label_namespace="coref_labels") # Generate labels for relations and arguments. Only store non-null values. # For the arguments, by convention the first span specifies the trigger, and the second # specifies the argument. Ideally we'd have an adjacency field between (token, span) pairs # for the event arguments field, but AllenNLP doesn't make it possible to express # adjacencies between two different sequences. n_spans = len(spans) span_tuples = [(span.span_start, span.span_end) for span in spans] candidate_indices = [(i, j) for i in range(n_spans) for j in range(n_spans)] relations = [] relation_indices = [] for i, j in candidate_indices: span_pair = (span_tuples[i], span_tuples[j]) relation_label = relation_dict[span_pair] if relation_label: relation_indices.append((i, j)) relations.append(relation_label) relation_label_field = AdjacencyField( indices=relation_indices, sequence_field=span_field, labels=relations, label_namespace="relation_labels") arguments = [] argument_indices = [] n_tokens = len(sentence) candidate_indices = [(i, j) for i in range(n_tokens) for j in range(n_spans)] for i, j in candidate_indices: token_span_pair = (i, span_tuples[j]) argument_label = argument_dict[token_span_pair] if argument_label: argument_indices.append((i, j)) arguments.append(argument_label) argument_label_field = AdjacencyFieldAssym( indices=argument_indices, row_field=text_field, col_field=span_field, labels=arguments, label_namespace="argument_labels") # Pull it all together. fields = dict(text=text_field_with_context, spans=span_field, ner_labels=ner_label_field, coref_labels=coref_label_field, trigger_labels=trigger_label_field, argument_labels=argument_label_field, relation_labels=relation_label_field, metadata=metadata_field) return Instance(fields)
def make_reading_comprehension_instance( question_tokens: List[Token], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_spans_sent: List[Tuple[int, int]] = None, sent_labels: List[int] = None, answer_texts: List[str] = None, passage_offsets: List[Tuple] = None, evd_possible_chains: List[List[int]] = None, ans_sent_idxs: List[int] = None, article_id: str = None, para_limit: int = 2250) -> Instance: """ Parameters ---------- question_tokens : ``List[Token]`` An already-tokenized question. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_spans : ``List[Tuple[int, int]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list because there might be several possible correct answer spans in the passage. Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple annotations on the dev set; this will select the span that the most annotators gave as correct). answer_texts : ``List[str]``, optional All valid answer strings for the given question. In SQuAD, e.g., the training set has exactly one answer per question, but the dev and test sets have several. TriviaQA has many possible answers, which are the aliases for the known correct entity. This is put into the metadata for use with official evaluation scripts, but not used anywhere else. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. para_limit : ``int``, indicates the maximum length of a given article """ fields: Dict[str, Field] = {} limit = len( passage_tokens) if para_limit > len(passage_tokens) else para_limit passage_tokens = passage_tokens[:limit] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) # sent_spans: list of [SpanFiled[sent_start, sent_end]], denote the start and end offset for each sentence # sent_labels_: list of [label], denote the whether a sentence is a supporting fact sent_spans, sent_labels_ = process_sent_spans(token_spans_sent, sent_labels, passage_field, para_limit) fields['sent_labels'] = ListField(sent_labels_) fields['sentence_spans'] = ListField(sent_spans) fields['passage'] = passage_field fields['question'] = TextField(question_tokens, token_indexers) # filter spans that exceed para limit so that the info in metadata is correct token_spans_sent = [(s, e if e < limit else limit - 1) for s, e in token_spans_sent if s < limit] sent_labels = sent_labels[:len(token_spans_sent)] evd_possible_chains_ = process_evidence_chains(evd_possible_chains, sent_labels_, fields) metadata = make_meta_data(passage_text, passage_offsets, question_tokens, passage_tokens, token_spans_sent, sent_labels, answer_texts, evd_possible_chains, evd_possible_chains_, ans_sent_idxs, article_id) fields['metadata'] = MetadataField(metadata) return Instance(fields)
def __getitem__(self, index): # if self.split == 'test': # raise ValueError("blind test mode not supported quite yet") item = deepcopy(self.items[index]) image_id = int(item['img_id'].split('-')[-1]) with h5py.File(self.tag_feature_path, 'r') as h5: tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32) tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32) tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int) with h5py.File(self.non_tag_feature_path, 'r') as h5: non_tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32) non_tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int) non_tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32) ################################################################### # Load questions and answers non_tag_question_annotid2detidx = self.non_tag_question_annotid2detidx[item['annot_id']] non_tag_answer_annotid2detidx = self.non_tag_answer_annotid2detidx[item['annot_id']] non_tag_rationale_annotid2detidx = self.non_tag_rationale_annotid2detidx[item['annot_id']] if self.mode == 'answer': question_annotid2detidx = non_tag_question_annotid2detidx answer_annotid2detidx = non_tag_answer_annotid2detidx else: conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice q_len = len(item['question']) question_annotid2detidx = {} for k,v in non_tag_question_annotid2detidx.items(): question_annotid2detidx[k] = v for k,v in non_tag_answer_annotid2detidx[conditioned_label].items(): question_annotid2detidx[k+q_len] = v answer_annotid2detidx = non_tag_rationale_annotid2detidx if self.mode == 'rationale': conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice item['question'] += item['answer_choices'][conditioned_label] with h5py.File(self.h5fn, 'r') as h5: grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()} answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) non_tag_dets2use, non_tag_old_det_to_new_ind = self._get_non_tag_det_to_use(question_annotid2detidx, answer_annotid2detidx, len(non_tag_boxes)) if self.add_image_as_a_box: assert (len(dets2use) == np.max(old_det_to_new_ind)) if self.add_image_as_a_box: non_tag_old_det_to_new_ind += 1 # shift the non_tag detection idx, effectively as appending the non_tag detections to tag detections non_tag_old_det_to_new_ind[np.where(non_tag_old_det_to_new_ind)[0]] += len(dets2use) old_det_to_new_ind = old_det_to_new_ind.tolist() non_tag_old_det_to_new_ind = non_tag_old_det_to_new_ind.tolist() ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()} with h5py.File(self.h5fn, 'r') as h5: grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()} # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always # condition on the `conditioned_answer_choice.` condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else "" instance_dict = {} if 'endingonly' not in self.embs_to_load: questions_tokenized, question_tags = zip(*[_my_fix_tokenization( item['question'], grp_items[f'ctx_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], non_tag_old_det_to_new_ind, question_annotid2detidx, token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1, ) for i in range(4)]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) answers_tokenized, answer_tags = zip(*[_my_fix_tokenization( answer, grp_items[f'answer_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], non_tag_old_det_to_new_ind, answer_annotid2detidx[i], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1, ) for i, answer in enumerate(answer_choices)]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'], 'img_id':item['img_id']}) ##node node_tokenized, node_tags = zip(*[_fix_word( i, index, item['annot_id'], self.h5fn_graph, self.h5fn_word, pad_ind=0 ) for i in range(4)]) instance_dict['node'] = ListField(node_tokenized) ##visual concept visual_concept_tokenized, visual_concept_tags = zip(*[_fix_visual_concept( item['visual_concept'], item['visual_concept_num'], self.h5fn_word, pad_ind=0 ) for i in range(4)]) instance_dict['visual_concept'] = ListField(visual_concept_tokenized) ##adj adj_result, adj_len = zip(*[_fix_adj( i, index, item['annot_id'], self.h5fn_graph, pad_ind=0 ) for i in range(4)]) instance_dict['adjacent'] = ListField(adj_result) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. #image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn'])) #image, window, img_scale, padding = resize_image(image, random_pad=self.is_train) #image = to_tensor_and_normalize(image) #c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # Chop off the final dimension, that's the confidence tag_boxes = np.array(metadata['boxes'])[dets2use, :-1] if self.add_image_as_a_box: tag_boxes = np.row_stack(([1,1,700,700], tag_boxes)) # here we just use dummy box for background non_tag_boxes = non_tag_boxes[non_tag_dets2use] boxes = np.concatenate((tag_boxes, non_tag_boxes)) if self.add_image_as_a_box: dets2use = dets2use + 1 dets2use = np.insert(dets2use, 0, 0) tag_det_features = tag_features[dets2use] non_tag_det_features = non_tag_features[non_tag_dets2use] det_features = np.concatenate((tag_det_features, non_tag_det_features)) instance_dict['det_features'] = ArrayField(det_features, padding_value=0) assert (det_features.shape[0] == boxes.shape[0]) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return None, instance
def text_to_instance( self, # type: ignore tokens: List[str], predicate_indices: List[int], token_representations: FloatTensor = None, labels: List[float] = None): """ Parameters ---------- tokens : ``List[str]``, required. The tokens in the sentence to be encoded. predicate_indices: ``List[int]``, required. A List of int, where each item denotes the index of a token to predict a value for. token_representations: ``FloatTensor``, optional (default=``None``) Precomputed token representations to use in the instance. If ``None``, we use a ``Contextualizer`` provided to the dataset reader to calculate the token representations. Shape is (seq_len, representation_dim). labels: ``List[str]``, optional (default=``None``) The labels of the arcs. ``None`` indicates that labels are not provided. Returns ------- An ``Instance`` containing the following fields: raw_tokens : ListField[MetadataField] The raw str tokens in the sequence. Each MetadataField stores the raw string of a single token. label_indices : ``SequenceArrayField`` Array of shape (num_labels,) corresponding to the indices of tokens to predict a value for. token_representations: ``ArrayField`` Contains the representation of the tokens. labels: ``SequenceArrayField`` The labels corresponding each arc represented in token_indices. """ fields: Dict[str, Field] = {} # Add raw_tokens to the field if self._include_raw_tokens: fields["raw_tokens"] = ListField( [MetadataField(token) for token in tokens]) # Add label_indices to the field label_indices_field = SequenceArrayField( # Subtract 1 since original data is 1-indexed # Pad with -1 since 0 (usually mask token) is a valid label index np.array(predicate_indices, dtype="int64") - 1, padding_value=-1) fields["label_indices"] = label_indices_field if token_representations is None and self._contextualizer: # Contextualize the tokens token_representations = self._contextualizer([tokens])[0] # Add representations of the tokens at the arc indices to the field # If we don't have representations, use an empty numpy array. if token_representations is not None: fields["token_representations"] = ArrayField( token_representations.numpy()) if labels: fields["labels"] = SequenceArrayField( np.array(labels, dtype="float32")) return Instance(fields)
def text_to_instance(self, graph, do_print=False) -> Instance: """ Does bulk of work converting a graph to an Instance of Fields """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} max_tgt_length = None if self.eval else 60 d = DecompGraph(graph, drop_syntax=self.drop_syntax, order=self.order) list_data = d.get_list_data(bos=START_SYMBOL, eos=END_SYMBOL, bert_tokenizer=self._tokenizer, max_tgt_length=max_tgt_length, semantics_only=self.semantics_only) if list_data is None: return None if do_print: self.spot_check(graph, list_data) # These four fields are used for seq2seq model and target side self copy fields["source_tokens"] = TextField( tokens=[Token(x) for x in list_data["src_tokens"]], token_indexers=self._source_token_indexers) if list_data['src_token_ids'] is not None: fields['source_subtoken_ids'] = ArrayField( list_data['src_token_ids']) self._number_bert_ids += len(list_data['src_token_ids']) self._number_bert_oov_ids += len([ bert_id for bert_id in list_data['src_token_ids'] if bert_id == 100 ]) if list_data['src_token_subword_index'] is not None: fields['source_token_recovery_matrix'] = ArrayField( list_data['src_token_subword_index']) # Target-side input. # (exclude the last one <EOS>.) fields["target_tokens"] = TextField( tokens=[Token(x) for x in list_data["tgt_tokens"][:-1]], token_indexers=self._target_token_indexers) if len(list_data['tgt_tokens']) > 60: self.over_len += 1 fields["source_pos_tags"] = SequenceLabelField( labels=list_data["src_pos_tags"], sequence_field=fields["source_tokens"], label_namespace="pos_tags") if list_data["tgt_pos_tags"] is not None: fields["target_pos_tags"] = SequenceLabelField( labels=list_data["tgt_pos_tags"][:-1], sequence_field=fields["target_tokens"], label_namespace="pos_tags") fields["target_node_indices"] = SequenceLabelField( labels=list_data["tgt_indices"][:-1], sequence_field=fields["target_tokens"], label_namespace="node_indices", ) # Target-side output. # Include <BOS> here because we want it in the generation vocabulary such that # at the inference starting stage, <BOS> can be correctly initialized. fields["generation_outputs"] = TextField( tokens=[Token(x) for x in list_data["tgt_tokens_to_generate"]], token_indexers=self._generation_token_indexers) fields["target_copy_indices"] = SequenceLabelField( labels=list_data["tgt_copy_indices"], sequence_field=fields["generation_outputs"], label_namespace="target_copy_indices", ) fields[ "target_attention_map"] = AdjacencyField( # TODO: replace it with ArrayField. indices=list_data["tgt_copy_map"], sequence_field=fields["generation_outputs"], padding_value=0) # These two fields for source copy fields["source_copy_indices"] = SequenceLabelField( labels=list_data["src_copy_indices"], sequence_field=fields["generation_outputs"], label_namespace="source_copy_indices", ) fields[ "source_attention_map"] = AdjacencyField( # TODO: replace it with ArrayField. indices=list_data["src_copy_map"], sequence_field=TextField([ Token(x) for x in list_data["src_copy_vocab"].get_special_tok_list() + list_data["src_tokens"] ], None), padding_value=0) #print(list_data['src_copy_indices']) #print(list_data['src_copy_map']) #print(f'over textfield {[Token(x) for x in list_data["src_copy_vocab"].get_special_tok_list() + list_data["src_tokens"]]}') #print(fields["source_copy_indices"]) #print(fields["source_attention_map"]) #sys.exit() # These two fields are used in biaffine parser fields["edge_types"] = TextField( tokens=[Token(x) for x in list_data["head_tags"]], token_indexers=self._edge_type_indexers) fields["edge_heads"] = SequenceLabelField( labels=list_data["head_indices"], sequence_field=fields["edge_types"], label_namespace="edge_heads") if list_data.get('node_mask', None) is not None: # Valid nodes are 1; pads are 0. fields['valid_node_mask'] = ArrayField(list_data['node_mask']) if list_data.get('edge_mask', None) is not None: # A matrix of shape [num_nodes, num_nodes] where entry (i, j) is 1 # if and only if (1) j < i and (2) j is not an antecedent of i. # TODO: try to remove the second constrain. fields['edge_head_mask'] = ArrayField(list_data['edge_mask']) # node attributes #print(f"tgt attr {len(list_data['tgt_attributes'])}") #print(list_data['tgt_attributes']) #print(f"target tokens {len(fields['target_tokens'])}") #print(fields['target_tokens']) fields["target_attributes"] = ContinuousLabelField( labels=list_data["tgt_attributes"][:-1], sequence_field=fields["target_tokens"], ontology=NODE_ONTOLOGY) # edge attributes fields["edge_attributes"] = ContinuousLabelField( labels=list_data["edge_attributes"][:-1], sequence_field=fields["target_tokens"], ontology=EDGE_ONTOLOGY) # this field is actually needed for scoring later fields["graph"] = MetadataField(list_data['arbor_graph']) # Metadata fields, good for debugging fields["src_tokens_str"] = MetadataField(list_data["src_tokens"]) fields["tgt_tokens_str"] = MetadataField( list_data.get("tgt_tokens", [])) fields["src_copy_vocab"] = MetadataField(list_data["src_copy_vocab"]) fields["tag_lut"] = MetadataField(dict(pos=list_data["pos_tag_lut"])) fields["source_copy_invalid_ids"] = MetadataField( list_data['src_copy_invalid_ids']) fields["node_name_list"] = MetadataField(list_data['node_name_list']) fields["target_dynamic_vocab"] = MetadataField(dict()) fields["instance_meta"] = MetadataField( dict( pos_tag_lut=list_data["pos_tag_lut"], source_dynamic_vocab=list_data["src_copy_vocab"], target_token_indexers=self._target_token_indexers, )) to_print_keys = ["target_attributes", "target_tokens"] to_print = {k: v for k, v in fields.items() if k in to_print_keys} return Instance(fields)
def text_to_instance( self, # type: ignore rule_text: str, question: str, scenario: str, history: List[Dict[str, str]], utterance_id: str = None, tree_id: str = None, source_url: str = None, answer: str = None, evidence: List[Dict[str, str]] = None) -> Optional[Instance]: """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # For CopyNet Model source_string = rule_text + ' [SEP]' target_string = answer # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) # tokenized_source.append(Token(END_SYMBOL)) '[SEP]' acts as end symbol source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField( tokenized_source[1:-1], self._target_namespace) meta_fields = { "source_tokens": [x.text for x in tokenized_source[1:-1]] } fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } # For Bert model passage_text = rule_text + ' [SEP]' question_text = question question_text += ' @ss@ ' + scenario question_text += ' @hs@ ' for follow_up_qna in history: question_text += '@qs@ ' question_text += follow_up_qna['follow_up_question'] + ' ' question_text += follow_up_qna['follow_up_answer'] + ' ' question_text += '@he@' bert_input = passage_text + ' ' + question_text bert_input_tokens = self._bert_tokenizer.tokenize(bert_input) bert_input_tokens.insert(0, Token(START_SYMBOL)) fields_dict['bert_input'] = TextField(bert_input_tokens, self._bert_token_indexers) meta_fields['passage_tokens'] = self._bert_tokenizer.tokenize( passage_text) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [ y.text for y in tokenized_target[1:-1] ] source_and_target_token_ids = self._tokens_to_ids( tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len( tokenized_source) - 2] fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source ) - 2:] fields_dict["target_token_ids"] = ArrayField( np.array(target_token_ids)) action = 'More' if answer not in ['Yes', 'No', 'Irrelevant' ] else answer fields_dict['label'] = LabelField(action) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) meta_fields['rule_text'] = rule_text meta_fields['question'] = question meta_fields['scenario'] = scenario meta_fields['history'] = history fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance( self, annotation_id: str, documents: Dict[str, List[str]], rationales: Dict[str, List[Tuple[int, int]]], query: str, label: str = None, ) -> Instance: # type: ignore # pylint: disable=arguments-differ fields = {} tokens = [] is_evidence = [] document_to_span_map = {} document_to_span_map_whole = {} docwords = documents[list(documents.keys())[0]] query = query.split("[sep]") query = [x.strip() for x in query] for docid, docwords in documents.items(): document_to_span_map_whole[docid] = (len(tokens), len(tokens) + len(docwords)) tokens += [Token(word) for word in docwords] document_to_span_map[docid] = (len(tokens) - len(docwords), len(tokens)) tokens.append(Token("[SEP]")) rationale = [0] * len(docwords) if docid in rationales: for s, e in rationales[docid]: for i in range(s, e): rationale[i] = 1 is_evidence += rationale + [1] always_keep_mask = [ 1 if t.text.upper() == "[SEP]" else 0 for t in tokens ] fields["document"] = TextField(tokens, self._token_indexers) fields["rationale"] = SequenceLabelField( is_evidence, sequence_field=fields["document"], label_namespace="evidence_labels") fields["kept_tokens"] = SequenceLabelField( always_keep_mask, sequence_field=fields["document"], label_namespace="kept_token_labels") metadata = { "annotation_id": annotation_id, "tokens": tokens, "document_to_span_map": document_to_span_map, "convert_tokens_to_instance": self.convert_tokens_to_instance, "document_to_span_map_whole": document_to_span_map_whole, "always_keep_mask": np.array(always_keep_mask) } fields["metadata"] = MetadataField(metadata) fields["label"] = MetadataField({ k: v for k, v in zip(["A", "B", "C", "D", "E", "Label"], query + [label]) }) return Instance(fields)
def text_to_instance( self, # type: ignore tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None, ner_tags: List[str] = None, target_verb_lemma: str = None, target_verb_position: int = None, verb_sense: str = None, legal_args: List[str] = None, verb_annotation: List[str] = None, parse: str = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} words = [x.text for x in tokens] instance_fields["metadata"] = MetadataField({ "words": words, # used in ai2's srl model "pos_tags": pos_tags, "chunk_tags": chunk_tags, "ner_tags": chunk_tags, "target_verb_lemma": target_verb_lemma, "target_verb_position": target_verb_position, "verb_annotation": verb_annotation, "verb_sense": verb_sense, "legal_args": legal_args, "verb": target_verb_lemma, # used in ai2's srl model "parse": parse # for constraints for the dev set srl }) # This is the position of the gold verb predicate # We may or may not use it (the model might predict the predicate), but the reader always sends it. # instance_fields["verb_pos"] = IndexField(index=target_verb_position, sequence_field=sequence) # TODO Allennlp uses SequenceFeatureField for a indicator vector of the verb position (Find this) # instance_fields["verb_indicator"] = SequenceFeatureField(index=target_verb_position, sequence_field=sequence) verb_indicator = np.zeros(len(tokens)) verb_indicator[target_verb_position] = 1.0 instance_fields["verb_indicator"] = ArrayField(array=verb_indicator) # everyone follows the default IOB2 == BIO format here coded_srl = get_bio_from_spans(verb_annotation, year=self.year, core_args_only=self.core_args_only) coded_chunks = chunk_tags coded_ner = ner_tags if self.coding_scheme == "BIOUL": # coded_srl = get_bio_from_spans(verb_annotation) coded_chunks = to_bioul(chunk_tags, encoding=self._original_coding_scheme ) if chunk_tags is not None else None coded_ner = to_bioul(ner_tags, encoding=self._original_coding_scheme ) if ner_tags is not None else None if 'pos' in self.feature_labels: if pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance.") instance_fields['pos_tags'] = SequenceLabelField( pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: if coded_chunks is None: raise ConfigurationError( "Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance.") instance_fields['chunk_tags'] = SequenceLabelField( coded_chunks, sequence, "chunk_tags") if 'ner' in self.feature_labels: if coded_ner is None: raise ConfigurationError( "Dataset reader was specified to use NER tags as " " features. Pass them to text_to_instance.") instance_fields['ner_tags'] = SequenceLabelField( coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'srl' and coded_srl is not None: instance_fields['tags'] = SequenceLabelField( coded_srl, sequence, self.label_namespace) elif self.tag_label == 'pos' and pos_tags is not None: instance_fields['tags'] = SequenceLabelField( pos_tags, sequence, self.label_namespace) elif self.tag_label == 'chunk' and coded_chunks is not None: instance_fields['tags'] = SequenceLabelField( coded_chunks, sequence, self.label_namespace) return Instance(instance_fields)
def text_to_instance(self, tokens: List[Token]) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ return Instance({'tokens': TextField(tokens, token_indexers=self._token_indexers)})
def __getitem__(self, index): # if self.split == 'test': # raise ValueError("blind test mode not supported quite yet") item = deepcopy(self.items[index]) ################################################################### # Load questions and answers if self.mode == 'rationale': conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice item['question'] += item['answer_choices'][conditioned_label] answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()} with h5py.File(self.h5fn, 'r') as h5: grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()} # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always # condition on the `conditioned_answer_choice.` condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else "" instance_dict = {} if 'endingonly' not in self.embs_to_load: questions_tokenized, question_tags = zip(*[_fix_tokenization( item['question'], grp_items[f'ctx_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1 ) for i in range(4)]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) answers_tokenized, answer_tags = zip(*[_fix_tokenization( answer, grp_items[f'answer_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1 ) for i, answer in enumerate(answer_choices)]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number']}) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn'])) image, window, img_scale, padding = resize_image(image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()] if self.add_image_as_a_box: boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return image, instance
def text_to_instance( self, # type: ignore tokens: List[str], pos_tags: List[str] = None, gold_tree: Tree = None, ) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. # Parameters tokens : ``List[str]``, required. The tokens in a given sentence. pos_tags : ``List[str]``, optional, (default = None). The POS tags for the words in the sentence. gold_tree : ``Tree``, optional (default = None). The gold parse tree to create span labels from. # Returns An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. pos_tags : ``SequenceLabelField`` The POS tags of the words in the sentence. Only returned if ``use_pos_tags`` is ``True`` spans : ``ListField[SpanField]`` A ListField containing all possible subspans of the sentence. span_labels : ``SequenceLabelField``, optional. The constituency tags for each of the possible spans, with respect to a gold parse tree. If a span is not contained within the tree, a span will have a ``NO-LABEL`` label. gold_tree : ``MetadataField(Tree)`` The gold NLTK parse tree for use in evaluation. """ if self._convert_parentheses: tokens = [PTB_PARENTHESES.get(token, token) for token in tokens] text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} pos_namespace = self._label_namespace_prefix + self._pos_label_namespace if self._use_pos_tags and pos_tags is not None: pos_tag_field = SequenceLabelField(pos_tags, text_field, label_namespace=pos_namespace) fields["pos_tags"] = pos_tag_field elif self._use_pos_tags: raise ConfigurationError( "use_pos_tags was set to True but no gold pos" " tags were passed to the dataset reader.") spans: List[Field] = [] gold_labels = [] if gold_tree is not None: gold_spans: Dict[Tuple[int, int], str] = {} self._get_gold_spans(gold_tree, 0, gold_spans) else: gold_spans = None for start, end in enumerate_spans(tokens): spans.append(SpanField(start, end, text_field)) if gold_spans is not None: gold_labels.append(gold_spans.get((start, end), "NO-LABEL")) metadata = {"tokens": tokens} if gold_tree: metadata["gold_tree"] = gold_tree if self._use_pos_tags: metadata["pos_tags"] = pos_tags fields["metadata"] = MetadataField(metadata) span_list_field: ListField = ListField(spans) fields["spans"] = span_list_field if gold_tree is not None: fields["span_labels"] = SequenceLabelField( gold_labels, span_list_field, label_namespace=self._label_namespace_prefix + "labels", ) return Instance(fields)
def text_to_instance( self, item: Dict, entity_map: Dict, literals: Set, logical_forms: List = None) -> Instance: # type: ignore qid = MetadataField(item['qid']) if item['qid'] in [2102902009000 ]: # will exceed maximum length constraint return None if not self._use_sparql: if 's_expression' in item: target_string = item['s_expression'] else: target_string = None else: if 'sparql_query' in item: target_string = item['sparql_query'] else: target_string = None item['question'] = item['question'].replace(self._delimiter, ' ') # if self._training: if self._use_constrained_vocab and len(entity_map) > 0: if not self._training: constrained_vocab = self._get_constrained_vocab( entity_map, literals) else: logical_form = item[ 's_expression'] if not self._use_sparql else item[ 'sparql_query'] domains = item['domains'] if not self._gq1 else None constrained_vocab = self._get_constrained_vocab( entity_map, literals, s_expression=logical_form, domains=domains) elif len(entity_map) == 0 and self._training: vocab = set() vocab.update(self._schema_constants) vocab = list(vocab) random.shuffle(vocab) vocab = set(vocab[:200]) if not self._use_sparql: vocab.update( [x for x in self._target_tokenizer(item['s_expression'])]) else: vocab.update( [x for x in self._target_tokenizer(item['sparql_query'])]) constrained_vocab = list(vocab) else: vocab = set() vocab.update(self._schema_constants) for eid in entity_map: vocab.add(eid) for l in literals: vocab.add(l) constrained_vocab = list(vocab) # schema_constants = constrained_vocab[:] # always fix the position of END_SYMBOL, START_SYMBOL in each constrained vocab, # because a consistent global shared end_index / start_index is needed by BeamSearch # Here we also fix the position for all other syntactic constants for the convenience # of embeddings computing for k, v in { k: v for k, v in sorted(self._global_syntax_constants_vocab.items(), key=lambda x: x[1]) }.items(): constrained_vocab.insert(v, k) schema_constants = constrained_vocab[:] # dividing the schema constants into num_constants_per_group every group concat_strings = [ '' for _ in range( len(schema_constants) // self._num_constants_per_group + 1) ] for i in range( len(schema_constants) // self._num_constants_per_group + 1): if (i + 1) * self._num_constants_per_group <= len( schema_constants): right_index = (i + 1) * self._num_constants_per_group else: right_index = len(schema_constants) for constant in schema_constants[ i * self._num_constants_per_group:right_index]: if constant in entity_map: # to get the representation for a entity based on its friendly name constant = entity_map[constant] if constant == '.': # '.' in sparql means and constant = 'and' concat_strings[i] += ' '.join( re.split('\.|_', constant.lower())) + self._delimiter # handle sequence of length > 512 (dividing the schema constants into num_constants_per_group every group) # _source_tokenizer.tokenize will append the head [CLS] and ending [SEP] by itself tokenized_sources = [ self._source_tokenizer.tokenize(item['question'] + '[SEP]' + concat_string) for concat_string in concat_strings ] end = [] start = [] for tokenized_source in tokenized_sources: flag = False for i, token in enumerate(tokenized_source): if flag and str(token) == self._delimiter: end.append(i - 1) start.append(i + 1) if str(token) == '[SEP]': if not flag: start.append(i + 1) flag = True start = start[:-1] # ignore the last ';' # unit test for concatenation # print(len(constrained_vocab), constrained_vocab) # for i, tokenized_source in enumerate(tokenized_sources): # print(constrained_vocab[14 + 50*i: 14 + min(50*(i + 1), len(start))]) # print(start[50*i:min(50*(i + 1), len(start))]) # print(end[50*i:min(50*(i + 1), len(start))]) # print(tokenized_source) # source_field = ListField( # [TextField(tokenized_source, self._source_token_indexers) for tokenized_source in tokenized_sources]) source_field = [] for tokenized_source in tokenized_sources: chunk = TextField(tokenized_source, self._source_token_indexers) if len(chunk) > self._source_max_tokens: print(len(chunk), item['qid'], '!!!!!!!!!') exit(-1) source_field.append(chunk) source_field = ListField(source_field) # vocab_field = TextField([Token(x) for x in constrained_vocab], self._target_token_indexers) vocab_field = MetadataField(constrained_vocab) # if len(constrained_vocab) != 14 + len(start): if len(constrained_vocab) != len(start): print(entity_map) # assert len(constrained_vocab) == 14 + len(start) assert len(constrained_vocab) == len(start) instance_dict = { "source_tokens": source_field, # The concatenation of utterance and schema constants # The start position for each schema constant in the concatenated input. "schema_start": MetadataField(start), # The end position ... "schema_end": MetadataField(end), "constrained_vocab": vocab_field, "ids": qid } # If you want to use F1 during training, uncomment this! # if 'answer' in item: # answer = [] # for a in item['answer']: # answer.append(a['answer_argument']) # instance_dict['answer'] = MetadataField(answer) # print("num lfs: ", len(logical_forms)) if not self._training and self._ranking_mode and logical_forms: lfs = [] for lf in logical_forms: try: lf_field = self._convert_target_to_indices( lf, constrained_vocab, vocab_field) lfs.append(lf_field) except Exception: pass if len(lfs) == 0: return None candidates = ListField(lfs) instance_dict["candidates"] = candidates print(len(candidates)) if target_string is not None: target_field = self._convert_target_to_indices( target_string, constrained_vocab, vocab_field) instance_dict[ "target_tokens"] = target_field # The id of each target token in constrained_vocab return Instance(instance_dict)
def make_reading_comprehension_instance( question_tokens: List[Token], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, additional_metadata: Dict[str, Any] = None) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_tokens : ``List[Token]`` An already-tokenized question. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_spans : ``List[Tuple[int, int]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list because there might be several possible correct answer spans in the passage. Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple annotations on the dev set; this will select the span that the most annotators gave as correct). answer_texts : ``List[str]``, optional All valid answer strings for the given question. In SQuAD, e.g., the training set has exactly one answer per question, but the dev and test sets have several. TriviaQA has many possible answers, which are the aliases for the known correct entity. This is put into the metadata for use with official evaluation scripts, but not used anywhere else. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} #import pdb; pdb.set_trace() fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = TextField(question_tokens, token_indexers) metadata = { 'original_passage': passage_text, 'token_offsets': passage_offsets, 'question_tokens': [token.text for token in question_tokens], 'passage_tokens': [token.text for token in passage_tokens], } if answer_texts: metadata['answer_texts'] = answer_texts list_span_start = [] list_span_end = [] if token_spans: # There may be multiple answer annotations, so we pick the one that occurs the most. This # only matters on the SQuAD dev set, and it means our computed metrics ("start_acc", # "end_acc", and "span_acc") aren't quite the same as the official metrics, which look at # all of the annotations. This is why we have a separate official SQuAD metric calculation # (the "em" and "f1" metrics use the official script). for span_start, span_end in token_spans: list_span_start.append(IndexField(span_start, passage_field)) list_span_end.append(IndexField(span_end, passage_field)) fields['span_start'] = ListField(list_span_start) fields['span_end'] = ListField(list_span_end) #import pdb; pdb.set_trace() metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)
def text_to_instance( # type: ignore self, tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None, ner_tags: List[str] = None, ) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {"tokens": sequence} instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_chunks = ( to_bioul(chunk_tags, encoding=self._original_coding_scheme) if chunk_tags is not None else None ) coded_ner = ( to_bioul(ner_tags, encoding=self._original_coding_scheme) if ner_tags is not None else None ) else: # the default IOB1 coded_chunks = chunk_tags coded_ner = ner_tags # Add "feature labels" to instance if "pos" in self.feature_labels: if pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance." ) instance_fields["pos_tags"] = SequenceLabelField(pos_tags, sequence, "pos_tags") if "chunk" in self.feature_labels: if coded_chunks is None: raise ConfigurationError( "Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance." ) instance_fields["chunk_tags"] = SequenceLabelField(coded_chunks, sequence, "chunk_tags") if "ner" in self.feature_labels: if coded_ner is None: raise ConfigurationError( "Dataset reader was specified to use NER tags as " " features. Pass them to text_to_instance." ) instance_fields["ner_tags"] = SequenceLabelField(coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == "ner" and coded_ner is not None: instance_fields["tags"] = SequenceLabelField(coded_ner, sequence, self.label_namespace) elif self.tag_label == "pos" and pos_tags is not None: instance_fields["tags"] = SequenceLabelField(pos_tags, sequence, self.label_namespace) elif self.tag_label == "chunk" and coded_chunks is not None: instance_fields["tags"] = SequenceLabelField( coded_chunks, sequence, self.label_namespace ) return Instance(instance_fields)
def text_to_instance( self, # type: ignore sentence: List[Token], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None, ) -> Instance: """ # Parameters sentence : `List[Token]`, required. The already tokenised sentence to analyse. gold_clusters : `Optional[List[List[Tuple[int, int]]]]`, optional (default = None) A list of all clusters in the sentence, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. # Returns An `Instance` containing the following `Fields`: text : `TextField` The text of the full sentence. spans : `ListField[SpanField]` A ListField containing the spans represented as `SpanFields` with respect to the sentence text. span_labels : `SequenceLabelField`, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a `SequenceLabelField` with respect to the `spans `ListField`. """ metadata: Dict[str, Any] = {"original_text": sentence} if gold_clusters is not None: metadata["clusters"] = gold_clusters text_field = TextField(sentence, self._token_indexers) cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List[Field] = [] span_labels: Optional[ List[int]] = [] if gold_clusters is not None else None for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) spans.append(SpanField(start, end, text_field)) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field, } if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields)
def text_to_instance( self, sentence: str, identifier: str, image_ids: List[str], logical_form: str = None, attention_mode: int = None, box_annotation: Dict = None, denotation: str = None, ) -> Instance: tokenized_sentence = self._tokenizer.tokenize(sentence) sentence_field = TextField(tokenized_sentence, self._token_indexers) world = VisualReasoningNlvr2Language(None, None, None, None, None, None) production_rule_fields: List[Field] = [] instance_action_ids: Dict[str, int] = {} for production_rule in world.all_possible_productions(): instance_action_ids[production_rule] = len(instance_action_ids) field = ProductionRuleField(production_rule, is_global_rule=True) production_rule_fields.append(field) action_field = ListField(production_rule_fields) boxes2 = [] feats2 = [] max_num_boxes = 0 for key in image_ids: if self.img_data is not None: img_info = self.img_data[key] else: split_name = "train" if "dev" in key: split_name = "valid" img_info = pickle.load( open( os.path.join(self._image_feat_cache_dir, split_name + "_obj36.tsv", key), "rb", )) boxes = img_info["boxes"].copy() feats = img_info["features"].copy() assert len(boxes) == len(feats) # Normalize the boxes (to 0 ~ 1) img_h, img_w = img_info["img_h"], img_info["img_w"] boxes[..., (0, 2)] /= img_w boxes[..., (1, 3)] /= img_h np.testing.assert_array_less(boxes, 1 + 1e-5) np.testing.assert_array_less(-boxes, 0 + 1e-5) if boxes.shape[0] > self._max_boxes: boxes = boxes[:self._max_boxes, :] feats = feats[:self._max_boxes, :] max_num_boxes = max(max_num_boxes, boxes.shape[0]) boxes2.append(boxes) feats2.append(feats) boxes3 = [ np.zeros((max_num_boxes, img_boxes.shape[-1])) for img_boxes in boxes2 ] feats3 = [ np.zeros((max_num_boxes, img_feats.shape[-1])) for img_feats in feats2 ] for i in range(len(boxes2)): boxes3[i][:boxes2[i].shape[0], :] = boxes2[i] feats3[i][:feats2[i].shape[0], :] = feats2[i] boxes2 = boxes3 feats2 = feats3 feats = np.stack(feats2) boxes = np.stack(boxes2) metadata: Dict[str, Any] = { "utterance": sentence, "tokenized_utterance": tokenized_sentence, "identifier": identifier, } fields: Dict[str, Field] = { "sentence": sentence_field, "actions": action_field, "metadata": MetadataField(metadata), "image_id": MetadataField(identifier[:-2]), "visual_feat": ArrayField(feats), "pos": ArrayField(boxes), } if denotation is not None: fields["denotation"] = LabelField(denotation, skip_indexing=True) if logical_form: lisp_exp = annotation_to_lisp_exp(logical_form) target_sequence = world.logical_form_to_action_sequence(lisp_exp) index_field = [ IndexField(instance_action_ids[action], action_field) for action in target_sequence ] fields["target_action_sequence"] = ListField(index_field) module_attention = annotation_to_module_attention(logical_form) target_attention = target_sequence_to_target_attn( target_sequence, module_attention) gold_question_attentions = self._assign_attention_to_tokens( target_attention, sentence, attention_mode) attn_index_field = [ ListField( [IndexField(att, sentence_field) for att in target_att]) for target_att in gold_question_attentions ] fields["gold_question_attentions"] = ListField(attn_index_field) if box_annotation is None and len(self.box_annotations) > 0: fields["gold_box_annotations"] = MetadataField([]) elif box_annotation is not None: modules = logical_form.split("\n") children = [[] for _ in modules] for j, module in enumerate(modules): num_periods = len(module) - len(module.strip(".")) for k in range(j + 1, len(modules)): num_periods_k = len(modules[k]) - len( modules[k].strip(".")) if num_periods_k <= num_periods: break if num_periods_k == num_periods + 1: children[j].append(k) for j in range(len(modules) - 1, -1, -1): if modules[j].strip(".") == "in_left_image": box_annotation[j] = {} box_annotation[j]["module"] = modules[j].strip(".") box_annotation[j][0] = box_annotation[j + 1][0] box_annotation[j][1] = [] """for k in children[j]: box_annotation[k][0] = box_annotation[k][0] box_annotation[k][1] = []""" elif modules[j].strip(".") == "in_right_image": box_annotation[j] = {} box_annotation[j]["module"] = modules[j].strip(".") box_annotation[j][1] = box_annotation[j + 1][1] box_annotation[j][0] = [] elif modules[j].strip(".") in { "in_one_image", "in_other_image" }: box_annotation[j] = {} box_annotation[j]["module"] = modules[j].strip(".") box_annotation[j][0] = box_annotation[j + 1][0] box_annotation[j][1] = box_annotation[j + 1][1] """for k in children[j]: box_annotation[k][0] = [] box_annotation[k][1] = box_annotation[k][1]""" keys = sorted(list(box_annotation.keys())) # print(identifier, keys) # print(box_annotation) # print(target_sequence) module_boxes = [( mod, box_annotation[mod]["module"], [box_annotation[mod][0], box_annotation[mod][1]], ) for mod in keys] gold_boxes, gold_counts = target_sequence_to_target_boxes( target_sequence, module_boxes, children) # print(identifier, target_sequence, module_boxes, gold_boxes) fields["gold_box_annotations"] = MetadataField(gold_boxes) metadata["gold"] = world.action_sequence_to_logical_form( target_sequence) fields["valid_target_sequence"] = ArrayField( np.array(1, dtype=np.int32)) else: fields["target_action_sequence"] = ListField( [IndexField(0, action_field)]) fields["gold_question_attentions"] = ListField( [ListField([IndexField(0, sentence_field)])]) fields["valid_target_sequence"] = ArrayField( np.array(0, dtype=np.int32)) if len(self.box_annotations) > 0: fields["gold_box_annotations"] = MetadataField([]) return Instance(fields)
def text_to_instance( self, # type: ignore question: str, table_lines: List[List[str]], target_values: List[str], offline_search_output: List[str] = None) -> Instance: """ Reads text inputs and makes an instance. WikitableQuestions dataset provides tables as TSV files pre-tagged using CoreNLP, which we use for training. Parameters ---------- question : ``str`` Input question table_lines : ``List[List[str]]`` The table content preprocessed by CoreNLP. See ``TableQuestionContext.read_from_lines`` for the expected format. target_values : ``List[str]`` offline_search_output : List[str], optional List of logical forms, produced by offline search. Not required during test. """ # pylint: disable=arguments-differ tokenized_question = self._tokenizer.tokenize(question.lower()) question_field = TextField(tokenized_question, self._question_token_indexers) # TODO(pradeep): We'll need a better way to input CoreNLP processed lines. table_context = TableQuestionContext.read_from_lines( table_lines, tokenized_question) target_values_field = MetadataField(target_values) world = WikiTablesVariableFreeWorld(table_context) world_field = MetadataField(world) # Note: Not passing any featre extractors when instantiating the field below. This will make # it use all the available extractors. table_field = KnowledgeGraphField( table_context.get_table_knowledge_graph(), tokenized_question, self._table_token_indexers, tokenizer=self._tokenizer, include_in_vocab=self._use_table_for_vocab, max_table_tokens=self._max_table_tokens) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_actions(): _, rule_right_side = production_rule.split(' -> ') is_global_rule = not world.is_instance_specific_entity( rule_right_side) field = ProductionRuleField(production_rule, is_global_rule=is_global_rule) production_rule_fields.append(field) action_field = ListField(production_rule_fields) fields = { 'question': question_field, 'table': table_field, 'world': world_field, 'actions': action_field, 'target_values': target_values_field } # We'll make each target action sequence a List[IndexField], where the index is into # the action list we made above. We need to ignore the type here because mypy doesn't # like `action.rule` - it's hard to tell mypy that the ListField is made up of # ProductionRuleFields. action_map = { action.rule: i for i, action in enumerate(action_field.field_list) } # type: ignore if offline_search_output: action_sequence_fields: List[Field] = [] for logical_form in offline_search_output: try: expression = world.parse_logical_form(logical_form) except ParsingError as error: logger.debug( f'Parsing error: {error.message}, skipping logical form' ) logger.debug(f'Question was: {question}') logger.debug(f'Logical form was: {logical_form}') logger.debug(f'Table info was: {table_lines}') continue except: logger.error(logical_form) raise action_sequence = world.get_action_sequence(expression) try: index_fields: List[Field] = [] for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) action_sequence_fields.append(ListField(index_fields)) except KeyError as error: logger.debug( f'Missing production rule: {error.args}, skipping logical form' ) logger.debug(f'Question was: {question}') logger.debug(f'Table info was: {table_lines}') logger.debug(f'Logical form was: {logical_form}') continue if len(action_sequence_fields ) >= self._max_offline_logical_forms: break if not action_sequence_fields: # This is not great, but we're only doing it when we're passed logical form # supervision, so we're expecting labeled logical forms, but we can't actually # produce the logical forms. We should skip this instance. Note that this affects # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the # full test data. return None fields['target_action_sequences'] = ListField( action_sequence_fields) if self._output_agendas: agenda_index_fields: List[Field] = [] for agenda_string in world.get_agenda(conservative=True): agenda_index_fields.append( IndexField(action_map[agenda_string], action_field)) if not agenda_index_fields: agenda_index_fields = [IndexField(-1, action_field)] fields['agenda'] = ListField(agenda_index_fields) return Instance(fields)
def text_to_instance( self, # type: ignore words: List[str], lemmas: List[str] = None, lemma_rules: List[str] = None, upos_tags: List[str] = None, xpos_tags: List[str] = None, feats: List[str] = None, separate_feats: List[Dict[str, str]] = None, dependencies: List[Tuple[str, int]] = None, ids: List[str] = None, multiword_ids: List[str] = None, multiword_forms: List[str] = None, langs: List[str] = None) -> Instance: fields: Dict[str, Field] = {} if self.use_lang_ids: # use ent_type_ for lang_ids tokens = TextField( [Token(text=w, ent_type_=l) for w, l in zip(words, langs)], self._token_indexers) else: tokens = TextField([Token(w) for w in words], self._token_indexers) fields["tokens"] = tokens names = ["upos", "xpos", "feats", "lemmas", "langs"] all_tags = [upos_tags, xpos_tags, feats, lemma_rules, langs] for name, field in zip(names, all_tags): if field: fields[name] = SequenceLabelField(field, tokens, label_namespace=name) if dependencies is not None: # We don't want to expand the label namespace with an additional dummy token, so we'll # always give the 'ROOT_HEAD' token a label of 'root'. fields["head_tags"] = SequenceLabelField( [x[0] for x in dependencies], tokens, label_namespace="head_tags") fields["head_indices"] = SequenceLabelField( [int(x[1]) for x in dependencies], tokens, label_namespace="head_index_tags") if self.use_separate_feats: feature_seq = [] for feat_set in separate_feats: dimensions = { dimension.replace('[', '_').replace(']', '_'): "_" for dimension in self.ud_feats_schema } if feat_set != "_": for dimension in feat_set: dimensions[dimension.replace('[', '_').replace( ']', '_')] = feat_set[dimension] feature_seq.append(dimensions) for dimension in self.ud_feats_schema: d = dimension.replace('[', '_').replace(']', '_') labels = [f[d] for f in feature_seq] fields[d] = SequenceLabelField(labels, tokens, label_namespace=d) fields["metadata"] = MetadataField({ "words": words, "upos_tags": upos_tags, "xpos_tags": xpos_tags, "feats": feats, "lemmas": lemmas, "lemma_rules": lemma_rules, "ids": ids, "multiword_ids": multiword_ids, "multiword_forms": multiword_forms, "langs": langs }) return Instance(fields)
def text_to_instance( self, # type: ignore premise: List[Tuple[str, float]], # Important type information hypothesis: str, pid: str = None, label: str = None) -> Instance: fields: Dict[str, Field] = {} if self.shuffle_sentences: # Potential improvement. Shuffle the input sentences. Maybe close this at last several epoch. random.shuffle(premise) premise_prob_list = [] premise_tokens_list = [] for premise_sent, prob in premise: tokenized_cur_sent = self.bert_servant.tokenize( premise_sent, modify_from_corenlp=True) # cur_sent_ids = self.bert_servant.tokens_to_ids(tokenized_cur_sent) if self.max_l is not None: tokenized_cur_sent = tokenized_cur_sent[:self. max_l] # truncate max length (default 60) premise_tokens_list.extend(tokenized_cur_sent) prob_value = np.ones( (len(tokenized_cur_sent), 1), dtype=np.float32) * prob premise_prob_list.append(prob_value) premise_prob = np.concatenate(premise_prob_list, axis=0) # premise_tokens_id_list = self.bert_servant.tokens_to_ids(premise_tokens_list) hypothesis_tokens_list = self.bert_servant.tokenize( hypothesis, modify_from_corenlp=True) # print("WTF!!!, p", len(premise_tokens_list)) # print("WTF!!!, h", len(hypothesis_tokens_list)) if self.max_l is not None: hypothesis_tokens_list = hypothesis_tokens_list[:self.max_l] hypothesis_prob = np.ones((len(hypothesis_tokens_list), 1), dtype=np.float32) assert len(premise_tokens_list) == len(premise_prob) assert len(hypothesis_tokens_list) == len(hypothesis_prob) paired_tokens_sequence = ['[CLS]'] + premise_tokens_list + [ '[SEP]' ] + hypothesis_tokens_list + ['[SEP]'] token_type_ids = [0] * (2 + len(premise_tokens_list)) + [1] * ( 1 + len(hypothesis_tokens_list)) paired_ids_seq = self.bert_servant.tokens_to_ids( paired_tokens_sequence) assert len(paired_ids_seq) == len(token_type_ids) fields['paired_sequence'] = BertIndexField( np.asarray(paired_ids_seq, dtype=np.int64)) fields['paired_token_type_ids'] = BertIndexField( np.asarray(token_type_ids, dtype=np.int64)) premise_span = (1, 1 + len(premise_tokens_list) ) # End is exclusive (important for later use) hypothesis_span = (premise_span[1] + 1, premise_span[1] + 1 + len(hypothesis_tokens_list)) assert len(paired_ids_seq) == 1 + (premise_span[1] - premise_span[0]) + 1 + \ (hypothesis_span[1] - hypothesis_span[0]) + 1 fields['bert_premise_span'] = MetadataField(premise_span) fields['bert_hypothesis_span'] = MetadataField(hypothesis_span) fields['premise_probs'] = MetadataField(premise_prob) fields['hypothesis_probs'] = MetadataField(hypothesis_prob) if label: fields['label'] = LabelField(label, label_namespace='labels') if pid: fields['pid'] = IdField(pid) return Instance(fields)
def text_to_instance(self, source: Token, targets: List[Token] = []) -> Instance: fields = {'source': TextField([source], self._syllable_indexers), 'targets': TextField(targets, self._word_indexers)} return Instance(fields)
def text_to_instance(self, inputs): fields: Dict[str, Field] = {} tokens_list_field: List[TextField] = [] sent_positions_list_field: List[TextField] = [] position_list_field: List[TextField] = [] participant_mask_list_field: List[SequenceLabelField] = [] after_loc_start_list_field: List[IndexField] = [] after_loc_end_list_field: List[IndexField] = [] after_category_list_field: List[IndexField] = [] after_category_mask_list_field: List[IndexField] = [] category_field: List[LabelField] = [] for l in category_list: category_field.append(LabelField(str(l), "labels")) category_field = ListField(category_field) category_mask_field: List[LabelField] = [] for l in category_mask_list: category_mask_field.append(LabelField(str(l), "labels")) category_mask_field = ListField(category_mask_field) token_field_step0 = TextField(inputs[0][0], self._token_indexers) before_loc_start_field = IndexField(inputs[6][0], token_field_step0) before_loc_end_field = IndexField(inputs[7][0], token_field_step0) before_category_field = IndexField(inputs[4][0], category_field) before_category_mask_field = IndexField(inputs[5][0], category_mask_field) for i in range(len(inputs[0])): token_field = TextField(inputs[0][i], self._token_indexers) tokens_list_field.append(token_field) sent_position_field = TextField(inputs[1][i], self._sent_position_indexers) sent_positions_list_field.append(sent_position_field) position_field = TextField(inputs[2][i], self._token_position_indexers) position_list_field.append(position_field) participant_mask_field = SequenceLabelField(inputs[3][i], token_field, 'tags') participant_mask_list_field.append(participant_mask_field) after_loc_start_field = IndexField(inputs[10][i], token_field) after_loc_end_field = IndexField(inputs[11][i], token_field) after_loc_start_list_field.append(after_loc_start_field) after_loc_end_list_field.append(after_loc_end_field) after_category_field = IndexField(inputs[8][i], category_field) after_category_list_field.append(after_category_field) after_category_mask_field = IndexField(inputs[9][i], category_mask_field) after_category_mask_list_field.append(after_category_mask_field) fields['tokens_list'] = ListField(tokens_list_field) fields['positions_list'] = ListField(position_list_field) fields['sent_positions_list'] = ListField(sent_positions_list_field) fields['before_loc_start'] = before_loc_start_field fields['before_loc_end'] = before_loc_end_field fields['after_loc_start_list'] = ListField(after_loc_start_list_field) fields['after_loc_end_list'] = ListField(after_loc_end_list_field) fields['before_category'] = before_category_field fields['after_category_list'] = ListField(after_category_list_field) fields['before_category_mask'] = before_category_mask_field fields['after_category_mask_list'] = ListField(after_category_mask_list_field) return Instance(fields)
def text_to_instance( self, # type: ignore sentences: List[List[str]], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- sentences : ``List[List[str]]``, required. A list of lists representing the tokenised words and sentences in the document. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the document, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full document. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the document text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ flattened_sentences = [ self._normalize_word(word) for sentence in sentences for word in sentence ] # align clusters gold_clusters = self.align_clusters_to_tokens(flattened_sentences, gold_clusters) def tokenizer(s: str): return self.token_indexer.wordpiece_tokenizer(s) # we nee dto try this with the other one. flattened_sentences = tokenizer(" ".join(flattened_sentences)) metadata: Dict[str, Any] = {"original_text": flattened_sentences} if gold_clusters is not None: metadata["clusters"] = gold_clusters text_field = TextField([Token(["[CLS]"])] + [Token(word) for word in flattened_sentences] + [Token(["[SEP]"])], self._token_indexers) cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List[Field] = [] span_labels: Optional[ List[int]] = [] if gold_clusters is not None else None sentence_offset = 0 normal = [] for sentence in sentences: # enumerate the spans. for start, end in enumerate_spans( sentence, offset=sentence_offset, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) # align the spans to the BERT tokeniation normal.append((start, end)) span_field = TextField( [Token(["[CLS]"])] + [Token(word) for word in flattened_sentences] + [Token(["[SEP]"])], self._token_indexers) # span field for Span, which needs to be a flattened esnetnece. spans.append(SpanField(start, end, span_field)) sentence_offset += len(sentence) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field } if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields)
def text_to_instance( self, # type: ignore tokens: List[str], lemmas: List[str] = None, pos_tags: List[str] = None, arc_indices: List[Tuple[int, int]] = None, arc_tags: List[str] = None, gold_actions: List[str] = None, arc_descendants: List[str] = None, root_id: List[int] = None, meta_info: List[str] = None, tokens_range: List[Tuple[int, int]] = None, gold_mrps: List[str] = None, deprels: List[str] = None, lex_infos: List[List[str]] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} token_field = TextField([Token(t) for t in tokens], self._token_indexers) fields["tokens"] = token_field meta_dict = {"tokens": tokens} if arc_indices is not None and arc_tags is not None: meta_dict["arc_indices"] = arc_indices meta_dict["arc_tags"] = arc_tags fields["arc_tags"] = TextField([Token(a) for a in arc_tags], self._arc_tag_indexers) if gold_actions is not None: meta_dict["gold_actions"] = gold_actions fields["gold_actions"] = TextField( [Token(a) for a in gold_actions], self._action_indexers) if pos_tags is not None and self.pos_tags: fields["pos_tags"] = SequenceLabelField(pos_tags, token_field, label_namespace="pos") if arc_descendants is not None: meta_dict["arc_descendants"] = arc_descendants if root_id is not None: meta_dict["root_id"] = root_id[0] if meta_info is not None: meta_dict["meta_info"] = meta_info[0] if tokens_range is not None: meta_dict["tokens_range"] = tokens_range if gold_mrps is not None: meta_dict["gold_mrps"] = gold_mrps[0] if deprels is not None and self.deprels: fields["deprels"] = SequenceLabelField(deprels, token_field, label_namespace="deprels") if lex_infos is not None: bios, lexcat, ss, ss2 = zip(*tuple(lex_infos)) if self.bios: fields["bios"] = SequenceLabelField(bios, token_field, label_namespace="bios") if self.lexcat: fields["lexcat"] = SequenceLabelField(lexcat, token_field, label_namespace="lexcat") if self.ss: fields["ss"] = SequenceLabelField(ss, token_field, label_namespace="ss") if self.ss2: fields["ss2"] = SequenceLabelField(ss2, token_field, label_namespace="ss2") fields["metadata"] = MetadataField(meta_dict) return Instance(fields)
def text_to_instance( self, # type: ignore utterances: List[str], sql_query_labels: List[str] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- utterances: ``List[str]``, required. List of utterances in the interaction, the last element is the current utterance. sql_query_labels: ``List[str]``, optional The SQL queries that are given as labels during training or validation. """ utterance = utterances[-1] action_sequence: List[str] = [] if not utterance: return None world = AtisWorld(utterances=utterances, database_file=self._database_file) if sql_query_labels: # If there are multiple sql queries given as labels, we use the shortest # one for training. sql_query = min(sql_query_labels, key=len) try: action_sequence = world.get_action_sequence(sql_query) except ParseError: logger.debug(f'Parsing error') tokenized_utterance = self._tokenizer.tokenize(utterance.lower()) utterance_field = TextField(tokenized_utterance, self._token_indexers) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_actions(): nonterminal, _ = production_rule.split(' ->') # The whitespaces are not semantically meaningful, so we filter them out. production_rule = ' '.join([ token for token in production_rule.split(' ') if token != 'ws' ]) field = ProductionRuleField(production_rule, self._is_global_rule(nonterminal)) production_rule_fields.append(field) action_field = ListField(production_rule_fields) action_map = { action.rule: i # type: ignore for i, action in enumerate(action_field.field_list) } index_fields: List[Field] = [] world_field = MetadataField(world) fields = { 'utterance': utterance_field, 'actions': action_field, 'world': world_field, 'linking_scores': ArrayField(world.linking_scores) } if sql_query_labels != None: fields['sql_queries'] = MetadataField(sql_query_labels) if action_sequence: for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) action_sequence_field = ListField(index_fields) fields['target_action_sequence'] = action_sequence_field else: # If we are given a SQL query, but we are unable to parse it, then we will skip it. return None return Instance(fields)
def text_to_instance( self, # type: ignore sentence: str, structured_representations: List[List[List[JsonDict]]], labels: List[str] = None, target_sequences: List[List[str]] = None, identifier: str = None, ) -> Instance: """ Parameters ---------- sentence : ``str`` The query sentence. structured_representations : ``List[List[List[JsonDict]]]`` A list of Json representations of all the worlds. See expected format in this class' docstring. labels : ``List[str]`` (optional) List of string representations of the labels (true or false) corresponding to the ``structured_representations``. Not required while testing. target_sequences : ``List[List[str]]`` (optional) List of target action sequences for each element which lead to the correct denotation in worlds corresponding to the structured representations. identifier : ``str`` (optional) The identifier from the dataset if available. """ worlds = [] for structured_representation in structured_representations: boxes = { Box(object_list, box_id) for box_id, object_list in enumerate(structured_representation) } worlds.append(NlvrLanguage(boxes)) tokenized_sentence = self._tokenizer.tokenize(sentence) sentence_field = TextField(tokenized_sentence, self._sentence_token_indexers) production_rule_fields: List[Field] = [] instance_action_ids: Dict[str, int] = {} # TODO(pradeep): Assuming that possible actions are the same in all worlds. This may change # later. for production_rule in worlds[0].all_possible_productions(): instance_action_ids[production_rule] = len(instance_action_ids) field = ProductionRuleField(production_rule, is_global_rule=True) production_rule_fields.append(field) action_field = ListField(production_rule_fields) worlds_field = ListField([MetadataField(world) for world in worlds]) metadata: Dict[str, Any] = {"sentence_tokens": [x.text for x in tokenized_sentence]} fields: Dict[str, Field] = { "sentence": sentence_field, "worlds": worlds_field, "actions": action_field, "metadata": MetadataField(metadata), } if identifier is not None: fields["identifier"] = MetadataField(identifier) # Depending on the type of supervision used for training the parser, we may want either # target action sequences or an agenda in our instance. We check if target sequences are # provided, and include them if they are. If not, we'll get an agenda for the sentence, and # include that in the instance. if target_sequences: action_sequence_fields: List[Field] = [] for target_sequence in target_sequences: index_fields = ListField( [ IndexField(instance_action_ids[action], action_field) for action in target_sequence ] ) action_sequence_fields.append(index_fields) # TODO(pradeep): Define a max length for this field. fields["target_action_sequences"] = ListField(action_sequence_fields) elif self._output_agendas: # TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true # now, but may change later too. agenda = worlds[0].get_agenda_for_sentence(sentence) assert agenda, "No agenda found for sentence: %s" % sentence # agenda_field contains indices into actions. agenda_field = ListField( [IndexField(instance_action_ids[action], action_field) for action in agenda] ) fields["agenda"] = agenda_field if labels: labels_field = ListField( [LabelField(label, label_namespace="denotations") for label in labels] ) fields["labels"] = labels_field return Instance(fields)
def make_marginal_drop_instance( question_tokens: List[Token], passage_tokens: List[Token], number_tokens: List[Token], number_indices: List[int], token_indexers: Dict[str, TokenIndexer], passage_text: str, answer_info: Dict[str, Any] = None, additional_metadata: Dict[str, Any] = None, ) -> Instance: additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] question_offsets = [(token.idx, token.idx + len(token.text)) for token in question_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) question_field = TextField(question_tokens, token_indexers) fields["passage"] = passage_field fields["question"] = question_field number_index_fields: List[Field] = [ IndexField(index, passage_field) for index in number_indices ] fields["number_indices"] = ListField(number_index_fields) # This field is actually not required in the model, # it is used to create the `answer_as_plus_minus_combinations` field, which is a `SequenceLabelField`. # We cannot use `number_indices` field for creating that, because the `ListField` will not be empty # when we want to create a new empty field. That will lead to error. numbers_in_passage_field = TextField(number_tokens, token_indexers) metadata = { "original_passage": passage_text, "passage_token_offsets": passage_offsets, "question_token_offsets": question_offsets, "question_tokens": [token.text for token in question_tokens], "passage_tokens": [token.text for token in passage_tokens], "number_tokens": [token.text for token in number_tokens], "number_indices": number_indices, } if answer_info: metadata["answer_texts"] = answer_info["answer_texts"] passage_span_fields: List[Field] = [ SpanField(span[0], span[1], passage_field) for span in answer_info["answer_passage_spans"] ] if not passage_span_fields: passage_span_fields.append(SpanField(-1, -1, passage_field)) fields["answer_as_passage_spans"] = ListField(passage_span_fields) question_span_fields: List[Field] = [ SpanField(span[0], span[1], question_field) for span in answer_info["answer_question_spans"] ] if not question_span_fields: question_span_fields.append(SpanField(-1, -1, question_field)) fields["answer_as_question_spans"] = ListField( question_span_fields) add_sub_signs_field: List[Field] = [] for signs_for_one_add_sub_expression in answer_info[ "signs_for_add_sub_expressions"]: add_sub_signs_field.append( SequenceLabelField(signs_for_one_add_sub_expression, numbers_in_passage_field)) if not add_sub_signs_field: add_sub_signs_field.append( SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field)) fields["answer_as_add_sub_expressions"] = ListField( add_sub_signs_field) count_fields: List[Field] = [ LabelField(count_label, skip_indexing=True) for count_label in answer_info["counts"] ] if not count_fields: count_fields.append(LabelField(-1, skip_indexing=True)) fields["answer_as_counts"] = ListField(count_fields) metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, # pylint: disable=arguments-differ premises: List[str], hypotheses: List[str], answer_index: int = None, relevant_sentence_idxs: List[int] = None) -> Instance: fields = {} premises_tokens = [ self._tokenizer.tokenize(premise)[-self._premise_max_tokens:] for premise in premises ] hypotheses_tokens = [ self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:] for hypothesis in hypotheses ] if premises: premises_text_fields = [ TextField(premise_tokens, self._token_indexers) for premise_tokens in premises_tokens ] premises_field = ListField(premises_text_fields) else: empty_stub = ListField( [TextField([Token('dummy')], self._token_indexers)]) premises_field = empty_stub.empty_field() fields['premises'] = premises_field hypotheses_text_fields = [ TextField(hypothesis_tokens, self._token_indexers) for hypothesis_tokens in hypotheses_tokens ] hypotheses_field = ListField(hypotheses_text_fields) fields['hypotheses'] = hypotheses_field # If sentence relevance is available if relevant_sentence_idxs is not None: relevance_presence_mask = np.zeros(len(premises)) for idx in relevant_sentence_idxs: relevance_presence_mask[idx] = 1 fields['relevance_presence_mask'] = ArrayField( np.array(relevance_presence_mask)) # If entailment labels are available if answer_index is not None: # if answer_index not in range(0, len(hypotheses)): # raise ConfigurationError("Provided label must be in 0 to {}".format(len(hypotheses))) fields['answer_index'] = ArrayField(np.array(answer_index), padding_value=-1, dtype=np.long) paragraph_tokens = [ token for premise_tokens in premises_tokens for token in premise_tokens ] #print(len(paragraph_tokens)) if (len(paragraph_tokens) == 0): return None paragraph_text_field = TextField(paragraph_tokens, self._token_indexers) fields['paragraph'] = paragraph_text_field return Instance(fields)