def text_to_instance( self, # type: ignore sentence: str, structured_representations: List[List[List[JsonDict]]], labels: List[str] = None, target_sequences: List[List[str]] = None, identifier: str = None) -> Instance: """ Parameters ---------- sentence : ``str`` The query sentence. structured_representations : ``List[List[List[JsonDict]]]`` A list of Json representations of all the worlds. See expected format in this class' docstring. labels : ``List[str]`` (optional) List of string representations of the labels (true or false) corresponding to the ``structured_representations``. Not required while testing. target_sequences : ``List[List[str]]`` (optional) List of target action sequences for each element which lead to the correct denotation in worlds corresponding to the structured representations. identifier : ``str`` (optional) The identifier from the dataset if available. """ # pylint: disable=arguments-differ worlds = [NlvrWorld(data) for data in structured_representations] tokenized_sentence = self._tokenizer.tokenize(sentence) sentence_field = TextField(tokenized_sentence, self._sentence_token_indexers) production_rule_fields: List[Field] = [] instance_action_ids: Dict[str, int] = {} # TODO(pradeep): Assuming that possible actions are the same in all worlds. This may change # later. for production_rule in worlds[0].all_possible_actions(): instance_action_ids[production_rule] = len(instance_action_ids) field = ProductionRuleField(production_rule, is_global_rule=True) production_rule_fields.append(field) action_field = ListField(production_rule_fields) worlds_field = ListField([MetadataField(world) for world in worlds]) fields: Dict[str, Field] = { "sentence": sentence_field, "worlds": worlds_field, "actions": action_field } if identifier is not None: fields["identifier"] = MetadataField(identifier) # Depending on the type of supervision used for training the parser, we may want either # target action sequences or an agenda in our instance. We check if target sequences are # provided, and include them if they are. If not, we'll get an agenda for the sentence, and # include that in the instance. if target_sequences: action_sequence_fields: List[Field] = [] for target_sequence in target_sequences: index_fields = ListField([ IndexField(instance_action_ids[action], action_field) for action in target_sequence ]) action_sequence_fields.append(index_fields) # TODO(pradeep): Define a max length for this field. fields["target_action_sequences"] = ListField( action_sequence_fields) elif self._output_agendas: # TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true # now, but may change later too. agenda = worlds[0].get_agenda_for_sentence( sentence, add_paths_to_agenda=False) assert agenda, "No agenda found for sentence: %s" % sentence # agenda_field contains indices into actions. agenda_field = ListField([ IndexField(instance_action_ids[action], action_field) for action in agenda ]) fields["agenda"] = agenda_field if labels: labels_field = ListField([ LabelField(label, label_namespace='denotations') for label in labels ]) fields["labels"] = labels_field return Instance(fields)
def text_to_instance(self, tokens: List[Token]) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ return Instance({'tokens': TextField(tokens, token_indexers=self._token_indexers)})
def text_to_instance( self, # type: ignore utterances, sql_query=None): # pylint: disable=arguments-differ u""" Parameters ---------- utterances: ``List[str]``, required. List of utterances in the interaction, the last element is the current utterance. sql_query: ``str``, optional The SQL query, given as label during training or validation. """ utterance = utterances[-1] action_sequence = [] if not utterance: return None world = AtisWorld(utterances) if sql_query: try: action_sequence = world.get_action_sequence(sql_query) except ParseError: logger.debug('Parsing error') tokenized_utterance = self._tokenizer.tokenize(utterance.lower()) utterance_field = TextField(tokenized_utterance, self._token_indexers) production_rule_fields = [] for production_rule in world.all_possible_actions(): lhs, _ = production_rule.split(u' ->') is_global_rule = not lhs in [u'number', u'string'] # The whitespaces are not semantically meaningful, so we filter them out. production_rule = u' '.join([ token for token in production_rule.split(u' ') if token != u'ws' ]) field = ProductionRuleField(production_rule, is_global_rule) production_rule_fields.append(field) action_field = ListField(production_rule_fields) action_map = dict((action.rule, i) # type: ignore for i, action in enumerate(action_field.field_list)) index_fields = [] world_field = MetadataField(world) fields = { u'utterance': utterance_field, u'actions': action_field, u'world': world_field, u'linking_scores': ArrayField(world.linking_scores) } if sql_query: if action_sequence: for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) action_sequence_field = [] action_sequence_field.append(ListField(index_fields)) fields[u'target_action_sequence'] = ListField( action_sequence_field) else: # If we are given a SQL query, but we are unable to parse it, then we will skip it. return None return Instance(fields)
def __getitem_detector__(self, index): item = self.items[index] sample = {} if self.expanded and index >= self.train_size: image_file_name = "COCO_val2014_{:0>12d}.jpg".format(item['image_id']) else: image_file_name = "COCO_{}2014_{:0>12d}.jpg".format(self.split_name, item['image_id']) image_info = self.masks[image_file_name] if "train" in image_file_name: image_file_path = os.path.join(self.data_root, "train2014", image_file_name) elif "val" in image_file_name: image_file_path = os.path.join(self.data_root, "val2014", image_file_name) ################################################################### # Most of things adapted from VCR # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(image_file_path) image, window, img_scale, padding = resize_image(image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### metadata = self.masks[image_file_name] # Get the metadata # Load boxes. # We will use all detections dets2use = np.arange(len(metadata['boxes'])) # [nobj, 14, 14] segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] try: metadata['names'] = [i.split(" ")[1][1:-1] for i in metadata["names"]] except: pass obj_labels = [self.coco_obj_to_ind[metadata['names'][i]] for i in dets2use.tolist()] boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels sample['segms'] = ArrayField(segms, padding_value=0) sample['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) sample['boxes'] = ArrayField(boxes, padding_value=-1) caption_a = item["caption"] imageID = item["image_id"] sample["label"] = sample['objects'] # This is an useless field. Just so that they know the batch size. if self.expanded and index >= self.train_size: coco = self.coco_val else: coco = self.coco rest_anns = coco.loadAnns([i for i in coco.getAnnIds(imgIds=imageID) if i != item['id']]) if self.args.get("two_sentence", True): if random.random() > 0.5: item_b = self.items[random.randint(0, len(self.items) - 1)] while item_b["image_id"] == imageID: item_b = self.items[random.randint(0, len(self.items) - 1)] flag = False else: item_b = rest_anns[random.randint(0, len(rest_anns) - 1)] flag = True # is next sentence caption_b = item_b["caption"] subword_tokens_a = self.tokenizer.tokenize(caption_a) subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id = index, text_a = subword_tokens_a, text_b = subword_tokens_b, is_correct=flag, max_seq_length = self.max_seq_length) elif not self.args.get("no_next_sentence", False): if random.random() < self.args.false_caption_ratio: item_b = self.items[random.randint(0, len(self.items) - 1)] while item_b["image_id"] == imageID: item_b = self.items[random.randint(0, len(self.items) - 1)] flag = False else: item_b = item flag = True # is next sentence caption_b = item_b["caption"] subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id = index, text_a = subword_tokens_b, text_b = None, is_correct=flag, max_seq_length = self.max_seq_length) else: subword_tokens_a = self.tokenizer.tokenize(caption_a) bert_example = InputExample(unique_id = index, text_a = subword_tokens_a, text_b = None, is_correct=None, max_seq_length = self.max_seq_length) bert_feature = InputFeatures.convert_one_example_to_features_pretraining( example = bert_example, tokenizer=self.tokenizer, probability = self.masked_lm_prob) bert_feature.insert_field_into_dict(sample) return image, Instance(sample)
def text_to_instance( self, task_name: str, domain_name: str, source_string: str, target_string: str = None) -> Instance: # type: ignore task_field = LabelField(task_name, label_namespace="task_labels") domain_field = LabelField(domain_name, label_namespace="domain_labels") # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) if self._source_add_start_token: tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) inst = Instance({ 'source_tokens': source_field, "task_token": task_field, "domain_token": domain_field, 'upos_tokens': TextField( [Token(START_SYMBOL), Token(END_SYMBOL)], self._upos_token_indexers), 'ner_tokens': TextField( [Token(START_SYMBOL), Token(END_SYMBOL)], self._ner_token_indexers), 'chunk_tokens': TextField( [Token(START_SYMBOL), Token(END_SYMBOL)], self._chunk_token_indexers) }) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._task_to_indexers[task_name]) if task_name == 'upos': inst = Instance({ 'source_tokens': source_field, "task_token": task_field, "domain_token": domain_field, 'upos_tokens': target_field, 'ner_tokens': TextField([Token(START_SYMBOL), Token(END_SYMBOL)], self._ner_token_indexers), 'chunk_tokens': TextField([Token(START_SYMBOL), Token(END_SYMBOL)], self._chunk_token_indexers) }) if task_name == 'ner': inst = Instance({ 'source_tokens': source_field, "task_token": task_field, "domain_token": domain_field, 'upos_tokens': TextField([Token(START_SYMBOL), Token(END_SYMBOL)], self._upos_token_indexers), 'ner_tokens': target_field, 'chunk_tokens': TextField([Token(START_SYMBOL), Token(END_SYMBOL)], self._chunk_token_indexers) }) if task_name == 'chunk': inst = Instance({ 'source_tokens': source_field, "task_token": task_field, "domain_token": domain_field, 'upos_tokens': TextField([Token(START_SYMBOL), Token(END_SYMBOL)], self._upos_token_indexers), 'ner_tokens': TextField([Token(START_SYMBOL), Token(END_SYMBOL)], self._ner_token_indexers), 'chunk_tokens': target_field }) return inst
def formatted_text_to_instance(self, # type: ignore item_id: Any, question_text: str, documents_text_list: List[str], flattened_p1_list: List[int], flattened_p1_list_e1wh: List[int], flattened_p2_list_e1: List[int], flattened_p2_list: List[int], flattened_he_locs_list: List[Tuple[int, int]], flattened_e1wh_locs_list: List[Tuple[int, int]], flattened_e1_locs_list: List[Tuple[int, int]], flattened_ca_locs_list: List[Tuple[int, int]], he_tracks: List[List[int]], e1wh_tracks: List[List[int]], e1_tracks: List[List[int]], ca_tracks: List[List[int]], max_paths: int, max_he_locs: int, max_e1wh_locs: int, max_e1_locs: int, max_ca_locs: int, choice_text_list: List[str], all_choice_locs: List[List[Tuple[int, int]]], all_choice_docidxs: List[List[int]], answer_id: int) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} question_tokens = self._tokenizer.tokenize(question_text) documents_list_tokens = [self._tokenizer.tokenize(dt) for dt in documents_text_list] if len(sum(documents_list_tokens, [])) == 0: documents_list_tokens = [question_tokens] choices_list_tokens = [self._tokenizer.tokenize(x) for x in choice_text_list] fields['question'] = TextField(question_tokens, self._token_indexers) document_text_fields = [TextField(x, self._token_indexers) for x in documents_list_tokens] document_field = ListField(document_text_fields) fields['documents'] = document_field fields['candidates'] = ListField([TextField(x, self._token_indexers) for x in choices_list_tokens]) fields['flattened_p1list'] = ListField([IndexField(x, document_field) for x in flattened_p1_list]) fields['flattened_p1list_e1wh'] = ListField([IndexField(x, document_field) for x in flattened_p1_list_e1wh]) fields['flattened_p2list_e1'] = ListField([IndexField(x, document_field) for x in flattened_p2_list_e1]) fields['flattened_p2list'] = ListField([IndexField(x, document_field) for x in flattened_p2_list]) fields['flat_he_spans'] = ListField([SpanField(x[0], x[1], document_text_fields[flattened_p1_list[xidx]]) for xidx, x in enumerate(flattened_he_locs_list)]) fields['flat_e1wh_spans'] = ListField([SpanField(x[0], x[1], document_text_fields[flattened_p1_list_e1wh[xidx]]) for xidx, x in enumerate(flattened_e1wh_locs_list)]) fields['flat_e1_spans'] = ListField([SpanField(x[0], x[1], document_text_fields[flattened_p2_list_e1[xidx]]) for xidx, x in enumerate(flattened_e1_locs_list)]) fields['flat_choice_spans'] = ListField([SpanField(x[0], x[1], document_text_fields[flattened_p2_list[xidx]]) for xidx, x in enumerate(flattened_ca_locs_list)]) # all choice fields all_choice_docidx_field = [] all_choice_span_fileds = [] for choice_docidxs, choice_spans in zip(all_choice_docidxs, all_choice_locs): all_choice_docidx_field.append(ListField([IndexField(x, document_field) for x in choice_docidxs])) all_choice_span_fileds.append(ListField([SpanField(x[0], x[1], document_text_fields[choice_docidxs[xidx]]) for xidx, x in enumerate(choice_spans)])) fields['all_choice_docidxs'] = ListField(all_choice_docidx_field) fields['all_choice_locs'] = ListField(all_choice_span_fileds) if answer_id is not None: fields['label'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question_text, "documents_text": documents_text_list, "choice_text_list": choice_text_list, "he_tracks": he_tracks, "e1wh_tracks": e1wh_tracks, "e1_tracks": e1_tracks, "choice_tracks": ca_tracks, "max_num_paths": max_paths, "max_num_he_locs": max_he_locs, "max_num_e1wh_locs": max_e1wh_locs, "max_num_e1_locs": max_e1_locs, "max_num_ca_locs": max_ca_locs, } fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, # type: ignore tokens: List[str], pos_tags: List[str] = None, gold_tree: Tree = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. pos_tags ``List[str]``, optional, (default = None). The POS tags for the words in the sentence. gold_tree : ``Tree``, optional (default = None). The gold parse tree to create span labels from. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. pos_tags : ``SequenceLabelField`` The POS tags of the words in the sentence. Only returned if ``use_pos_tags`` is ``True`` spans : ``ListField[SpanField]`` A ListField containing all possible subspans of the sentence. span_labels : ``SequenceLabelField``, optional. The constiutency tags for each of the possible spans, with respect to a gold parse tree. If a span is not contained within the tree, a span will have a ``NO-LABEL`` label. gold_tree : ``MetadataField(Tree)`` The gold NLTK parse tree for use in evaluation. """ # pylint: disable=arguments-differ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} if self._use_pos_tags and pos_tags is not None: pos_tag_field = SequenceLabelField(pos_tags, text_field, "pos_tags") fields["pos_tags"] = pos_tag_field elif self._use_pos_tags: raise ConfigurationError( "use_pos_tags was set to True but no gold pos" " tags were passed to the dataset reader.") spans: List[Field] = [] gold_labels = [] if gold_tree is not None: gold_spans_with_pos_tags: Dict[Tuple[int, int], str] = {} self._get_gold_spans(gold_tree, 0, gold_spans_with_pos_tags) gold_spans = { span: label for (span, label) in gold_spans_with_pos_tags.items() if "-POS" not in label } else: gold_spans = None for start, end in enumerate_spans(tokens): spans.append(SpanField(start, end, text_field)) if gold_spans is not None: if (start, end) in gold_spans.keys(): gold_labels.append(gold_spans[(start, end)]) else: gold_labels.append("NO-LABEL") metadata = {"tokens": tokens} if gold_tree: metadata["gold_tree"] = gold_tree fields["metadata"] = MetadataField(metadata) span_list_field: ListField = ListField(spans) fields["spans"] = span_list_field if gold_tree is not None: fields["span_labels"] = SequenceLabelField(gold_labels, span_list_field) return Instance(fields)
def text_to_instance(self, rule_text, question, scenario, history, answer=None, evidence=None) -> Instance: # type: ignore """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ # pylint: disable=arguments-differ if answer and answer in ['Yes', 'No', 'Irrelevant']: return None target_string = answer if self.train_using_gold and answer is not None: # i.e. during training and validation predicted_label = answer if answer in ['Yes', 'No', 'Irrelevant'] else 'More' predicted_span_ixs = self.dataset_reader.find_lcs(rule_text, answer, self._source_tokenizer.tokenize) if predicted_span_ixs is None: return None else: rule_offsets = [(token.idx, token.idx + len(token.text)) for token in self._source_tokenizer.tokenize(rule_text)] predicted_span = rule_text[rule_offsets[predicted_span_ixs[0]][0]: rule_offsets[predicted_span_ixs[1]][1]] else: predicted_span, predicted_label = self.get_prediction(rule_text, question, scenario, history) if self.add_rule: if self.embed_span: source_string = self.get_embedded_span(rule_text, predicted_span) else: source_string = rule_text + ' @pss@ ' + predicted_span + ' @pse@' else: source_string = predicted_span if self.add_question: source_string += ' @qs@ ' + question + ' @qe' if self.add_followup_ques: for follow_up_qna in history: source_string += ' @fs@ ' + follow_up_qna['follow_up_question'] + ' @fe' tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace) meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]} fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]] source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2] fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:] fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids)) meta_fields['label'] = predicted_label fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance( self, # type: ignore question: str, table_lines: List[str], example_lisp_string: str = None, dpd_output: List[str] = None, tokenized_question: List[Token] = None) -> Instance: """ Reads text inputs and makes an instance. WikitableQuestions dataset provides tables as TSV files, which we use for training. Parameters ---------- question : ``str`` Input question table_lines : ``List[str]`` The table content itself, as a list of rows. See ``TableQuestionKnowledgeGraph.read_from_lines`` for the expected format. example_lisp_string : ``str``, optional The original (lisp-formatted) example string in the WikiTableQuestions dataset. This comes directly from the ``.examples`` file provided with the dataset. We pass this to SEMPRE for evaluating logical forms during training. It isn't otherwise used for anything. dpd_output : List[str], optional List of logical forms, produced by dynamic programming on denotations. Not required during test. tokenized_question : ``List[Token]``, optional If you have already tokenized the question, you can pass that in here, so we don't duplicate that work. You might, for example, do batch processing on the questions in the whole dataset, then pass the result in here. """ # pylint: disable=arguments-differ tokenized_question = tokenized_question or self._tokenizer.tokenize( question.lower()) question_field = TextField(tokenized_question, self._question_token_indexers) metadata: Dict[str, Any] = { "question_tokens": [x.text for x in tokenized_question] } metadata["original_table"] = "\n".join(table_lines) table_knowledge_graph = TableQuestionKnowledgeGraph.read_from_lines( table_lines, tokenized_question) table_metadata = MetadataField(table_lines) table_field = KnowledgeGraphField( table_knowledge_graph, tokenized_question, self._table_token_indexers, tokenizer=self._tokenizer, feature_extractors=self._linking_feature_extractors, include_in_vocab=self._use_table_for_vocab, max_table_tokens=self._max_table_tokens) world = WikiTablesWorld(table_knowledge_graph) world_field = MetadataField(world) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_actions(): _, rule_right_side = production_rule.split(' -> ') is_global_rule = not world.is_table_entity(rule_right_side) field = ProductionRuleField(production_rule, is_global_rule) production_rule_fields.append(field) action_field = ListField(production_rule_fields) fields = { 'question': question_field, 'metadata': MetadataField(metadata), 'table': table_field, 'world': world_field, 'actions': action_field } if self._include_table_metadata: fields['table_metadata'] = table_metadata if example_lisp_string: fields['example_lisp_string'] = MetadataField(example_lisp_string) # We'll make each target action sequence a List[IndexField], where the index is into # the action list we made above. We need to ignore the type here because mypy doesn't # like `action.rule` - it's hard to tell mypy that the ListField is made up of # ProductionRuleFields. action_map = { action.rule: i for i, action in enumerate(action_field.field_list) } # type: ignore if dpd_output: action_sequence_fields: List[Field] = [] for logical_form in dpd_output: if not self._should_keep_logical_form(logical_form): logger.debug(f'Question was: {question}') logger.debug(f'Table info was: {table_lines}') continue try: expression = world.parse_logical_form(logical_form) except ParsingError as error: logger.debug( f'Parsing error: {error.message}, skipping logical form' ) logger.debug(f'Question was: {question}') logger.debug(f'Logical form was: {logical_form}') logger.debug(f'Table info was: {table_lines}') continue except: logger.error(logical_form) raise action_sequence = world.get_action_sequence(expression) try: index_fields: List[Field] = [] for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) action_sequence_fields.append(ListField(index_fields)) except KeyError as error: logger.debug( f'Missing production rule: {error.args}, skipping logical form' ) logger.debug(f'Question was: {question}') logger.debug(f'Table info was: {table_lines}') logger.debug(f'Logical form was: {logical_form}') continue if len(action_sequence_fields) >= self._max_dpd_logical_forms: break if not action_sequence_fields: # This is not great, but we're only doing it when we're passed logical form # supervision, so we're expecting labeled logical forms, but we can't actually # produce the logical forms. We should skip this instance. Note that this affects # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the # full test data. return None fields['target_action_sequences'] = ListField( action_sequence_fields) if self._output_agendas: agenda_index_fields: List[Field] = [] for agenda_string in world.get_agenda(): agenda_index_fields.append( IndexField(action_map[agenda_string], action_field)) if not agenda_index_fields: agenda_index_fields = [IndexField(-1, action_field)] fields['agenda'] = ListField(agenda_index_fields) return Instance(fields)
def text_to_instance( self, # type: ignore utterances: List[str], sql_query_labels: List[str] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- utterances: ``List[str]``, required. List of utterances in the interaction, the last element is the current utterance. sql_query_labels: ``List[str]``, optional The SQL queries that are given as labels during training or validation. """ if self._num_turns_to_concatenate: utterances[-1] = f' {END_OF_UTTERANCE_TOKEN} '.join( utterances[-self._num_turns_to_concatenate:]) utterance = utterances[-1] action_sequence: List[str] = [] if not utterance: return None world = AtisWorld(utterances=utterances) if sql_query_labels: # If there are multiple sql queries given as labels, we use the shortest # one for training. sql_query = min(sql_query_labels, key=len) try: action_sequence = world.get_action_sequence(sql_query) except ParseError: action_sequence = [] logger.debug(f'Parsing error') tokenized_utterance = self._tokenizer.tokenize(utterance.lower()) utterance_field = TextField(tokenized_utterance, self._token_indexers) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_actions(): nonterminal, _ = production_rule.split(' ->') # The whitespaces are not semantically meaningful, so we filter them out. production_rule = ' '.join([ token for token in production_rule.split(' ') if token != 'ws' ]) field = ProductionRuleField(production_rule, self._is_global_rule(nonterminal)) production_rule_fields.append(field) action_field = ListField(production_rule_fields) action_map = { action.rule: i # type: ignore for i, action in enumerate(action_field.field_list) } index_fields: List[Field] = [] world_field = MetadataField(world) fields = { 'utterance': utterance_field, 'actions': action_field, 'world': world_field, 'linking_scores': ArrayField(world.linking_scores) } if sql_query_labels != None: fields['sql_queries'] = MetadataField(sql_query_labels) if self._keep_if_unparseable or action_sequence: for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) if not action_sequence: index_fields = [IndexField(-1, action_field)] action_sequence_field = ListField(index_fields) fields['target_action_sequence'] = action_sequence_field else: # If we are given a SQL query, but we are unable to parse it, and we do not specify explicitly # to keep it, then we will skip the it. return None return Instance(fields)
def text_to_instance( self, # type: ignore query: List[str], derived_cols: List[Tuple[str, str]], derived_tables: List[str], prelinked_entities: Dict[str, Dict[str, str]] = None, sql: List[str] = None, alignment: List[str] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokens = TextField([Token(t) for t in query], self._token_indexers) fields["tokens"] = tokens if sql is not None: action_sequence, all_actions = self._world.get_action_sequence_and_all_actions( query=sql, derived_cols=derived_cols, derived_tables=derived_tables, prelinked_entities=prelinked_entities) if action_sequence is None: return None if alignment is not None: # Modify the alignment according to the action sequence alignment = AttnSupGrammarBasedWorld.modify_alignment( action_sequence=action_sequence, alignment=alignment) else: # having a list of NO_ALIGN is basically equivalent to mask all the alignment alignment = ['NO_ALIGN'] * len(action_sequence) index_fields: List[Field] = [] production_rule_fields: List[Field] = [] for production_rule in all_actions: nonterminal, _ = production_rule.split(' ->') production_rule = ' '.join(production_rule.split(' ')) field = ProductionRuleField( production_rule, self._world.is_global_rule(nonterminal), nonterminal=nonterminal) production_rule_fields.append(field) valid_actions_field = ListField(production_rule_fields) fields["valid_actions"] = valid_actions_field action_map = { action.rule: i # type: ignore for i, action in enumerate(valid_actions_field.field_list) } for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], valid_actions_field)) if not action_sequence: index_fields = [IndexField(-1, valid_actions_field)] # if not action_sequence and re.findall(r"COUNT \( \* \) (?:<|>|<>|=) 0", " ".join(sql)): # index_fields = [IndexField(-2, valid_actions_field)] action_sequence_field = ListField(index_fields) fields["action_sequence"] = action_sequence_field alignment_index_fields: List[IndexField] = [] tmp_tokens_as_strings = [t.text for t in tokens] for aligned_token in alignment: try: aligned_token_index = int( tmp_tokens_as_strings.index(aligned_token)) alignment_index_fields.append( IndexField(aligned_token_index, tokens)) except ValueError as e: # a special "no alignment" index alignment_index_fields.append( IndexField(-1, tokens.empty_field())) fields["alignment_sequence"] = ListField(alignment_index_fields) return Instance(fields)
def text_to_instance( self, # type: ignore tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None, ner_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField( {"words": [x.text for x in tokens]}) # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_chunks = to_bioul(chunk_tags, encoding=self._original_coding_scheme ) if chunk_tags is not None else None coded_ner = to_bioul(ner_tags, encoding=self._original_coding_scheme ) if ner_tags is not None else None else: # the default IOB1 coded_chunks = chunk_tags coded_ner = ner_tags # Add "feature labels" to instance if 'pos' in self.feature_labels: if pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance.") instance_fields['pos_tags'] = SequenceLabelField( pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: if coded_chunks is None: raise ConfigurationError( "Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance.") instance_fields['chunk_tags'] = SequenceLabelField( coded_chunks, sequence, "chunk_tags") if 'ner' in self.feature_labels: if coded_ner is None: raise ConfigurationError( "Dataset reader was specified to use NER tags as " " features. Pass them to text_to_instance.") instance_fields['ner_tags'] = SequenceLabelField( coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner' and coded_ner is not None: instance_fields['tags'] = SequenceLabelField( coded_ner, sequence, self.label_namespace) elif self.tag_label == 'pos' and pos_tags is not None: instance_fields['tags'] = SequenceLabelField( pos_tags, sequence, self.label_namespace) elif self.tag_label == 'chunk' and coded_chunks is not None: instance_fields['tags'] = SequenceLabelField( coded_chunks, sequence, self.label_namespace) return Instance(instance_fields)
def text_to_instance(self, line: str) -> Instance: # type: ignore # pylint: disable=arguments-differ tokens = self._tokenizer.tokenize(line) return Instance({"line": TextField(tokens, self._token_indexers)})
def text_to_instance( self, question_text: str, passage_text: str, passage_tokens: List[Token], passage_spans: List[Tuple[int, int]], numbers_in_passage: List[Any], number_words: List[str], number_indices: List[int], number_len: List[int], question_id: str = None, passage_id: str = None, answer_annotations: List[Dict] = None, count_gold_spans_text: List[str] = None) -> Union[Instance, None]: # Tokenize question and passage question_tokens = self.tokenizer.tokenize(question_text) qlen = len(question_tokens) plen = len(passage_tokens) question_passage_tokens = [Token('[CLS]')] + question_tokens + [ Token('[SEP]') ] + passage_tokens if len(question_passage_tokens) > self.max_pieces - 1: question_passage_tokens = question_passage_tokens[:self. max_pieces - 1] passage_tokens = passage_tokens[:self.max_pieces - qlen - 3] plen = len(passage_tokens) number_indices, number_len, numbers_in_passage = \ clipped_passage_num(number_indices, number_len, numbers_in_passage, plen) question_passage_tokens += [Token('[SEP]')] number_indices = [index + qlen + 2 for index in number_indices] + [-1] # Not done in-place so they won't change the numbers saved for the passage number_len = number_len + [1] numbers_in_passage = numbers_in_passage + [0] number_tokens = [Token(str(number)) for number in numbers_in_passage] extra_number_tokens = [Token(str(num)) for num in self.extra_numbers] mask_indices = [0, qlen + 1, len(question_passage_tokens) - 1] fields: Dict[str, Field] = {} # Add feature fields question_passage_field = TextField(question_passage_tokens, self.token_indexers) fields["question_passage"] = question_passage_field number_token_indices = \ [ArrayField(np.arange(start_ind, start_ind + number_len[i]), padding_value=-1) for i, start_ind in enumerate(number_indices)] fields["number_indices"] = ListField(number_token_indices) numbers_in_passage_field = TextField(number_tokens, self.token_indexers) extra_numbers_field = TextField(extra_number_tokens, self.token_indexers) all_numbers_field = TextField(extra_number_tokens + number_tokens, self.token_indexers) mask_index_fields: List[Field] = [ IndexField(index, question_passage_field) for index in mask_indices ] fields["mask_indices"] = ListField(mask_index_fields) # Compile question, passage, answer metadata metadata = { "original_passage": passage_text, "original_question": question_text, "original_numbers": numbers_in_passage, "original_number_words": number_words, "extra_numbers": self.extra_numbers, "passage_tokens": passage_tokens, "question_tokens": question_tokens, "question_passage_tokens": question_passage_tokens, "passage_id": passage_id, "question_id": question_id } if self.extract_spans: metadata["passage_spans"] = passage_spans if count_gold_spans_text is not None: metadata["count_gold_spans_text"] = count_gold_spans_text if answer_annotations: for annotation in answer_annotations: tokenized_spans = [[ token.text for token in self.tokenizer.tokenize(answer) ] for answer in annotation['spans']] annotation['spans'] = [ tokenlist_to_passage(token_list) for token_list in tokenized_spans ] # Get answer type, answer text, tokenize answer_type, answer_texts = DropReader.extract_answer_info_from_annotation( answer_annotations[0]) tokenized_answer_texts = [] num_spans = min(len(answer_texts), self.max_spans) for answer_text in answer_texts: answer_tokens = self.tokenizer.tokenize(answer_text) tokenized_answer_texts.append(' '.join( token.text for token in answer_tokens)) metadata["answer_annotations"] = answer_annotations metadata["answer_texts"] = answer_texts metadata["answer_tokens"] = tokenized_answer_texts # Find answer text in question and passage valid_question_spans = DropReader.find_valid_spans( question_tokens, tokenized_answer_texts) for span_ind, span in enumerate(valid_question_spans): valid_question_spans[span_ind] = (span[0] + 1, span[1] + 1) valid_passage_spans = DropReader.find_valid_spans( passage_tokens, tokenized_answer_texts) for span_ind, span in enumerate(valid_passage_spans): valid_passage_spans[span_ind] = (span[0] + qlen + 2, span[1] + qlen + 2) # Get target numbers target_numbers = [] for answer_text in answer_texts: number = self.word_to_num(answer_text) if number is not None: target_numbers.append(number) # Get possible ways to arrive at target numbers with add/sub valid_expressions: List[List[int]] = [] exp_strings = None if answer_type in ["number", "date"]: if self.exp_search == 'full': expressions = get_full_exp( list(enumerate(self.extra_numbers + numbers_in_passage)), target_numbers, self.operations, self.op_dict, self.max_depth) zipped = list(zip(*expressions)) if zipped: valid_expressions = list(zipped[0]) exp_strings = list(zipped[1]) elif self.exp_search == 'add_sub': valid_expressions = \ DropReader.find_valid_add_sub_expressions(self.extra_numbers + numbers_in_passage, target_numbers, self.max_numbers_expression) elif self.exp_search == 'template': valid_expressions, exp_strings = \ get_template_exp(self.extra_numbers + numbers_in_passage, target_numbers, self.templates, self.template_strings) exp_strings = sum(exp_strings, []) # Get possible ways to arrive at target numbers with counting valid_counts: List[int] = [] if answer_type in ["number"]: numbers_for_count = list(range(self.max_count + 1)) valid_counts = DropReader.find_valid_counts( numbers_for_count, target_numbers) # Update metadata with answer info answer_info = { "answer_passage_spans": valid_passage_spans, "answer_question_spans": valid_question_spans, "num_spans": num_spans, "expressions": valid_expressions, "counts": valid_counts } if self.exp_search in ['template', 'full']: answer_info['expr_text'] = exp_strings metadata["answer_info"] = answer_info # Add answer fields passage_span_fields: List[Field] = [ SpanField(span[0], span[1], question_passage_field) for span in valid_passage_spans ] if not passage_span_fields: passage_span_fields.append( SpanField(-1, -1, question_passage_field)) fields["answer_as_passage_spans"] = ListField(passage_span_fields) question_span_fields: List[Field] = [ SpanField(span[0], span[1], question_passage_field) for span in valid_question_spans ] if not question_span_fields: question_span_fields.append( SpanField(-1, -1, question_passage_field)) fields["answer_as_question_spans"] = ListField( question_span_fields) if self.exp_search == 'add_sub': add_sub_signs_field: List[Field] = [] extra_signs_field: List[Field] = [] for signs_for_one_add_sub_expressions in valid_expressions: extra_signs = signs_for_one_add_sub_expressions[:len( self.extra_numbers)] normal_signs = signs_for_one_add_sub_expressions[ len(self.extra_numbers):] add_sub_signs_field.append( SequenceLabelField(normal_signs, numbers_in_passage_field)) extra_signs_field.append( SequenceLabelField(extra_signs, extra_numbers_field)) if not add_sub_signs_field: add_sub_signs_field.append( SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field)) if not extra_signs_field: extra_signs_field.append( SequenceLabelField([0] * len(self.extra_numbers), extra_numbers_field)) fields["answer_as_expressions"] = ListField( add_sub_signs_field) if self.extra_numbers: fields["answer_as_expressions_extra"] = ListField( extra_signs_field) elif self.exp_search in ['template', 'full']: expression_indices = [] for expression in valid_expressions: if not expression: expression.append(3 * [-1]) expression_indices.append( ArrayField(np.array(expression), padding_value=-1)) if not expression_indices: expression_indices = \ [ArrayField(np.array([3 * [-1]]), padding_value=-1) for _ in range(len(self.templates))] fields["answer_as_expressions"] = ListField(expression_indices) count_fields: List[Field] = [ LabelField(count_label, skip_indexing=True) for count_label in valid_counts ] if not count_fields: count_fields.append(LabelField(-1, skip_indexing=True)) fields["answer_as_counts"] = ListField(count_fields) fields["num_spans"] = LabelField(num_spans, skip_indexing=True) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance(self, origin_obj: Any) -> Instance: prev_obj = origin_obj['prev'] fol_obj = origin_obj['follow'] abs_prev_tokens, col_counter, val_counter = abstract_utterance(prev_obj) abs_fol_tokens, _, _ = abstract_utterance(fol_obj, col_counter, val_counter) # token level tokenizing prev_tokens = self._tokenizer.tokenize(" ".join(abs_prev_tokens)) prev_tokens = TextField(prev_tokens, self._token_indexers) fol_tokens = self._tokenizer.tokenize(" ".join(abs_fol_tokens)) fol_tokens = TextField(fol_tokens, self._token_indexers) # char level tokenizing prev_tag_tokens = [] prev_anno: StandardSpan = origin_obj["prev"] for ind, tag in enumerate(prev_anno.tags): if tag is None: prev_tag_tokens.append(prev_tokens[ind].text) elif tag.class_type in COLUMN_BIND_TYPES: prev_tag_tokens.append(tag.header.replace(" ", "_")) elif tag.class_type in VALUE_BIND_TYPES: if len(tag.header) > 0: prev_tag_tokens.append(tag.header[0].replace(" ", "_")) else: prev_tag_tokens.append(tag.origin.replace(" ", "_")) else: prev_tag_tokens.append(prev_tokens[ind].text) fol_char_str = [] fol_anno: StandardSpan = origin_obj["follow"] for ind, tag in enumerate(fol_anno.tags): if tag is None: fol_char_str.append(fol_tokens[ind].text) elif tag.class_type in COLUMN_BIND_TYPES: fol_char_str.append(tag.header.replace(" ", "_")) elif tag.class_type in VALUE_BIND_TYPES: if len(tag.header) > 0: fol_char_str.append(tag.header[0].replace(" ", "_")) else: fol_char_str.append(tag.origin.replace(" ", "_")) else: fol_char_str.append(fol_tokens[ind].text) # split into char-based tokens prev_tag_str = " ".join(prev_tag_tokens) prev_tag_tokens = self._tokenizer.tokenize(prev_tag_str) prev_tag_field = TextField(prev_tag_tokens, self._char_indexers) fol_tag_str = " ".join(fol_char_str) fol_char_str = self._tokenizer.tokenize(fol_tag_str) fol_tag_field = TextField(fol_char_str, self._char_indexers) fields = {'prev_tokens': prev_tokens, 'fol_tokens': fol_tokens, 'prev_tags': prev_tag_field, 'fol_tags': fol_tag_field} metadata = {"origin_obj": origin_obj, "tokens_origin": abs_prev_tokens + abs_fol_tokens} metadata_field = MetadataField(metadata) fields['metadata'] = metadata_field # pre-training object caching prev_snippets = origin_obj['prev'].snippet fol_snippets = origin_obj['follow'].snippet conflict = origin_obj['conflicts'] origin_obj.pop('conflicts') prev_labels = SequenceLabelField(prev_snippets, prev_tokens) fields['prev_labels'] = prev_labels fol_labels = SequenceLabelField(fol_snippets, fol_tokens) fields['fol_labels'] = fol_labels conflict_field = MetadataField(conflict) fields['conflicts'] = conflict_field fields['metadata'].metadata['origin_obj']['prev_labels'] = prev_snippets fields['metadata'].metadata['origin_obj']['fol_labels'] = fol_snippets return Instance(fields)
def _json_blob_to_instance(self, json_obj: JsonDict) -> Instance: question_tokens = self._read_tokens_from_json_list( json_obj['question_tokens']) question_field = TextField(question_tokens, self._question_token_indexers) question_metadata = MetadataField( {"question_tokens": [x.text for x in question_tokens]}) table_knowledge_graph = TableQuestionKnowledgeGraph.read_from_lines( json_obj['table_lines'], question_tokens) entity_tokens = [ self._read_tokens_from_json_list(token_list) for token_list in json_obj['entity_texts'] ] table_field = KnowledgeGraphField( table_knowledge_graph, question_tokens, tokenizer=None, token_indexers=self._table_token_indexers, entity_tokens=entity_tokens, linking_features=json_obj['linking_features'], include_in_vocab=self._use_table_for_vocab, max_table_tokens=self._max_table_tokens) world = WikiTablesWorld(table_knowledge_graph) world_field = MetadataField(world) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_actions(): _, rule_right_side = production_rule.split(' -> ') is_global_rule = not world.is_table_entity(rule_right_side) field = ProductionRuleField(production_rule, is_global_rule) production_rule_fields.append(field) action_field = ListField(production_rule_fields) example_string_field = MetadataField(json_obj['example_lisp_string']) fields = { 'question': question_field, 'metadata': question_metadata, 'table': table_field, 'world': world_field, 'actions': action_field, 'example_lisp_string': example_string_field } if 'target_action_sequences' in json_obj or 'agenda' in json_obj: action_map = { action.rule: i for i, action in enumerate(action_field.field_list) } # type: ignore if 'target_action_sequences' in json_obj: action_sequence_fields: List[Field] = [] for sequence in json_obj['target_action_sequences']: index_fields: List[Field] = [] for production_rule in sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) action_sequence_fields.append(ListField(index_fields)) fields['target_action_sequences'] = ListField( action_sequence_fields) if 'agenda' in json_obj: agenda_index_fields: List[Field] = [] for agenda_action in json_obj['agenda']: agenda_index_fields.append( IndexField(action_map[agenda_action], action_field)) fields['agenda'] = ListField(agenda_index_fields) return Instance(fields)
def text_to_instance(self, source_string: str, target_lang: str, target_string: str = None) -> Instance: """ Turn raw source string and target string into an ``Instance``. Parameters ---------- source_string : ``str``, required target_lang : ``str``, required target_string : ``str``, optional (default = None) Returns ------- Instance See the above for a description of the fields that the instance will contain. """ tokenized_source = self._source_tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) # For each token in the source sentence, we keep track of the matching token # in the target sentence (which will be the OOV symbol if there is no match). source_to_target_field = NamespaceSwappingField( tokenized_source[1:-1], self._target_namespace) meta_fields = { "source_tokens": [x.text for x in tokenized_source[1:-1]] } fields_dict = { "source_tokens": source_field, "source_to_target": source_to_target_field, } if self._provide_trg_lang: lang_id_field = LabelField( target_lang, label_namespace=self._language_id_namespace) metadata_trg_lang = MetadataField(target_lang) fields_dict["target_lang"] = lang_id_field fields_dict["target_language"] = metadata_trg_lang if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) fields_dict["target_tokens"] = target_field meta_fields["target_tokens"] = [ y.text for y in tokenized_target[1:-1] ] source_and_target_token_ids = self._tokens_to_ids( tokenized_source[1:-1] + tokenized_target) source_token_ids = source_and_target_token_ids[:len( tokenized_source) - 2] fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(tokenized_source ) - 2:] fields_dict["target_token_ids"] = ArrayField( np.array(target_token_ids)) else: source_token_ids = self._tokens_to_ids(tokenized_source[1:-1]) fields_dict["source_token_ids"] = ArrayField( np.array(source_token_ids)) fields_dict["metadata"] = MetadataField(meta_fields) return Instance(fields_dict)
def text_to_instance(self, # type: ignore document_id: str, part_number: str, sentences: List[List[str]], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- document_id: ``str``, required. The id of the document. sentences : ``List[List[str]]``, required. A list of lists representing the tokenised words and sentences in the document. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the document, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full document. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the document text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ flattened_sentences = [self._normalize_word(word) for sentence in sentences for word in sentence] metadata: Dict[str, Any] = { "document_id": document_id, "part_number": part_number, "original_text": flattened_sentences, } if gold_clusters is not None: metadata["clusters"] = gold_clusters text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers) cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List[Field] = [] span_labels: Optional[List[int]] = [] if gold_clusters is not None else None sentence_offset = 0 for sentence in sentences: for start, end in enumerate_spans(sentence, offset=sentence_offset, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) spans.append(SpanField(start, end, text_field)) sentence_offset += len(sentence) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = {"text": text_field, "spans": span_field, "metadata": metadata_field} if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields)
def text_to_instance( # type: ignore self, tokens: List[Token], verb_label: List[int], img, tags: List[str] = None ) -> Instance: """ We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ metadata_dict: Dict[str, Any] = {} if self.bert_tokenizer is not None: wordpieces, offsets, start_offsets = self._wordpiece_tokenize_input( [t.text for t in tokens] ) new_verbs = _convert_verb_indices_to_wordpiece_indices(verb_label, offsets) metadata_dict["offsets"] = start_offsets # In order to override the indexing mechanism, we need to set the `text_id` # attribute directly. This causes the indexing to use this id. text_field = TextField( [Token(t, text_id=self.bert_tokenizer.vocab[t]) for t in wordpieces], token_indexers=self._token_indexers, ) verb_indicator = SequenceLabelField(new_verbs, text_field) else: text_field = TextField(tokens, token_indexers=self._token_indexers) verb_indicator = SequenceLabelField(verb_label, text_field) #? Maybe other options??? img_feats = img['features'].copy() img_boxes = img['boxes'].copy() obj_num = img['num_boxes'] assert len(img_feats) == len(img_boxes) == obj_num # Normalize the boxes to 0 ~ 1 img_boxes = img_boxes.copy() img_boxes[:, (0, 2)] /= img['img_w'] img_boxes[:, (1, 3)] /= img['img_h'] np.testing.assert_array_less(img_boxes, 1+1e-5) np.testing.assert_array_less(-img_boxes, 0+1e-5) # Concat box feats to each object features img_concat = np.hstack((img_feats, img_boxes)) img_field = ArrayField(img_concat) fields: Dict[str, Field] = {} fields["tokens"] = text_field fields["verb_indicator"] = verb_indicator fields["img_emb"] = img_field if all([x == 0 for x in verb_label]): verb = None verb_index = None else: verb_index = verb_label.index(1) verb = tokens[verb_index].text metadata_dict["words"] = [x.text for x in tokens] metadata_dict["verb"] = verb metadata_dict["verb_index"] = verb_index if tags: if self.bert_tokenizer is not None: new_tags = _convert_tags_to_wordpiece_tags(tags, offsets) fields["tags"] = SequenceLabelField(new_tags, text_field) else: fields["tags"] = SequenceLabelField(tags, text_field) metadata_dict["gold_tags"] = tags fields["metadata"] = MetadataField(metadata_dict) return Instance(fields)
def text_to_instance( self, # type: ignore sentence: List[Token], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None, ) -> Instance: """ Parameters ---------- sentence : ``List[Token]``, required. The already tokenised sentence to analyse. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the sentence, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full sentence. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the sentence text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ metadata: Dict[str, Any] = {"original_text": sentence} if gold_clusters is not None: metadata["clusters"] = gold_clusters text_field = TextField(sentence, self._token_indexers) cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id spans: List[Field] = [] span_labels: Optional[ List[int]] = [] if gold_clusters is not None else None for start, end in enumerate_spans(sentence, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) spans.append(SpanField(start, end, text_field)) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field, } if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) return Instance(fields)
def __getitem__(self, index): if self.image_feature_type == "r2c": return self.__getitem_detector__(index) item = self.items[index] sample = {} if not self.text_only: image_feat_variable, image_boxes, image_dim_variable = self.get_image_features_by_training_index(index) image_feat_variable = ArrayField(image_feat_variable) image_dim_variable = IntArrayField(np.array(image_dim_variable)) sample["image_feat_variable"] = image_feat_variable sample["image_dim_variable"] = image_dim_variable sample["label"] = image_dim_variable else: sample["label"] = IntArrayField(np.array([0])) caption_a = item["caption"] imageID = item["image_id"] if self.expanded and index >= self.train_size: coco = self.coco_val else: coco = self.coco rest_anns = coco.loadAnns([i for i in coco.getAnnIds(imgIds=imageID) if i != item['id']]) if self.args.get("two_sentence", True): if random.random() > 0.5: item_b = self.items[random.randint(0, len(self.items) - 1)] while item_b["image_id"] == imageID: item_b = self.items[random.randint(0, len(self.items) - 1)] flag = False else: item_b = rest_anns[random.randint(0, len(rest_anns) - 1)] flag = True caption_b = item_b["caption"] subword_tokens_a = self.tokenizer.tokenize(caption_a) subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id = index, text_a = subword_tokens_a, text_b = subword_tokens_b, is_correct=flag, max_seq_length = self.max_seq_length) elif not self.args.get("no_next_sentence", False): if random.random() < self.args.false_caption_ratio: item_b = self.items[random.randint(0, len(self.items) - 1)] while item_b["image_id"] == imageID: item_b = self.items[random.randint(0, len(self.items) - 1)] flag = False else: item_b = item flag = True caption_b = item_b["caption"] subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id = index, text_a = subword_tokens_b, text_b = None, is_correct=flag, max_seq_length = self.max_seq_length) else: caption_b = item["caption"] subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id = index, text_a = subword_tokens_b, text_b = None, is_correct=None, max_seq_length = self.max_seq_length) bert_feature = InputFeatures.convert_one_example_to_features_pretraining( example = bert_example, tokenizer=self.tokenizer, probability = self.masked_lm_prob) bert_feature.insert_field_into_dict(sample) return Instance(sample)
def text_to_instance( self, # type: ignore tokens: List[Token], verb_label: List[int], verb_index: int, constituents: List[List[str]] = None, srl_args: List[List[str]] = None) -> Instance: """ We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ # pylint: disable=arguments-differ # Input fields. text_field = TextField(tokens, token_indexers=self._token_indexers) verb_field = SequenceLabelField(verb_label, text_field) target_field = IndexField(verb_index, text_field) # Span-based output fields. span_starts: List[Field] = [] span_ends: List[Field] = [] span_mask: List[int] = [ 1 for _ in range(len(tokens) * self.max_span_width) ] span_labels: Optional[List[str]] = [] if srl_args is not None else None constit_labels: Optional[ List[str]] = [] if constituents is not None else None for j in range(len(tokens)): for diff in range(self.max_span_width): width = diff if j - diff < 0: # This is an invalid span. span_mask[j * self.max_span_width + diff] = 0 width = j span_starts.append(IndexField(j - width, text_field)) span_ends.append(IndexField(j, text_field)) if srl_args: current_label = srl_args[j][diff] span_labels.append(current_label) if constituents: label = constituents[j][diff] constit_labels.append(label) start_fields = ListField(span_starts) end_fields = ListField(span_ends) span_mask_fields = SequenceLabelField(span_mask, start_fields) fields: Dict[str, Field] = { 'tokens': text_field, 'verb_indicator': verb_field, 'target_index': target_field, 'span_starts': start_fields, 'span_ends': end_fields, 'span_mask': span_mask_fields } if srl_args: fields['tags'] = SequenceLabelField(span_labels, start_fields) if constituents: fields['constituents'] = SequenceLabelField( constit_labels, start_fields, label_namespace="constit_labels") return Instance(fields)
def make_marginal_drop_instance( question_tokens: List[Token], passage_tokens: List[Token], number_tokens: List[Token], number_indices: List[int], token_indexers: Dict[str, TokenIndexer], passage_text: str, answer_info: Dict[str, Any] = None, additional_metadata: Dict[str, Any] = None, ) -> Instance: additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] question_offsets = [(token.idx, token.idx + len(token.text)) for token in question_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) question_field = TextField(question_tokens, token_indexers) fields["passage"] = passage_field fields["question"] = question_field number_index_fields: List[Field] = [ IndexField(index, passage_field) for index in number_indices ] fields["number_indices"] = ListField(number_index_fields) # This field is actually not required in the model, # it is used to create the `answer_as_plus_minus_combinations` field, which is a `SequenceLabelField`. # We cannot use `number_indices` field for creating that, because the `ListField` will not be empty # when we want to create a new empty field. That will lead to error. numbers_in_passage_field = TextField(number_tokens, token_indexers) metadata = { "original_passage": passage_text, "passage_token_offsets": passage_offsets, "question_token_offsets": question_offsets, "question_tokens": [token.text for token in question_tokens], "passage_tokens": [token.text for token in passage_tokens], "number_tokens": [token.text for token in number_tokens], "number_indices": number_indices, } if answer_info: metadata["answer_texts"] = answer_info["answer_texts"] passage_span_fields: List[Field] = [ SpanField(span[0], span[1], passage_field) for span in answer_info["answer_passage_spans"] ] if not passage_span_fields: passage_span_fields.append(SpanField(-1, -1, passage_field)) fields["answer_as_passage_spans"] = ListField(passage_span_fields) question_span_fields: List[Field] = [ SpanField(span[0], span[1], question_field) for span in answer_info["answer_question_spans"] ] if not question_span_fields: question_span_fields.append(SpanField(-1, -1, question_field)) fields["answer_as_question_spans"] = ListField( question_span_fields) add_sub_signs_field: List[Field] = [] for signs_for_one_add_sub_expression in answer_info[ "signs_for_add_sub_expressions"]: add_sub_signs_field.append( SequenceLabelField(signs_for_one_add_sub_expression, numbers_in_passage_field)) if not add_sub_signs_field: add_sub_signs_field.append( SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field)) fields["answer_as_add_sub_expressions"] = ListField( add_sub_signs_field) count_fields: List[Field] = [ LabelField(count_label, skip_indexing=True) for count_label in answer_info["counts"] ] if not count_fields: count_fields.append(LabelField(-1, skip_indexing=True)) fields["answer_as_counts"] = ListField(count_fields) metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def test_instances_must_have_homogeneous_fields(self): instance1 = Instance({"tag": (LabelField(1))}) instance2 = Instance({"words": TextField(["hello"], {})}) with pytest.raises(ConfigurationError): _ = Dataset([instance1, instance2])
def text_to_instance( # type: ignore self, question: str, table_lines: List[List[str]], target_values: List[str] = None, offline_search_output: List[str] = None, ) -> Instance: """ Reads text inputs and makes an instance. We pass the ``table_lines`` to ``TableQuestionContext``, and that method accepts this field either as lines from CoreNLP processed tagged files that come with the dataset, or simply in a tsv format where each line corresponds to a row and the cells are tab-separated. Parameters ---------- question : ``str`` Input question table_lines : ``List[List[str]]`` The table content optionally preprocessed by CoreNLP. See ``TableQuestionContext.read_from_lines`` for the expected format. target_values : ``List[str]``, optional Target values for the denotations the logical forms should execute to. Not required for testing. offline_search_output : ``List[str]``, optional List of logical forms, produced by offline search. Not required during test. """ tokenized_question = self._tokenizer.tokenize(question.lower()) question_field = TextField(tokenized_question, self._question_token_indexers) metadata: Dict[str, Any] = { "question_tokens": [x.text for x in tokenized_question] } table_context = TableQuestionContext.read_from_lines( table_lines, tokenized_question) world = WikiTablesLanguage(table_context) world_field = MetadataField(world) # Note: Not passing any featre extractors when instantiating the field below. This will make # it use all the available extractors. table_field = KnowledgeGraphField( table_context.get_table_knowledge_graph(), tokenized_question, self._table_token_indexers, tokenizer=self._tokenizer, include_in_vocab=self._use_table_for_vocab, max_table_tokens=self._max_table_tokens, ) production_rule_fields: List[Field] = [] for production_rule in world.all_possible_productions(): _, rule_right_side = production_rule.split(" -> ") is_global_rule = not world.is_instance_specific_entity( rule_right_side) field = ProductionRuleField(production_rule, is_global_rule=is_global_rule) production_rule_fields.append(field) action_field = ListField(production_rule_fields) fields = { "question": question_field, "metadata": MetadataField(metadata), "table": table_field, "world": world_field, "actions": action_field, } if target_values is not None: target_values_field = MetadataField(target_values) fields["target_values"] = target_values_field # We'll make each target action sequence a List[IndexField], where the index is into # the action list we made above. We need to ignore the type here because mypy doesn't # like `action.rule` - it's hard to tell mypy that the ListField is made up of # ProductionRuleFields. action_map = { action.rule: i for i, action in enumerate(action_field.field_list) # type: ignore } if offline_search_output: action_sequence_fields: List[Field] = [] for logical_form in offline_search_output: try: action_sequence = world.logical_form_to_action_sequence( logical_form) index_fields: List[Field] = [] for production_rule in action_sequence: index_fields.append( IndexField(action_map[production_rule], action_field)) action_sequence_fields.append(ListField(index_fields)) except ParsingError as error: logger.debug( f"Parsing error: {error.message}, skipping logical form" ) logger.debug(f"Question was: {question}") logger.debug(f"Logical form was: {logical_form}") logger.debug(f"Table info was: {table_lines}") continue except KeyError as error: logger.debug( f"Missing production rule: {error.args}, skipping logical form" ) logger.debug(f"Question was: {question}") logger.debug(f"Table info was: {table_lines}") logger.debug(f"Logical form was: {logical_form}") continue except: # noqa logger.error(logical_form) raise if len(action_sequence_fields ) >= self._max_offline_logical_forms: break if not action_sequence_fields: # This is not great, but we're only doing it when we're passed logical form # supervision, so we're expecting labeled logical forms, but we can't actually # produce the logical forms. We should skip this instance. Note that this affects # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the # full test data. return None fields["target_action_sequences"] = ListField( action_sequence_fields) if self._output_agendas: agenda_index_fields: List[Field] = [] for agenda_string in world.get_agenda(conservative=True): agenda_index_fields.append( IndexField(action_map[agenda_string], action_field)) if not agenda_index_fields: agenda_index_fields = [IndexField(-1, action_field)] fields["agenda"] = ListField(agenda_index_fields) return Instance(fields)
def __call__(self, instances: Iterable[Instance], num_epochs: int = None, shuffle: bool = False) -> Iterator[TensorDict]: key = id(instances) starting_epoch = self._epochs[key] # In order to ensure that we are (almost) constantly streaming data to the model we # need to have all of the instances in memory ($$$) instance_list = list(instances) if (self._batch_size > len(instance_list)) and self._truncate: raise ConfigurationError('FancyIterator will not return any data when the batch size ' 'is larger than number of instances and truncation is enabled. ' 'To fix this either use a smaller batch size (better for ' 'training) or disable truncation (better for validation).') if num_epochs is None: epochs: Iterable[int] = itertools.count(starting_epoch) else: epochs = range(starting_epoch, starting_epoch + num_epochs) for epoch in epochs: if shuffle: random.shuffle(instance_list) # We create queues for each instance in the batch, and greedily fill them to try and # ensure each queue's length is roughly equal in size. queues: List[Deque[Instance]] = [deque() for _ in range(self._batch_size)] queue_lengths = np.zeros(self._batch_size, dtype=int) for instance in instances: # Now we split the instance into chunks. chunks, length = self._split(instance) # Next we identify which queue is the shortest and add the chunks to that queue. destination = np.argmin(queue_lengths) queues[destination].extend(chunks) queue_lengths[destination] += length # We need a NULL instance to replace the output of an exhausted queue if we are evaluating prototype = deepcopy(chunks[-1]) new_fields: Dict[str, Field] = {} for name, field in prototype.fields.items(): if isinstance(field, MetadataField): new_fields[name] = field else: new_fields[name] = field.empty_field() blank_instance = Instance(new_fields) for batch in self._generate_batches(queues, blank_instance): if self._track_epoch: add_epoch_number(batch, epoch) if self.vocab is not None: batch.index_instances(self.vocab) padding_lengths = batch.get_padding_lengths() yield batch.as_tensor_dict(padding_lengths), 1 self._epochs[key] = epoch + 1
def _read(self, file_path: str): file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset = json.load(dataset_file) span_file = open(self._span_file_path) logger.info("Reading the dataset") for data, best_span in zip(dataset, span_file): answer = data['answers'][0] question = data['query'] well_formed_answer = data['wellFormedAnswers'][0] passages_json = data['passages'] passages = [passages_json[i]['passage_text'] for i in range(len(passages_json))] passages_is_selected = [passages_json[i]['is_selected'] for i in range(len(passages_json))] normalized_answer = util.normalize_text_msmarco(answer) tokenized_answer = self._tokenizer.tokenize(normalized_answer) # set question field normalized_question = util.normalize_text_msmarco(question) tokenized_question = self._tokenizer.tokenize(normalized_question) question_field = TextField(tokenized_question, self._token_indexers) fields = {'question': question_field} # get preprocessed span start_idx, end_idx, rouge_score, passage_idx = None, None, None, None start_idx, end_idx, passage_idx, rouge_score = best_span.strip().split(' ') start_idx, end_idx, passage_idx, rouge_score = int(start_idx), int(end_idx), int(passage_idx), float(rouge_score) # skip contexts that have less than 4 paragraphs if len(passages) < 4: continue # only train instances with rouge score larger than 0.9 if rouge_score > 0.9: # rank passsages based on tf-idf score passage_features = self._tfidf.fit_transform(passages) question_features = self._tfidf.transform([normalized_question]) distances = pairwise_distances(question_features, passage_features, "cosine").ravel() sorted_passages = np.lexsort((passages, distances)) # choose 4 passages with highest tf-idf score selected_passages = [] ## choose golden passage first normalized_passage = util.normalize_text_msmarco(passages[passage_idx]) tokenized_passage = self._tokenizer.tokenize(normalized_passage) passage_field = TextField(tokenized_passage, self._token_indexers) selected_passages.append(passage_field) ## set span field from golden passage span_start_field = IndexField(start_idx, passage_field) span_end_field = IndexField(end_idx, passage_field) fields['span_start'] = span_start_field fields['span_end'] = span_end_field ## choose three others with highest tf-idf score idx = 0 while len(selected_passages) < 4: if sorted_passages[idx] != passage_idx: normalized_passage = util.normalize_text_msmarco(passages[sorted_passages[idx]]) tokenized_passage = self._tokenizer.tokenize(normalized_passage) passage_field = TextField(tokenized_passage, self._token_indexers) selected_passages.append(passage_field) idx += 1 fields['passage'] = ListField(selected_passages) yield Instance(fields)
def text_to_instance( self, # type: ignore sentences: List[List[str]], document_id: str, sentence_id: int, gold_clusters: Optional[List[List[Tuple[int, int]]]] = None, user_threshold: Optional[float] = 0.0) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- sentences : ``List[List[str]]``, required. A list of lists representing the tokenised words and sentences in the document. document_id : ``str``, required. A string representing the document ID. sentence_id : ``int``, required. An int representing the sentence ID. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the document, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. user_threshold: ``Optional[float]``, optional (default = 0.0) approximate % of gold labels to label to hold out as user input. EX = 0.5, 0.33, 0.25, 0.125 Returns ------- An ``Instance`` containing the following ``Fields``: text : ``TextField`` The text of the full document. spans : ``ListField[SpanField]`` A ListField containing the spans represented as ``SpanFields`` with respect to the document text. span_labels : ``SequenceLabelField``, optional The id of the cluster which each possible span belongs to, or -1 if it does not belong to a cluster. As these labels have variable length (it depends on how many spans we are considering), we represent this a as a ``SequenceLabelField`` with respect to the ``spans ``ListField``. """ flattened_sentences = [ self._normalize_word(word) for sentence in sentences for word in sentence ] metadata: Dict[str, Any] = { "original_text": flattened_sentences, "ID": document_id + ";" + str(sentence_id) } if gold_clusters is not None: metadata["clusters"] = gold_clusters metadata["num_gold_clusters"] = len(gold_clusters) text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers) user_threshold_mod = int( 1 / user_threshold ) if self._simulate_user_inputs and user_threshold > 0 else 0 cluster_dict = {} simulated_user_cluster_dict = {} if gold_clusters is not None: for cluster_id, cluster in enumerate(gold_clusters): for i in range(len(cluster)): # use modulo to have a relatively even distribution of user labels across length of document, # (since clusters are sorted)--so user simulated clusters are spread evenly across document if user_threshold_mod == 0 or i % user_threshold_mod != user_threshold_mod - 1: cluster_dict[tuple(cluster[i])] = cluster_id simulated_user_cluster_dict[tuple(cluster[i])] = cluster_id # Note simulated_user_cluster_dict encompasses ALL gold labels, including those in cluster_dict # Consequently user_labels encompasses all gold labels spans: List[Field] = [] if gold_clusters is not None: span_labels: Optional[List[int]] = [] user_labels: Optional[List[ int]] = [] if self._simulate_user_inputs and user_threshold > 0 else None else: span_labels = user_labels = None # our must-link and cannot-link constraints, derived from user labels # using gold_clusters being None as an indicator of whether we're running training or not # TODO: confirm ^^ must_link: Optional[ List[int]] = [] if gold_clusters is not None else None cannot_link: Optional[ List[int]] = [] if gold_clusters is not None else None sentence_offset = 0 for sentence in sentences: for start, end in enumerate_spans( sentence, offset=sentence_offset, max_span_width=self._max_span_width): if span_labels is not None: if (start, end) in cluster_dict: span_labels.append(cluster_dict[(start, end)]) else: span_labels.append(-1) if self._simulate_user_inputs and user_threshold > 0: if (start, end) in simulated_user_cluster_dict: user_labels.append( simulated_user_cluster_dict[(start, end)]) else: user_labels.append(-1) spans.append(SpanField(start, end, text_field)) sentence_offset += len(sentence) span_field = ListField(spans) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "spans": span_field, "metadata": metadata_field } if span_labels is not None: fields["span_labels"] = SequenceLabelField(span_labels, span_field) if user_labels is not None: fields["user_labels"] = SequenceLabelField( user_labels, span_field) return Instance(fields)
def text_to_instance(self, dialog: Dict, ignore_fact: bool = False): msg_texts = [] msg_senders = [] msg_likes = [] msg_acts = [] msg_act_mask = [] msg_facts = [] msg_fact_labels = [] metadata_fact_labels = [] if len(dialog['messages']) == 0: raise ValueError('There are no dialog messages') known_entities = [ Token(text='ENTITY/' + t.replace(' ', '_'), idx=idx) for idx, t in enumerate(dialog['known_entities']) ] if len(known_entities) == 0: known_entities.append(Token(text='@@YOUKNOWNOTHING@@', idx=0)) known_entities_field = TextField(known_entities, self._mention_indexers) focus_entity = dialog['focus_entity'] focus_entity_field = TextField( [Token(text='ENTITY/' + focus_entity.replace(' ', '_'), idx=0)], self._mention_indexers) prev_msg = '' for msg in dialog['messages']: if True: if prev_msg == '': cur_message = msg['message'] else: if len(prev_msg) > DIALOG_MAX_LENGTH: prev_msg = ' '.join( prev_msg[-DIALOG_MAX_LENGTH:].split(' ')[1:]) cur_message = prev_msg + ' ' + msg['message'] prev_msg = cur_message else: cur_message = msg['message'] tokenized_msg = self._tokenizer.tokenize(cur_message) msg_texts.append(TextField(tokenized_msg, self._token_indexers)) msg_senders.append(0 if msg['sender'] == USER else 1) msg_likes.append( LabelField('liked' if msg['liked'] else 'not_liked', label_namespace='like_labels')) if msg['dialog_acts'] is None: dialog_acts = ['@@NODA@@'] act_mask = 0 else: dialog_acts = msg['dialog_acts'] act_mask = 1 dialog_acts_field = MultiLabelFieldListCompat( dialog_acts, label_namespace=DIALOG_ACT_LABELS) msg_acts.append(dialog_acts_field) msg_act_mask.append(act_mask) curr_facts_text = [] curr_facts_labels = [] curr_metadata_fact_labels = [] if msg['sender'] == ASSISTANT: for idx, f in enumerate(msg['facts']): if ignore_fact: fact_text = 'dummy fact' else: fact = self._fact_lookup[f['fid']] fact_text = fact.text # TODO: These are already space tokenized tokenized_fact = self._tokenizer.tokenize(fact_text) # 99% of text length is 77 tokenized_fact = tokenized_fact[:DIALOG_MAX_LENGTH] curr_facts_text.append( TextField(tokenized_fact, self._token_indexers)) if f['used']: curr_facts_labels.append(idx) curr_metadata_fact_labels.append(idx) else: # Users don't have facts, but lets avoid divide by zero curr_facts_text.append( TextField([Token(text='@@NOFACT@@', idx=0)], self._token_indexers)) msg_facts.append(ListField(curr_facts_text)) # Add in a label if there are no correct indices if len(curr_facts_labels) == 0: curr_metadata_fact_labels.append(-1) n_facts = len(curr_facts_text) fact_label_arr = np.zeros(n_facts, dtype=np.float32) if len(curr_facts_labels) > 0: fact_label_arr[curr_facts_labels] = 1 msg_fact_labels.append(ArrayField(fact_label_arr, dtype=np.float32)) metadata_fact_labels.append(curr_metadata_fact_labels) return Instance({ 'messages': ListField(msg_texts), 'facts': ListField(msg_facts), 'fact_labels': ListField(msg_fact_labels), 'likes': ListField(msg_likes), 'dialog_acts': ListField(msg_acts), 'dialog_acts_mask': to_long_field(msg_act_mask), 'senders': to_long_field(msg_senders), 'focus_entity': focus_entity_field, 'known_entities': known_entities_field, 'metadata': MetadataField({ 'dialog_id': dialog['dialog_id'], 'n_message': len(msg_texts), 'fact_labels': metadata_fact_labels, 'known_entities': dialog['known_entities'], 'focus_entity': dialog['focus_entity'] }) })
def __getitem__(self, index): if self.complete_shuffle: if self.pretraining_include_qa_and_qar: index = index // 8 which = index % 8 else: index = index // 4 which = index % 4 else: which = None item = deepcopy(self.items[index]) ################################################################### # Load questions and answers answer_choices = item['{}_choices'.format(self.mode)] if self.complete_shuffle and which < 4: only_use_answer = True else: only_use_answer = False if self.complete_shuffle and which >= 4: only_use_qar = True else: only_use_qar = False dets2use, old_det_to_new_ind = self._get_dets_to_use(item, only_use_answer = only_use_answer, only_use_qar = only_use_qar) # The only_use_qar is ambigious... instance_dict = {} if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number']}) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(os.path.join(self.vcr_image_dir, item['img_fn'])) #image = self.imagedatas(item['img_fn']) image, window, img_scale, padding = resize_image(image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(self.vcr_image_dir, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()] if self.add_image_as_a_box: boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels examples = data_iter_item(item, tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, endingonly=False, include_qar = self.pretraining_include_qa_and_qar, only_qar = self.only_qar) self.getitem_bert_part(examples, item, instance_dict, which) if self.use_alignment: # Alignment between objects and text ###################### examples_alginment_pack = [] for i in range(len(examples)): if self.pretraining_include_qa_and_qar: if i < 4: raw_text_a = item["question"] raw_text_b = item['answer_choices'][i] else: raw_text_a = item["question"] + item['answer_choices'][item['answer_label']] raw_text_b = item['rationale_choices'][i - 4] elif self.only_qar: raw_text_a = item["question"] + item['answer_choices'][item['answer_label']] # This is the correct alignment right now. raw_text_b = item['rationale_choices'][i] else: raw_text_a = item["question"] raw_text_b = item['answer_choices'][i] true_text_a = examples[i][0].text_a true_text_b = examples[i][0].text_b text_alignment_a = examples[i][1] text_alignment_b = examples[i][2] examples_alginment_pack.append((raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b)) image_box_position = [] if which is not None: raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b = examples_alginment_pack[which] box_record = defaultdict(list) self.get_alignment_original(raw_text_a, text_alignment_a, old_det_to_new_ind, box_record, offset = 1) self.get_alignment_original(raw_text_b, text_alignment_b, old_det_to_new_ind, box_record, offset = 1 + len(text_alignment_a) + 1) image_text_alignment = ListField([IntArrayField(np.array(box_record[i]), padding_value = -1) for i in range(len(boxes))]) else: for raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b in examples_alginment_pack: box_record = defaultdict(list) self.get_alignment_original(raw_text_a, text_alignment_a, old_det_to_new_ind, box_record, offset = 1) self.get_alignment_original(raw_text_b, text_alignment_b, old_det_to_new_ind, box_record, offset = 1 + len(text_alignment_a) + 1) image_box_position.append(ListField([IntArrayField(np.array(box_record[i]), padding_value = -1) for i in range(len(boxes))])) image_text_alignment = ListField(image_box_position) ###################### instance_dict["image_text_alignment"] = image_text_alignment instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return image, instance