def test_label_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("entailment", namespace="labels") vocab.add_token_to_namespace("contradiction", namespace="labels") vocab.add_token_to_namespace("neutral", namespace="labels") label = LabelField("entailment") label.index(vocab) tensor = label.as_tensor(label.get_padding_lengths()).data.cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0]))
def test_label_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("entailment", namespace="labels") vocab.add_token_to_namespace("contradiction", namespace="labels") vocab.add_token_to_namespace("neutral", namespace="labels") label = LabelField("entailment") label.index(vocab) tensor = label.as_tensor(label.get_padding_lengths()) assert tensor.item() == 0
def test_label_field_empty_field_works(self): label = LabelField("test") empty_label = label.empty_field() assert empty_label.label == -1
def test_label_field_raises_with_non_integer_labels_and_no_indexing(self): with pytest.raises(ConfigurationError): _ = LabelField("non integer field", skip_indexing=True)
def test_pad_returns_one_hot_array(self): label = LabelField(5, num_labels=10) array = label.as_array(label.get_padding_lengths()) numpy.testing.assert_array_almost_equal( array, numpy.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0]))
def text_to_instance( self, # type: ignore premises: Union[List[str], List[List[str]]], choices: List[str], coverage: List[List[float]], label: int = None, question: str = None) -> Instance: number_of_choices = len(choices) if isinstance(premises[0], str): premises = [premises] * number_of_choices # create an empty dictionary to store the input fields: Dict[str, Field] = {} all_links = [] all_link_token_ids = [] if len(coverage) != len(choices): logger.error("the dimension of coverage and choices did not match") exit(0) max_len = 0 max_premises = 0 for arr, p in zip(coverage, premises): if len(arr) != len(p): logger.error( "the dimension of coverage and premises did not match") exit(0) max_premises = max([max_premises, len(p)]) max_len = max([max_len, max([len(a) for a in arr])]) # padding np_coverage = np.zeros([len(coverage), max_premises, max_len]) for c_idx in range(len(coverage)): for p_idx in range(len(coverage[c_idx])): np_coverage[ c_idx, p_idx, 0:len(coverage[c_idx][p_idx])] = coverage[c_idx][p_idx] fields['coverage'] = ArrayField(np_coverage) for premise, hypothesis in zip(premises, choices): # two major keys # ph: [cls]all_premise[sep]hypothesis[sep] # two different segment_ids # join all premise sentences links_segment_2d = [] links_2d = [] for i in range(0, len(premise)): tokenized_links_field = [] type_ids_of_links = [] for j in range(0, len(premise)): if i == j: continue else: if question is None: pp_tokens, pp_token_type_ids = self.bert_features_from_qa( question=premise[i], answer=hypothesis, context=premise[j]) else: pp_tokens, pp_token_type_ids = self.bert_features_from_qa( question=question, context2=premise[j], answer=hypothesis, context=premise[i]) pp_tokens_field = TextField(pp_tokens, self._token_indexers) tokenized_links_field.append(pp_tokens_field) type_ids_of_links.append( SequenceLabelField(pp_token_type_ids, pp_tokens_field)) links_2d.append(ListField(tokenized_links_field)) links_segment_2d.append(ListField(type_ids_of_links)) if len(premise) >= 2: all_links.append(ListField(links_2d)) all_link_token_ids.append(ListField(links_segment_2d)) else: # add an empty list field empty_tokens_field = [TextField([], self._token_indexers)] empty_type_ids_of_links = [ SequenceLabelField([], empty_tokens_field[0]) ] all_links.append(ListField(ListField(empty_tokens_field))) all_link_token_ids.append( ListField(ListField(empty_type_ids_of_links))) if label is not None: fields['label'] = LabelField(label, skip_indexing=True) fields['links_tokens'] = ListField(all_links) fields['links_token_type_ids'] = ListField(all_link_token_ids) return Instance(fields)
def make_marginal_drop_instance(question_tokens: List[Token], passage_tokens: List[Token], number_tokens: List[Token], number_indices: List[int], token_indexers: Dict[str, TokenIndexer], passage_text: str, answer_info: Dict[str, Any] = None, additional_metadata: Dict[str, Any] = None) -> Instance: additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] question_offsets = [(token.idx, token.idx + len(token.text)) for token in question_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) question_field = TextField(question_tokens, token_indexers) fields["passage"] = passage_field fields["question"] = question_field number_index_fields: List[Field] = [IndexField(index, passage_field) for index in number_indices] fields["number_indices"] = ListField(number_index_fields) # This field is actually not required in the model, # it is used to create the `answer_as_plus_minus_combinations` field, which is a `SequenceLabelField`. # We cannot use `number_indices` field for creating that, because the `ListField` will not be empty # when we want to create a new empty field. That will lead to error. numbers_in_passage_field = TextField(number_tokens, token_indexers) metadata = {"original_passage": passage_text, "passage_token_offsets": passage_offsets, "question_token_offsets": question_offsets, "question_tokens": [token.text for token in question_tokens], "passage_tokens": [token.text for token in passage_tokens], "number_tokens": [token.text for token in number_tokens], "number_indices": number_indices} if answer_info: metadata["answer_texts"] = answer_info["answer_texts"] passage_span_fields: List[Field] = \ [SpanField(span[0], span[1], passage_field) for span in answer_info["answer_passage_spans"]] if not passage_span_fields: passage_span_fields.append(SpanField(-1, -1, passage_field)) fields["answer_as_passage_spans"] = ListField(passage_span_fields) question_span_fields: List[Field] = \ [SpanField(span[0], span[1], question_field) for span in answer_info["answer_question_spans"]] if not question_span_fields: question_span_fields.append(SpanField(-1, -1, question_field)) fields["answer_as_question_spans"] = ListField(question_span_fields) add_sub_signs_field: List[Field] = [] for signs_for_one_add_sub_expression in answer_info["signs_for_add_sub_expressions"]: add_sub_signs_field.append(SequenceLabelField(signs_for_one_add_sub_expression, numbers_in_passage_field)) if not add_sub_signs_field: add_sub_signs_field.append(SequenceLabelField([0] * len(number_tokens), numbers_in_passage_field)) fields["answer_as_add_sub_expressions"] = ListField(add_sub_signs_field) count_fields: List[Field] = [LabelField(count_label, skip_indexing=True) for count_label in answer_info["counts"]] if not count_fields: count_fields.append(LabelField(-1, skip_indexing=True)) fields["answer_as_counts"] = ListField(count_fields) metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields)
def __getitem__(self, index): item = json.loads(self.items[index]) instance_dict = {} dets2use, old_det_to_new_ind = self._get_dets_to_use(item) vcr_tokenizer = VCRTokenizer(old_det_to_new_ind, item['objects'], self.add_image_as_a_box) ######################################以下是Q2A的数据处理部分################################################## with h5py.File(self.h5fn_answer, 'r') as h5: grp_items_answer = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()} # (n, 768) dict_keys(['answer_answer0', 'answer_answer1', 'answer_answer2', 'answer_answer3', 'ctx_answer0', 'ctx_answer1', 'ctx_answer2', 'ctx_answer3']) ['answer_rationale0', 'answer_rationale1', 'answer_rationale2', 'answer_rationale3', 'ctx_rationale0', 'ctx_rationale1', 'ctx_rationale2', 'ctx_rationale3'] if 'endingonly' not in self.embs_to_load: questions_answer_tokenized, question_answer_tags = zip(*[vcr_tokenizer( item['question'], grp_items_answer[f'ctx_answer{i}'] ) for i in range(4)]) instance_dict['question_answer'] = ListField(list(questions_answer_tokenized)) instance_dict['question_answer_tags'] = ListField(list(question_answer_tags)) answers_tokenized, answer_tags = zip(*[vcr_tokenizer( answer, grp_items_answer[f'answer_answer{i}'] ) for i, answer in enumerate(item['answer_choices'])]) instance_dict['answers'] = ListField(list(answers_tokenized)) instance_dict['answer_tags'] = ListField(list(answer_tags)) ######################################以下是QA2R的数据处理部分################################################ with h5py.File(self.h5fn_rationale, 'r') as h5_rationale: grp_items_rationale = {k: np.array(v, dtype=np.float16) for k, v in h5_rationale[str(index)].items()} condition_key = self.conditioned_answer_choice if self.split == "test" else "" conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice question_rationale = item['question'] + item['answer_choices'][conditioned_label] if 'endingonly' not in self.embs_to_load: questions_rationale_tokenized, question_rationale_tags = zip(*[vcr_tokenizer( question_rationale, grp_items_rationale[f'ctx_rationale{condition_key}{i}'] ) for i in range(4)]) instance_dict['question_rationale'] = ListField(list(questions_rationale_tokenized)) instance_dict['question_rationale_tags'] = ListField(list(question_rationale_tags)) rationale_tokenized, rationale_tags = zip(*[vcr_tokenizer( rationale, grp_items_rationale[f'answer_rationale{condition_key}{i}'] ) for i, rationale in enumerate(item['rationale_choices'])]) instance_dict['rationales'] = ListField(list(rationale_tokenized)) instance_dict['rationale_tags'] = ListField(list(rationale_tags)) ####################################各种metadata数据处理部分################################################## if self.split != 'test': instance_dict['answer_label'] = LabelField(item['answer_label'], skip_indexing=True) instance_dict['rationale_label'] = LabelField(item['rationale_label'], skip_indexing=True) # instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], # 'img_fn': item['img_fn'], # 'question_number': item['question_number']}) ##########################################图片处理部分######################################################## with h5py.File(self.h5fn_image, 'r') as h5_features: # pytoch1.1 img_id = item['img_id'].split('-')[-1] group_image = {k: np.array(v) for k, v in h5_features[img_id].items()} image_feature = group_image['features'][[0]+(dets2use+1).tolist()] tag_boxes = group_image['boxes'] zeros = np.zeros((1,2048), dtype=np.float32) if self.add_image_as_a_box: image_feature = np.concatenate((zeros, image_feature), axis=0) else: image_feature = np.concatenate((zeros, image_feature[1:]), axis=0) instance_dict['image_features'] = ArrayField(image_feature, padding_value=0) ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()] if self.add_image_as_a_box: boxes = np.row_stack((boxes[0], boxes)) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels # 第一个object是0 boxes = np.row_stack((boxes[0], boxes)) obj_labels = [81] + obj_labels instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels]) assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])) instance = Instance(instance_dict) instance.index_fields(self.vocab) return instance
def get_abstract_slot_value_field(slot_name, get_abstracted_value): abst_slot_name = "abst-%s" % slot_name namespace = get_slot_label_namespace(abst_slot_name) abst_slot_value = get_abstracted_value(question_label["questionSlots"][slot_name]) return LabelField(label = abst_slot_value, label_namespace = namespace)
def test_instances_must_have_homogeneous_fields(self): instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))}) instance2 = Instance({"words": TextField([Token("hello")], {})}) with pytest.raises(ConfigurationError): _ = Batch([instance1, instance2])
def text_to_instance( self, # type: ignore premise: List[Tuple[str, float]], # Important type information hypothesis: str, pid: str = None, label: str = None) -> Instance: fields: Dict[str, Field] = {} if self.shuffle_sentences: # Potential improvement. Shuffle the input sentences. Maybe close this at last several epoch. random.shuffle(premise) premise_prob_list = [] premise_tokens_list = [] for premise_sent, prob in premise: tokenized_cur_sent = self.bert_servant.tokenize( premise_sent, modify_from_corenlp=True) # cur_sent_ids = self.bert_servant.tokens_to_ids(tokenized_cur_sent) if self.max_l is not None: tokenized_cur_sent = tokenized_cur_sent[:self. max_l] # truncate max length (default 60) premise_tokens_list.extend(tokenized_cur_sent) prob_value = np.ones( (len(tokenized_cur_sent), 1), dtype=np.float32) * prob premise_prob_list.append(prob_value) premise_prob = np.concatenate(premise_prob_list, axis=0) # premise_tokens_id_list = self.bert_servant.tokens_to_ids(premise_tokens_list) hypothesis_tokens_list = self.bert_servant.tokenize( hypothesis, modify_from_corenlp=True) # print("WTF!!!, p", len(premise_tokens_list)) # print("WTF!!!, h", len(hypothesis_tokens_list)) if self.max_l is not None: hypothesis_tokens_list = hypothesis_tokens_list[:self.max_l] hypothesis_prob = np.ones((len(hypothesis_tokens_list), 1), dtype=np.float32) assert len(premise_tokens_list) == len(premise_prob) assert len(hypothesis_tokens_list) == len(hypothesis_prob) paired_tokens_sequence = ['[CLS]'] + premise_tokens_list + [ '[SEP]' ] + hypothesis_tokens_list + ['[SEP]'] token_type_ids = [0] * (2 + len(premise_tokens_list)) + [1] * ( 1 + len(hypothesis_tokens_list)) paired_ids_seq = self.bert_servant.tokens_to_ids( paired_tokens_sequence) assert len(paired_ids_seq) == len(token_type_ids) fields['paired_sequence'] = BertIndexField( np.asarray(paired_ids_seq, dtype=np.int64)) fields['paired_token_type_ids'] = BertIndexField( np.asarray(token_type_ids, dtype=np.int64)) premise_span = (1, 1 + len(premise_tokens_list) ) # End is exclusive (important for later use) hypothesis_span = (premise_span[1] + 1, premise_span[1] + 1 + len(hypothesis_tokens_list)) assert len(paired_ids_seq) == 1 + (premise_span[1] - premise_span[0]) + 1 + \ (hypothesis_span[1] - hypothesis_span[0]) + 1 fields['bert_premise_span'] = MetadataField(premise_span) fields['bert_hypothesis_span'] = MetadataField(hypothesis_span) fields['premise_probs'] = MetadataField(premise_prob) fields['hypothesis_probs'] = MetadataField(hypothesis_prob) if label: fields['label'] = LabelField(label, label_namespace='labels') if pid: fields['pid'] = IdField(pid) return Instance(fields)
def text_to_instance(self, content: str, candidates: List[str], ground_truths: List[str] = None, real_count: int = 1) -> Iterable[Instance]: splits = re.split(r'#idiom#', content) assert real_count + 1 == len(splits) assert real_count == len(candidates) split_tokens = [self.tokenizer.tokenize(item) for item in splits] for index, current_candidates in enumerate(candidates): before_part_tokens = [Token(token) for token in split_tokens[0]] for before_part in split_tokens[1:index + 1]: before_part_tokens += [Token('[UNK]')] + [ Token(token) for token in before_part ] after_part_tokens = [ Token(token) for token in split_tokens[index + 1] ] for after_part in split_tokens[index + 2:]: after_part_tokens += [Token('[UNK]')] + [ Token(token) for token in after_part ] # 将 留空处 打上 [MASK]标记 content_tokens = before_part_tokens + [Token('[MASK]') ] + after_part_tokens # 取 留空 前后最多max_seq_length的内容作为输入 half_length = self.max_seq_length // 2 if len(before_part_tokens) < half_length: start = 0 end = min( len(before_part_tokens) + 1 + len(after_part_tokens), self.max_seq_length - 2) elif len(after_part_tokens) < half_length: end = len(before_part_tokens) + 1 + len(after_part_tokens) start = max(0, end - (self.max_seq_length - 2)) else: start = len(before_part_tokens) + 3 - half_length end = len(before_part_tokens) + 1 + half_length content_tokens = content_tokens[start:end] # 填空内容 content_field = TextField(content_tokens, self.content_token_indexer) # 留空 的位置 blank_index = content_tokens.index(Token("[MASK]")) blank_index_field = IndexField(blank_index, content_field) # 候选成语 candidate_tokens = [ self.idiom_list.index(option) for option in current_candidates ] candidate_tokens = np.array(candidate_tokens) candidate_field = ArrayField(candidate_tokens, dtype=np.long) fields = { "content": content_field, "blank_indices": blank_index_field, "candidates": candidate_field, } if ground_truths: label = current_candidates.index(ground_truths[index]) label_field = LabelField(label, skip_indexing=True) fields["answer"] = label_field # 元信息 meta = { "content": '[UNK]'.join(splits[:index + 1]) + "[MASK]" + '[UNK]'.join(splits[index + 1:]), "candidates": current_candidates, "answer": ground_truths[index] } else: meta = { "content": '[UNK]'.join(splits[:index + 1]) + "[MASK]" + '[UNK]'.join(splits[index + 1:]), "candidates": current_candidates, } fields["meta"] = MetadataField(meta) yield Instance(fields)
def text_to_instance( self, annotation_id: str, documents: Dict[str, List[str]], rationales: Dict[str, List[Tuple[int, int]]], query: str = None, label: str = None, ) -> Instance: # type: ignore # pylint: disable=arguments-differ fields = {} tokens = [] is_evidence = [] document_to_span_map = {} for docid, docwords in documents.items(): tokens += [Token(word) for word in docwords] document_to_span_map[docid] = (len(tokens) - len(docwords), len(tokens)) tokens.append(Token("[SEP]")) rationale = [0] * len(docwords) if docid in rationales: for s, e in rationales[docid]: for i in range(s, e): rationale[i] = 1 is_evidence += rationale + [1] if query is not None and type(query) != list: query_words = query.split() tokens += [Token(word) for word in query_words] tokens.append(Token("[SEP]")) is_evidence += [1] * (len(query_words) + 1) always_keep_mask = [ 1 if t.text.upper() == "[SEP]" else 0 for t in tokens ] fields["document"] = TextField(tokens, self._token_indexers) fields["rationale"] = SequenceLabelField( is_evidence, sequence_field=fields["document"], label_namespace="evidence_labels") fields["kept_tokens"] = SequenceLabelField( always_keep_mask, sequence_field=fields["document"], label_namespace="kept_token_labels") metadata = { "annotation_id": annotation_id, "tokens": tokens, "document_to_span_map": document_to_span_map, "convert_tokens_to_instance": self.convert_tokens_to_instance, } fields["metadata"] = MetadataField(metadata) if label is not None: fields["label"] = LabelField(label, label_namespace="labels") return Instance(fields)
def text_to_instance( self, tokens: List[Token], spans: List[Tuple[str, Tuple[int, int]]], # end ind is exclusive span_pairs: List[Tuple[str, str]], span_weights: List[float], task: str = None, span_labels: List[str] = None, span_pair_labels: List[str] = None, e2e: bool = False, **kwargs) -> Instance: text_field = TextField(tokens, token_indexers=self._token_indexers) # Spans must be ordered by the end index because we might need to # remove some of them during forward computation mainly because of # the length constraints introduced by models like BERT. In this case, # we hope all the removed spans are located at the end of the list. spans_ind = sorted(range(len(spans)), key=lambda i: (spans[i][1][1], spans[i][1][0])) spans = [spans[i] for i in spans_ind] sid2ind = dict((s[0], i) for i, s in enumerate(spans)) span_field = ListField([ SpanField(sind, eind - 1, text_field, check_sentence=False) for sid, (sind, eind) in spans ]) task = self._default_task if task is None else task task_field = LabelField(task, label_namespace='task_labels') if len(span_pairs) > 0: span_pair_field = ListField([ ArrayField( np.array([sid2ind[sid1], sid2ind[sid2]], dtype=np.int64), padding_value=-1, # the same as span field dtype=np.int64) for sid1, sid2 in span_pairs ]) else: span_pair_field = ListField([ ArrayField( np.array( [-1, -1], dtype=np.int64), # use a padding sample as placeholder padding_value=-1, dtype=np.int64) ]) assert len(spans) == len( span_weights), 'input and weights length inconsistent' span_weights = [span_weights[i] for i in spans_ind ] # to be consistent with sorted spans span_weights_field = ArrayField(np.array(span_weights, dtype=np.float32), padding_value=0, dtype=np.float32) fields: Dict[str, Field] = { 'text': text_field, 'spans': span_field, 'task_labels': task_field, 'span_pairs': span_pair_field, 'span_weights': span_weights_field } if span_labels is not None: # TODO debug (consti label transformation) ''' def consti_map(sp): if sp == 'NP' or sp == BratDoc.NEG_SPAN_LABEL: return sp return 'S' if task == 'consti': span_labels = [consti_map(sp) for sp in span_labels] ''' assert len(spans) == len( span_labels), 'input and label length inconsistent' span_labels = [span_labels[i] for i in spans_ind ] # to be consistent with sorted spans fields['span_labels'] = SequenceLabelField( span_labels, span_field, label_namespace='{}_span_labels'.format(task)) if span_pair_labels is not None: if len(span_pairs) > 0: assert len(span_pairs) == len( span_pair_labels), 'input and label length inconsistent' fields['span_pair_labels'] = SequenceLabelField( span_pair_labels, span_pair_field, label_namespace='{}_span_pair_labels'.format(task)) else: fields['span_pair_labels'] = SequenceLabelField( [self.PADDING_LABEL], span_pair_field, label_namespace='{}_span_pair_labels'.format(task)) # add meta filed # e2e is used in forward to decide whether to use end2end training/testing metadata_dict: Dict[str, Any] = { 'task': task, 'e2e': e2e, 'max_span_width': self._max_span_width[task] } if 'brat_doc' in kwargs: metadata_dict['clusters'] = kwargs['brat_doc'].build_cluster( inclusive=True) metadata_dict.update(kwargs) fields['metadata'] = MetadataField(metadata_dict) return Instance(fields)
def data_to_instance(self, words: List[str], labels: List[str], sentence_boundaries: List[int], doc_index: str): if self.tokenizer is None: tokens = [[Token(w)] for w in words] else: tokens = [self.tokenizer.tokenize(w) for w in words] subwords = [sw for token in tokens for sw in token] subword2token = list( itertools.chain(*[[i] * len(token) for i, token in enumerate(tokens)])) token2subword = [0] + list( itertools.accumulate(len(token) for token in tokens)) subword_start_positions = frozenset(token2subword) subword_sentence_boundaries = [ sum(len(token) for token in tokens[:p]) for p in sentence_boundaries ] # extract entities from IOB tags # we need to pass sentence by sentence entities: List[Entity] = [] for s, e in zip(sentence_boundaries[:-1], sentence_boundaries[1:]): for ent in Entities([labels[s:e]], scheme=IOB1).entities[0]: ent.start += s ent.end += s entities.append(ent) span_to_entity_label: Dict[Tuple[int, int], str] = dict() for ent in entities: subword_start = token2subword[ent.start] subword_end = token2subword[ent.end] span_to_entity_label[(subword_start, subword_end)] = ent.tag # split data according to sentence boundaries for n in range(len(subword_sentence_boundaries) - 1): # process (sub) words doc_sent_start, doc_sent_end = subword_sentence_boundaries[n:n + 2] assert doc_sent_end - doc_sent_start < self.max_num_subwords left_length = doc_sent_start right_length = len(subwords) - doc_sent_end sentence_length = doc_sent_end - doc_sent_start half_context_length = int( (self.max_num_subwords - sentence_length) / 2) if left_length < right_length: left_context_length = min(left_length, half_context_length) right_context_length = min( right_length, self.max_num_subwords - left_context_length - sentence_length) else: right_context_length = min(right_length, half_context_length) left_context_length = min( left_length, self.max_num_subwords - right_context_length - sentence_length) doc_offset = doc_sent_start - left_context_length word_ids = subwords[doc_offset:doc_sent_end + right_context_length] if isinstance(self.tokenizer, PretrainedTransformerTokenizer): word_ids = self.tokenizer.add_special_tokens(word_ids) # process entities entity_start_positions = [] entity_end_positions = [] entity_ids = [] entity_position_ids = [] original_entity_spans = [] labels = [] for entity_start in range(left_context_length, left_context_length + sentence_length): doc_entity_start = entity_start + doc_offset if doc_entity_start not in subword_start_positions: continue for entity_end in range( entity_start + 1, left_context_length + sentence_length + 1): doc_entity_end = entity_end + doc_offset if doc_entity_end not in subword_start_positions: continue if entity_end - entity_start > self.max_mention_length: continue entity_start_positions.append(entity_start + 1) entity_end_positions.append(entity_end) entity_ids.append(self.entity_id) position_ids = list(range(entity_start + 1, entity_end + 1)) position_ids += [-1] * (self.max_mention_length - entity_end + entity_start) entity_position_ids.append(position_ids) original_entity_spans.append( (subword2token[doc_entity_start], subword2token[doc_entity_end - 1] + 1)) labels.append( span_to_entity_label.pop( (doc_entity_start, doc_entity_end), NON_ENTITY)) # split instances split_size = math.ceil(len(entity_ids) / self.max_entity_length) for i in range(split_size): entity_size = math.ceil(len(entity_ids) / split_size) start = i * entity_size end = start + entity_size fields = { "word_ids": TextField(word_ids, token_indexers=self.token_indexers), "entity_start_positions": TensorField(np.array(entity_start_positions[start:end])), "entity_end_positions": TensorField(np.array(entity_end_positions[start:end])), "original_entity_spans": TensorField(np.array(original_entity_spans[start:end]), padding_value=-1), "labels": ListField([LabelField(l) for l in labels[start:end]]), "doc_id": MetadataField(doc_index), "input_words": MetadataField(words), } if self.use_entity_feature: fields.update({ "entity_ids": TensorField(np.array(entity_ids[start:end]), padding_value=0), "entity_position_ids": TensorField(np.array(entity_position_ids[start:end])), }) yield Instance(fields) assert len(span_to_entity_label) == 0
def get_clause_slot_field(slot_name: str, slot_value: str): clause_slot_name = "clause-%s" % slot_name namespace = get_slot_label_namespace(clause_slot_name) return LabelField(label = slot_value, label_namespace = namespace)
def get_num_answers_field(question_label): return LabelField(label = len(question_label["answerJudgments"]), skip_indexing = True)
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray]) -> List[Instance]: new_instance = deepcopy(instance) # For BiDAF if "best_span" in outputs: span_start_label = outputs["best_span"][0] span_end_label = outputs["best_span"][1] passage_field: SequenceField = new_instance[ "passage"] # type: ignore new_instance.add_field( "span_start", IndexField(int(span_start_label), passage_field)) new_instance.add_field( "span_end", IndexField(int(span_end_label), passage_field)) # For NAQANet model. It has the fields: answer_as_passage_spans, answer_as_question_spans, # answer_as_add_sub_expressions, answer_as_counts. We need labels for all. elif "answer" in outputs: answer_type = outputs["answer"]["answer_type"] # When the problem is a counting problem if answer_type == "count": field = ListField([ LabelField(int(outputs["answer"]["count"]), skip_indexing=True) ]) new_instance.add_field("answer_as_counts", field) # When the answer is in the passage elif answer_type == "passage_span": # TODO(mattg): Currently we only handle one predicted span. span = outputs["answer"]["spans"][0] # Convert character span indices into word span indices word_span_start = None word_span_end = None offsets = new_instance["metadata"].metadata[ "passage_token_offsets"] # type: ignore for index, offset in enumerate(offsets): if offset[0] == span[0]: word_span_start = index if offset[1] == span[1]: word_span_end = index passage_field: SequenceField = new_instance[ "passage"] # type: ignore field = ListField( [SpanField(word_span_start, word_span_end, passage_field)]) new_instance.add_field("answer_as_passage_spans", field) # When the answer is an arithmetic calculation elif answer_type == "arithmetic": # The different numbers in the passage that the model encounters sequence_labels = outputs["answer"]["numbers"] numbers_field: ListField = instance[ "number_indices"] # type: ignore # The numbers in the passage are given signs, that's what we are labeling here. # Negative signs are given the class label 2 (for 0 and 1, the sign matches the # label). labels = [] for label in sequence_labels: if label["sign"] == -1: labels.append(2) else: labels.append(label["sign"]) # There's a dummy number added in the dataset reader to handle passages with no # numbers; it has a label of 0 (not included). labels.append(0) field = ListField([SequenceLabelField(labels, numbers_field)]) new_instance.add_field("answer_as_add_sub_expressions", field) # When the answer is in the question elif answer_type == "question_span": span = outputs["answer"]["spans"][0] # Convert character span indices into word span indices word_span_start = None word_span_end = None question_offsets = new_instance[ "metadata"].metadata[ # type: ignore "question_token_offsets"] for index, offset in enumerate(question_offsets): if offset[0] == span[0]: word_span_start = index if offset[1] == span[1]: word_span_end = index question_field: SequenceField = new_instance[ "question"] # type: ignore field = ListField([ SpanField(word_span_start, word_span_end, question_field) ]) new_instance.add_field("answer_as_question_spans", field) return [new_instance]
def get_num_valids_field(question_label): return LabelField(label = len([aj for aj in question_label["answerJudgments"] if aj["isValid"]]), skip_indexing = True)
def text_to_instance(self, index: int) -> Instance: # type: ignore return Instance({"index": LabelField(index, skip_indexing=True)})
def get_num_invalids_field(question_label): return LabelField(label = get_num_invalids(question_label), skip_indexing = True)
def get_answer_fields(self, **kwargs: Dict[str, Any]) -> Tuple[Dict[str, Field], bool]: seq_tokens: List[Token] = kwargs['seq_tokens'] seq_wordpieces: int = kwargs['seq_wordpieces'] question_text_index_to_token_index: List[int] = kwargs['question_text_index_to_token_index'] question_text: str = kwargs['question_text'] passage_text_index_to_token_index: List[int] = kwargs['passage_text_index_to_token_index'] passage_text: str = kwargs['passage_text'] answer_texts: List[str] = kwargs['answer_texts'] gold_indexes: Dict[List[int]] = (kwargs['gold_indexes'] if 'gold_indexes' in kwargs else {'question': None, 'passage': None}) fields: Dict[str, Field] = {} spans_dict = {} all_spans = [] is_missing_answer = False for i, answer_text in enumerate(answer_texts): answer_spans = [] if not self._ignore_question: answer_spans += find_valid_spans(question_text, [answer_text], question_text_index_to_token_index, seq_tokens, seq_wordpieces, gold_indexes['question']) answer_spans += find_valid_spans(passage_text, [answer_text], passage_text_index_to_token_index, seq_tokens, seq_wordpieces, gold_indexes['passage']) if len(answer_spans) == 0: is_missing_answer = True continue spans_dict[answer_text] = answer_spans all_spans.extend(answer_spans) old_reader_behavior = kwargs['old_reader_behavior'] if old_reader_behavior: answer_type = kwargs['answer_type'] is_training = kwargs['is_training'] if is_training: if answer_type in SPAN_ANSWER_TYPES: if is_missing_answer: all_spans = [] if len(all_spans) > 0: has_answer = True fields['wordpiece_indices'] = self._get_wordpiece_indices_field(seq_wordpieces) no_answer_bios = self._get_empty_answer(seq_tokens) text_to_disjoint_bios: List[ListField] = [] flexibility_count = 1 for answer_text in answer_texts: spans = spans_dict[answer_text] if answer_text in spans_dict else [] if len(spans) == 0: continue disjoint_bios: List[LabelsField] = [] for span_ind, span in enumerate(spans): bios = self._create_sequence_labels([span], len(seq_tokens)) disjoint_bios.append(LabelsField(bios)) text_to_disjoint_bios.append(ListField(disjoint_bios)) flexibility_count *= ((2**len(spans)) - 1) fields['answer_as_text_to_disjoint_bios'] = ListField(text_to_disjoint_bios) if (flexibility_count < self._flexibility_threshold): # generate all non-empty span combinations per each text spans_combinations_dict = {} for key, spans in spans_dict.items(): spans_combinations_dict[key] = all_combinations = [] for i in range(1, len(spans) + 1): all_combinations += list(itertools.combinations(spans, i)) # calculate product between all the combinations per each text packed_gold_spans_list = itertools.product(*list(spans_combinations_dict.values())) bios_list: List[LabelsField] = [] for packed_gold_spans in packed_gold_spans_list: gold_spans = [s for sublist in packed_gold_spans for s in sublist] bios = self._create_sequence_labels(gold_spans, len(seq_tokens)) bios_list.append(LabelsField(bios)) fields['answer_as_list_of_bios'] = ListField(bios_list) fields['answer_as_text_to_disjoint_bios'] = ListField([ListField([no_answer_bios])]) else: fields['answer_as_list_of_bios'] = ListField([no_answer_bios]) bio_labels = self._create_sequence_labels(all_spans, len(seq_tokens)) fields['span_bio_labels'] = LabelsField(bio_labels) fields['is_bio_mask'] = LabelField(1, skip_indexing=True) else: has_answer = False fields.update(self.get_empty_answer_fields(**kwargs)) return fields, has_answer
def get_slot_value_field(slot_name): slot_value = question_slots[slot_name] namespace = get_slot_label_namespace(slot_name) return LabelField(label = slot_value, label_namespace = namespace)
def text_to_instance(self, # type: ignore item_id: Any, question_text: str, choice_text_list: List[str], fact_text: str, answer_span: List[str], answer_relations: List[str], answer_starts: List[int] = None, answer_id: int = None, prefetched_sentences: Dict[str, List[str]] = None, prefetched_indices: str = None) -> Instance: fields: Dict[str, Field] = {} question_tokens = self._tokenizer.tokenize(question_text) fact_tokens = self._tokenizer.tokenize(fact_text) choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list] choice_kb_fields = [] selected_tuples = [] for choice in choice_text_list: kb_fields = [] if self._use_conceptnet and self._use_elastic_search: max_sents_per_source = int(self._max_tuples / 2) else: max_sents_per_source = self._max_tuples selected_hits = [] if self._use_elastic_search: elastic_search_hits = self.get_elasticsearch_sentences(prefetched_sentences, prefetched_indices, answer_span, choice, question_text, fact_text, max_sents_per_source) selected_hits.extend(elastic_search_hits) if self._use_conceptnet: conceptnet_sentences = self.get_conceptnet_sentences(fact_text, answer_span, choice, max_sents_per_source) selected_hits.extend(conceptnet_sentences) # add a dummy entry to capture the embedding link if self._ignore_spans: fact_choice_sentence = fact_text + " || " + choice selected_hits.append(fact_choice_sentence) else: for answer in set(answer_span): answer_choice_sentence = answer + " || " + choice selected_hits.append(answer_choice_sentence) selected_tuples.append(selected_hits) for hit_text in selected_hits: kb_fields.append(TextField(self._tokenizer.tokenize(hit_text), self._token_indexers)) choice_kb_fields.append(ListField(kb_fields)) fields["choice_kb"] = ListField(choice_kb_fields) fields['fact'] = TextField(fact_tokens, self._token_indexers) if self._add_relation_labels: if answer_relations and len(answer_relations): relation_fields = [] for relation in set(answer_relations): relation_fields.append(LabelField(relation, label_namespace="relation_labels")) fields["relations"] = ListField(relation_fields) selected_relations = self.collate_relations(answer_relations) fields["relation_label"] = MultiLabelField(selected_relations, "relation_labels") else: fields["relations"] = ListField([LabelField(-1, label_namespace="relation_labels", skip_indexing=True)]) fields["relation_label"] = MultiLabelField([], "relation_labels") answer_fields = [] answer_span_fields = [] fact_offsets = [(token.idx, token.idx + len(token.text)) for token in fact_tokens] for idx, answer in enumerate(answer_span): answer_fields.append(TextField(self._tokenizer.tokenize(answer), self._token_indexers)) if answer_starts: if len(answer_starts) <= idx: raise ValueError("Only {} answer_starts in json. " "Expected {} in {}".format(len(answer_starts), len(answer_span), item_id)) offset = answer_starts[idx] else: offset = fact_text.index(answer) if offset == -1: raise ValueError("Span: {} not found in fact: {}".format(answer, fact_text)) tok_span, err = char_span_to_token_span(fact_offsets, (offset, offset + len(answer))) if err: logger.info("Could not find token spans for '{}' in '{}'." "Best guess: {} in {} at {}".format( answer, fact_text, [offset, offset + len(answer)], fact_offsets, tok_span)) answer_span_fields.append(SpanField(tok_span[0], tok_span[1], fields['fact'])) fields["answer_text"] = ListField(answer_fields) fields["answer_spans"] = ListField(answer_span_fields) fields['question'] = TextField(question_tokens, self._token_indexers) fields['choices_list'] = ListField( [TextField(x, self._token_indexers) for x in choices_tokens_list]) if answer_id is not None: fields['answer_id'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question_text, "fact_text": fact_text, "choice_text_list": choice_text_list, "question_tokens": [x.text for x in question_tokens], "fact_tokens": [x.text for x in fact_tokens], "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list], "answer_text": answer_span, "answer_start": answer_starts, "answer_span_fields": [(x.span_start, x.span_end) for x in answer_span_fields], "relations": answer_relations, "selected_tuples": selected_tuples } fields["metadata"] = MetadataField(metadata) return Instance(fields)
def __getitem__(self, index): # if self.split == 'test': # raise ValueError("blind test mode not supported quite yet") item = deepcopy(self.items[index]) image_id = int(item['img_id'].split('-')[-1]) anno_id = str(item['annot_id'].split('-')[-1]) with h5py.File(self.tag_annot_path, 'r') as h5: tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int) with h5py.File(self.non_tag_annot_path, 'r') as h5: non_tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int) with h5py.File(self.tag_feature_path, 'r') as h5: tag_boxes = np.array(h5[str(anno_id)]['boxes'], dtype=np.float32) tag_features = np.zeros([4, tag_boxes.shape[0], 1024]) for m in range(4): tag_features[m, :, :] = np.array(h5[str(anno_id)]['features' + str(m)], dtype=np.float32) with h5py.File(self.non_tag_feature_path, 'r') as h5: non_tag_boxes = np.array(h5[str(anno_id)]['boxes'], dtype=np.float32) non_tag_features = np.zeros([4, non_tag_boxes.shape[0], 1024]) for m in range(4): non_tag_features[m, :, :] = np.array( h5[str(anno_id)]['features' + str(m)], dtype=np.float32) ################################################################### # Load questions and answers non_tag_question_annotid2detidx = self.non_tag_question_annotid2detidx[ item['annot_id']] non_tag_answer_annotid2detidx = self.non_tag_answer_annotid2detidx[ item['annot_id']] non_tag_rationale_annotid2detidx = self.non_tag_rationale_annotid2detidx[ item['annot_id']] if self.mode == 'answer': question_annotid2detidx = non_tag_question_annotid2detidx answer_annotid2detidx = non_tag_answer_annotid2detidx else: conditioned_label = item[ 'answer_label'] if self.split != 'test' else self.conditioned_answer_choice q_len = len(item['question']) question_annotid2detidx = {} for k, v in non_tag_question_annotid2detidx.items(): question_annotid2detidx[k] = v for k, v in non_tag_answer_annotid2detidx[conditioned_label].items( ): question_annotid2detidx[k + q_len] = v answer_annotid2detidx = non_tag_rationale_annotid2detidx if self.mode == 'rationale': conditioned_label = item[ 'answer_label'] if self.split != 'test' else self.conditioned_answer_choice item['question'] += item['answer_choices'][conditioned_label] with h5py.File(self.h5fn, 'r') as h5: grp_items = { k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items() } answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) non_tag_dets2use, non_tag_old_det_to_new_ind = self._get_non_tag_det_to_use( question_annotid2detidx, answer_annotid2detidx, len(non_tag_boxes)) if self.add_image_as_a_box: assert (len(dets2use) == np.max(old_det_to_new_ind)) if self.add_image_as_a_box: non_tag_old_det_to_new_ind += 1 # shift the non_tag detection idx, effectively as appending the non_tag detections to tag detections non_tag_old_det_to_new_ind[np.where(non_tag_old_det_to_new_ind) [0]] += len(dets2use) old_det_to_new_ind = old_det_to_new_ind.tolist() non_tag_old_det_to_new_ind = non_tag_old_det_to_new_ind.tolist() ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()} with h5py.File(self.h5fn, 'r') as h5: grp_items = { k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items() } # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always # condition on the `conditioned_answer_choice.` condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else "" instance_dict = {} if 'endingonly' not in self.embs_to_load: questions_tokenized, question_tags = zip(*[ _my_fix_tokenization( item['question'], grp_items[f'ctx_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], non_tag_old_det_to_new_ind, question_annotid2detidx, token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1, ) for i in range(4) ]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) answers_tokenized, answer_tags = zip(*[ _my_fix_tokenization( answer, grp_items[f'answer_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], non_tag_old_det_to_new_ind, answer_annotid2detidx[i], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1, ) for i, answer in enumerate(answer_choices) ]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format( self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({ 'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'] }) ##node node_tokenized, node_tags = zip(*[ _fix_word(i, index, item['annot_id'], self.h5fn_graph, self.h5fn_word, pad_ind=0) for i in range(4) ]) instance_dict['node'] = ListField(node_tokenized) ##visual concept visual_concept_tokenized, visual_concept_tags = zip(*[ _fix_visual_concept(item['visual_concept'], item['visual_concept_num'], self.h5fn_word, pad_ind=0) for i in range(4) ]) instance_dict['visual_concept'] = ListField(visual_concept_tokenized) ##adj adj_result, adj_len = zip(*[ _fix_adj(i, index, item['annot_id'], self.h5fn_graph, pad_ind=0) for i in range(4) ]) instance_dict['adjacent'] = ListField(adj_result) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. #image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn'])) #image, window, img_scale, padding = resize_image(image, random_pad=self.is_train) #image = to_tensor_and_normalize(image) #c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # Chop off the final dimension, that's the confidence if self.add_image_as_a_box: tag_boxes = np.row_stack( ([1, 1, 700, 700], tag_boxes)) # here we just use dummy box for background non_tag_boxes = non_tag_boxes[non_tag_dets2use] boxes = np.concatenate((tag_boxes, non_tag_boxes)) if self.add_image_as_a_box: dets2use = dets2use + 1 dets2use = np.insert(dets2use, 0, 0) tag_det_features = np.zeros([4, len(dets2use), 1024]) non_tag_det_features = np.zeros([4, len(non_tag_dets2use), 1024]) for z in range(4): tag_det_features[z, :, :] = tag_features[z][dets2use] non_tag_det_features[ z, :, :] = non_tag_features[z][non_tag_dets2use] det_features = np.concatenate((tag_det_features, non_tag_det_features), 1) #print('here all features ',all_features.shape) instance_dict['det_features'] = ArrayField(det_features, padding_value=0) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return None, instance
def test_as_tensor_returns_integer_tensor(self): label = LabelField(5, skip_indexing=True) tensor = label.as_tensor(label.get_padding_lengths()) assert tensor.item() == 5
def text_to_instance(self, string: str, label: int) -> Instance: fields: Dict[str, Field] = {} tokens = self._tokenizer.tokenize(string) fields['tokens'] = TextField(tokens, self._token_indexers) fields['label'] = LabelField(label, skip_indexing=True) return Instance(fields)
def test_label_field_raises_with_incorrect_label_type(self): with pytest.raises(ConfigurationError): _ = LabelField([], skip_indexing=False)
def text_to_instance( self, # type: ignore sentence: str, structured_representations: List[List[List[JsonDict]]], labels: List[str] = None, target_sequences: List[List[str]] = None, identifier: str = None, ) -> Instance: """ Parameters ---------- sentence : ``str`` The query sentence. structured_representations : ``List[List[List[JsonDict]]]`` A list of Json representations of all the worlds. See expected format in this class' docstring. labels : ``List[str]`` (optional) List of string representations of the labels (true or false) corresponding to the ``structured_representations``. Not required while testing. target_sequences : ``List[List[str]]`` (optional) List of target action sequences for each element which lead to the correct denotation in worlds corresponding to the structured representations. identifier : ``str`` (optional) The identifier from the dataset if available. """ worlds = [] for structured_representation in structured_representations: boxes = { Box(object_list, box_id) for box_id, object_list in enumerate(structured_representation) } worlds.append(NlvrLanguage(boxes)) tokenized_sentence = self._tokenizer.tokenize(sentence) sentence_field = TextField(tokenized_sentence, self._sentence_token_indexers) production_rule_fields: List[Field] = [] instance_action_ids: Dict[str, int] = {} # TODO(pradeep): Assuming that possible actions are the same in all worlds. This may change # later. for production_rule in worlds[0].all_possible_productions(): instance_action_ids[production_rule] = len(instance_action_ids) field = ProductionRuleField(production_rule, is_global_rule=True) production_rule_fields.append(field) action_field = ListField(production_rule_fields) worlds_field = ListField([MetadataField(world) for world in worlds]) metadata: Dict[str, Any] = {"sentence_tokens": [x.text for x in tokenized_sentence]} fields: Dict[str, Field] = { "sentence": sentence_field, "worlds": worlds_field, "actions": action_field, "metadata": MetadataField(metadata), } if identifier is not None: fields["identifier"] = MetadataField(identifier) # Depending on the type of supervision used for training the parser, we may want either # target action sequences or an agenda in our instance. We check if target sequences are # provided, and include them if they are. If not, we'll get an agenda for the sentence, and # include that in the instance. if target_sequences: action_sequence_fields: List[Field] = [] for target_sequence in target_sequences: index_fields = ListField( [ IndexField(instance_action_ids[action], action_field) for action in target_sequence ] ) action_sequence_fields.append(index_fields) # TODO(pradeep): Define a max length for this field. fields["target_action_sequences"] = ListField(action_sequence_fields) elif self._output_agendas: # TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true # now, but may change later too. agenda = worlds[0].get_agenda_for_sentence(sentence) assert agenda, "No agenda found for sentence: %s" % sentence # agenda_field contains indices into actions. agenda_field = ListField( [IndexField(instance_action_ids[action], action_field) for action in agenda] ) fields["agenda"] = agenda_field if labels: labels_field = ListField( [LabelField(label, label_namespace="denotations") for label in labels] ) fields["labels"] = labels_field return Instance(fields)
def test_printing_doesnt_crash(self): label = LabelField("label", label_namespace="namespace") print(label)
def process_split(split, indexers, pair_input, categorical): ''' Convert a dataset of sentences into padded sequences of indices. Args: - split (list[list[str]]): list of inputs (possibly pair) and outputs - pair_input (int) - tok2idx (dict) Returns: ''' if pair_input: inputs1 = [ TextField(list(map(Token, sent)), token_indexers=indexers) for sent in split[0] ] inputs2 = [ TextField(list(map(Token, sent)), token_indexers=indexers) for sent in split[1] ] if categorical: labels = [ LabelField(l, label_namespace="labels", skip_indexing=True) for l in split[2] ] else: labels = [NumericField(l) for l in split[-1]] if len(split) == 4: # numbered test examples idxs = [ LabelField(l, label_namespace="idxs", skip_indexing=True) for l in split[3] ] instances = [Instance({"input1": input1, "input2": input2, "label": label, "idx": idx})\ for (input1, input2, label, idx) in zip(inputs1, inputs2, labels, idxs)] else: instances = [Instance({"input1": input1, "input2": input2, "label": label}) for \ (input1, input2, label) in zip(inputs1, inputs2, labels)] else: inputs1 = [ TextField(list(map(Token, sent)), token_indexers=indexers) for sent in split[0] ] if categorical: labels = [ LabelField(l, label_namespace="labels", skip_indexing=True) for l in split[2] ] else: labels = [NumericField(l) for l in split[2]] if len(split) == 4: idxs = [ LabelField(l, label_namespace="idxs", skip_indexing=True) for l in split[3] ] instances = [Instance({"input1": input1, "label": label, "idx": idx}) for \ (input1, label, idx) in zip(inputs1, labels, idxs)] else: instances = [ Instance({ "input1": input1, "label": label }) for (input1, label) in zip(inputs1, labels) ] return instances #DatasetReader(instances) #Batch(instances) #Dataset(instances)
def test_as_tensor_returns_integer_tensor(self): label = LabelField(5, skip_indexing=True) tensor = label.as_tensor(label.get_padding_lengths()).data.cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([5]))
def make_reading_comprehension_instance(question_text: str, passage_text: str, answer_text: str, label: float, question_passage_tokens: List[Token], question_passage_offsets: List[Tuple[ int, int]], token_indexers: Dict[str, TokenIndexer], id: str = None, pred_chains: List[Tuple[List, float]] = None, sp_facts_id: List[int] = None, article: Dict = None) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_tokens : ``List[Token]`` An already-tokenized question. question_passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_spans : ``List[Tuple[int, int]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list because there might be several possible correct answer spans in the passage. Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple annotations on the dev set; this will select the span that the most annotators gave as correct). answer_texts : ``List[str]``, optional All valid answer strings for the given question. In SQuAD, e.g., the training set has exactly one answer per question, but the dev and test sets have several. TriviaQA has many possible answers, which are the aliases for the known correct entity. This is put into the metadata for use with official evaluation scripts, but not used anywhere else. passage_dep_heads : ``List[int]``, optional The dependency parents for each token in the passage, zero-indexing. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. para_limit : ``int``, indicates the maximum length of a given article """ fields: Dict[str, Field] = {} # This is separate so we can reference it later with a known type. question_passage_field = TextField(question_passage_tokens, token_indexers) fields['question_passage'] = question_passage_field fields['label'] = LabelField(label, skip_indexing=True) metadata = { 'original_passage': passage_text, 'token_offsets': question_passage_offsets, 'question_text': question_text, 'original_label_score': label, 'id': id, 'pred_chains': pred_chains, 'sp_set_id': sp_facts_id, 'original_article': article, 'passage_tokens': [token.text for token in question_passage_tokens] } fields['metadata'] = MetadataField(metadata) return Instance(fields)
def test_label_field_empty_field_works(self): label = LabelField("test") empty_label = label.empty_field() assert empty_label.label == -1
def test_as_tensor_returns_integer_tensor(self): label = LabelField(5, skip_indexing=True) tensor = label.as_tensor(label.get_padding_lengths()) assert tensor.item() == 5