def text_to_instance(self, # type: ignore example) -> Instance: fields: Dict[str, Field] = {} joint_tokens_seq = example['paired_c_tokens'] assert len(joint_tokens_seq) <= 512 segments_ids = example['segment_ids'] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64)) # This text span is begin inclusive and end exclusive. # text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use) # text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens'])) # fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence']) # fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence']) # fields['bert_s2_span'] = SpanField(text2_span) # fields['bert_s1_span'] = MetadataField(text1_span) # fields['bert_s2_span'] = MetadataField(text2_span) # However, the ground truth span is begin and end both inclusive fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence']) fields['fid'] = IdField(example['fid']) fields['uid'] = IdField(example['uid']) return Instance(fields)
def text_to_instance( self, # type: ignore query: str, # Important type information context: str, fid: str = None, qid: str = None, selection_label: str = None) -> Instance: fields: Dict[str, Field] = {} tokenized_text1 = self.bert_tokenizer.tokenize(query) tokenized_text2 = self.bert_tokenizer.tokenize(context) # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l) tokenized_text1 = tokenized_text1[:self.query_l] tokenized_text2 = tokenized_text2[:self.context_l] s1_tokens_seq = ['[CLS]'] + tokenized_text1 s2_tokens_seq = ['[CLS]'] + tokenized_text2 # text1_len = len(tokenized_text1) + 1 # text2_len = len(tokenized_text2) + 1 # segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)] s1_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids( s1_tokens_seq) s2_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids( s2_tokens_seq) fields['s1_sequence'] = BertIndexField( np.asarray(s1_tokens_ids, dtype=np.int64)) fields['s2_sequence'] = BertIndexField( np.asarray(s2_tokens_ids, dtype=np.int64)) text1_span = (1, len(tokenized_text1) ) # End is exclusive (important for later use) text2_span = (1, len(tokenized_text2)) fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['s1_sequence']) fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['s2_sequence']) if selection_label: fields['label'] = LabelField(selection_label, label_namespace='labels') assert fid is not None assert qid is not None fields['fid'] = IdField(fid) fields['qid'] = IdField(qid) return Instance(fields)
def text_to_instance( self, # type: ignore sent1: str, # Important type information sent2: str, pid: str = None, label: str = None) -> Instance: fields: Dict[str, Field] = {} tokenized_text1 = self.bert_tokenizer.tokenize(sent1) tokenized_text2 = self.bert_tokenizer.tokenize(sent2) # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l) tokenized_text1 = tokenized_text1[:self.query_l] tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))] joint_tokens_seq = ['[CLS]'] + tokenized_text1 + [ '[SEP]' ] + tokenized_text2 + ['[SEP]'] text1_len = len(tokenized_text1) + 2 text2_len = len(tokenized_text2) + 1 segments_ids = [0 for _ in range(text1_len) ] + [1 for _ in range(text2_len)] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids( joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField( np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField( np.asarray(segments_ids, dtype=np.int64)) text1_span = (1, 1 + len(tokenized_text1) ) # End is exclusive (important for later use) text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2)) fields['bert_s1_span'] = MetadataField(text1_span) fields['bert_s2_span'] = MetadataField(text2_span) if label: fields['label'] = LabelField(label, label_namespace='labels') if pid: fields['pid'] = IdField(pid) return Instance(fields)
def text_to_instance( self, # type: ignore seq1: str, # Important type information seq2: str, fid: str = None, oid: str = None, retain_item: str = None, selection_label: str = None) -> Instance: if self.is_paired: fields: Dict[str, Field] = {} tokenized_text1 = self.bert_tokenizer.tokenize(seq1) tokenized_text2 = self.bert_tokenizer.tokenize(seq2) if self.pair_order == 'qc': tokenized_text1 = tokenized_text1[:self.query_l] tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))] elif self.pair_order == 'cq': tokenized_text2 = tokenized_text2[:self.query_l] tokenized_text1 = tokenized_text1[:(self.max_l - len(tokenized_text2))] else: raise ValueError(f"No valid pair ordering. {self.pair_order}") joint_tokens_seq = ['[CLS]'] + tokenized_text1 + [ '[SEP]' ] + tokenized_text2 + ['[SEP]'] text1_len = len(tokenized_text1) + 2 text2_len = len(tokenized_text2) + 1 segments_ids = [0 for _ in range(text1_len) ] + [1 for _ in range(text2_len)] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids( joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField( np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField( np.asarray(segments_ids, dtype=np.int64)) text1_span = (1, 1 + len(tokenized_text1) ) # End is exclusive (important for later use) text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2)) fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence']) fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence']) if selection_label: fields['label'] = LabelField(selection_label, label_namespace='labels') assert fid is not None assert oid is not None fields['fid'] = IdField(fid) fields['oid'] = IdField(oid) fields['item'] = IdField(retain_item) return Instance(fields) else: fields: Dict[str, Field] = {} tokenized_text1 = self.bert_tokenizer.tokenize(seq1) tokenized_text2 = self.bert_tokenizer.tokenize(seq2) # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l) tokenized_text1 = tokenized_text1[:self.query_l] tokenized_text2 = tokenized_text2[:self.context_l] s1_tokens_seq = ['[CLS]'] + tokenized_text1 s2_tokens_seq = ['[CLS]'] + tokenized_text2 s1_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids( s1_tokens_seq) s2_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids( s2_tokens_seq) fields['s1_sequence'] = BertIndexField( np.asarray(s1_tokens_ids, dtype=np.int64)) fields['s2_sequence'] = BertIndexField( np.asarray(s2_tokens_ids, dtype=np.int64)) text1_span = (1, len(tokenized_text1) ) # End is exclusive (important for later use) text2_span = (1, len(tokenized_text2)) fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['s1_sequence']) fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['s2_sequence']) if selection_label: fields['label'] = LabelField(selection_label, label_namespace='labels') assert fid is not None assert oid is not None fields['fid'] = IdField(fid) fields['oid'] = IdField(oid) fields['item'] = IdField(retain_item) return Instance(fields)