def get_examples(self, filepath): """See base class.""" """ filepath: the file of article-category pairs """ examples = [] i = 0 with open(filepath) as fin: lines = fin.read().strip().split("\n") for line in tqdm(lines): line = line.strip().split("\t") pos_cat = line[0] neg_cats = line[1:-1] article = line[-1] for neg_cat in neg_cats: examples.append( InputExample(guid=i, text_a=pos_cat, text_b=article, label='1')) i += 1 examples.append( InputExample(guid=i, text_a=neg_cat, text_b=article, label='0')) i += 1 return examples
def get_examples(self, filepath): """See base class.""" """ filepath: the file of the evaluation dataset """ examples = [] labels = [] i = 0 with open(filepath) as fin: lines = fin.read().strip().split("\n") for line in tqdm(lines): line = line.strip().split(",", 1) if line[0].startswith("'") or line[0].startswith('"'): line[0] = line[0][1:-1] label = int(line[0]) - 1 text = " ".join(line[1][1:-1].split()[:128]) if text.strip() == "": text = "N/A" for cat in self.cats: i += 1 if cat == self.cats[label]: examples.append(InputExample(guid=i, text_a=cat, text_b=text, label=1)) else: examples.append(InputExample(guid=i, text_a=cat, text_b=text, label=0)) return examples
def _create_examples(self, lines_a, lines_b): original_examples = [] pos_examples = [] neg_examples = [] for (i, (line_a, line_b)) in enumerate(zip(lines_a, lines_b)): if (i + 1) % 5000 == 0: print("create examples:{}".format(i)) original_guid = "%s_%s_%s" % ("train", 'original', i) original_examples.append( InputExample(guid=original_guid, text_a=line_a, text_b=None, label=1)) pos_guid = "%s_%s_%s" % ("train", 'pos', i) pos_examples.append( InputExample(guid=pos_guid, text_a=line_b, text_b=None, label=1)) neg_guid = "%s_%s_%s" % ("train", 'neg', i) neg_line = self.get_neg_sent(line_a) neg_examples.append( InputExample(guid=neg_guid, text_a=neg_line, text_b=None, label=1)) return original_examples, pos_examples, neg_examples
def _create_examples(self, path, set_type): """ 创建数据集 Args: path: train, dev, test数据集路径 set_type: 标记数据类型, train, dev, test Returns: """ examples = [] #样本计数 count = 0 dirs = os.listdir(path) #如果是测试数据,或者预测数据,不是双层文件夹,是一层文件夹 if set_type != 'test': for dir in dirs: files = os.listdir(os.path.join(path, dir)) for file in tqdm(files): file_content = self.docx2text(os.path.join(path, dir, file)) # 过滤掉内容少于5个字符的无意义文档 if len(file_content) > 5: guid = "%s-%s" % (set_type, count) count += 1 text_a = file_content label = dir examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) else: #设置label为None for file in dirs: file_content = self.docx2text(os.path.join(path, file)) guid = "%s-%s" % (set_type, count) count += 1 text_a = file_content label = None examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, filename, set_type): """Creates examples for the training, dev and test sets.""" data = pd.read_csv(filename) user_review_examples = [] item_review_examples = [] for i in range(data.shape[0]): guid = "%s-%s" % (set_type, i) row = data.iloc[i] user_id = str(row["user_id"]) item_id = str(row["item_id"]) label = float(row["ratings"]) rating = row["ratings"] user_reviews = self.user_reviews[ user_id] if user_id in self.user_reviews else ["N/A"] item_reviews = self.item_reviews[ item_id] if item_id in self.user_reviews else ["N/A"] random.shuffle(user_reviews) random.shuffle(item_reviews) user_reviews = " [SEP] ".join(user_reviews) item_reviews = " [SEP] ".join(item_reviews) user_review_examples.append( InputExample(guid=guid, text_a=user_reviews, label=label)) item_review_examples.append( InputExample(guid=guid, text_a=item_reviews, label=label)) return user_review_examples, item_review_examples
def get_FEVER_examples(prefix, hypo_only=False): ''' train_fitems.jsonl, dev_fitems.jsonl, test_fitems.jsonl dev_fitems.label.recovered.jsonl ''' examples = [] path = '/export/home/Dataset/para_entail_datasets/nli_FEVER/nli_fever/' filename = path+prefix+'_fitems.jsonl' if prefix == 'test' or prefix == 'dev': filename = path+'dev_fitems.label.recovered.jsonl' print('loading FEVER...', filename) guid_id = 0 pos_size = 0 with open(filename, 'r') as f: for line in json_lines.reader(f): guid_id+=1 premise = line.get('context') hypothesis = line.get('query') label = 'entailment' if line.get('label') == 'SUPPORTS' else 'not_entailment' if label == 'entailment': pos_size+=1 if len(premise) == 0 or len(hypothesis)==0: continue if hypo_only: examples.append(InputExample(guid=str(guid_id), text_a=hypothesis, text_b=None, label=label)) else: examples.append(InputExample(guid=str(guid_id), text_a=premise, text_b=hypothesis, label=label)) print('FEVER size:', len(examples)) return examples, pos_size
def load_DocNLI(prefix, hypo_only=False): readfile = codecs.open('/export/home/Dataset/para_entail_datasets/'+prefix+'.json', 'r', 'utf-8') data = json.load(readfile) examples = [] for dic in data: premise = dic.get('premise') hypothesis = dic.get('hypothesis') label = dic.get('label') if hypo_only: examples.append(InputExample(guid='ex', text_a=hypothesis, text_b=None, label=label)) else: examples.append(InputExample(guid='ex', text_a=premise, text_b=hypothesis, label=label)) return examples
def get_MCTest_examples(prefix, hypo_only=False): path = '/export/home/Dataset/para_entail_datasets/MCTest/' filename = path+prefix+'_in_entail.txt' print('loading MCTest...', filename) readfile = codecs.open(filename, 'r', 'utf-8') guid_id = 0 pos_size = 0 neg_size = 0 examples = [] for line in readfile: guid_id+=1 parts = line.strip().split('\t') if len(parts) ==3: premise = parts[1] hypothesis = parts[2] label = 'entailment' if parts[0] == 'entailment' else 'not_entailment' # if label == 'entailment': # pos_size+=1 if len(premise) == 0 or len(hypothesis)==0: # print('MCTest premise:', premise) # print('hypothesis:', hypothesis) continue if label == 'entailment': pos_size+=1 else: neg_size+=1 if hypo_only: examples.append(InputExample(guid=prefix+str(guid_id), text_a=hypothesis, text_b=None, label=label)) else: examples.append(InputExample(guid=prefix+str(guid_id), text_a=premise, text_b=hypothesis, label=label)) print('MCTest size:', len(examples)) # if prefix == 'train': # new_examples = [] # new_pos_size = 0 # new_neg_size = 0 # for ex in examples: # if ex.label == 'not_entailment': # if random.uniform(0.0, 1.0) <= pos_size/neg_size: # new_examples.append(ex) # new_neg_size+=1 # else: # new_examples.append(ex) # new_pos_size+=1 # print('>>new pos:neg: ', new_pos_size, new_neg_size) # return new_examples, new_pos_size # else: # return examples, pos_size return examples, pos_size
def test_mnli_dev_no_contradiction(self): processor = MnliNoContradictionProcessor() base_name = 'crosslangt.nli.dataprep_nli.MnliProcessor' \ '.get_dev_examples' examples = [ InputExample('2', 'text c', 'text e', 'entailment'), InputExample('3', 'text d', 'text f', 'neutral'), InputExample('1', 'text a', 'text b', 'contradiction') ] expected = examples[:2] # All but the contradiction example with patch(base_name, return_value=examples): actual = processor.get_dev_examples('/some/data/dir') self.assertListEqual(expected, list(actual))
def get_example_from_tensor_dict(self, tensor_dict): return InputExample( tensor_dict["idx"].numpy(), tensor_dict["sentence1"].numpy().decode("utf-8"), tensor_dict["sentence2"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), )
def _create_examples(self, lines, set_type): examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) examples.append( InputExample(guid=guid, text_a=line, text_b=None, label=1)) return examples
def __create_examples(self, pmid, text_li, genes, relations): examples = [] text_li_ori = text_li guids = set() for g1 in genes: for g2 in genes: guid = f"{pmid}_{g1}_{g2}" if self.testData and f"{pmid}_{g2}_{g1}" in guids: continue text_li = text_li_ori.copy() if (g1, g2) in relations or (g2, g1) in relations: label = "1" self.label_1_count += 1 else: label = "0" self.label_0_count += 1 g1_l = "Gene_S" if g1 == g2 else "Gene_A" g2_l = "Gene_S" if g1 == g2 else "Gene_B" for i, word in enumerate(text_li): if word[:5] == "Gene_": if word[5:] == g1: text_li[i] = g1_l elif word[5:] == g2: text_li[i] = g2_l else: text_li[i] = "Gene_N" text_a = " ".join(text_li) if self.testData: guids.add(guid) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def predict(context, replies, tokenizer, model, label_list, args): model.eval() best_score = 0.0 best_reply = None results = [] for index, reply in enumerate(replies): example = InputExample(guid=0, text_a = context, text_b = [reply]) feature = convert_single_example_to_features(example, tokenizer, max_length=512, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0, mask_padding_with_zero=True) # with torch.no_grad(): all_input_ids = torch.tensor([feature.input_ids],dtype=torch.long).to(args.device) all_attention_mask = torch.tensor([feature.attention_mask ], dtype=torch.long).to(args.device) all_token_type_ids = torch.tensor([feature.token_type_ids ], dtype=torch.long).to(args.device) #dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids) inputs = {'input_ids': all_input_ids, 'attention_mask': all_attention_mask, 'token_type_ids': all_token_type_ids} outputs = model(**inputs) logits = outputs[0] ## label is None, so there we got logits. # logits.detach().cpu().numpy() prob = np.argmax(logits.detach().cpu().numpy() ) results.append(prob) return results
def load_and_cache_examples(self, x1, x2, task, tokenizer): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file label_list = processor.get_labels() examples = [] for t1, t2 in zip(x1, x2): guid = "%s-%s" % ('dev_matched', t1) examples.append(InputExample(guid=guid, text_a=t1, text_b=t2, label='negative')) features = convert_examples_to_features(examples, tokenizer, label_list=label_list, max_length=128, output_mode=output_mode, pad_on_left=False, # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def _create_examples(self, lines, set_type, do_text_b): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # if i == 0: # continue guid = "%s-%s" % (set_type, i) try: text_a = line[0] except IndexError: print(i) print(line) if do_text_b: text_b = "Climate change and global warming are serious concerns." else: text_b = None try: label = line[-1] except IndexError: print(i, line) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_valid_examples(self, data_dir): lg = self.language if self.train_language is None else self.train_language lines = self._read_tsv( os.path.join( data_dir, "XNLI-MT-1.0/multinli/multinli.valid.{}.tsv".format(lg))) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % ("valid", i) text_a = line[0] text_b = line[1] label = "contradiction" if line[2] == "contradictory" else line[2] assert isinstance( text_a, str), f"Validation input {text_a} is not a string" assert isinstance( text_b, str), f"Validation input {text_b} is not a string" assert isinstance(label, str), f"Validation label {label} is not a string" examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i != 0: guid = "%s-%s-%s" % (str(i), set_type, line[0]) text_a = line[1] text_a = text_a.replace( '@problem$', self.problem_pattern).replace( '@treatment$', self.treatment_pattern ).replace('@test$', self.test_pattern).replace( "@problem-problem$", self.problem_problem_pattern).replace( "@test-problem$", self.test_problem_pattern).replace( "@test-test$", self.test_test_pattern).replace( '@treatment-test$', self.treatment_test_pattern).replace( '@treatment-treatment$', self.treatment_treatment_pattern) label = line[2] assert label in self.get_labels() examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_train_examples(self, data_dir): """See base class.""" # lg = self.language if self.train_language is None else self.train_language # lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg))) train_data = self.read_data(join(data_dir, 'snli_1.0_train.jsonl')) dev_data = self.read_data(join(data_dir, 'snli_1.0_dev.jsonl')) lines = train_data + dev_data lines = [x for x in lines if (x['gold_label'] != '-')] examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % ('train', i) text_a = line['sentence1'] text_b = line['sentence2'] # label = "contradiction" if line[2] == "contradictory" else line[2] label = line['gold_label'] assert isinstance(text_a, str) and isinstance( text_b, str) and isinstance(label, str) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_example_from_tensor_dict(self, tensor_dict): return InputExample( tensor_dict['idx'].numpy(), tensor_dict['sentence1'].numpy().decode('utf-8'), tensor_dict['sentence2'].numpy().decode('utf-8'), str(tensor_dict['label'].numpy()), )
def create_examples(self, df, set_type): """Creates examples for the training and dev sets.""" examples = [] for idx, row in df.iterrows(): guid = "%s-%s" % (set_type, idx) default_input_columns = ['query', 'text'] default_output_columns = ['label'] input_columns = self.configs.get('input', default_input_columns) output_columns = self.configs.get('output', default_output_columns) try: text_a, text_b = row[input_columns[0]], None if len(input_columns) > 1: text_b = row[input_columns[1]] label = row[output_columns[0]] except KeyError: print('No corresponding columns found for config keys {}'.format(input_columns)) examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def load_and_cache_examples(candidates, tokenizer): max_length = 128 examples = [ InputExample(guid=str(i), text_a=x) for i, x in enumerate(candidates) ] features = glue_convert_examples_to_features( examples, tokenizer, label_list=["0", "1"], max_length=max_length, output_mode="classification") # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor( [f.attention_mask for f in features], dtype=torch.long) all_labels = torch.tensor([0 for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([[0.0] * max_length for f in features], dtype=torch.long) dataset = torch.utils.data.TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache cached_features_file = '/home/ray/transformers/trec/cacha_{}'.format( 'test' if evaluate else 'train') if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: train_dataset = trec_dataset('/home/ray/transformers/trec', train=not evaluate, test=evaluate) examples = [] for i, inst in enumerate(train_dataset): examples.append( InputExample(guid='', text_a=inst['text'], label=inst['label'])) features = convert_examples_to_features( examples, tokenizer, label_list=['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'], max_length=args.max_seq_length, output_mode="classification", pad_on_left=False, # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) output_mode = "classification" if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["question"].numpy().decode("utf-8"), tensor_dict["sentence"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), )
def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample( tensor_dict["idx"].numpy(), tensor_dict["premise"].numpy().decode("utf-8"), tensor_dict["hypothesis"].numpy().decode("utf-8"), str(tensor_dict["label"].numpy()), )
def _create_pos_examples(self, lines_a, lines_b, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, (line_a, line_b)) in enumerate(zip(lines_a, lines_b)): guid = "%s-%s" % (set_type, i) examples.append( InputExample(guid=guid, text_a=line_a, text_b=line_b, label=1)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) examples.append( InputExample(guid=guid, text_a=line, text_b=None, label=1)) return examples
def create_input_feature(tokenizer, output_mode, example, max_length, mask_padding_with_zero, pad_on_left, pad_token, pad_token_segment_id, label_map): example = InputExample( example['id'], example['sentence1'], example['sentence2'] if 'sentence2' in example else None, example['label']) inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, truncation_strategy= 'only_first' # We're truncating the first sequence in priority ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len( input_ids) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len(attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len(token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)
def get_ANLI_examples(prefix, hypo_only=False): folders = ['R1', 'R2', 'R3'] examples = [] guid_id = 0 pos_size = 0 neg_size = 0 path = '/export/home/Dataset/para_entail_datasets/ANLI/anli_v0.1/' for folder in folders: filename = path+folder+'/'+prefix+'.jsonl' print('loading ANLI...', filename) with open(filename, 'r') as f: for line in json_lines.reader(f): guid_id+=1 premise = line.get('context') hypothesis = line.get('hypothesis') label = 'entailment' if line.get('label') == 'e' else 'not_entailment' if len(premise) == 0 or len(hypothesis)==0: continue if label == 'entailment': pos_size+=1 else: neg_size+=1 if hypo_only: examples.append(InputExample(guid=str(guid_id), text_a=hypothesis, text_b=None, label=label)) else: examples.append(InputExample(guid=str(guid_id), text_a=premise, text_b=hypothesis, label=label)) print('>>pos:neg: ', pos_size, neg_size) print('ANLI size:', len(examples)) # if prefix == 'train': # new_examples = [] # new_pos_size = 0 # new_neg_size = 0 # for ex in examples: # if ex.label == 'not_entailment': # if random.uniform(0.0, 1.0) <= pos_size/neg_size: # new_examples.append(ex) # new_neg_size+=1 # else: # new_examples.append(ex) # new_pos_size+=1 # print('>>new pos:neg: ', new_pos_size, new_neg_size) # return new_examples, new_pos_size # else: # return examples, pos_size return examples, pos_size
def deal_with_block(block_line_list, filter_label_set, hypo_only=False): examples = [] premise = '' if not block_line_list[0].startswith('document>>'): return [], 0, 0 first_line_parts = block_line_list[0].strip().split('\t') # premise = first_line_parts[1].strip() premise = first_line_parts[2].strip() if len(premise) == 0: return [], 0, 0 pos_hypo_list = [] neg_hypo_list = [] for line in block_line_list[1:]: if len(line.strip())>0: parts = line.strip().split('\t') if len(parts) == 3: filter_label = parts[1].strip() if parts[0] == 'positive>>': pos_hypo = parts[2].strip() # harsh version if filter_label not in filter_label_set and len(pos_hypo) >0: pos_hypo_list.append(pos_hypo) elif parts[0] == 'negative>>': neg_hypo = parts[2].strip() '''we do not need filter any negative summary in train, dev, and test''' if len(neg_hypo) >0: neg_hypo_list.append(neg_hypo) for pos_hypo in pos_hypo_list: if hypo_only: examples.append(InputExample(guid='ex', text_a=pos_hypo, text_b=None, label='entailment')) else: examples.append(InputExample(guid='ex', text_a=premise, text_b=pos_hypo, label='entailment')) for neg_hypo in neg_hypo_list: if hypo_only: examples.append(InputExample(guid='ex', text_a=neg_hypo, text_b=None, label='not_entailment')) else: examples.append(InputExample(guid='ex', text_a=premise, text_b=neg_hypo, label='not_entailment')) return examples, len(pos_hypo_list), len(neg_hypo_list)
def _create_examples(self, lines, set_type, LABELS=False): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, str(i)) text_q = line[0] # question text_r = line[2] # response if LABELS: label = int(line[-1]) # label label = "1" if label > 0 else "0" examples.append( InputExample(guid=guid, text_a=text_q, text_b=text_r, label=label)) else: examples.append( InputExample(guid=guid, text_a=text_q, text_b=text_r)) return examples