def get_binary_experimental_setup(): # Items train_items, valid_items = extract_examples("items") # Domains train_domains, valid_domains = extract_examples("domains") # Regroup items and domains together train_examples = train_items + train_domains valid_examples = valid_items + valid_domains print( f"{len(train_examples)} training examples to {len(valid_examples)} valid examples" ) # Postprocess train examples to correct format train_examples = [ InputExample(texts=[sent1, sent2], label=label) for (sent1, sent2, label) in train_examples ] # Get evaluator from valid data evaluator = evaluation.BinaryClassificationEvaluator(*zip(*valid_examples), batch_size=128) return train_examples, evaluator
def run(): train_file = config.TRAINING_FILE train_batch = config.TRAIN_BATCH_SIZE vaild_batch = config.VALID_BATCH_SIZE model_path = config.BERT_PATH max_length = config.MAX_LEN dfs = pd.read_csv(train_file, sep="\t", names=['idx', 'sent1', 'sent2', 'label']) dfs['label'] = pd.to_numeric(dfs["label"], downcast='float') df_train, df_valid = model_selection.train_test_split( dfs, test_size=0.1, random_state=42, stratify=dfs.label.values, ) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) dataset_reader = dataset.Dataset() train_dataset = dataset_reader.read(df_train, return_pt=True) valid_sentence1, valid_sentence2, valid_labels = dataset_reader.read( df_valid) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch) # evaluator = evaluation.EmbeddingSimilarityEvaluator(valid_sentence1, valid_sentence2, valid_labels) evaluator = evaluation.BinaryClassificationEvaluator( valid_sentence1, valid_sentence2, valid_labels, batch_size=vaild_batch, show_progress_bar=False) word_embedding_model = models.Transformer(model_path, max_seq_length=max_length) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=max_length, activation_function=nn.Tanh()) model = SentenceTransformer( modules=[word_embedding_model, pooling_model, dense_model]) train_loss = losses.CosineSimilarityLoss(model) engine.train(train_dataloader, model, train_loss, evaluator)
def run(): test_file = config.TEST_FILE test_batch = config.TEST_BATCH_SIZE model_save_path = config.MODEL_SAVE_PATH dfs = pd.read_csv(test_file, sep='\t', names=['idx', 'sent1', 'sent2', 'label']) dfs['label'] = pd.to_numeric(dfs['label'], downcast='float') dataset_reader = dataset.Dataset() test_sent1, test_sent2, test_labels = dataset_reader.read(dfs) evaluator = evaluation.BinaryClassificationEvaluator( test_sent1, test_sent2, test_labels, batch_size=test_batch, show_progress_bar=True) model = SentenceTransformer(model_save_path) model.evaluate(evaluator)
###### Classification ###### # Given (quesiton1, question2), is this a duplicate or not? # The evaluator will compute the embeddings for both questions and then compute # a cosine similarity. If the similarity is above a threshold, we have a duplicate. dev_sentences1 = [] dev_sentences2 = [] dev_labels = [] with open(os.path.join(dataset_path, "classification/dev_pairs.tsv"), encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: dev_sentences1.append(row['question1']) dev_sentences2.append(row['question2']) dev_labels.append(int(row['is_duplicate'])) binary_acc_evaluator = evaluation.BinaryClassificationEvaluator( dev_sentences1, dev_sentences2, dev_labels) evaluators.append(binary_acc_evaluator) ###### Duplicate Questions Mining ###### # Given a large corpus of questions, identify all duplicates in that corpus. # For faster processing, we limit the development corpus to only 10,000 sentences. max_dev_samples = 10000 dev_sentences = {} dev_duplicates = [] with open(os.path.join(dataset_path, "duplicate-mining/dev_corpus.tsv"), encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: dev_sentences[row['qid']] = row['question']
def main(): parser = argparse.ArgumentParser(description='Start training with SBERT') parser.add_argument('--model_path', type=str, help='Path to trained model folder ./models/[MODEL_NAME]') parser.add_argument('--dataset', type=str, default='few_rel', help='Name dataset') parser.add_argument('--mask_method', type=str, default='bracket', help='Type of masking') parser.add_argument('--num_epochs', type=int, default=15, help='Number epochs') parser.add_argument('--num_samples', type=int, default=-1, help='Number of samples for test run, default -1 means all data') parser.add_argument('--max_seq_length', type=int, default=256, help='Max token length for BERT') args = parser.parse_args() model_path = args.model_path dataset = args.dataset mask_method = args.mask_method num_samples = args.num_samples max_seq_length=args.max_seq_length num_epochs = args.num_epochs evaluation_steps = 1000 # Frequency of evaluation results warmup_steps = 1000 # warm up steps sentence_out_embedding_dimension = 256 if model_path.endswith('/'): model_path = model_path[:-1] model_name = model_path.split('/')[-1] path_train_data = f'./data/train_samples/{dataset}_train_{mask_method}_train.csv' path_eval_data = f'./data/train_samples/{dataset}_val_{mask_method}_test.csv' if num_samples>0: model_save_path = f'./trained_models/{model_name}_sbert_bi_{dataset}_test/' else: model_save_path = f'./trained_models/{model_name}_sbert_bi_{dataset}/' ### Define the model word_embedding_model = models.Transformer(model_path, max_seq_length=max_seq_length) ### Add special tokens - this helps us add tokens like Doc or query or Entity1 / Entity2 # but in our case we already added that to the model prior #tokens = ["[DOC]", "[QRY]"] #word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True) #word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer)) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=sentence_out_embedding_dimension, activation_function=nn.Tanh()) # Model pipeline model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model]) # Prep DataLoader train_examples = load_train_sbert(path_train_data, num_samples) train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) # Prep Evaluator sentences1, sentences2, scores = load_eval_sbert(path_eval_data, num_samples) #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores) evaluator = evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores) #train_loss = losses.CosineSimilarityLoss(model) train_loss = losses.SoftmaxLoss(model, sentence_embedding_dimension= sentence_out_embedding_dimension, num_labels = 2) #Tune the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=evaluation_steps, warmup_steps=warmup_steps, output_path=model_save_path)
word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) print("SentenceTransformer model created") train_loss = losses.MultipleNegativesRankingLoss(model) # Set up a set of different performatnce evaluators evaluators = [] ###### Classification ###### # Given (quesiton1, question2), is this a duplicate or not? # The evaluator will compute the embeddings for both questions and then compute # a cosine similarity. If the similarity is above a threshold, we have a duplicate. binary_acc_evaluator = evaluation.BinaryClassificationEvaluator( dev_seqs1, dev_seqs2, dev_labels) evaluators.append(binary_acc_evaluator) binary_acc_evaluator = evaluation.BinaryClassificationEvaluator( train_seqs1, train_seqs2, train_labels) evaluators.append(binary_acc_evaluator) logging.info("binary acc evaluator added") dev_seq_dict = {} dev_duplicates = [] # create dict of id:seq #for i in range(len(dev_seqs1)): for i in range(0, 5): dev_seq_dict[dev_ids1[i]] = dev_seqs1[i] dev_seq_dict[dev_ids2[i]] = dev_seqs2[i]
def BertEM(path_train, path_valid, path_test, path_error, epochs_num, warmup_steps_num, evaluation_steps_num): #实例化进度条 bar = progressbar #定义模型 #model = SentenceTransformer('bert-large-nli-stsb-mean-tokens',device='cuda:1') model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens', device='cuda:6') #model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens',device='cuda:2') data_type = {"text_a": str, "text_b": str} train_data = pd.read_csv(path_train, encoding='utf-8', dtype=data_type) valid_data = pd.read_csv(path_valid, encoding='utf-8', dtype=data_type) test_data = pd.read_csv(path_test, encoding='utf-8', dtype=data_type) #训练集 train_examples = [] for i in bar.progressbar(range(len(train_data))): time.sleep(0.0001) text_a = train_data.iloc[i]['text_a'] text_b = train_data.iloc[i]['text_b'] text_a = str(text_a) text_b = str(text_b) label_data = train_data.iloc[i]['label'] label_data = float(label_data) train_examples.append( InputExample(texts=[text_a, text_b], label=label_data)) print(InputExample) #验证集 sentence_a = [] sentence_b = [] label_valid = [] for i in bar.progressbar(range(len(valid_data))): time.sleep(0.0001) sentence1 = valid_data.iloc[i]['text_a'] sentence2 = valid_data.iloc[i]['text_b'] label_valid_t = valid_data.iloc[i]['label'] label_valid_t = float(label_valid_t) sentence_a.append(sentence1) sentence_b.append(sentence2) label_valid.append(label_valid_t) #定义评估器 #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentence_a, sentence_b, label_valid) evaluator = evaluation.BinaryClassificationEvaluator( sentence_a, sentence_b, label_valid) #定义数据集,损失函数 train_dataset = SentencesDataset(train_examples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16) train_loss = losses.CosineSimilarityLoss(model) #计算时间 start_time = time.clock() #训练模型 model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs_num, warmup_steps=warmup_steps_num, evaluator=evaluator, evaluation_steps=evaluation_steps_num, use_amp=True) end_time = time.clock() #=========================================评估过程=================================================== #读取并把test所有属性转化成str test_data = pd.read_csv(path_test, encoding='utf-8') test_data['text_a'] = test_data['text_a'].map(lambda x: str(x)) test_data['text_b'] = test_data['text_b'].map(lambda x: str(x)) #循环创建预测的list字典 list_num = 40 prefix = 'pred_list_' test_map = {prefix + str(i): [] for i in range(list_num)} label_list = [] score = 0.20 error_csv = pd.DataFrame(columns=('id', 'text_a', 'text_b', 'cos_scores')) #进入测试集测试 for i in bar.progressbar(range(len(test_data))): time.sleep(0.0001) text_a_embedding = model.encode(test_data.iloc[i]['text_a'], convert_to_tensor=True) text_b_embedding = model.encode(test_data.iloc[i]['text_b'], convert_to_tensor=True) cos_scores = util.pytorch_cos_sim(text_a_embedding, text_b_embedding)[0] cos_scores = cos_scores.cpu() #标签list label = test_data.iloc[i]['label'] label_list.append(int(label)) #记录下错误的数据 if cos_scores >= 0.80: pred_test = 1 else: pred_test = 0 if pred_test != label: error_text_a = test_data.iloc[i]['text_a'] error_text_b = test_data.iloc[i]['text_b'] error_cos_scores = cos_scores error_csv = error_csv.append(pd.DataFrame({ 'id': [i], 'text_a': [error_text_a], 'text_b': [error_text_b], 'cos_scores': [error_cos_scores] }), ignore_index=True) #生成预测list compute_pred(score, cos_scores, prefix, test_map) error_csv.to_csv(path_error, index=0) max_f1 = 0 target_threshold = 0.01 target_accuracy = 0.01 target_recall = 0.01 threshold = 0.20 #循环输出各种得分结果 for i in range(len(test_map.keys())): #循环计算得分 accuracy, recall, f1 = compute_score(test_map[prefix + str(i)], label_list) if f1 >= max_f1: max_f1 = f1 target_threshold = threshold target_accuracy = accuracy target_recall = recall print('The score > {} result is accuracy: {}, | recall:{}, | f1: {}'. format(round(threshold, 2), accuracy, recall, f1)) threshold += 0.02 #输出所有结果 print('================dataset_name==================', path_a) print( '================threshold:{}, target_accuracy:{}, target_recall:{}, max_f1:{}' .format(target_threshold, target_accuracy, target_recall, max_f1)) print('================train_time:{}'.format(str(end_time - start_time)))
def train(self, train_df, eval_df): """ :param train_df: dataframe with columns 'text_a', 'text_b', 'labels' :param eval_df: dataframe with columns 'text_a', 'text_b', 'labels' :return: """ # format training data if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in train_df.columns: if self.args.do_lower_case: train_df.loc[:, 'text_a'] = train_df['text_a'].str.lower() train_df.loc[:, 'text_b'] = train_df['text_b'].str.lower() train_examples = [ InputExample(str(i), texts=[text_a, text_b], label=label) for i, (text_a, text_b, label) in enumerate( zip( train_df["text_a"].astype(str), train_df["text_b"].astype(str), train_df["labels"].astype(int), )) ] else: raise KeyError( 'Training data processing - Required columns not found!') # format evaluation data if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in eval_df.columns: if self.args.do_lower_case: eval_df.loc[:, 'text_a'] = eval_df['text_a'].str.lower() eval_df.loc[:, 'text_b'] = eval_df['text_b'].str.lower() evaluator = evaluation.BinaryClassificationEvaluator( list(eval_df["text_a"]), list(eval_df["text_b"]), list(eval_df["labels"].astype(int)), batch_size=self.args.eval_batch_size) else: raise KeyError( 'Evaluation data processing - Required columns not found!') # Define train dataset, the dataloader and the train loss train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=self.args.train_batch_size) if self.args.loss_func is not None and self.args.loss_func == 'MultipleNegativesRankingLoss': train_loss = losses.MultipleNegativesRankingLoss(self.model) else: distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE train_loss = losses.OnlineContrastiveLoss( model=self.model, distance_metric=distance_metric, margin=self.args.margin) # Tune the model self.model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=self.args.num_train_epochs, warmup_steps=self.args.warmup_steps, optimizer_params={'lr': self.args.learning_rate}, weight_decay=self.args.weight_decay, evaluator=evaluator, evaluation_steps=self.args.evaluate_during_training_steps, max_grad_norm=self.args.max_grad_norm, output_path=self.args.best_model_dir, show_progress_bar=self.args.show_progress_bar) evaluation_file = os.path.join(self.args.best_model_dir, evaluator.csv_file) eval_results_df = pd.read_csv(evaluation_file) eval_results_df.sort_values(self.score_type, inplace=True, ascending=False, ignore_index=True) self.threshold = eval_results_df.loc[0, self.threshold_type] print( f'Set model threshold to {self.threshold} acquiring a {self.score_type} of {eval_results_df.loc[0, self.score_type]}' ) return self.threshold