reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: train_samples_ConstrativeLoss.append( InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))) if row['is_duplicate'] == '1': train_samples_MultipleNegativesRankingLoss.append( InputExample(texts=[row['question1'], row['question2']], label=1)) train_samples_MultipleNegativesRankingLoss.append( InputExample(texts=[row['question2'], row['question1']], label=1) ) # if A is a duplicate of B, then B is a duplicate of A # Create data loader and loss for MultipleNegativesRankingLoss train_dataset_MultipleNegativesRankingLoss = SentencesDataset( train_samples_MultipleNegativesRankingLoss, model=model) train_dataloader_MultipleNegativesRankingLoss = DataLoader( train_dataset_MultipleNegativesRankingLoss, shuffle=True, batch_size=train_batch_size) train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss( model) # Create data loader and loss for OnlineContrastiveLoss train_dataset_ConstrativeLoss = SentencesDataset(train_samples_ConstrativeLoss, model=model) train_dataloader_ConstrativeLoss = DataLoader(train_dataset_ConstrativeLoss, shuffle=True, batch_size=train_batch_size) train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss( model=model, distance_metric=distance_metric, margin=margin)
def get_dataset2(data_pars=None, model=None, **kw): """ JSON data_pars to get dataset "data_pars": { "data_path": "dataset/GOOG-year.csv", "data_type": "pandas", "size": [0, 0, 6], "output_size": [0, 6] }, """ # data_path = path_norm(data_pars["data_path"]) istrain = data_pars.get("is_train", 0) mode = "train" if istrain else "test" data_type = data_pars[f"{mode}_type"].lower() def get_reader(data_type, path): if data_type == 'nli': Reader = readers.NLIDataReader elif data_type == 'sts': Reader = readers.STSDataReader else: Reader = "MyCustomReader()" path = os.path.join(path) reader = Reader(path) return reader def get_filename(data_type, mode='test'): if mode == 'train': fname = 'train.gz' if data_pars["train_type"].lower( ) == 'nli' else 'sts-train.csv' if mode == 'test': fname = 'dev.gz' if data_pars["test_type"].lower( ) == 'nli' else 'sts-dev.csv' return fname log("############ Dataloader setup #############################") train_dataloader = None if istrain: train_pars = data_pars.copy() train_pars.update(train=1) train_fname = get_filename( data_pars, mode='train' ) # 'train.gz' if data_pars["train_type"].lower() == 'nli'else 'sts-train.csv' train_reader = get_reader(data_type, data_pars['train_path']) train_data = SentencesDataset(train_reader.get_examples(train_fname), model=model.model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=data_pars["batch_size"]) val_pars = data_pars.copy() val_pars.update(train=0) val_fname = get_filename( data_pars, mode='test' ) #'dev.gz' if data_pars["test_type"].lower() == 'nli' else 'sts-dev.csv' val_reader = get_reader(data_type, data_pars['test_path']) val_data = SentencesDataset(val_reader.get_examples(val_fname), model=model.model) val_dataloader = DataLoader(val_data, shuffle=True, batch_size=data_pars["batch_size"]) pars = {"train_num_labels": train_reader.get_num_labels()} return train_dataloader, val_dataloader, pars else: #### Inference part val_pars = data_pars.copy() val_pars.update(train=0) val_fname = get_filename( data_pars, mode='test' ) #'dev.gz' if data_pars["test_type"].lower() == 'nli' else 'sts-dev.csv' val_reader = get_reader(data_type, data_pars['test_path']) pars = { "train_fname": 'train.gz' if data_pars["train_type"].lower() == 'nli' else 'sts-train.csv' } return val_reader, pars
# Use BERT for mapping tokens to embeddings word_embedding_model = models.RoBERTa('roberta-large') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.SoftmaxLoss( model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset( examples=sts_reader.get_examples('sts-train-dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training num_epochs = 1
word_embedding_model = models.RoBERTa('roberta-base', do_lower_case=False) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Load Old Model # model = SentenceTransformer('checkpoints/sentence_transformers/roberta-base_v7_triplet_epoch1') triplet_reader = TripletReader('data/v7') logging.info("Read Train dataset") train_data = SentencesDataset( examples=triplet_reader.get_examples('triplet_train_full.csv'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model, triplet_margin=1.0) logging.info("Read Dev dataset") dev_data = SentencesDataset( examples=triplet_reader.get_examples('triplet_dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=eval_batch_size) evaluator = TripletEvaluator(dev_dataloader) # Train the model
for line in lines: s1, s2, score = line.split('\t') score = score.strip() score = float(score) / 5.0 test_samples.append(InputExample(texts=[s1, s2], label=score)) with open('./KorNLUDatasets/KorSTS/tune_train.tsv', 'rt', encoding='utf-8') as fIn: lines = fIn.readlines() for line in lines: s1, s2, score = line.split('\t') score = score.strip() score = float(score) / 5.0 train_samples.append(InputExample(texts=[s1, s2], label=score)) train_dataset = SentencesDataset(train_samples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps))
model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") label2int = {"contradiction": 0, "entailment": 1, "neutral": 2} train_nli_samples = [] with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['split'] == 'train': label_id = label2int[row['label']] train_nli_samples.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=label_id)) train_data_nli = SentencesDataset(train_nli_samples, model=model) train_dataloader_nli = DataLoader(train_data_nli, shuffle=True, batch_size=batch_size) train_loss_nli = losses.SoftmaxLoss( model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int)) logging.info("Read STSbenchmark train dataset") train_sts_samples = [] dev_sts_samples = [] test_sts_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader:
# # Read the dataset model_save_path = "output2/run3/training_agb_roberta-base-nli-mean-tokens-2020-04-10_15-59-19_og_3/" batch_size = 52 agb_reader = AGBDataReader('datasets/AGB_og') train_num_labels = agb_reader.get_num_labels() # Use RoBERTa for mapping tokens to embeddings model = SentenceTransformer(model_save_path) train_loss = losses.SoftmaxLoss( model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) train_loss.classifier = torch.load( os.path.join(model_save_path, "2_Softmax/pytorch_model.bin")) print("dev") test_data = SentencesDataset(examples=agb_reader.get_examples('dev_raw.tsv'), model=model, shorten=True) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) evaluator = LabelAccuracyEvaluator(test_dataloader, softmax_model=train_loss) model.evaluate(evaluator) print("test") test_data = SentencesDataset(examples=agb_reader.get_examples('test_raw.tsv'), model=model, shorten=True) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) evaluator = LabelAccuracyEvaluator(test_dataloader, softmax_model=train_loss) model.evaluate(evaluator)
################################################################################################# # # Step 3: Train bi-encoder model with both (gold + silver) STSbenchmark dataset - Augmented SBERT # ################################################################################################# logging.info( "Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)". format(model_name)) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark gold and silver train dataset") silver_samples = list(InputExample(texts=[data[0], data[1]], label=score) for \ data, score in zip(silver_data, silver_scores)) train_dataset = SentencesDataset(gold_samples + silver_samples, bi_encoder) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model=bi_encoder) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training. warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the bi-encoder model
### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) logging.info("Read Triplet train dataset") train_dataset = SentencesDataset(examples=triplet_reader.get_examples( 'train.csv', max_examples=100000), model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model) logging.info("Read Wikipedia Triplet dev dataset") evaluator = TripletEvaluator.from_input_examples(triplet_reader.get_examples( 'validation.csv', 1000), name='dev') warmup_steps = int(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data # Train the model
"/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training train_data = SentencesDataset(ict_reader.get_examples('dev.txt', max_examples=6000), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.SoftmaxLoss( model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) # load dev evaluator with sentence similarity task sts-dev # Finetuning Sentence Transformers with inverse cloze task + nips papers (does not seem to improve sentence similarity by much): # Without finetuning (vanilla bert) : Pearson: 0.5917 Spearman: 0.5932 # With ICT finetuning after 3k steps: Pearson: 0.6574 Spearman: 0.6809 # longer training does not improve the sentence similiarity logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=model) # The below all apply to the de example - how does one evaluate the model outside this single example??? ###### Load dev sets ###### # Test on STS 2017.en-de dataset using Spearman rank correlation logging.info("Read data/hindi_sbert_sts_train.csv dataset") evaluators = [] sts_reader = readers.STSDataReader('./data/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2) dev_data = SentencesDataset( examples=sts_reader.get_examples('hindi_sbert_sts_train.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator_sts = evaluation.EmbeddingSimilarityEvaluator( dev_dataloader, name='Hindi_Headlines_en_hi_sbert') evaluators.append(evaluator_sts) model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluation.SequentialEvaluator( evaluators, main_score_function=lambda scores: scores[-1]), epochs=20, evaluation_steps=1000, warmup_steps=10000, scheduler='warmupconstant', output_path=output_path,
def main(): parser = argparse.ArgumentParser() # Input and output configs parser.add_argument("--task", default=None, type=str, required=True, help="the task to run bert ranker for") parser.add_argument("--data_folder", default=None, type=str, required=True, help="the folder containing data") parser.add_argument("--output_dir", default=None, type=str, required=True, help="the folder to output predictions") # #Training procedure parser.add_argument("--num_epochs", default=5, type=int, required=False, help="Number of epochs for training.") parser.add_argument("--train_batch_size", default=8, type=int, required=False, help="Training batch size.") # #Model hyperparameters parser.add_argument("--transformer_model", default="bert-base-cased", type=str, required=False, help="Bert model to use (default = bert-base-cased).") args = parser.parse_args() word_embedding_model = models.Transformer(args.transformer_model) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) logging.info("Creating train CRR dataset.") crr_reader = CRRBenchmarkDataReader('{}/{}'.format(args.data_folder, args.task)) train_data = SentencesDataset(crr_reader.get_examples("train.tsv"), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Creating dev CRR dataset.") dev_data = SentencesDataset(crr_reader.get_examples('valid.tsv'), model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=args.train_batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) warmup_steps = math.ceil( len(train_data) * args.num_epochs / args.train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) logging.info("Fitting sentenceBERT") model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=args.num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=args.output_dir + "{}_{}".format(args.transformer_model, args.task))
def grid_search_fine_tune_sbert(train_params, train_sents, train_labels, label_names, eval_classifier=None): """ Find the optimal SBERT model by doing a hyperparameter search over random seeds, dev percentage, and different types of SBERT models """ output_path = train_params["output_path"] all_dev_perc = train_params["all_dev_perc"] model_names = train_params["model_names"] max_num_epochs = train_params["max_num_epochs"] baseline = train_params['baseline'] patience = train_params['patience'] seeds = train_params['seeds'] if eval_classifier is None: train_params["eval_classifier"] = "SBERT" else: train_params["eval_classifier"] = eval_classifier.__class__.__name__ print( f"Grid Search Fine tuning parameters:\n{json.dumps(train_params, indent=4)}" ) label2int = dict(zip(label_names, range(len(label_names)))) for dev_perc in all_dev_perc: X_train, X_dev, y_train, y_dev = train_test_split( train_sents, train_labels, test_size=dev_perc, stratify=train_labels, random_state=100) # Load data samples into batches train_batch_size = 16 train_samples = build_data_samples(X_train, label2int, y_train) dev_samples = build_data_samples(X_dev, label2int, y_dev) for model_name in model_names: # Train set config model = EarlyStoppingSentenceTransformer(model_name) train_dataset = SentencesDataset(train_samples, model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) # Dev set config dev_dataset = SentencesDataset(dev_samples, model=model) dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=train_batch_size) # Define the way the loss is computed classifier = SoftmaxClassifier(model=model, sentence_embedding_dimension=model. get_sentence_embedding_dimension(), num_labels=len(label2int)) warmup_steps = math.ceil( len(train_dataset) * max_num_epochs / train_batch_size * 0.1) # 10% of train data for warm-up for seed in seeds: set_seeds(seed) model_deets = f"{train_params['eval_classifier']}_model={model_name}_test-perc={dev_perc}_seed={seed}" # Train the model start = time.time() dev_evaluator = CustomLabelAccuracyEvaluator( dataloader=dev_dataloader, softmax_model=classifier, name='lae-dev', label_names=label_names, model_hyper_params={ 'model_name': model_name, 'dev_perc': dev_perc, 'seed': seed }) model.fit( train_objectives=[(train_dataloader, classifier)], evaluator=dev_evaluator, epochs=max_num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path, model_deets=model_deets, baseline=baseline, patience=patience, ) end = time.time() hours, rem = divmod(end - start, 3600) minutes, seconds = divmod(rem, 60) print( "Time taken for fine-tuning:", "{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
examples = [] for topic in topics: gold = qrel[topic["number"]].items() query = topic["title"].strip() for item in gold: try: doc = db.lookup_docno(item[0]) examples.append(InputExample(texts=[query, doc], label=item[1])) except: continue print("finished") from torch.utils.data import DataLoader train_dataset = SentencesDataset(examples, ranker) train_dl = DataLoader(train_dataset, shuffle=True, batch_size=16) ranker.fit(train_dataloader=train_dl, epochs=5, output_path="ranker/base") from tqdm.notebook import tqdm run = {} for topic in tqdm(topics): number = topic["number"] query = topic["title"] extracted_ids = [k for k in qrel[number].keys()] doc_ids = [] for id in extracted_ids: try:
cnn = models.CNN(in_word_embedding_dimension=word_embedding_model. get_word_embedding_dimension(), out_channels=256, kernel_sizes=[1, 3, 5]) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(cnn.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = LanguageTransformer(modules=[word_embedding_model, cnn, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples('sts-dev.csv')) # Configure the training num_epochs = 10 warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)],
# Add two trainable feed-forward networks (DAN) sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension( ) dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) model = SentenceTransformer( modules=[word_embedding_model, pooling_model, dan1, dan2]) # Convert the dataset to a DataLoader ready for training logging.info("Read AGB train dataset") train_data = SentencesDataset(agb_reader.get_examples('train_raw.tsv'), model=model, shorten=False) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.SoftmaxLoss( model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read AGB dev dataset") dev_data = SentencesDataset( examples=agb_reader.get_examples('dev_raw.tsv'), model=model, shorten=False) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
progress.update(1) progress.reset() progress.close() logging.info("Textual augmentation completed....") logging.info("Number of silver pairs generated: {}".format( len(silver_samples))) ################################################################### # # Train SBERT model with both (gold + silver) STS benchmark dataset # ################################################################### logging.info("Read STSbenchmark (gold + silver) training dataset") train_dataset = SentencesDataset(gold_samples + silver_samples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training. warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the SBERT model
def main(args): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' model_name = args.model_name if model_name is None: model_name = 'bert-base-chinese' # Read the dataset batch_size = args.batch_size model_output_dir = args.model_output_dir #model_save_path = os.path.join(model_output_dir, "bert-base", datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) model_save_path = model_output_dir # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings # word_embedding_model = models.Transformer(model_name) if args.init_model is None: word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) else: model = SentenceTransformer(args.init_model) if args.do_train == 1: # Convert the dataset to a DataLoader ready for training data_reader = SimTextDataReader() logging.info("train_data:%s" % (args.train_data)) logging.info("cache_data:%s" % (args.cached_data)) train_data_files = args.train_data.split('#') cached_data_file = args.cached_data logging.info("Read train dataset") if not os.path.isfile(cached_data_file): train_examples = [] for train_file in train_data_files: if os.path.isfile(train_file): logging.info("load train file:%s" % (train_file)) now_examples = data_reader.get_examples(train_file) train_examples.extend(now_examples) train_data = SentencesDataset(train_examples, model=model) torch.save(train_data, args.cached_data) else: train_data = torch.load(cached_data_file) logging.info("Load cached dataset %s" % (cached_data_file)) logging.info("Build train dataset") train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) # train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read dev dataset") dev_data_files = args.dev_data.split('#') dev_examples = [] for dev_file in dev_data_files: if os.path.isfile(dev_file): logging.info("load dev file:%s" % (dev_file)) now_examples = data_reader.get_examples(dev_file) dev_examples.extend(now_examples) dev_data = SentencesDataset(examples=dev_examples, model=model) logging.info("Build dev dataset") dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training num_epochs = args.num_epochs warmup_steps = math.ceil( len(train_dataloader) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) logging.info("Start training") # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path) if args.do_predict == 1: logging.info("Read predict dataset") pred_data_file = args.pred_data output_file = os.path.join(args.model_output_dir, "pred_res") text_pairs = load_pred_data(pred_data_file) with open(output_file, "w", encoding="utf-8") as fp: for tpair in text_pairs: embedding_pair = model.encode(tpair) cos_sim = cosine_similarity(embedding_pair[0], embedding_pair[1]) fp.write("%s\t%s\t%f\n" % (tpair[0], tpair[1], cos_sim))
from sentence_transformers import SentenceTransformer, LossFunction, TrainConfig, SentencesDataset, LoggingHandler, EmbeddingSimilarityEvaluator, EmbeddingSimilarity from sentence_transformers.dataset_readers import STSDataReader import numpy as np import logging import sys #### Just some code to print debug information to stdout np.set_printoptions(threshold=100) logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Load Sentence model (based on BERT) from URL model = SentenceTransformer(sys.argv[1]) sts_reader = STSDataReader('datasets/stsbenchmark') test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8, collate_fn=model.smart_batching_collate()) evaluator = EmbeddingSimilarityEvaluator(test_dataloader, EmbeddingSimilarity.COSINE) model.evaluate(evaluator)
###### Read Dataset ###### train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) for train_file in train_files: train_data.load_data(train_file) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=model) ###### Load dev sets ###### # Test on STS 2017.en-de dataset using Spearman rank correlation logging.info("Read STS2017.en-de dataset") evaluators = [] sts_reader = readers.STSDataReader('../datasets/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2) dev_data = SentencesDataset(examples=sts_reader.get_examples('STS2017.en-de.txt.gz'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='STS2017.en-de') evaluators.append(evaluator_sts) # Use XLNI.en-de dataset with MSE evaluation logging.info("Read XNLI.en-de dataset") xnli_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) xnli_reader.load_data('../datasets/xnli-en-de.txt.gz') xnli_dataloader = DataLoader(xnli_reader, shuffle=False, batch_size=train_batch_size) xnli_mse = evaluation.MSEEvaluator(xnli_dataloader, name='xnli-en-de') evaluators.append(xnli_mse)
from sentence_transformers import SentencesDataset, SentenceTransformer, losses from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.readers import InputExample from torch.utils.data import DataLoader from transformers.data.processors.glue import MrpcProcessor processor = MrpcProcessor() model = SentenceTransformer('bert-base-nli-mean-tokens') train_examples = processor.get_train_examples("/data/misc/cc/1_extraction/1.1_squad_classification/glue_data/MRPC") train_examples = [InputExample(ie.guid, [ie.text_a, ie.text_b], float(ie.label)) for ie in train_examples] train_data = SentencesDataset(train_examples, model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=2) train_loss = losses.CosineSimilarityLoss(model=model) dev_examples = processor.get_dev_examples("/data/misc/cc/1_extraction/1.1_squad_classification/glue_data/MRPC") dev_examples = [InputExample(ie.guid, [ie.text_a, ie.text_b], float(ie.label)) for ie in dev_examples] dev_data = SentencesDataset(dev_examples, model) dev_dataloader = DataLoader(train_data, shuffle=True, batch_size=2) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=5, evaluation_steps=1000, warmup_steps=500, output_path="out") print("done")
### Configure sentence transformers for training and train on the provided dataset # Use BERT for mapping tokens to embeddings word_embedding_model = models.Transformers( model_name_or_path='bert-base-uncased', model_type='bert') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) logging.info("Read Triplet train dataset") train_data = SentencesDataset( examples=triplet_reader.get_examples('train.csv'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model) logging.info("Read Wikipedia Triplet dev dataset") dev_data = SentencesDataset(examples=triplet_reader.get_examples( 'validation.csv', 1000), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(dev_dataloader) warmup_steps = int(len(train_data) * num_epochs / train_batch_size *
def BertEM(path_train, path_valid, path_test, path_error,epochs_num, warmup_steps_num, evaluation_steps_num): #实例化进度条 bar = progressbar #定义模型 data_type = {"text_a": str, "text_b": str} train_data = pd.read_csv(path_train, encoding='utf-8',dtype=data_type) valid_data = pd.read_csv(path_valid, encoding='utf-8',dtype=data_type) test_data = pd.read_csv(path_test, encoding='utf-8',dtype=data_type) #训练集 train_examples = [] for i in bar.progressbar(range(len(train_data))): time.sleep(0.0001) text_a = train_data.iloc[i]['text_a'] text_b = train_data.iloc[i]['text_b'] text_a = str(text_a) text_b = str(text_b) label_data = train_data.iloc[i]['label'] label_data = float(label_data) train_examples.append(InputExample(texts=[text_a,text_b], label=label_data)) print(InputExample) #验证集 sentence_a = [] sentence_b = [] label_valid = [] for i in bar.progressbar(range(len(valid_data))): time.sleep(0.0001) sentence1 = valid_data.iloc[i]['text_a'] sentence2 = valid_data.iloc[i]['text_b'] label_valid_t = valid_data.iloc[i]['label'] label_valid_t = float(label_valid_t) sentence_a.append(sentence1) sentence_b.append(sentence2) label_valid.append(label_valid_t) #定义评估器 #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentence_a, sentence_b, label_valid) evaluator = evaluation.BinaryClassificationEvaluator(sentence_a, sentence_b, label_valid) #定义数据集,损失函数 train_dataset = SentencesDataset(train_examples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=64) train_loss = losses.CosineSimilarityLoss(model) #计算时间 start_time = time.clock() #训练模型 model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs_num, warmup_steps=warmup_steps_num,evaluator=evaluator, evaluation_steps=evaluation_steps_num, use_amp=True) end_time = time.clock() #=========================================评估过程=================================================== #读取并把test所有属性转化成str test_data = pd.read_csv(path_test, encoding='utf-8') test_data['text_a'] = test_data['text_a'].map(lambda x: str(x)) test_data['text_b'] = test_data['text_b'].map(lambda x: str(x)) #循环创建预测的list字典 list_num = 40 prefix = 'pred_list_' test_map = {prefix + str(i): [] for i in range(list_num)} label_list = [] score = 0.20 error_csv = pd.DataFrame(columns=('id','text_a','text_b','cos_scores')) #进入测试集测试 for i in bar.progressbar(range(len(test_data))): time.sleep(0.0001) text_a_embedding = model.encode(test_data.iloc[i]['text_a'], convert_to_tensor=True) text_b_embedding = model.encode(test_data.iloc[i]['text_b'], convert_to_tensor=True) extend_score = 0 cos_scores = util.pytorch_cos_sim(text_a_embedding, text_b_embedding)[0] cos_scores = cos_scores.cpu() cos_scores = cos_scores + extend_score #标签list label = test_data.iloc[i]['label'] label_list.append(int(label)) #记录下错误的数据 if cos_scores >= 0.70: pred_test = 1 else: pred_test = 0 if pred_test != label: error_text_a = test_data.iloc[i]['text_a'] error_text_b = test_data.iloc[i]['text_b'] error_cos_scores = cos_scores error_csv = error_csv.append(pd.DataFrame({'id':[i],'text_a':[error_text_a],'text_b':[error_text_b],'cos_scores':[error_cos_scores]}),ignore_index=True) #生成预测list compute_pred(score, cos_scores, prefix, test_map) error_csv.to_csv(path_error, index=0) max_f1 = 0 target_threshold = 0.01 target_precision = 0.01 target_recall = 0.01 threshold = 0.20 #循环输出各种得分结果 for i in range(len(test_map.keys())): #循环计算得分 precision, recall, f1 = compute_score(label_list, test_map[prefix + str(i)]) if f1 >= max_f1: max_f1 = f1 target_threshold = threshold target_precision = precision target_recall = recall print('The score > {} result is precision: {}, | recall:{}, | f1: {}'.format(round(threshold,2), precision, recall, f1)) threshold += 0.02 #输出所有结果 print('================dataset_name==================',path_a) print('================threshold:{}, target_precision:{}, target_recall:{}, max_f1:{}'.format(target_threshold, target_precision, target_recall, max_f1)) print('================train_time:{}'.format(str(end_time-start_time)))
# Use BERT for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_data_nli = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader_nli = DataLoader(train_data_nli, shuffle=True, batch_size=batch_size) train_loss_nli = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read STSbenchmark train dataset") train_data_sts = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) train_dataloader_sts = DataLoader(train_data_sts, shuffle=True, batch_size=batch_size) train_loss_sts = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples('sts-dev.csv'), name='sts-dev') # Configure the training num_epochs = 4
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) model = moco.builder.SB_MoCo(args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) # comment out the following line for debugging # raise NotImplementedError("Only DistributedDataParallel is supported.") else: pass # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. # raise NotImplementedError("Only DistributedDataParallel is supported.") # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True SB = SentenceTransformer(''bert-base-nli-mean-tokens'') dataReader = DataReader.DataReader('csv') train_dataset = SentencesDataset(dataReader.get_examples('positive_pair.csv'),model=SB) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None fn = SB.smart_batching_collate() train_loader = torch.utils.data.DataLoader( train_dataset, # The training samples. sampler=train_sampler, # Select batches randomly batch_size=args.batch_size,# Trains with this batch size. num_workers=args.workers, drop_last=True, collate_fn = fn ) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0 ): save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), }, is_best=False, filename='./checkpoints/moco_{:03d}.pth.tar'.format(epoch))
# Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT('bert-base-uncased') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read ParaBank train dataset") train_data = SentencesDataset(nli_reader.get_examples('train.tsv'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.SoftmaxLoss( model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read ParaBank dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('dev.tsv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training num_epochs = 1
def _dataloader_from_examples(examples, model, batch_size=8, shuffle=False): train_data = SentencesDataset(examples, model, show_progress_bar=True) return DataLoader(train_data, shuffle=shuffle, batch_size=batch_size)
# # Step 3: Train bi-encoder (SBERT) model with QQP dataset - Augmented SBERT # ########################################################################### logging.info( "Step 3: Train bi-encoder: {} over labeled QQP (target dataset)".format( model_name)) # Convert the dataset to a DataLoader ready for training logging.info("Loading BERT labeled QQP dataset") qqp_train_data = list( InputExample(texts=[data[0], data[1]], label=score) for (data, score) in zip(silver_data, binary_silver_scores)) train_dataset = SentencesDataset(qqp_train_data, bi_encoder) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size) train_loss = losses.MultipleNegativesRankingLoss(bi_encoder) ###### Classification ###### # Given (quesiton1, question2), is this a duplicate or not? # The evaluator will compute the embeddings for both questions and then compute # a cosine similarity. If the similarity is above a threshold, we have a duplicate. logging.info("Read QQP dev dataset") dev_sentences1 = [] dev_sentences2 = [] dev_labels = []
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings transformers_cache_dir = os.path.join(os.path.dirname(__file__), '..', '..', 'cache', 'transformers') word_embedding_model = models.Transformer(model_name, cache_dir=transformers_cache_dir) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=args.pooling == 'mean', pooling_mode_cls_token=args.pooling == 'cls', pooling_mode_max_tokens=args.pooling == 'max') model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read CMNLI train dataset") train_data = SentencesDataset(cmnli_reader.get_examples('train'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read CSTSbenchmark dev dataset") dev_data = SentencesDataset(examples=csts_reader.get_examples('cnsd-sts-dev.txt'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) warmup_steps = math.ceil(len(train_dataloader) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator,
postprocessor = PolishSubstituer( '/home/xstefan3/arqmath/ARQMath_CLEF2020/Collection/formula_prefix.V0.2.tsv' ) postproc_questions = list( postprocessor.process_questions(dr.post_parser.map_questions)) all_examples = list(examples_from_questions_tup(postproc_questions)) # all_examples = list(examples_from_questions_tup(dr.post_parser.map_questions)) examples_len = len(all_examples) train_dev_test_split = (int(0.8 * examples_len), int(0.9 * examples_len)) # single-time preprocessing support train_data = SentencesDataset(all_examples[:train_dev_test_split[0]], model, show_progress_bar=True) pickle.dump(train_data, open("train_data.pkl", "wb")) # train_data = pickle.load(open("train_data.pkl", "rb")) train_loader = DataLoader(train_data, batch_size=16, shuffle=True) dev_data = SentencesDataset( all_examples[train_dev_test_split[0]:train_dev_test_split[1]], model, show_progress_bar=True) dev_sampler = RandomSampler(dev_data, replacement=True, num_samples=2000) dev_loader = DataLoader(dev_data, batch_size=16, sampler=dev_sampler) train_loss = losses.CosineSimilarityLoss(model=model) evaluator = EmbeddingSimilarityEvaluator(dev_loader,