コード例 #1
0
def main(model_path, model_type, extra_dataset):
    # Read the dataset
    train_batch_size = 64
    num_epochs = 20
    model_save_path = model_path + '_continue_training_' + datetime.now(
    ).strftime("%Y_%m_%d_%H_%M_%S")
    n2c2_reader = TripletReader(extra_dataset)

    if model_type.lower() in ["bert"]:
        word_embedding_model = models.BERT(model_path)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        embedder = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        #### load sentence BERT models and generate sentence embeddings ####
    else:
        #### load sentence BERT models and generate sentence embeddings ####
        embedder = SentenceTransformer(model_path)

    # Load a pre-trained sentence transformer model
    model = SentenceTransformer(model_path)

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read extra training dataset")
    train_data = SentencesDataset(n2c2_reader.get_examples('train.tsv'), model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.TripletLoss(model=model, triplet_margin=triplet_margin)

    logging.info("Read development dataset")
    dev_data = SentencesDataset(examples=n2c2_reader.get_examples('dev.tsv'),
                                model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=train_batch_size)
    evaluator = TripletEvaluator(dev_dataloader)

    # Configure the training. We skip evaluation in this example
    warmup_steps = math.ceil(
        len(train_data) * num_epochs / train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=math.ceil(len(train_data) / train_batch_size),
              warmup_steps=warmup_steps,
              output_path=model_save_path)
コード例 #2
0
    def train_contrastive_model(self,
                                slang_ind,
                                params=None,
                                fold_name='default'):

        if params is None:
            params = {
                'train_batch_size': 16,
                'num_epochs': 4,
                'triplet_margin': 1,
                'outpath': 'SBERT_contrastive'
            }

        self.prep_contrastive_training(slang_ind, fold_name=fold_name)

        out_dir = self.out_dir + '/' + fold_name + '/SBERT_data/'

        triplet_reader = TripletReader(out_dir,
                                       s1_col_idx=0,
                                       s2_col_idx=1,
                                       s3_col_idx=2,
                                       delimiter=',',
                                       has_header=True)
        output_path = out_dir + params['outpath']

        sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

        train_data = SentencesDataset(
            examples=triplet_reader.get_examples('contrastive_train.csv'),
            model=sbert_model)
        train_dataloader = DataLoader(train_data,
                                      shuffle=True,
                                      batch_size=params['train_batch_size'])
        train_loss = losses.TripletLoss(
            model=sbert_model, triplet_margin=params['triplet_margin'])

        dev_data = SentencesDataset(
            examples=triplet_reader.get_examples('contrastive_dev.csv'),
            model=sbert_model)
        dev_dataloader = DataLoader(dev_data,
                                    shuffle=False,
                                    batch_size=params['train_batch_size'])
        evaluator = TripletEvaluator(dev_dataloader)

        warmup_steps = int(
            len(train_data) * params['num_epochs'] /
            params['train_batch_size'] * 0.1)  #10% of train data

        # Train the model
        sbert_model.fit(train_objectives=[(train_dataloader, train_loss)],
                        evaluator=evaluator,
                        epochs=params['num_epochs'],
                        evaluation_steps=len(dev_data),
                        warmup_steps=warmup_steps,
                        output_path=output_path)
コード例 #3
0
def run_triplets_model(train_triplets, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac,
                       use_model_device, model_name='distilbert-base-uncased', out_features=256):
    task = Task.init(project_name='BB Clustering', task_name='bbclustering_triplets')
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: ' + str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: ' + str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features,
                                   activation_function=nn.Tanh())

    model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model])

    train_dataloader = DataLoader(train_triplets, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.TripletLoss(model=model)

    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)

    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Raw BERT embedding performance")
    model.to(device)
    evaluator(model, output_path)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              test_evaluator=test_evaluator,
              epochs=num_epochs,
              evaluation_steps=eval_steps,
              warmup_steps=warmup_steps,
              output_path=output_path)
コード例 #4
0
torch.cuda.empty_cache()

my_model_path = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/train_results/test_wiki'

model_wiki = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')

dev_dataloader = torch.load(os.path.join(my_model_path, 'dev_dataloader.pth'))
train_dataloader = torch.load(
    os.path.join(my_model_path, 'train_dataloader.pth'))

evaluator = TripletEvaluator(dev_dataloader)

optimizer_class = transformers.AdamW
optimizer_params = {'lr': 2e-4, 'eps': 1e-6, 'correct_bias': False}
train_loss = losses.TripletLoss(model=model_wiki)

num_epochs = 4
warmup_steps = math.ceil(
    len(train_dataloader.dataset) * num_epochs / train_dataloader.batch_size *
    0.05)  #5% of train data for warm-up

model_wiki.fit(train_objectives=[(train_dataloader, train_loss)],
               evaluator=evaluator,
               epochs=num_epochs,
               steps_per_epoch=8000,
               warmup_steps=warmup_steps,
               optimizer_class=optimizer_class,
               optimizer_params=optimizer_params,
               output_path=os.path.join(
                   my_model_path,
コード例 #5
0
with open(my_loc + '/proc_data/val-triples.csv', 'w', encoding='utf-8') as f:
    for trs in val_trips:
        f.write('%s\t%s\t%s\n' % (trs[0], trs[1], trs[2]))

num_epochs = 2
batch_size = 8
model_save_path = my_loc + '/models/finetune_%s_%s' % (fname, emb_type)

triplet_reader = TripletReader(dataset_folder=my_loc + '/proc_data/')

model = SentenceTransformer(emb_type)
train_data = SentencesDataset(triplet_reader.get_examples('train-triples.csv'),
                              model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.TripletLoss(model=model, triplet_margin=1)

dev_data = SentencesDataset(
    examples=triplet_reader.get_examples('val-triples.csv'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = TripletEvaluator(dev_dataloader)

warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size *
                         0.1)  #10% of train data for warm-up

model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=500,
          warmup_steps=warmup_steps,
          output_path=model_save_path)
コード例 #6
0
def train(triplet_data_dir, output):
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])

    ### Create a torch.DataLoader that passes training batch instances to our model
    train_batch_size = 16
    triplet_reader = TripletReader(triplet_data_dir,
                                   s1_col_idx=1,
                                   s2_col_idx=2,
                                   s3_col_idx=3,
                                   delimiter=',',
                                   quoting=csv.QUOTE_MINIMAL,
                                   has_header=True)
    # output_path = "output/bert-base-wikipedia-sections-mean-tokens-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    output_path = output + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    num_epochs = 1

    ### Configure sentence transformers for training and train on the provided dataset
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.BERT('bert-base-uncased')

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    logging.info("Read Triplet train dataset")
    train_data = SentencesDataset(examples=triplet_reader.get_examples(
        'train.csv', 2000000),
                                  model=model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.TripletLoss(model=model)

    logging.info("Read Wikipedia Triplet dev dataset")
    dev_data = SentencesDataset(examples=triplet_reader.get_examples(
        'validation.csv', 10000),
                                model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=train_batch_size)
    evaluator = TripletEvaluator(dev_dataloader)

    warmup_steps = int(len(train_data) * num_epochs / train_batch_size *
                       0.1)  #10% of train data

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=output_path)

    ##############################################################################
    #
    # Load the stored model and evaluate its performance on STS benchmark dataset
    #
    ##############################################################################

    model = SentenceTransformer(output_path)
    test_data = SentencesDataset(
        examples=triplet_reader.get_examples('test.csv'), model=model)
    test_dataloader = DataLoader(test_data,
                                 shuffle=False,
                                 batch_size=train_batch_size)
    evaluator = TripletEvaluator(test_dataloader)

    model.evaluate(evaluator)
コード例 #7
0
def main():
    parser = argparse.ArgumentParser()

    # Input and output configs
    parser.add_argument("--task", default=None, type=str, required=True,
                        help="the task to run bert ranker for")
    parser.add_argument("--data_folder", default=None, type=str, required=True,
                        help="the folder containing data")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="the folder to output predictions")
    parser.add_argument("--negative_sampler", default="random", type=str, required=False,
                        help="negative sampling procedure to use ['random', 'bm25', 'sentence_transformer']")
    parser.add_argument("--anserini_folder", default="", type=str, required=True,
                        help="Path containing the anserini bin <anserini_folder>/target/appassembler/bin/IndexCollection")
    parser.add_argument("--sentence_bert_ns_model", default="all-MiniLM-L6-v2", type=str, required=False,
                        help="model to use for sentenceBERT negative sampling.")

    parser.add_argument('--denoise_negatives', dest='denoise_negatives', action='store_true')
    parser.add_argument('--no-denoise_negatives', dest='denoise_negatives', action='store_false')
    parser.set_defaults(denoise_negatives=False)
    parser.add_argument("--num_ns_for_denoising", default=100, type=int, required=False,
                        help="Only used for --denoise_negatives. Number of total of samples to retrieve and get the bottom 10.")

    parser.add_argument("--generative_sampling_model", default="all-MiniLM-L6-v2", type=str, required=False,
                        help="model to use for generating negative samples on the go.")

    parser.add_argument('--remove_cand_subsets', dest='remove_cand_subsets', action='store_true')
    parser.add_argument('--dont_remove_cand_subsets', dest='remove_cand_subsets', action='store_false')
    parser.set_defaults(remove_cand_subsets=True)

    #which part of the context we use to sample negatives.
    parser.add_argument('--last_utterance_only', dest='last_utterance_only', action='store_true')
    parser.add_argument('--all_utterances', dest='last_utterance_only', action='store_false')
    parser.set_defaults(last_utterance_only=False)

    # External corpus to augment negative sampling
    parser.add_argument('--external_corpus', dest='use_external_corpus', action='store_true')
    parser.add_argument('--dont_use_external_corpus', dest='use_external_corpus', action='store_false')
    parser.set_defaults(use_external_corpus=False)

    # #Training procedure
    parser.add_argument("--num_epochs", default=3, type=int, required=False,
                        help="Number of epochs for training.")
    parser.add_argument("--train_batch_size", default=8, type=int, required=False,
                        help="Training batch size.")
    # #Model hyperparameters
    parser.add_argument("--transformer_model", default="bert-base-cased", type=str, required=False,
                        help="Bert model to use (default = bert-base-cased).")
    parser.add_argument("--loss", default='MultipleNegativesRankingLoss', type=str, required=False,
                        help="Loss function to use ['MultipleNegativesRankingLoss', 'TripletLoss', 'MarginMSELoss']")

    ## Wandb project name 
    parser.add_argument("--wandb_project", default='train_sentence_transformer', type=str, required=False,
                        help="name of the wandb project")
    parser.add_argument("--seed", default=42, type=int, required=False,
                        help="Random seed.")

    args = parser.parse_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    max_seq_length = 300
    if args.transformer_model == 'all-mpnet-base-v2' or args.transformer_model == 'msmarco-bert-base-dot-v5':
        model = SentenceTransformer(args.transformer_model)
        model.max_seq_length = max_seq_length
    else:
        word_embedding_model = models.Transformer(args.transformer_model, max_seq_length=max_seq_length)
        tokens = ['[UTTERANCE_SEP]', '[TURN_SEP]', '[AUG]']
        word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
        word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)
        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    eval_only = False
    if eval_only:
        logging.info("Skipping training (eval_only=True)")
    
    else:
        logging.info("Creating train CRR dataset for {} using {}.".format(args.task, args.negative_sampler))
        crr_reader = CRRBenchmarkDataReader('{}/{}'.format(args.data_folder, args.task))
        train_data = crr_reader.get_examples("train.tsv", args.negative_sampler,
                                    args.anserini_folder, args.sentence_bert_ns_model, args.loss, args.output_dir,
                                    True, False,
                                    args.denoise_negatives, args.num_ns_for_denoising,
                                    args.generative_sampling_model,
                                    args.remove_cand_subsets,
                                    args.last_utterance_only,
                                    args.use_external_corpus)
        train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size)
    
    if args.loss == 'MultipleNegativesRankingLoss':
        train_loss = losses.MultipleNegativesRankingLoss(model=model, similarity_fct=util.dot_score)
    elif args.loss == 'MarginMSELoss':
        train_loss = losses.MarginMSELoss(model=model)
    elif args.loss == 'TripletLoss':
        train_loss = losses.TripletLoss(model=model)
    elif args.loss == 'ContrastiveLoss':
        train_loss = losses.ContrastiveLoss(model=model)
    elif args.loss == 'OnlineContrastiveLoss':
        train_loss = losses.OnlineContrastiveLoss(model=model)


    ns_description = args.negative_sampler
    if args.negative_sampler == 'sentence_transformer':
        ns_description+="_{}".format(args.sentence_bert_ns_model)

    if args.negative_sampler == 'generative':
        ns_description+="_{}".format(args.generative_sampling_model)

    wandb.init(project=args.wandb_project)
    wandb.config.update(args)

    if not eval_only: # this is the eval data for the training, not the actual evaluation
        logging.info("Getting eval data")
        examples_dev = crr_reader.get_examples('valid.tsv', 
            args.negative_sampler, args.anserini_folder, args.sentence_bert_ns_model, args.loss, args.output_dir, eval_data=True)
        examples_dev = examples_dev[0:(11*500)]
        eval_samples = []
        docs = []
        for i, example in enumerate(examples_dev):
            if (i+1)%11==0:
                eval_samples.append({'query': example.texts[0], 
                                    'positive': [example.texts[1]],
                                    'negative': docs
                })
                docs=[]
            else:
                docs.append(example.texts[2])
        evaluator = RerankingEvaluator(eval_samples, write_csv=True, similarity_fct=util.dot_score)
        warmup_steps = math.ceil(len(train_data)*args.num_epochs/args.train_batch_size*0.1) #10% of train data for warm-up
        logging.info("Warmup-steps: {}".format(warmup_steps))

        logging.info("Fitting sentenceBERT for {}".format(args.task))

        model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=args.num_epochs,
            evaluation_steps=100,          
            steps_per_epoch=10000,        
            warmup_steps=warmup_steps,
            output_path=args.output_dir+"{}_{}_ns_{}_loss_{}".format(args.transformer_model, args.task, ns_description, args.loss))

    logging.info("Evaluating for full retrieval of responses to dialogue.")

    train = pd.read_csv(args.data_folder+args.task+"/train.tsv", sep="\t")
    test = pd.read_csv(args.data_folder+args.task+"/test.tsv", sep="\t")

    ns_test_sentenceBERT = negative_sampling.SentenceBERTNegativeSampler(list(train["response"].values)+list(test["response"].values), 10, 
                   args.data_folder+args.task+"/test_sentenceBERTembeds", -1, 
                   args.output_dir+"{}_{}_ns_{}_loss_{}".format(args.transformer_model, args.task, ns_description, args.loss),
                   use_cache_for_embeddings=False)
    
    ns_info = [
        (ns_test_sentenceBERT, 
        ["cand_sentenceBERT_{}".format(i) for i in range(10)] + ["sentenceBERT_retrieved_relevant", "sentenceBERT_rank"], 
        'sentenceBERT')
    ]
    examples = []
    examples_cols = ["context", "relevant_response"] + \
        reduce(lambda x,y:x+y, [t[1] for t in ns_info])
    logging.info("Retrieving candidates using different negative sampling strategies for {}.".format(args.task))
    recall_df = []
    for idx, row in enumerate(tqdm(test.itertuples(index=False), total=len(test))):
        context = row[0]
        relevant_response = row[1]
        instance = [context, relevant_response]

        for ns, _ , ns_name in ns_info:
            ns_candidates, scores, had_relevant, rank_relevant, _ = ns.sample(context, [relevant_response])
            for ns in ns_candidates:
                instance.append(ns)
            instance.append(had_relevant)
            instance.append(rank_relevant)
            if had_relevant:
                r10 = 1
            else:
                r10 = 0
            if rank_relevant == 0:
                r1 = 1
            else:
                r1 =0
            recall_df.append([r10, r1])
        examples.append(instance)

    recall_df  = pd.DataFrame(recall_df, columns = ["R@10", "R@1"])
    examples_df = pd.DataFrame(examples, columns=examples_cols)
    logging.info("R@10: {}".format(examples_df[[c for c in examples_df.columns if 'retrieved_relevant' in c]].sum()/examples_df.shape[0]))
    wandb.log({'R@10': (examples_df[[c for c in examples_df.columns if 'retrieved_relevant' in c]].sum()/examples_df.shape[0]).values[0]})
    rank_col = [c for c in examples_df.columns if 'rank' in c][0]
    logging.info("R@1: {}".format(examples_df[examples_df[rank_col]==0].shape[0]/examples_df.shape[0]))
    wandb.log({'R@1': examples_df[examples_df[rank_col]==0].shape[0]/examples_df.shape[0]})
    recall_df.to_csv(args.output_dir+"/recall_df_{}_{}_ns_{}_loss_{}.csv".format(args.transformer_model.replace("/", "-"), args.task, ns_description.replace("/", "-"), args.loss), index=False, sep="\t")