Ejemplo n.º 1
0
def run_fixed_lambda_bbcluster(train_cluster_data, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps,
                               num_epochs, warmup_frac, lambda_val, reg, beta, loss_name, use_model_device, model_name='distilbert-base-uncased', out_features=256):
    task = Task.init(project_name='BB Clustering', task_name='bbclustering_fixed_lambda')
    config_dict = {'lambda_val': lambda_val, 'reg': reg}
    config_dict = task.connect(config_dict)
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: '+str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: '+str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features,
                                   activation_function=nn.Tanh())

    model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model])
    # model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    GPUtil.showUtilization()
    if loss_name == 'bbspec':
        loss_model = BBSpectralClusterLossModel(model=model, device=device,
                                                lambda_val=config_dict.get('lambda_val', lambda_val),
                                                reg_const=config_dict.get('reg', reg), beta=beta)
    else:
        loss_model = BBClusterLossModel(model=model, device=device,
                                        lambda_val=config_dict.get('lambda_val', lambda_val),
                                        reg_const=config_dict.get('reg', reg))
    # reg_loss_model = ClusterDistLossModel(model=model)

    train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size)
    GPUtil.showUtilization()
    # train_dataloader2 = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size)
    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)
    GPUtil.showUtilization()
    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Raw BERT embedding performance")
    model.to(device)
    evaluator(model, output_path)
    GPUtil.showUtilization()

    # Train the model
    model.fit(train_objectives=[(train_dataloader, loss_model)],
              evaluator=evaluator,
              test_evaluator=test_evaluator,
              epochs=num_epochs,
              evaluation_steps=eval_steps,
              warmup_steps=warmup_steps,
              output_path=output_path)
def run():
    train_file = config.TRAINING_FILE
    train_batch = config.TRAIN_BATCH_SIZE
    vaild_batch = config.VALID_BATCH_SIZE
    model_path = config.BERT_PATH
    max_length = config.MAX_LEN
    dfs = pd.read_csv(train_file,
                      sep="\t",
                      names=['idx', 'sent1', 'sent2', 'label'])
    dfs['label'] = pd.to_numeric(dfs["label"], downcast='float')
    df_train, df_valid = model_selection.train_test_split(
        dfs,
        test_size=0.1,
        random_state=42,
        stratify=dfs.label.values,
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    dataset_reader = dataset.Dataset()

    train_dataset = dataset_reader.read(df_train, return_pt=True)
    valid_sentence1, valid_sentence2, valid_labels = dataset_reader.read(
        df_valid)

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch)
    # evaluator = evaluation.EmbeddingSimilarityEvaluator(valid_sentence1, valid_sentence2, valid_labels)
    evaluator = evaluation.BinaryClassificationEvaluator(
        valid_sentence1,
        valid_sentence2,
        valid_labels,
        batch_size=vaild_batch,
        show_progress_bar=False)

    word_embedding_model = models.Transformer(model_path,
                                              max_seq_length=max_length)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    dense_model = models.Dense(
        in_features=pooling_model.get_sentence_embedding_dimension(),
        out_features=max_length,
        activation_function=nn.Tanh())

    model = SentenceTransformer(
        modules=[word_embedding_model, pooling_model, dense_model])

    train_loss = losses.CosineSimilarityLoss(model)

    engine.train(train_dataloader, model, train_loss, evaluator)
Ejemplo n.º 3
0
def define_bert_encoder():
    word_embedding_model = models.Transformer('bert-base-uncased',
                                              max_seq_length=200)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    dense_model = models.Dense(
        ni_features=pooling_model.get_sentence_embedding_dimension(),
        out_features=200,
        activation_function=nn.Tanh())

    bert_model = SentenceTransformer(
        modules=[word_embedding_model, pooling_model, dense_model])
    return bert_model
Ejemplo n.º 4
0
 def __init__(self):
     word_embedding_model = models.Transformer(
         'sentence-transformers/bert-large-nli-max-tokens',
         max_seq_length=256)
     pooling_model = models.Pooling(
         word_embedding_model.get_word_embedding_dimension())
     dense_model = models.Dense(
         in_features=pooling_model.get_sentence_embedding_dimension(),
         out_features=256,
         activation_function=nn.Tanh())
     self.model = SentenceTransformer(
         modules=[word_embedding_model, pooling_model, dense_model])
     path = 'multinli_1.0/'
     self.MNLI_train_path = path + 'multinli_1.0_train.txt'
     self.MNLI_matched_test_path = path + 'multinli_1.0_dev_matched.txt'
     self.MNLI_mismatched_test_path = path + 'multinli_1.0_dev_mismatched.txt'
Ejemplo n.º 5
0
def run_binary_model(train_pairs, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac,
                       use_model_device, model_name='distilbert-base-uncased', out_features=256):
    task = Task.init(project_name='BB Clustering', task_name='bbclustering_pairs')
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: ' + str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: ' + str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(),
                                   out_features=out_features,
                                   activation_function=nn.Tanh())

    model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model])

    train_dataloader = DataLoader(train_pairs, shuffle=True, batch_size=train_batch_size)
    train_loss = BinaryLoss(model=model)

    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)

    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Raw BERT embedding performance")
    model.to(device)
    evaluator(model, output_path)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              test_evaluator=test_evaluator,
              epochs=num_epochs,
              evaluation_steps=eval_steps,
              warmup_steps=warmup_steps,
              output_path=output_path)
Ejemplo n.º 6
0
def do_test(pt_file, model_name, n):
    text = []
    i = 0
    with open(pt_file, 'r', encoding='utf8') as f:
        for l in f:
            text.append(l.split('\t')[1])
            i += 1
            if i >= n:
                break
    psg_word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    psg_pooling_model = models.Pooling(
        psg_word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    psg_dense_model = models.Dense(
        in_features=psg_pooling_model.get_sentence_embedding_dimension(),
        out_features=256,
        activation_function=nn.Tanh())
    psg_model = CustomSentenceTransformer(
        modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model])
    if torch.cuda.is_available():
        psg_model.to(torch.device('cuda'))
    psg_features = []
    print('Tokenizing')
    for p in text:
        psg_tkn = psg_model.tokenize(p)
        if torch.cuda.is_available():
            batch_to_device(psg_tkn, torch.device('cuda'))
        psg_features.append(psg_tkn)
    psg_embs = []
    print('Embedding')
    for pfet in psg_features:
        psg_emb = psg_model(pfet)['sentence_embedding']
        psg_emb.to(torch.device('cpu'))
        psg_embs.append(psg_emb)
    print(psg_embs[:10])
Ejemplo n.º 7
0
# Initialize the WordWeights model. This model must be between the WordEmbeddings and the Pooling model
word_weights = models.WordWeights(vocab=vocab,
                                  word_weights=word_weights,
                                  unknown_word_weight=unknown_word_weight)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

# Add two trainable feed-forward networks (DAN)
sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension()
dan1 = models.Dense(in_features=sent_embeddings_dimension,
                    out_features=sent_embeddings_dimension)
dan2 = models.Dense(in_features=sent_embeddings_dimension,
                    out_features=sent_embeddings_dimension)

model = SentenceTransformer(
    modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'),
                              model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
nli_sentences = list(nli_sentences)
random.shuffle(nli_sentences)

#To determine the PCA matrix, we need some example sentence embeddings.
#Here, we compute the embeddings for 20k random sentences from the AllNLI dataset
pca_train_sentences = nli_sentences[0:20000]
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)

#Compute PCA on the train embeddings matrix
pca = PCA(n_components=new_dimension)
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)

# We add a dense layer to the model, so that it will produce directly embeddings with the new size
dense = models.Dense(in_features=model.get_sentence_embedding_dimension(),
                     out_features=new_dimension,
                     bias=False,
                     activation_function=torch.nn.Identity())
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
model.add_module('dense', dense)

# Evaluate the model with the reduce embedding size
logger.info("Model with {} dimensions:".format(new_dimension))
stsb_evaluator(model)

# If you like, you can store the model on disc by uncommenting the following line
#model.save('models/bert-base-nli-stsb-mean-tokens-128dim')

# You can then load the adapted model that produces 128 dimensional embeddings like this:
#model = SentenceTransformer('models/bert-base-nli-stsb-mean-tokens-128dim')
Ejemplo n.º 9
0
#%%
print("Setting model...")
modules = []
word_embedding_model = models.Transformer(args.model, max_seq_length=128)
modules.append(word_embedding_model)

pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_cls_token=True,
    pooling_mode_mean_tokens=False)
modules.append(pooling_model)

if args.dense:
    if args.activation == 'tanh':
        dense_model = models.Dense(
            in_features=pooling_model.get_sentence_embedding_dimension(),
            out_features=256,
            activation_function=nn.Tanh())
    elif args.activation == 'sigmod':
        dense_model = models.Dense(
            in_features=pooling_model.get_sentence_embedding_dimension(),
            out_features=256,
            activation_function=nn.Sigmoid())
    elif args.activation == 'relu':
        dense_model = models.Dense(
            in_features=pooling_model.get_sentence_embedding_dimension(),
            out_features=256,
            activation_function=nn.ReLU())
    assert dense_model, f"unknown activation function {args.activation}"
    modules.append(dense_model)

model = SentenceTransformer(modules=modules)
Ejemplo n.º 10
0
    args = parser.parse_args()

    base_model = args.base_model
    sentence_embedding_dim = args.sentence_embedding_dim
    model_save_path = args.model_save_path
    batch_size = args.batch_size
    epochs = args.epochs
    dataset = args.dataset
    task_type = args.task_type
    masked = args.masked

    word_embedding_model = models.Transformer(base_model, max_seq_length=256)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    dense_model = models.Dense(
        in_features=pooling_model.get_sentence_embedding_dimension(),
        out_features=sentence_embedding_dim,
        activation_function=nn.Tanh())

    model = SentenceTransformer(
        modules=[word_embedding_model, pooling_model, dense_model])

    train_examples = ld.load_dataset(dataset_name=dataset,
                                     dataset_type='train')

    train_dataset = SentencesDataset(train_examples, model)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=batch_size)

    train_loss = losses.ContrastiveLoss(model=model)
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(description='Start training with SBERT')
    parser.add_argument('--model_path',
                    type=str,
                    help='Path to trained model folder ./models/[MODEL_NAME]')
    parser.add_argument('--dataset',
                    type=str,
                    default='few_rel',
                    help='Name dataset')  
    parser.add_argument('--mask_method',
                    type=str,
                    default='bracket',
                    help='Type of masking')    
    parser.add_argument('--num_epochs',
                    type=int,
                    default=15,
                    help='Number epochs')                                
    parser.add_argument('--num_samples',
                    type=int,
                    default=-1,
                    help='Number of samples for test run, default -1 means all data')
    parser.add_argument('--max_seq_length',
                    type=int,
                    default=256,
                    help='Max token length for BERT')
    args = parser.parse_args()

    model_path = args.model_path
    dataset = args.dataset
    mask_method = args.mask_method
    num_samples = args.num_samples
    max_seq_length=args.max_seq_length
    num_epochs = args.num_epochs
    evaluation_steps = 1000 # Frequency of evaluation results
    warmup_steps = 1000 # warm up steps
    sentence_out_embedding_dimension = 256

    if model_path.endswith('/'):
        model_path = model_path[:-1]
    model_name = model_path.split('/')[-1]

    path_train_data = f'./data/train_samples/{dataset}_train_{mask_method}_train.csv'
    path_eval_data = f'./data/train_samples/{dataset}_val_{mask_method}_test.csv'
    if num_samples>0:
        model_save_path = f'./trained_models/{model_name}_sbert_bi_{dataset}_test/'
    else:
        model_save_path = f'./trained_models/{model_name}_sbert_bi_{dataset}/'
    ### Define the model
    word_embedding_model = models.Transformer(model_path, max_seq_length=max_seq_length)

    ### Add special tokens - this helps us add tokens like Doc or query or Entity1 / Entity2 
    # but in our case we already added that to the model prior
    #tokens = ["[DOC]", "[QRY]"]
    #word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
    #word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), 
                        out_features=sentence_out_embedding_dimension, activation_function=nn.Tanh())
    # Model pipeline
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

    # Prep DataLoader
    train_examples = load_train_sbert(path_train_data, num_samples)
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

    # Prep Evaluator
    sentences1, sentences2, scores = load_eval_sbert(path_eval_data, num_samples)
    #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
    evaluator = evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores)
    #train_loss = losses.CosineSimilarityLoss(model)
    train_loss = losses.SoftmaxLoss(model, sentence_embedding_dimension= sentence_out_embedding_dimension, num_labels = 2)

    #Tune the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=evaluation_steps,
            warmup_steps=warmup_steps,
            output_path=model_save_path)
Ejemplo n.º 12
0
def train(train_cluster_data, val_cluster_data, test_cluster_data, output_path, eval_steps,
          num_epochs, warmup_frac, lambda_val, reg, use_model_device, max_train_size=-1, train_psg_model=False,
          model_name='distilbert-base-uncased', out_features=256, steps_per_epoch=None, weight_decay=0.01,
          optimizer_class=transformers.AdamW, scheduler='WarmupLinear', optimizer_params={'lr':2e-5},
          show_progress_bar=True, max_grad_norm=1, save_best_model=True):
    tensorboard_writer = SummaryWriter('./tensorboard_logs')
    task = Task.init(project_name='Query Specific BB Clustering', task_name='query_bbc_fixed_lambda')
    config_dict = {'lambda_val': lambda_val, 'reg': reg}
    config_dict = task.connect(config_dict)
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: '+str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: '+str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    query_word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    query_pooling_model = models.Pooling(query_word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    query_dense_model = models.Dense(in_features=query_pooling_model.get_sentence_embedding_dimension(),
                                     out_features=out_features,
                                     activation_function=nn.Sigmoid())
    psg_word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    psg_pooling_model = models.Pooling(psg_word_embedding_model.get_word_embedding_dimension(),
                                         pooling_mode_mean_tokens=True,
                                         pooling_mode_cls_token=False,
                                         pooling_mode_max_tokens=False)

    psg_dense_model = models.Dense(in_features=psg_pooling_model.get_sentence_embedding_dimension(),
                                     out_features=out_features,
                                     activation_function=nn.Tanh())

    query_model = CustomSentenceTransformer(modules=[query_word_embedding_model, query_pooling_model,
                                                     query_dense_model])
    psg_model = SentenceTransformer(modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model])

    model = QuerySpecificClusterModel(query_transformer=query_model, psg_transformer=psg_model, device=device)

    train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=1)
    evaluator = QueryClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = QueryClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)

    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Untrained performance")
    model.to(device)
    evaluator(model)

    train_dataloader.collate_fn = model.query_batch_collate_fn

    # Train the model
    best_score = -9999999
    if steps_per_epoch is None or steps_per_epoch == 0:
        steps_per_epoch = len(train_dataloader)
    num_train_steps = int(steps_per_epoch * num_epochs)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay': weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    data_iter = iter(train_dataloader)
    optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)
    scheduler_obj = model._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps,
                                        t_total=num_train_steps)
    config = {'epochs': num_epochs, 'steps_per_epoch': steps_per_epoch}
    global_step = 0
    loss_model = BBClusterLossModel(model, device, lambda_val, reg)
    for epoch in trange(config.get('epochs'), desc="Epoch", disable=not show_progress_bar):
        training_steps = 0
        running_loss_0 = 0.0
        model.zero_grad()
        model.train()
        if not train_psg_model:
            for m in model.psg_model.modules():
                m.training = False
        for _ in trange(config.get('steps_per_epoch'), desc="Iteration", smoothing=0.05, disable=not show_progress_bar):
            try:
                data = next(data_iter)
            except StopIteration:
                data_iter = iter(train_dataloader)
                data = next(data_iter)
            query_feature, psg_features, labels = data
            if max_train_size > 0 and labels.shape[1] > max_train_size:
                print('skipping instance with '+str(labels.shape[1])+' passages')
                continue
            loss_val = loss_model(query_feature, psg_features, labels)
            running_loss_0 += loss_val.item()
            loss_val.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()
            scheduler_obj.step()
            training_steps += 1
            global_step += 1

            if eval_steps > 0 and training_steps % eval_steps == 0:
                tensorboard_writer.add_scalar('training_loss', running_loss_0 / eval_steps, global_step)
                # logger.report_scalar('Loss', 'training_loss', iteration=global_step, v
                # alue=running_loss_0/evaluation_steps)
                running_loss_0 = 0.0
                # self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, callback)
                if evaluator is not None:
                    score = evaluator(model, output_path=output_path, epoch=epoch, steps=training_steps)
                    tensorboard_writer.add_scalar('val_ARI', score, global_step)
                    # logger.report_scalar('Training progress', 'val_ARI', iteration=global_step, value=score)
                    if score > best_score:
                        best_score = score
                        if save_best_model:
                            print('Saving model at: ' + output_path)
                            model.save(output_path)
                model.zero_grad()
                model.train()
                if not train_psg_model:
                    for m in model.psg_model.modules():
                        m.training = False
        if evaluator is not None:
            score = evaluator(model, output_path=output_path, epoch=epoch, steps=training_steps)
            tensorboard_writer.add_scalar('val_ARI', score, global_step)
            # logger.report_scalar('Training progress', 'val_ARI', iteration=global_step, value=score)
            if score > best_score:
                best_score = score
                if save_best_model:
                    model.save(output_path)
        if test_evaluator is not None:
            best_model = QuerySpecificClusterModel(output_path)
            if torch.cuda.is_available():
                model.to(torch.device('cpu'))
                best_model.to(device)
                test_ari = test_evaluator(best_model)
                best_model.to(torch.device('cpu'))
                model.to(device)
            else:
                test_ari = test_evaluator(best_model)
            tensorboard_writer.add_scalar('test_ARI', test_ari, global_step)
            # logger.report_scalar('Training progress', 'test_ARI', iteration=global_step, value=test_ari)
    if evaluator is None and output_path is not None:  # No evaluator, but output path: save final model version
        model.save(output_path)
Ejemplo n.º 13
0
    datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

# Check if dataset exsist. If not, download and extract  it
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                  sts_dataset_path)

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=32)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension())
dense = models.Dense(pooling_model.get_sentence_embedding_dimension(),
                     pooling_model.get_sentence_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# We use 1 Million sentences from Wikipedia to train our model
wikipedia_dataset_path = 'datasets/wiki1m_for_simcse.txt'
if not os.path.exists(wikipedia_dataset_path):
    util.http_get(
        'https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt',
        wikipedia_dataset_path)

# train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent]
train_samples = []
with open(wikipedia_dataset_path, 'r', encoding='utf8') as fIn:
    for line in fIn:
        train_samples.append(InputExample(texts=[line.strip(), line.strip()]))
Ejemplo n.º 14
0
def main(opt):
    ArgumentParser.validate_preprocess_args(opt)
    torch.manual_seed(opt.seed)
    if not (opt.overwrite):
        check_existing_pt_files(opt)

    init_logger(opt.log_file)

    shutil.copy2(opt.config, os.path.dirname(opt.log_file))
    logger.info(opt)
    logger.info("Extracting features...")

    #Prepares the document embedding to initialize memory vectors.

    word_embedding_model = models.Transformer('bert-base-uncased',
                                              max_seq_length=256)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    dense_model = models.Dense(
        in_features=pooling_model.get_sentence_embedding_dimension(),
        out_features=256,
        activation_function=nn.Tanh())

    embedder = SentenceTransformer(
        modules=[word_embedding_model, pooling_model, dense_model])
    # embedder = SentenceTransformer('bert-base-nli-mean-tokens')

    kpcorpus = []
    files_path = [  #'data/keyphrase/json/kp20k/kp20k_train.json',
        #   'data/keyphrase/json/kp20k/kp20k_valid.json',
        #   'data/keyphrase/json/kp20k/kp20k_test.json',
        #   'data/keyphrase/json/inspec/inspec_valid.json',
        #   'data/keyphrase/json/inspec/inspec_test.json',
        #   'data/keyphrase/json/krapivin/krapivin_valid.json',
        #   'data/keyphrase/json/krapivin/krapivin_test.json',
        #   'data/keyphrase/json/nus/split/nus_valid.json',
        #   'data/keyphrase/json/nus/split/nus_test.json',
        #   'data/keyphrase/json/semeval/semeval_valid.json',
        #   'data/keyphrase/json/semeval/semeval_test.json',
        #   'data/keyphrase/json/duc/split/duc_valid.json',
        #   'data/keyphrase/json/duc/split/duc_test.json'
        'data/keyphrase/json/twitter_conv/twitter_conv_valid.json',
        'data/keyphrase/json/twitter_conv/twitter_conv_train.json',
        'data/keyphrase/json/twitter_conv/twitter_conv_test.json'
    ]
    for file_path in files_path:
        file = open(file_path, 'r')
        for line in file.readlines():
            dic = json.loads(line)
            # print(dic)
            kpcorpus.append(dic['title'] + ' ' + dic['abstract'])
            # print(kpcorpus)

    num_of_example = len(kpcorpus)
    print("number of examples in corpus: ", num_of_example)
    time_a = time.time()
    corpus_embeddings = embedder.encode(kpcorpus[:num_of_example])
    print("elapsed time: ", time.time() - time_a)
    alldocs_emb = torch.Tensor(corpus_embeddings)
    torch.save(alldocs_emb, './data/alldocs_emb')

    src_nfeats = 0
    tgt_nfeats = 0
    for src, tgt in zip(opt.train_src, opt.train_tgt):
        src_nfeats += count_features(src) if opt.data_type == 'text' \
            else 0
        tgt_nfeats += count_features(tgt)  # tgt always text so far
    logger.info(" * number of source features: %d." % src_nfeats)
    logger.info(" * number of target features: %d." % tgt_nfeats)

    logger.info("Building `Fields` object...")
    fields = inputters.get_fields(opt.data_type,
                                  src_nfeats,
                                  tgt_nfeats,
                                  dynamic_dict=opt.dynamic_dict,
                                  src_truncate=opt.src_seq_length_trunc,
                                  tgt_truncate=opt.tgt_seq_length_trunc)

    src_reader = inputters.str2reader[opt.data_type].from_opt(opt)
    tgt_reader = inputters.str2reader[opt.data_type].from_opt(opt)

    logger.info("Building & saving training data...")
    build_save_dataset('train', fields, src_reader, tgt_reader, opt)

    if opt.valid_src and opt.valid_tgt:
        logger.info("Building & saving validation data...")
        build_save_dataset('valid', fields, src_reader, tgt_reader, opt)
Ejemplo n.º 15
0
def _run_fixed_lambda_bbcluster(train_batch_size,
                                num_epochs,
                                lambda_val,
                                reg,
                                use_model_device,
                                eval_steps,
                                out_path,
                                warmup_frac=0.1,
                                model_name='distilbert-base-uncased',
                                out_features=256):
    exp_task = Task.create(project_name='Optuna Hyperparam optim',
                           task_name='trial')
    config_dict = {'lambda_val': lambda_val, 'reg': reg}
    config_dict = task.connect(config_dict)
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: ' + str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: ' + str(device))
    word_embedding_model = models.Transformer(model_name)

    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(
        in_features=pooling_model.get_sentence_embedding_dimension(),
        out_features=out_features,
        activation_function=nn.Tanh())

    model = CustomSentenceTransformer(
        modules=[word_embedding_model, pooling_model, doc_dense_model])
    loss_model = BBClusterLossModel(model=model,
                                    device=device,
                                    lambda_val=config_dict.get(
                                        'lambda_val', lambda_val),
                                    reg_const=config_dict.get('reg', reg))

    train_dataloader = DataLoader(train_cluster_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data,
                                                     use_model_device)

    warmup_steps = int(len(train_dataloader) * num_epochs *
                       warmup_frac)  # 10% of train data

    model.to(device)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, loss_model)],
              epochs=num_epochs,
              warmup_steps=warmup_steps,
              evaluator=evaluator,
              evaluation_steps=eval_steps,
              output_path=out_path)
    best_model = CustomSentenceTransformer(out_path)
    return evaluator(best_model)