Example #1
0
 def __init__(self, path:str=None, query_transformer:CustomSentenceTransformer=None,
              psg_transformer:CustomSentenceTransformer=None, device:torch.device=None):
     super(QuerySpecificClusterModel, self).__init__()
     if path is not None:
         self.query_model = CustomSentenceTransformer(path+'/query_model')
         self.psg_model = CustomSentenceTransformer(path+'/psg_model')
     else:
         self.query_model = query_transformer
         self.psg_model = psg_transformer
     self.optim = OptimCluster
     self.device = device
Example #2
0
def do_test(pt_file, model_name, n):
    text = []
    i = 0
    with open(pt_file, 'r', encoding='utf8') as f:
        for l in f:
            text.append(l.split('\t')[1])
            i += 1
            if i >= n:
                break
    psg_word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    psg_pooling_model = models.Pooling(
        psg_word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    psg_dense_model = models.Dense(
        in_features=psg_pooling_model.get_sentence_embedding_dimension(),
        out_features=256,
        activation_function=nn.Tanh())
    psg_model = CustomSentenceTransformer(
        modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model])
    if torch.cuda.is_available():
        psg_model.to(torch.device('cuda'))
    psg_features = []
    print('Tokenizing')
    for p in text:
        psg_tkn = psg_model.tokenize(p)
        if torch.cuda.is_available():
            batch_to_device(psg_tkn, torch.device('cuda'))
        psg_features.append(psg_tkn)
    psg_embs = []
    print('Embedding')
    for pfet in psg_features:
        psg_emb = psg_model(pfet)['sentence_embedding']
        psg_emb.to(torch.device('cpu'))
        psg_embs.append(psg_emb)
    print(psg_embs[:10])
Example #3
0
def run_fixed_lambda_bbcluster(train_cluster_data, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps,
                               num_epochs, warmup_frac, lambda_val, reg, beta, loss_name, use_model_device, model_name='distilbert-base-uncased', out_features=256):
    task = Task.init(project_name='BB Clustering', task_name='bbclustering_fixed_lambda')
    config_dict = {'lambda_val': lambda_val, 'reg': reg}
    config_dict = task.connect(config_dict)
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: '+str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: '+str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features,
                                   activation_function=nn.Tanh())

    model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model])
    # model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    GPUtil.showUtilization()
    if loss_name == 'bbspec':
        loss_model = BBSpectralClusterLossModel(model=model, device=device,
                                                lambda_val=config_dict.get('lambda_val', lambda_val),
                                                reg_const=config_dict.get('reg', reg), beta=beta)
    else:
        loss_model = BBClusterLossModel(model=model, device=device,
                                        lambda_val=config_dict.get('lambda_val', lambda_val),
                                        reg_const=config_dict.get('reg', reg))
    # reg_loss_model = ClusterDistLossModel(model=model)

    train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size)
    GPUtil.showUtilization()
    # train_dataloader2 = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size)
    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)
    GPUtil.showUtilization()
    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Raw BERT embedding performance")
    model.to(device)
    evaluator(model, output_path)
    GPUtil.showUtilization()

    # Train the model
    model.fit(train_objectives=[(train_dataloader, loss_model)],
              evaluator=evaluator,
              test_evaluator=test_evaluator,
              epochs=num_epochs,
              evaluation_steps=eval_steps,
              warmup_steps=warmup_steps,
              output_path=output_path)
Example #4
0
def run_binary_model(train_pairs, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac,
                       use_model_device, model_name='distilbert-base-uncased', out_features=256):
    task = Task.init(project_name='BB Clustering', task_name='bbclustering_pairs')
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: ' + str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: ' + str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(),
                                   out_features=out_features,
                                   activation_function=nn.Tanh())

    model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model])

    train_dataloader = DataLoader(train_pairs, shuffle=True, batch_size=train_batch_size)
    train_loss = BinaryLoss(model=model)

    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)

    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Raw BERT embedding performance")
    model.to(device)
    evaluator(model, output_path)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              test_evaluator=test_evaluator,
              epochs=num_epochs,
              evaluation_steps=eval_steps,
              warmup_steps=warmup_steps,
              output_path=output_path)
Example #5
0
    return dist_mat


parser = argparse.ArgumentParser(description='Eval treccar experiments')
parser.add_argument('-ip', '--input_dir', default='~/trec_dataset')
parser.add_argument('-lv', '--level', default='top')
parser.add_argument('-pg', '--page_title')
parser.add_argument('-mp', '--model_path')
parser.add_argument('-out', '--outdict')
args = parser.parse_args()
input_dir = args.input_dir
level = args.level
page = args.page_title
model_path = args.model_path
outpath = args.outdict
model = CustomSentenceTransformer(model_path)

test_art_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-article.qrels'
test_top_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-toplevel.qrels'
test_hier_qrels = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/test.pages.cbor-hierarchical.qrels'
test_paratext = input_dir + '/benchmarkY1/benchmarkY1-test-nodup/by1test_paratext/by1test_paratext.tsv'
test_top_cluster_data, test_hier_cluster_data = prepare_cluster_data2(
    test_art_qrels, test_top_qrels, test_hier_qrels, test_paratext, False, -1,
    0)
if level == 'top':
    test_cluster_data = test_top_cluster_data
else:
    test_cluster_data = test_hier_cluster_data
emb_dict = {}
for sample in test_cluster_data:
    print(sample.qid)
Example #6
0
mean_rand_tf = np.mean(np.array(rand_scores_tf))
mean_nmi_tf = np.mean(np.array(nmi_scores_tf))
mean_ami_tf = np.mean(np.array(ami_scores_tf))
mean_urand_tf = np.mean(np.array(urand_scores_tf))
print('TFIDF')
print("\nRAND: %.5f, NMI: %.5f, AMI: %.5f, URAND: %.5f\n" % (mean_rand_tf, mean_nmi_tf, mean_ami_tf, mean_urand_tf), flush=True)

word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)
doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256,
                                   activation_function=nn.Tanh())

raw_model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model])

anchor_rand, anchor_nmi, anchor_ami, anchor_urand = [], [], [], []
for i in range(len(model_paths)):
    mp = model_paths[i]
    m = CustomSentenceTransformer(mp)
    print('Model: '+mp.split('/')[len(mp.split('/'))-1])
    if i == 0:
        print('This is the anchor model for paired ttest')
        anchor_rand, anchor_nmi, anchor_ami, anchor_urand = get_eval_scores(m, test_cluster_data)
    else:
        mean_rand, mean_nmi, mean_ami, mean_urand = get_eval_scores(m, test_cluster_data, anchor_rand, anchor_nmi, anchor_ami, anchor_urand)

mean_rand, mean_nmi, mean_ami, mean_urand = get_eval_scores(raw_model, test_cluster_data, anchor_rand, anchor_nmi, anchor_ami, anchor_urand)

rand_ttest_tf, nmi_ttest_tf, ami_ttest_tf, urand_ttest_tf = (ttest_rel(anchor_rand, rand_scores_tf), ttest_rel(anchor_nmi, nmi_scores_tf),
Example #7
0
class QuerySpecificClusterModel(nn.Module):

    def __init__(self, path:str=None, query_transformer:CustomSentenceTransformer=None,
                 psg_transformer:CustomSentenceTransformer=None, device:torch.device=None):
        super(QuerySpecificClusterModel, self).__init__()
        if path is not None:
            self.query_model = CustomSentenceTransformer(path+'/query_model')
            self.psg_model = CustomSentenceTransformer(path+'/psg_model')
        else:
            self.query_model = query_transformer
            self.psg_model = psg_transformer
        self.optim = OptimCluster
        self.device = device

    def save(self, path):
        self.query_model.save(path+'/query_model')
        self.psg_model.save(path+'/psg_model')

    def _get_scheduler(self, optimizer, scheduler: str, warmup_steps: int, t_total: int):
        """
        Taken from SentenceTransformers
        Returns the correct learning rate scheduler
        """
        scheduler = scheduler.lower()
        if scheduler == 'constantlr':
            return transformers.get_constant_schedule(optimizer)
        elif scheduler == 'warmupconstant':
            return transformers.get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
        elif scheduler == 'warmuplinear':
            return transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                                num_training_steps=t_total)
        elif scheduler == 'warmupcosine':
            return transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                                num_training_steps=t_total)
        elif scheduler == 'warmupcosinewithhardrestarts':
            return transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,
                                                                                   num_warmup_steps=warmup_steps,
                                                                                   num_training_steps=t_total)
        else:
            raise ValueError("Unknown scheduler {}".format(scheduler))

    def query_batch_collate_fn(self, batch):
        num_texts = len(batch[0].texts)
        queries = []
        texts = [[] for _ in range(num_texts)]
        labels = []

        for example in batch:
            queries.append(example.q_context)
            for idx, text in enumerate(example.texts):
                texts[idx].append(text)
            labels.append(example.label)

        labels = torch.tensor(labels).to(self.device)

        q_tokenized = self.query_model.tokenize(queries)
        batch_to_device(q_tokenized, self.device)

        psg_features = []
        for idx in range(num_texts):
            p_tokenized = self.psg_model.tokenize(texts[idx])
            batch_to_device(p_tokenized, self.device)
            psg_features.append(p_tokenized)

        return q_tokenized, psg_features, labels

    def forward(self, query_feature: Dict[str, Tensor], passage_features: Iterable[Dict[str, Tensor]], labels: Tensor):
        n = labels.shape[1]

        query_embedding = self.query_model(query_feature)['sentence_embedding']
        # its the scaling vector, so each element in vector should be [0, 1]
        psg_embeddings = torch.stack([self.psg_model(passages)['sentence_embedding']
                                      for passages in passage_features], dim=1)
        scaled_psg_embeddings = torch.tile(query_embedding.unsqueeze(1), (1, n, 1)) * psg_embeddings

        return scaled_psg_embeddings
Example #8
0
def train(train_cluster_data, val_cluster_data, test_cluster_data, output_path, eval_steps,
          num_epochs, warmup_frac, lambda_val, reg, use_model_device, max_train_size=-1, train_psg_model=False,
          model_name='distilbert-base-uncased', out_features=256, steps_per_epoch=None, weight_decay=0.01,
          optimizer_class=transformers.AdamW, scheduler='WarmupLinear', optimizer_params={'lr':2e-5},
          show_progress_bar=True, max_grad_norm=1, save_best_model=True):
    tensorboard_writer = SummaryWriter('./tensorboard_logs')
    task = Task.init(project_name='Query Specific BB Clustering', task_name='query_bbc_fixed_lambda')
    config_dict = {'lambda_val': lambda_val, 'reg': reg}
    config_dict = task.connect(config_dict)
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: '+str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: '+str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    query_word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    query_pooling_model = models.Pooling(query_word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    query_dense_model = models.Dense(in_features=query_pooling_model.get_sentence_embedding_dimension(),
                                     out_features=out_features,
                                     activation_function=nn.Sigmoid())
    psg_word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    psg_pooling_model = models.Pooling(psg_word_embedding_model.get_word_embedding_dimension(),
                                         pooling_mode_mean_tokens=True,
                                         pooling_mode_cls_token=False,
                                         pooling_mode_max_tokens=False)

    psg_dense_model = models.Dense(in_features=psg_pooling_model.get_sentence_embedding_dimension(),
                                     out_features=out_features,
                                     activation_function=nn.Tanh())

    query_model = CustomSentenceTransformer(modules=[query_word_embedding_model, query_pooling_model,
                                                     query_dense_model])
    psg_model = SentenceTransformer(modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model])

    model = QuerySpecificClusterModel(query_transformer=query_model, psg_transformer=psg_model, device=device)

    train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=1)
    evaluator = QueryClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = QueryClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)

    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Untrained performance")
    model.to(device)
    evaluator(model)

    train_dataloader.collate_fn = model.query_batch_collate_fn

    # Train the model
    best_score = -9999999
    if steps_per_epoch is None or steps_per_epoch == 0:
        steps_per_epoch = len(train_dataloader)
    num_train_steps = int(steps_per_epoch * num_epochs)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay': weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    data_iter = iter(train_dataloader)
    optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)
    scheduler_obj = model._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps,
                                        t_total=num_train_steps)
    config = {'epochs': num_epochs, 'steps_per_epoch': steps_per_epoch}
    global_step = 0
    loss_model = BBClusterLossModel(model, device, lambda_val, reg)
    for epoch in trange(config.get('epochs'), desc="Epoch", disable=not show_progress_bar):
        training_steps = 0
        running_loss_0 = 0.0
        model.zero_grad()
        model.train()
        if not train_psg_model:
            for m in model.psg_model.modules():
                m.training = False
        for _ in trange(config.get('steps_per_epoch'), desc="Iteration", smoothing=0.05, disable=not show_progress_bar):
            try:
                data = next(data_iter)
            except StopIteration:
                data_iter = iter(train_dataloader)
                data = next(data_iter)
            query_feature, psg_features, labels = data
            if max_train_size > 0 and labels.shape[1] > max_train_size:
                print('skipping instance with '+str(labels.shape[1])+' passages')
                continue
            loss_val = loss_model(query_feature, psg_features, labels)
            running_loss_0 += loss_val.item()
            loss_val.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()
            scheduler_obj.step()
            training_steps += 1
            global_step += 1

            if eval_steps > 0 and training_steps % eval_steps == 0:
                tensorboard_writer.add_scalar('training_loss', running_loss_0 / eval_steps, global_step)
                # logger.report_scalar('Loss', 'training_loss', iteration=global_step, v
                # alue=running_loss_0/evaluation_steps)
                running_loss_0 = 0.0
                # self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, callback)
                if evaluator is not None:
                    score = evaluator(model, output_path=output_path, epoch=epoch, steps=training_steps)
                    tensorboard_writer.add_scalar('val_ARI', score, global_step)
                    # logger.report_scalar('Training progress', 'val_ARI', iteration=global_step, value=score)
                    if score > best_score:
                        best_score = score
                        if save_best_model:
                            print('Saving model at: ' + output_path)
                            model.save(output_path)
                model.zero_grad()
                model.train()
                if not train_psg_model:
                    for m in model.psg_model.modules():
                        m.training = False
        if evaluator is not None:
            score = evaluator(model, output_path=output_path, epoch=epoch, steps=training_steps)
            tensorboard_writer.add_scalar('val_ARI', score, global_step)
            # logger.report_scalar('Training progress', 'val_ARI', iteration=global_step, value=score)
            if score > best_score:
                best_score = score
                if save_best_model:
                    model.save(output_path)
        if test_evaluator is not None:
            best_model = QuerySpecificClusterModel(output_path)
            if torch.cuda.is_available():
                model.to(torch.device('cpu'))
                best_model.to(device)
                test_ari = test_evaluator(best_model)
                best_model.to(torch.device('cpu'))
                model.to(device)
            else:
                test_ari = test_evaluator(best_model)
            tensorboard_writer.add_scalar('test_ARI', test_ari, global_step)
            # logger.report_scalar('Training progress', 'test_ARI', iteration=global_step, value=test_ari)
    if evaluator is None and output_path is not None:  # No evaluator, but output path: save final model version
        model.save(output_path)
Example #9
0
def _run_fixed_lambda_bbcluster(train_batch_size,
                                num_epochs,
                                lambda_val,
                                reg,
                                use_model_device,
                                eval_steps,
                                out_path,
                                warmup_frac=0.1,
                                model_name='distilbert-base-uncased',
                                out_features=256):
    exp_task = Task.create(project_name='Optuna Hyperparam optim',
                           task_name='trial')
    config_dict = {'lambda_val': lambda_val, 'reg': reg}
    config_dict = task.connect(config_dict)
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: ' + str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: ' + str(device))
    word_embedding_model = models.Transformer(model_name)

    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(
        in_features=pooling_model.get_sentence_embedding_dimension(),
        out_features=out_features,
        activation_function=nn.Tanh())

    model = CustomSentenceTransformer(
        modules=[word_embedding_model, pooling_model, doc_dense_model])
    loss_model = BBClusterLossModel(model=model,
                                    device=device,
                                    lambda_val=config_dict.get(
                                        'lambda_val', lambda_val),
                                    reg_const=config_dict.get('reg', reg))

    train_dataloader = DataLoader(train_cluster_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data,
                                                     use_model_device)

    warmup_steps = int(len(train_dataloader) * num_epochs *
                       warmup_frac)  # 10% of train data

    model.to(device)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, loss_model)],
              epochs=num_epochs,
              warmup_steps=warmup_steps,
              evaluator=evaluator,
              evaluation_steps=eval_steps,
              output_path=out_path)
    best_model = CustomSentenceTransformer(out_path)
    return evaluator(best_model)