def run_fixed_lambda_bbcluster(train_cluster_data, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac, lambda_val, reg, beta, loss_name, use_model_device, model_name='distilbert-base-uncased', out_features=256): task = Task.init(project_name='BB Clustering', task_name='bbclustering_fixed_lambda') config_dict = {'lambda_val': lambda_val, 'reg': reg} config_dict = task.connect(config_dict) if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: '+str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: '+str(device)) ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model]) # model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) GPUtil.showUtilization() if loss_name == 'bbspec': loss_model = BBSpectralClusterLossModel(model=model, device=device, lambda_val=config_dict.get('lambda_val', lambda_val), reg_const=config_dict.get('reg', reg), beta=beta) else: loss_model = BBClusterLossModel(model=model, device=device, lambda_val=config_dict.get('lambda_val', lambda_val), reg_const=config_dict.get('reg', reg)) # reg_loss_model = ClusterDistLossModel(model=model) train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size) GPUtil.showUtilization() # train_dataloader2 = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device) GPUtil.showUtilization() warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data print("Raw BERT embedding performance") model.to(device) evaluator(model, output_path) GPUtil.showUtilization() # Train the model model.fit(train_objectives=[(train_dataloader, loss_model)], evaluator=evaluator, test_evaluator=test_evaluator, epochs=num_epochs, evaluation_steps=eval_steps, warmup_steps=warmup_steps, output_path=output_path)
def evaluate_treccar(model_path, test_art_qrels, test_top_qrels, test_hier_qrels, test_paratext, level): test_page_paras, test_rev_para_top, test_rev_para_hier = get_trec_dat( test_art_qrels, test_top_qrels, test_hier_qrels) test_len_paras = np.array( [len(test_page_paras[page]) for page in test_page_paras.keys()]) print('test mean paras: %.2f, std: %.2f, max paras: %.2f' % (np.mean(test_len_paras), np.std(test_len_paras), np.max(test_len_paras))) test_ptext_dict = get_paratext_dict(test_paratext) test_top_cluster_data = [] test_hier_cluster_data = [] max_num_doc_test = max( [len(test_page_paras[p]) for p in test_page_paras.keys()]) test_pages = list(test_page_paras.keys()) for i in trange(len(test_pages)): page = test_pages[i] paras = test_page_paras[page] paratexts = [test_ptext_dict[p] for p in paras] top_sections = list(set([test_rev_para_top[p] for p in paras])) top_labels = [top_sections.index(test_rev_para_top[p]) for p in paras] hier_sections = list(set([test_rev_para_hier[p] for p in paras])) hier_labels = [ hier_sections.index(test_rev_para_hier[p]) for p in paras ] query_text = ' '.join(page.split('enwiki:')[1].split('%20')) n = len(paras) paras = paras + ['dummy'] * (max_num_doc_test - n) paratexts = paratexts + [''] * (max_num_doc_test - n) top_labels = top_labels + [-1] * (max_num_doc_test - n) hier_labels = hier_labels + [-1] * (max_num_doc_test - n) test_top_cluster_data.append( InputTRECCARExample(qid=page, q_context=query_text, pids=paras, texts=paratexts, label=np.array(top_labels))) test_hier_cluster_data.append( InputTRECCARExample(qid=page, q_context=query_text, pids=paras, texts=paratexts, label=np.array(hier_labels))) print("Top-level datasets") print("Test instances: %5d" % len(test_top_cluster_data)) model = SentenceTransformer(model_path) if level == 'h': print('Evaluating hiererchical clusters') test_evaluator = ClusterEvaluator.from_input_examples( test_hier_cluster_data) model.evaluate(test_evaluator) else: print('Evaluating toplevel clusters') test_evaluator = ClusterEvaluator.from_input_examples( test_top_cluster_data) model.evaluate(test_evaluator)
def run_binary_model(train_pairs, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac, use_model_device, model_name='distilbert-base-uncased', out_features=256): task = Task.init(project_name='BB Clustering', task_name='bbclustering_pairs') if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: ' + str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: ' + str(device)) ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model]) train_dataloader = DataLoader(train_pairs, shuffle=True, batch_size=train_batch_size) train_loss = BinaryLoss(model=model) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device) warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data print("Raw BERT embedding performance") model.to(device) evaluator(model, output_path) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, test_evaluator=test_evaluator, epochs=num_epochs, evaluation_steps=eval_steps, warmup_steps=warmup_steps, output_path=output_path)
def evaluate_ng20(model_path, test_cluster_data, gpu_eval): if torch.cuda.is_available(): print('CUDA is available') device = torch.device('cuda') else: print('Using CPU') device = torch.device('cpu') model = SentenceTransformer(model_path) model.to(device) test_evaluator = ClusterEvaluator.from_input_examples( test_cluster_data, gpu_eval) model.evaluate(test_evaluator)
def _run_fixed_lambda_bbcluster(train_batch_size, num_epochs, lambda_val, reg, use_model_device, eval_steps, out_path, warmup_frac=0.1, model_name='distilbert-base-uncased', out_features=256): exp_task = Task.create(project_name='Optuna Hyperparam optim', task_name='trial') config_dict = {'lambda_val': lambda_val, 'reg': reg} config_dict = task.connect(config_dict) if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: ' + str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: ' + str(device)) word_embedding_model = models.Transformer(model_name) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer( modules=[word_embedding_model, pooling_model, doc_dense_model]) loss_model = BBClusterLossModel(model=model, device=device, lambda_val=config_dict.get( 'lambda_val', lambda_val), reg_const=config_dict.get('reg', reg)) train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data model.to(device) # Train the model model.fit(train_objectives=[(train_dataloader, loss_model)], epochs=num_epochs, warmup_steps=warmup_steps, evaluator=evaluator, evaluation_steps=eval_steps, output_path=out_path) best_model = CustomSentenceTransformer(out_path) return evaluator(best_model)