def _process_biencoder_dataloader(samples, tokenizer, biencoder_params): _, tensor_data = process_mention_data( samples, tokenizer, biencoder_params["max_context_length"], biencoder_params["max_cand_length"], silent=True, logger=None, debug=biencoder_params["debug"], ) sampler = SequentialSampler(tensor_data) dataloader = DataLoader( tensor_data, sampler=sampler, batch_size=biencoder_params["eval_batch_size"] ) return dataloader
def main(params): model_output_path = params["output_path"] if not os.path.exists(model_output_path): os.makedirs(model_output_path) logger = utils.get_logger(params["output_path"]) # Init model reranker = BiEncoderRanker(params) tokenizer = reranker.tokenizer model = reranker.model # utils.save_model(model, tokenizer, model_output_path) device = reranker.device n_gpu = reranker.n_gpu if params["gradient_accumulation_steps"] < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(params["gradient_accumulation_steps"])) # An effective batch size of `x`, when we are accumulating the gradient accross `y` batches will be achieved by having a batch size of `z = x / y` # args.gradient_accumulation_steps = args.gradient_accumulation_steps // n_gpu params["train_batch_size"] = (params["train_batch_size"] // params["gradient_accumulation_steps"]) train_batch_size = params["train_batch_size"] eval_batch_size = params["eval_batch_size"] grad_acc_steps = params["gradient_accumulation_steps"] # Fix the random seeds seed = params["seed"] random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if reranker.n_gpu > 0: torch.cuda.manual_seed_all(seed) # Load train data train_samples = utils.read_dataset("train", params["data_path"]) logger.info("Read %d train samples." % len(train_samples)) train_data, train_tensor_data = data.process_mention_data( train_samples, tokenizer, params["max_context_length"], params["max_cand_length"], context_key=params["context_key"], silent=params["silent"], logger=logger, debug=params["debug"], ) if params["shuffle"]: train_sampler = RandomSampler(train_tensor_data) else: train_sampler = SequentialSampler(train_tensor_data) train_dataloader = DataLoader(train_tensor_data, sampler=train_sampler, batch_size=train_batch_size) # Load eval data # TODO: reduce duplicated code here valid_samples = utils.read_dataset("valid", params["data_path"]) logger.info("Read %d valid samples." % len(valid_samples)) valid_data, valid_tensor_data = data.process_mention_data( valid_samples, tokenizer, params["max_context_length"], params["max_cand_length"], context_key=params["context_key"], silent=params["silent"], logger=logger, debug=params["debug"], ) valid_sampler = SequentialSampler(valid_tensor_data) valid_dataloader = DataLoader(valid_tensor_data, sampler=valid_sampler, batch_size=eval_batch_size) # evaluate before training results = evaluate( reranker, valid_dataloader, params, device=device, logger=logger, ) number_of_samples_per_dataset = {} time_start = time.time() utils.write_to_file(os.path.join(model_output_path, "training_params.txt"), str(params)) logger.info("Starting training") logger.info("device: {} n_gpu: {}, distributed training: {}".format( device, n_gpu, False)) optimizer = get_optimizer(model, params) scheduler = get_scheduler(params, optimizer, len(train_tensor_data), logger) model.train() best_epoch_idx = -1 best_score = -1 num_train_epochs = params["num_train_epochs"] for epoch_idx in trange(int(num_train_epochs), desc="Epoch"): tr_loss = 0 results = None if params["silent"]: iter_ = train_dataloader else: iter_ = tqdm(train_dataloader, desc="Batch") for step, batch in enumerate(iter_): batch = tuple(t.to(device) for t in batch) if params["zeshel"]: context_input, candidate_input, _, _ = batch else: context_input, candidate_input, _ = batch loss, _ = reranker(context_input, candidate_input) # if n_gpu > 1: # loss = loss.mean() # mean() to average on multi-gpu. if grad_acc_steps > 1: loss = loss / grad_acc_steps tr_loss += loss.item() if (step + 1) % (params["print_interval"] * grad_acc_steps) == 0: logger.info("Step {} - epoch {} average loss: {}\n".format( step, epoch_idx, tr_loss / (params["print_interval"] * grad_acc_steps), )) tr_loss = 0 loss.backward() if (step + 1) % grad_acc_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), params["max_grad_norm"]) optimizer.step() scheduler.step() optimizer.zero_grad() if (step + 1) % (params["eval_interval"] * grad_acc_steps) == 0: logger.info("Evaluation on the development dataset") evaluate( reranker, valid_dataloader, params, device=device, logger=logger, ) model.train() logger.info("\n") logger.info("***** Saving fine - tuned model *****") epoch_output_folder_path = os.path.join(model_output_path, "epoch_{}".format(epoch_idx)) utils.save_model(model, tokenizer, epoch_output_folder_path) output_eval_file = os.path.join(epoch_output_folder_path, "eval_results.txt") results = evaluate( reranker, valid_dataloader, params, device=device, logger=logger, ) ls = [best_score, results["normalized_accuracy"]] li = [best_epoch_idx, epoch_idx] best_score = ls[np.argmax(ls)] best_epoch_idx = li[np.argmax(ls)] logger.info("\n") execution_time = (time.time() - time_start) / 60 utils.write_to_file( os.path.join(model_output_path, "training_time.txt"), "The training took {} minutes\n".format(execution_time), ) logger.info("The training took {} minutes\n".format(execution_time)) # save the best model in the parent_dir logger.info("Best performance in epoch: {}".format(best_epoch_idx)) params["path_to_model"] = os.path.join(model_output_path, "epoch_{}".format(best_epoch_idx)) utils.save_model(reranker.model, tokenizer, model_output_path) if params["evaluate"]: params["path_to_model"] = model_output_path results = evaluate( reranker, valid_dataloader, params, device=device, logger=logger, )
def main(params): output_path = params["output_path"] if not os.path.exists(output_path): os.makedirs(output_path) logger = utils.get_logger(params["output_path"]) # Init model reranker = BiEncoderRanker(params) tokenizer = reranker.tokenizer model = reranker.model device = reranker.device cand_encode_path = params.get("cand_encode_path", None) # candidate encoding is not pre-computed. # load/generate candidate pool to compute candidate encoding. cand_pool_path = params.get("cand_pool_path", None) candidate_pool = load_or_generate_candidate_pool( tokenizer, params, logger, cand_pool_path, ) candidate_encoding = None if cand_encode_path is not None: # try to load candidate encoding from path # if success, avoid computing candidate encoding try: logger.info("Loading pre-generated candidate encode path.") candidate_encoding = torch.load(cand_encode_path) except: logger.info("Loading failed. Generating candidate encoding.") if candidate_encoding is None: candidate_encoding = encode_candidate_zeshel( reranker, candidate_pool, params["encode_batch_size"], silent=params["silent"], logger=logger, ) if cand_encode_path is not None: # Save candidate encoding to avoid re-compute logger.info("Saving candidate encoding to file " + cand_encode_path) torch.save(cand_encode_path, candidate_encoding) test_samples = utils.read_dataset(params["mode"], params["data_path"]) logger.info("Read %d test samples." % len(test_samples)) test_data, test_tensor_data = data.process_mention_data( test_samples, tokenizer, params["max_context_length"], params["max_cand_length"], context_key=params['context_key'], silent=params["silent"], logger=logger, debug=params["debug"], ) test_sampler = SequentialSampler(test_tensor_data) test_dataloader = DataLoader(test_tensor_data, sampler=test_sampler, batch_size=params["encode_batch_size"]) save_results = params.get("save_topk_result") new_data = nnquery.get_topk_predictions( reranker, test_dataloader, candidate_pool, candidate_encoding, params["silent"], logger, params["top_k"], params.get("zeshel", None), save_results, ) if save_results: save_data_path = os.path.join( params['output_path'], 'candidates_%s_top%d.t7' % (params['mode'], params['top_k'])) torch.save(new_data, save_data_path)
def main(params): output_path = params["output_path"] if not os.path.exists(output_path): os.makedirs(output_path) logger = utils.get_logger(params["output_path"]) # Init model reranker = BiEncoderRanker(params) tokenizer = reranker.tokenizer # laod entities entity_dict, entity_json = load_entity_dict(logger, params) # load tfidf candidates tfidf_cand_dict = read_tfidf_cands(params["data_path"], params["mode"]) # load mentions test_samples = utils.read_dataset(params["mode"], params["data_path"]) logger.info("Read %d test samples." % len(test_samples)) # get only the cands we need to tokenize cand_ids = [c for l in tfidf_cand_dict.values() for c in l] cand_ids.extend([x["label_umls_cuid"] for x in test_samples]) cand_ids = list(set(cand_ids)) num_cands = len(cand_ids) # tokenize the candidates cand_uid_map = {c : i for i, c in enumerate(cand_ids)} candidate_pool = get_candidate_pool_tensor( [entity_dict[c] for c in cand_ids], tokenizer, params["max_cand_length"], logger ) # create mention maps ctxt_uid_map = {x["mm_mention_id"] : i + num_cands for i, x in enumerate(test_samples)} ctxt_cand_map = {x["mm_mention_id"] : x["label_umls_cuid"] for x in test_samples} ctxt_doc_map = {x["mm_mention_id"] : x["context_doc_id"] for x in test_samples} doc_ctxt_map = defaultdict(list) for c, d in ctxt_doc_map.items(): doc_ctxt_map[d].append(c) # create text maps for investigative evaluation uid_to_json = { uid : entity_json[cuid] for cuid, uid in cand_uid_map.items() } uid_to_json.update({i+num_cands : x for i, x in enumerate(test_samples)}) # tokenize the contexts test_data, test_tensor_data = data.process_mention_data( test_samples, tokenizer, params["max_context_length"], params["max_cand_length"], context_key=params['context_key'], silent=params["silent"], logger=logger, debug=params["debug"], ) context_pool = test_data["context_vecs"] # create output variables contexts = context_pool context_uids = torch.LongTensor(list(ctxt_uid_map.values())) pos_coref_ctxts = [] pos_coref_ctxt_uids = [] for i, c in enumerate(ctxt_doc_map.keys()): assert ctxt_uid_map[c] == i + num_cands doc = ctxt_doc_map[c] coref_ctxts = [x for x in doc_ctxt_map[doc] if x != c and ctxt_cand_map[x] == ctxt_cand_map[c]] coref_ctxt_uids = [ctxt_uid_map[x] for x in coref_ctxts] coref_ctxt_idxs = [x - num_cands for x in coref_ctxt_uids] pos_coref_ctxts.append(context_pool[coref_ctxt_idxs]) pos_coref_ctxt_uids.append(torch.LongTensor(coref_ctxt_uids)) knn_ctxts = [] knn_ctxt_uids = [] for i, c in enumerate(ctxt_doc_map.keys()): assert ctxt_uid_map[c] == i + num_cands doc = ctxt_doc_map[c] wdoc_ctxts = [x for x in doc_ctxt_map[doc] if x != c] wdoc_ctxt_uids = [ctxt_uid_map[x] for x in wdoc_ctxts] wdoc_ctxt_idxs = [x - num_cands for x in wdoc_ctxt_uids] knn_ctxts.append(context_pool[wdoc_ctxt_idxs]) knn_ctxt_uids.append(torch.LongTensor(wdoc_ctxt_uids)) pos_cands = [] pos_cand_uids = [] for i, c in enumerate(ctxt_cand_map.keys()): assert ctxt_uid_map[c] == i + num_cands pos_cands.append(candidate_pool[cand_uid_map[ctxt_cand_map[c]]]) pos_cand_uids.append(torch.LongTensor([cand_uid_map[ctxt_cand_map[c]]])) knn_cands = [] knn_cand_uids = [] for i, c in enumerate(ctxt_cand_map.keys()): assert ctxt_uid_map[c] == i + num_cands tfidf_cands = tfidf_cand_dict.get(c, []) tfidf_cand_uids = [cand_uid_map[x] for x in tfidf_cands] knn_cands.append(candidate_pool[tfidf_cand_uids]) knn_cand_uids.append(torch.LongTensor(tfidf_cand_uids)) tfidf_data = { "contexts" : contexts, "context_uids": context_uids, "pos_coref_ctxts": pos_coref_ctxts, "pos_coref_ctxt_uids": pos_coref_ctxt_uids, "knn_ctxts": knn_ctxts, "knn_ctxt_uids": knn_ctxt_uids, "pos_cands": pos_cands, "pos_cand_uids": pos_cand_uids, "knn_cands": knn_cands, "knn_cand_uids": knn_cand_uids, "uid_to_json": uid_to_json, } save_data_path = os.path.join( params['output_path'], 'joint_candidates_%s_tfidf.t7' % (params['mode']) ) torch.save(tfidf_data, save_data_path)