def main(params): # create output dir now = datetime.now() datetime_str = now.strftime("%Y-%m-%d_%H-%M-%S") model_output_path = os.path.join(params["output_path"], datetime_str) if not os.path.exists(model_output_path): os.makedirs(model_output_path) # get logger logger = utils.get_logger(model_output_path) # copy blink source and create rerun script blink_copy_path = os.path.join(model_output_path, "blink") copy_directory("blink", blink_copy_path) cmd = sys.argv with open(os.path.join(model_output_path, "rerun.sh"), "w") as f: cmd.insert(0, "python") f.write(" ".join(cmd)) # Init model ctxt_reranker = CrossEncoderRanker(params) ctxt_model = ctxt_reranker.model tokenizer = ctxt_reranker.tokenizer params["pool_highlighted"] = False # only `True` for ctxt cand_reranker = CrossEncoderRanker(params) cand_model = cand_reranker.model device = ctxt_reranker.device n_gpu = ctxt_reranker.n_gpu if params["gradient_accumulation_steps"] < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(params["gradient_accumulation_steps"])) # An effective batch size of `x`, when we are accumulating the gradient accross `y` batches will be achieved by having a batch size of `z = x / y` # args.gradient_accumulation_steps = args.gradient_accumulation_steps // n_gpu params["train_batch_size"] = (params["train_batch_size"] // params["gradient_accumulation_steps"]) train_batch_size = params["train_batch_size"] eval_batch_size = params["eval_batch_size"] # Fix the random seeds seed = params["seed"] random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if ctxt_reranker.n_gpu > 0: torch.cuda.manual_seed_all(seed) context_length = params["max_context_length"] # create train dataloaders fname = os.path.join(params["data_path"], "joint_train.t7") train_data = torch.load(fname) gold_coref_clusters = build_gold_coref_clusters(train_data) ctxt_train_dataloader = create_mst_dataloader( params, gold_coref_clusters, train_data["contexts"], train_data["pos_coref_ctxts"], train_data["pos_coref_ctxt_uids"], train_data["knn_ctxts"], train_data["knn_ctxt_uids"]) cand_train_dataloader = create_dataloader(params, train_data["contexts"], train_data["pos_cands"], train_data["pos_cand_uids"], train_data["knn_cands"], train_data["knn_cand_uids"]) fname = os.path.join(params["data_path"], "joint_valid.t7") valid_data = torch.load(fname) ctxt_valid_dataloader = create_dataloader( params, valid_data["contexts"], valid_data["pos_coref_ctxts"], valid_data["pos_coref_ctxt_uids"], valid_data["knn_ctxts"], valid_data["knn_ctxt_uids"], evaluate=True) cand_valid_dataloader = create_dataloader(params, valid_data["contexts"], valid_data["pos_cands"], valid_data["pos_cand_uids"], valid_data["knn_cands"], valid_data["knn_cand_uids"], evaluate=True) # evaluate before training ctxt_results = evaluate( ctxt_reranker, ctxt_valid_dataloader, device=device, logger=logger, context_length=context_length, suffix="ctxt", silent=params["silent"], ) cand_results = evaluate( cand_reranker, cand_valid_dataloader, device=device, logger=logger, context_length=context_length, suffix="cand", silent=params["silent"], ) number_of_samples_per_dataset = {} time_start = time.time() utils.write_to_file(os.path.join(model_output_path, "training_params.txt"), str(params)) logger.info("Starting training") logger.info("device: {} n_gpu: {}, distributed training: {}".format( device, n_gpu, False)) ctxt_optimizer = get_optimizer(ctxt_model, params) ctxt_scheduler = get_scheduler( params, ctxt_optimizer, len(ctxt_train_dataloader) * train_batch_size, logger) cand_optimizer = get_optimizer(cand_model, params) cand_scheduler = get_scheduler( params, cand_optimizer, len(cand_train_dataloader) * train_batch_size, logger) ctxt_best_epoch_idx = -1 ctxt_best_score = -1 cand_best_epoch_idx = -1 cand_best_score = -1 num_train_epochs = params["num_train_epochs"] for epoch_idx in trange(int(num_train_epochs), desc="Epoch"): # train ctxt model train_one_epoch_mst(ctxt_train_dataloader, ctxt_reranker, ctxt_optimizer, ctxt_scheduler, logger, params, epoch_idx, device=device, suffix='ctxt') # train cand model train_one_epoch(cand_train_dataloader, cand_reranker, cand_optimizer, cand_scheduler, logger, params, epoch_idx, device=device, suffix='cand') logger.info("***** Saving fine - tuned models *****") ctxt_epoch_output_folder_path = os.path.join( model_output_path, "epoch_{}".format(epoch_idx), "ctxt") utils.save_model(ctxt_model, tokenizer, ctxt_epoch_output_folder_path) cand_epoch_output_folder_path = os.path.join( model_output_path, "epoch_{}".format(epoch_idx), "cand") utils.save_model(cand_model, tokenizer, cand_epoch_output_folder_path) ctxt_results = evaluate( ctxt_reranker, ctxt_valid_dataloader, device=device, logger=logger, context_length=context_length, suffix="ctxt", silent=params["silent"], ) cand_results = evaluate( cand_reranker, cand_valid_dataloader, device=device, logger=logger, context_length=context_length, suffix="cand", silent=params["silent"], ) ctxt_ls = [ctxt_best_score, ctxt_results["normalized_accuracy"]] ctxt_li = [ctxt_best_epoch_idx, epoch_idx] ctxt_best_score = ctxt_ls[np.argmax(ctxt_ls)] ctxt_best_epoch_idx = ctxt_li[np.argmax(ctxt_ls)] cand_ls = [cand_best_score, cand_results["normalized_accuracy"]] cand_li = [cand_best_epoch_idx, epoch_idx] cand_best_score = cand_ls[np.argmax(cand_ls)] cand_best_epoch_idx = cand_li[np.argmax(cand_ls)] logger.info("\n") execution_time = (time.time() - time_start) / 60 utils.write_to_file( os.path.join(model_output_path, "training_time.txt"), "The training took {} minutes\n".format(execution_time), ) logger.info("The training took {} minutes\n".format(execution_time)) # save the best models logger.info( "Best ctxt performance in epoch: {}".format(ctxt_best_epoch_idx)) best_ctxt_model_path = os.path.join(model_output_path, "epoch_{}".format(ctxt_best_epoch_idx), "ctxt") logger.info( "Best cand performance in epoch: {}".format(cand_best_epoch_idx)) best_cand_model_path = os.path.join(model_output_path, "epoch_{}".format(cand_best_epoch_idx), "cand") copy_directory(best_ctxt_model_path, os.path.join(model_output_path, "best_epoch", "ctxt")) copy_directory(best_cand_model_path, os.path.join(model_output_path, "best_epoch", "cand"))
def main(params): model_output_path = params["output_path"] if not os.path.exists(model_output_path): os.makedirs(model_output_path) logger = utils.get_logger(params["output_path"]) # Init model reranker = CrossEncoderRanker(params) tokenizer = reranker.tokenizer model = reranker.model # utils.save_model(model, tokenizer, model_output_path) device = reranker.device n_gpu = reranker.n_gpu if params["gradient_accumulation_steps"] < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(params["gradient_accumulation_steps"])) # An effective batch size of `x`, when we are accumulating the gradient accross `y` batches will be achieved by having a batch size of `z = x / y` # args.gradient_accumulation_steps = args.gradient_accumulation_steps // n_gpu params["train_batch_size"] = (params["train_batch_size"] // params["gradient_accumulation_steps"]) train_batch_size = params["train_batch_size"] eval_batch_size = params["eval_batch_size"] grad_acc_steps = params["gradient_accumulation_steps"] # Fix the random seeds seed = params["seed"] random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if reranker.n_gpu > 0: torch.cuda.manual_seed_all(seed) max_seq_length = params["max_seq_length"] context_length = params["max_context_length"] fname = os.path.join(params["data_path"], "train.t7") train_data = torch.load(fname) context_input = train_data["context_vecs"] candidate_input = train_data["candidate_vecs"] label_input = train_data["labels"] if params["debug"]: max_n = 200 context_input = context_input[:max_n] candidate_input = candidate_input[:max_n] label_input = label_input[:max_n] context_input = modify(context_input, candidate_input, max_seq_length) if params["zeshel"]: src_input = train_data['worlds'][:len(context_input)] train_tensor_data = TensorDataset(context_input, label_input, src_input) else: train_tensor_data = TensorDataset(context_input, label_input) train_sampler = RandomSampler(train_tensor_data) train_dataloader = DataLoader(train_tensor_data, sampler=train_sampler, batch_size=params["train_batch_size"]) fname = os.path.join(params["data_path"], "valid.t7") valid_data = torch.load(fname) context_input = valid_data["context_vecs"] candidate_input = valid_data["candidate_vecs"] label_input = valid_data["labels"] if params["debug"]: max_n = 200 context_input = context_input[:max_n] candidate_input = candidate_input[:max_n] label_input = label_input[:max_n] context_input = modify(context_input, candidate_input, max_seq_length) if params["zeshel"]: src_input = valid_data["worlds"][:len(context_input)] valid_tensor_data = TensorDataset(context_input, label_input, src_input) else: valid_tensor_data = TensorDataset(context_input, label_input) valid_sampler = SequentialSampler(valid_tensor_data) valid_dataloader = DataLoader(valid_tensor_data, sampler=valid_sampler, batch_size=params["eval_batch_size"]) # evaluate before training results = evaluate( reranker, valid_dataloader, device=device, logger=logger, context_length=context_length, silent=params["silent"], ) number_of_samples_per_dataset = {} time_start = time.time() utils.write_to_file(os.path.join(model_output_path, "training_params.txt"), str(params)) logger.info("Starting training") logger.info("device: {} n_gpu: {}, distributed training: {}".format( device, n_gpu, False)) optimizer = get_optimizer(model, params) scheduler = get_scheduler(params, optimizer, len(train_tensor_data), logger) model.train() best_epoch_idx = -1 best_score = -1 num_train_epochs = params["num_train_epochs"] for epoch_idx in trange(int(num_train_epochs), desc="Epoch"): tr_loss = 0 results = None if params["silent"]: iter_ = train_dataloader else: iter_ = tqdm(train_dataloader, desc="Batch") part = 0 for step, batch in enumerate(iter_): batch = tuple(t.to(device) for t in batch) context_input = batch[0] label_input = batch[1] loss, _ = reranker(context_input, label_input, context_length) # if n_gpu > 1: # loss = loss.mean() # mean() to average on multi-gpu. if grad_acc_steps > 1: loss = loss / grad_acc_steps tr_loss += loss.item() if (step + 1) % (params["print_interval"] * grad_acc_steps) == 0: logger.info("Step {} - epoch {} average loss: {}\n".format( step, epoch_idx, tr_loss / (params["print_interval"] * grad_acc_steps), )) tr_loss = 0 loss.backward() if (step + 1) % grad_acc_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), params["max_grad_norm"]) optimizer.step() scheduler.step() optimizer.zero_grad() if (step + 1) % (params["eval_interval"] * grad_acc_steps) == 0: logger.info("Evaluation on the development dataset") evaluate( reranker, valid_dataloader, device=device, logger=logger, context_length=context_length, silent=params["silent"], ) logger.info("***** Saving fine - tuned model *****") epoch_output_folder_path = os.path.join( model_output_path, "epoch_{}_{}".format(epoch_idx, part)) part += 1 utils.save_model(model, tokenizer, epoch_output_folder_path) model.train() logger.info("\n") logger.info("***** Saving fine - tuned model *****") epoch_output_folder_path = os.path.join(model_output_path, "epoch_{}".format(epoch_idx)) utils.save_model(model, tokenizer, epoch_output_folder_path) # reranker.save(epoch_output_folder_path) output_eval_file = os.path.join(epoch_output_folder_path, "eval_results.txt") results = evaluate( reranker, valid_dataloader, device=device, logger=logger, context_length=context_length, silent=params["silent"], ) ls = [best_score, results["normalized_accuracy"]] li = [best_epoch_idx, epoch_idx] best_score = ls[np.argmax(ls)] best_epoch_idx = li[np.argmax(ls)] logger.info("\n") execution_time = (time.time() - time_start) / 60 utils.write_to_file( os.path.join(model_output_path, "training_time.txt"), "The training took {} minutes\n".format(execution_time), ) logger.info("The training took {} minutes\n".format(execution_time)) # save the best model in the parent_dir logger.info("Best performance in epoch: {}".format(best_epoch_idx)) params["path_to_model"] = os.path.join(model_output_path, "epoch_{}".format(best_epoch_idx))
def main(parameters): # Read model reranker = utils.get_reranker(parameters) tokenizer = reranker.tokenizer model = reranker.model device = reranker.device n_gpu = reranker.n_gpu if parameters["gradient_accumulation_steps"] < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( parameters["gradient_accumulation_steps"] ) ) # An effective batch size of `x`, when we are accumulating the gradient accross `y` batches will be achieved by having a batch size of `z = x / y` # args.gradient_accumulation_steps = args.gradient_accumulation_steps // n_gpu parameters["train_batch_size"] = ( parameters["train_batch_size"] // parameters["gradient_accumulation_steps"] ) train_batch_size = parameters["train_batch_size"] evaluation_batch_size = parameters["evaluation_batch_size"] gradient_accumulation_steps = parameters["gradient_accumulation_steps"] # Fix the random seeds seed = parameters["seed"] random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) logger = None number_of_samples_per_dataset = {} if reranker.n_gpu > 0: torch.cuda.manual_seed_all(seed) time_start = time.time() model_output_path = parameters["model_output_path"] # Make sure everything is in order with the output directiory if os.path.exists(model_output_path) and os.listdir(model_output_path): print( "Output directory ({}) already exists and is not empty.".format( model_output_path ) ) answer = input("Would you like to empty the existing directory? [Y/N]\n") if answer.strip() == "Y": print("Deleteing {}...".format(model_output_path)) shutil.rmtree(model_output_path) else: raise ValueError( "Output directory ({}) already exists and is not empty.".format( model_output_path ) ) if not os.path.exists(model_output_path): os.makedirs(model_output_path) utils.write_to_file( os.path.join(model_output_path, "training_parameters.txt"), str(parameters) ) logger = utils.get_logger(model_output_path) logger.info("Starting training") logger.info( "device: {} n_gpu: {}, distributed training: {}".format(device, n_gpu, False) ) ### Load training data train_dataset_name = "aida-train" train_samples = utils.read_dataset( train_dataset_name, parameters["path_to_preprocessed_json_data"] ) train_samples_filtered = utils.filter_samples(train_samples, parameters["top_k"]) logger.info( "Retained {} out of {} samples".format( len(train_samples_filtered), len(train_samples) ) ) number_of_samples_per_dataset[train_dataset_name] = len(train_samples) train_data, train_tensor_data = reranker._process_mentions_for_model( parameters["context_key"], train_samples_filtered, tokenizer, parameters["max_seq_length"], silent=parameters["silent"], logger=logger, top_k=parameters["top_k"], debug=parameters["debug"], ) train_sampler = RandomSampler(train_tensor_data) train_dataloader = DataLoader( train_tensor_data, sampler=train_sampler, batch_size=train_batch_size ) ### ### Loading dev data dev_dataset_name = "aida-A" dev_samples = utils.read_dataset( dev_dataset_name, parameters["path_to_preprocessed_json_data"] ) dev_samples_filtered = utils.filter_samples(dev_samples, parameters["top_k"]) logger.info( "Retained {} out of {} samples".format( len(dev_samples_filtered), len(dev_samples) ) ) number_of_samples_per_dataset[dev_dataset_name] = len(dev_samples) dev_data, dev_tensor_data = reranker._process_mentions_for_model( parameters["context_key"], train_samples_filtered, tokenizer, parameters["max_seq_length"], silent=parameters["silent"], logger=logger, top_k=parameters["top_k"], debug=parameters["debug"], ) dev_sampler = SequentialSampler(dev_tensor_data) dev_dataloader = DataLoader( dev_tensor_data, sampler=dev_sampler, batch_size=evaluation_batch_size ) ### logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_samples_filtered)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Gradient accumulation steps = %d", gradient_accumulation_steps) optimizer, scheduler = reranker.get_scheduler_and_optimizer( parameters, train_tensor_data, logger ) best_epoch_idx = -1 best_score = -1 num_train_epochs = parameters["num_train_epochs"] model.train() for epoch_idx in trange(int(num_train_epochs), desc="Epoch"): tr_loss = 0 results = None for step, batch in enumerate(tqdm(train_dataloader, desc="Batch")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, entity_mask = batch loss, _ = model( input_ids, segment_ids, input_mask, label_ids, entity_mask=entity_mask ) # if n_gpu > 1: # loss = loss.mean() # mean() to average on multi-gpu. if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps tr_loss += loss.item() if (step + 1) % ( parameters["print_tr_loss_opt_steps_interval"] * parameters["gradient_accumulation_steps"] ) == 0: logger.info( "Step {} - epoch {} average loss: {}\n".format( step, epoch_idx, tr_loss / ( parameters["print_tr_loss_opt_steps_interval"] * gradient_accumulation_steps ), ) ) tr_loss = 0 loss.backward() if (step + 1) % gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_( model.parameters(), parameters["max_grad_norm"] ) optimizer.step() scheduler.step() optimizer.zero_grad() if (step + 1) % ( parameters["dev_evaluation_interval"] * gradient_accumulation_steps * train_batch_size ) == 0: logger.info("Evaluation on the development dataset") evaluate_model_on_dataset( model, dev_dataloader, dev_dataset_name, device=device, logger=logger, number_of_samples=number_of_samples_per_dataset[dev_dataset_name], ) model.train() logger.info("\n") logger.info("***** Saving fine - tuned model *****") epoch_output_folder_path = os.path.join( model_output_path, "epoch_{}".format(epoch_idx) ) utils.save_model(model, tokenizer, epoch_output_folder_path) output_eval_file = os.path.join(epoch_output_folder_path, "eval_results.txt") results = evaluate_model_on_dataset( model, dev_dataloader, dev_dataset_name, device=device, logger=logger, path_to_file_to_write_results=output_eval_file, number_of_samples=number_of_samples_per_dataset[dev_dataset_name], ) ls = [best_score, results["normalized_accuracy"]] li = [best_epoch_idx, epoch_idx] best_score = ls[np.argmax(ls)] best_epoch_idx = li[np.argmax(ls)] logger.info("\n") execution_time = (time.time() - time_start) / 60 utils.write_to_file( os.path.join(model_output_path, "training_time.txt"), "The training took {} minutes\n".format(execution_time), ) logger.info("The training took {} minutes\n".format(execution_time)) # save the best model in the parent_dir logger.info("Best performance in epoch: {}".format(best_epoch_idx)) parameters["path_to_model"] = os.path.join( model_output_path, "epoch_{}".format(best_epoch_idx) ) reranker = utils.get_reranker(parameters) utils.save_model(reranker.model, tokenizer, model_output_path) if parameters["evaluate"]: parameters["path_to_model"] = model_output_path evaluate(parameters, logger=logger)