def main(): print("Start...") global opt opt = get_opt() # Set seed torch.manual_seed(opt.seed) np.random.seed(opt.seed) random.seed(opt.seed) opt.cuda = torch.cuda.is_available() and len(opt.gpus) if opt.save_dir and not os.path.exists(opt.save_dir): os.makedirs(opt.save_dir) if torch.cuda.is_available() and not opt.cuda: print( "WARNING: You have a CUDA device, so you should probably run with -gpus 1" ) if opt.cuda: cuda.set_device(opt.gpus[0]) torch.cuda.manual_seed(opt.seed) dicts, supervised_data, rl_data, valid_data, test_data, DEV, EVAL = load_data( opt) print("Building model...") use_critic = opt.start_reinforce is not None print("use_critic: ", use_critic) print("has_baseline: ", opt.has_baseline) if not opt.has_baseline: assert opt.critic_pretrain_epochs == 0 if opt.load_from is None: model, optim = create_model(lib.Seq2SeqModel, dicts, dicts["tgt"].size()) checkpoint = None else: print("Loading from checkpoint at %s" % opt.load_from) checkpoint = torch.load( opt.load_from) #, map_location=lambda storage, loc: storage) model = checkpoint["model"] # config testing for attribute in ["predict_mask", "max_predict_length"]: model.opt.__dict__[attribute] = opt.__dict__[attribute] optim = checkpoint["optim"] optim.start_decay_at = opt.start_decay_at if optim.start_decay_at > opt.end_epoch: print("No decay!") opt.start_epoch = checkpoint["epoch"] + 1 print("model: ", model) print("optim: ", optim) # GPU. if opt.cuda: model.cuda(opt.gpus[0]) # Start reinforce training immediately. print("opt.start_reinforce: ", opt.start_reinforce) # Check if end_epoch is large enough. if use_critic: assert opt.start_epoch + opt.critic_pretrain_epochs - 1 <= \ opt.end_epoch, "Please increase -end_epoch to perform pretraining!" nParams = sum([p.nelement() for p in model.parameters()]) print("* number of parameters: %d" % nParams) if opt.sent_reward == "cr": lib.RetReward.cr = code_retrieval.CrCritic() # Metrics. print("sent_reward: %s" % opt.sent_reward) metrics = {} metrics["xent_loss"] = lib.Loss.weighted_xent_loss metrics["critic_loss"] = lib.Loss.weighted_mse if opt.sent_reward == "bleu": metrics["sent_reward"] = { "train": lib.Reward.wrapped_sentence_bleu, "eval": lib.Reward.wrapped_sentence_bleu } else: metrics["sent_reward"] = { "train": lib.RetReward.retrieval_mrr_train, "eval": lib.RetReward.retrieval_mrr_eval } print("opt.eval: ", opt.eval) print("opt.eval_codenn: ", opt.eval_codenn) print("opt.eval_codenn_all: ", opt.eval_codenn_all) print("opt.collect_anno: ", opt.collect_anno) # Evaluate model if opt.eval: if opt.sent_reward == "cr" and (opt.eval_codenn or opt.eval_codenn_all): raise Exception( "Currently we do not support evaluating MRR on codenn!") if False: # On training set. if opt.sent_reward == "cr": metrics["sent_reward"][ "eval"] = lib.RetReward.retrieval_mrr_train #if opt.collect_anno: # metrics["sent_reward"] = {"train": None, "eval": None} evaluator = lib.Evaluator(model, metrics, dicts, opt) pred_file = opt.load_from.replace(".pt", ".train.pred") if opt.eval_codenn or opt.eval_codenn_all: raise Exception("Invalid eval_codenn!") print("train_data.src: ", len(supervised_data.src)) if opt.predict_mask: pred_file += ".masked" pred_file += ".metric%s" % opt.sent_reward evaluator.eval(supervised_data, pred_file) if True: # On validation set. if opt.sent_reward == "cr": metrics["sent_reward"][ "eval"] = lib.RetReward.retrieval_mrr_eval #if opt.collect_anno: # metrics["sent_reward"] = {"train": None, "eval": None} evaluator = lib.Evaluator(model, metrics, dicts, opt) pred_file = opt.load_from.replace(".pt", ".valid.pred") if opt.eval_codenn: pred_file = pred_file.replace("valid", "DEV") valid_data = DEV elif opt.eval_codenn_all: pred_file = pred_file.replace("valid", "DEV_all") print("* Please input valid data = DEV_all") print("valid_data.src: ", len(valid_data.src)) if opt.predict_mask: pred_file += ".masked" pred_file += ".metric%s" % opt.sent_reward evaluator.eval(valid_data, pred_file) if False: # On test set. if opt.sent_reward == "cr": metrics["sent_reward"][ "eval"] = lib.RetReward.retrieval_mrr_eval #if opt.collect_anno: # metrics["sent_reward"] = {"train": None, "eval": None} evaluator = lib.Evaluator(model, metrics, dicts, opt) pred_file = opt.load_from.replace(".pt", ".test.pred") if opt.eval_codenn: pred_file = pred_file.replace("test", "EVAL") test_data = EVAL elif opt.eval_codenn_all: pred_file = pred_file.replace("test", "EVAL_all") print("* Please input test data = EVAL_all") print("test_data.src: ", len(test_data.src)) if opt.predict_mask: pred_file += ".masked" pred_file += ".metric%s" % opt.sent_reward evaluator.eval(test_data, pred_file) else: print("supervised_data.src: ", len(supervised_data.src)) print("supervised_data.tgt: ", len(supervised_data.tgt)) xent_trainer = lib.Trainer(model, supervised_data, valid_data, metrics, dicts, optim, opt, DEV=DEV) if use_critic: start_time = time.time() # Supervised training. print("supervised training..") print("start_epoch: ", opt.start_epoch) xent_trainer.train(opt.start_epoch, opt.start_reinforce - 1, start_time) if opt.sent_reward == "bleu": _valid_data = DEV else: _valid_data = valid_data if opt.has_baseline: # Create critic here to not affect random seed. critic, critic_optim = create_critic(checkpoint, dicts, opt) print("Building critic...") print("Critic: ", critic) print("Critic optim: ", critic_optim) # Pretrain critic. print("pretrain critic...") if opt.critic_pretrain_epochs > 0: reinforce_trainer = lib.ReinforceTrainer( model, critic, supervised_data, _valid_data, metrics, dicts, optim, critic_optim, opt) reinforce_trainer.train( opt.start_reinforce, opt.start_reinforce + opt.critic_pretrain_epochs - 1, True, start_time) else: print("NOTE: do not have a baseline model") critic, critic_optim = None, None # Reinforce training. print("reinforce training...") reinforce_trainer = lib.ReinforceTrainer(model, critic, rl_data, _valid_data, metrics, dicts, optim, critic_optim, opt) reinforce_trainer.train( opt.start_reinforce + opt.critic_pretrain_epochs, opt.end_epoch, False, start_time) else: # Supervised training only. Set opt.start_reinforce to None xent_trainer.train(opt.start_epoch, opt.end_epoch)
def main(): print('Loading data from "%s"' % opt.data) dataset = torch.load(opt.data) supervised_data = lib.Dataset(dataset["train_xe"], opt.batch_size, opt.cuda, eval=False) bandit_data = lib.Dataset(dataset["train_pg"], opt.batch_size, opt.cuda, eval=False) valid_data = lib.Dataset(dataset["valid"], opt.batch_size, opt.cuda, eval=True) test_data = lib.Dataset(dataset["test"], opt.batch_size, opt.cuda, eval=True) dicts = dataset["dicts"] print(" * vocabulary size. source = %d; target = %d" % (dicts["src"].size(), dicts["tgt"].size())) print(" * number of XENT training sentences. %d" % len(dataset["train_xe"]["src"])) print(" * number of PG training sentences. %d" % len(dataset["train_pg"]["src"])) print(" * maximum batch size. %d" % opt.batch_size) print("Building model...") use_critic = opt.start_reinforce is not None if opt.load_from is None: model, optim = create_model(lib.NMTModel, dicts, dicts["tgt"].size()) checkpoint = None else: print("Loading from checkpoint at %s" % opt.load_from) checkpoint = torch.load(opt.load_from) model = checkpoint["model"] optim = checkpoint["optim"] opt.start_epoch = checkpoint["epoch"] + 1 # GPU. if opt.cuda: model.cuda(opt.gpus[0]) # Start reinforce training immediately. if opt.start_reinforce == -1: opt.start_decay_at = opt.start_epoch opt.start_reinforce = opt.start_epoch # Check if end_epoch is large enough. if use_critic: assert opt.start_epoch + opt.critic_pretrain_epochs - 1 <= \ opt.end_epoch, "Please increase -end_epoch to perform pretraining!" nParams = sum([p.nelement() for p in model.parameters()]) print("* number of parameters: %d" % nParams) # Metrics. metrics = {} metrics["nmt_loss"] = lib.Loss.weighted_xent_loss metrics["critic_loss"] = lib.Loss.weighted_mse metrics["sent_reward"] = lib.Reward.sentence_bleu metrics["corp_reward"] = lib.Reward.corpus_bleu if opt.pert_func is not None: opt.pert_func = lib.PertFunction(opt.pert_func, opt.pert_param) # Evaluate model on heldout dataset. if opt.eval: evaluator = lib.Evaluator(model, metrics, dicts, opt) # On validation set. pred_file = opt.load_from.replace(".pt", ".valid.pred") evaluator.eval(valid_data, pred_file) # On test set. pred_file = opt.load_from.replace(".pt", ".test.pred") evaluator.eval(test_data, pred_file) elif opt.eval_sample: opt.no_update = True critic, critic_optim = create_critic(checkpoint, dicts, opt) reinforce_trainer = lib.ReinforceTrainer(model, critic, bandit_data, test_data, metrics, dicts, optim, critic_optim, opt) reinforce_trainer.train(opt.start_epoch, opt.start_epoch, False) elif opt.sup_train_on_bandit: optim.set_lr(opt.reinforce_lr) xent_trainer = lib.Trainer(model, bandit_data, test_data, metrics, dicts, optim, opt) xent_trainer.train(opt.start_epoch, opt.start_epoch) else: print("theek hai") xent_trainer = lib.Trainer(model, supervised_data, valid_data, metrics, dicts, optim, opt) if use_critic: start_time = time.time() # Supervised training. xent_trainer.train(opt.start_epoch, opt.start_reinforce - 1, start_time) # Create critic here to not affect random seed. critic, critic_optim = create_critic(checkpoint, dicts, opt) # Pretrain critic. if opt.critic_pretrain_epochs > 0: reinforce_trainer = lib.ReinforceTrainer(model, critic, supervised_data, test_data, metrics, dicts, optim, critic_optim, opt) reinforce_trainer.train(opt.start_reinforce, opt.start_reinforce + opt.critic_pretrain_epochs - 1, True, start_time) # Reinforce training. reinforce_trainer = lib.ReinforceTrainer(model, critic, bandit_data, test_data, metrics, dicts, optim, critic_optim, opt) reinforce_trainer.train(opt.start_reinforce + opt.critic_pretrain_epochs, opt.end_epoch, False, start_time) # Supervised training only. else: xent_trainer.train(opt.start_epoch, opt.end_epoch)
def main(): print("Start...") global opt opt = get_opt() # Set seed torch.manual_seed(opt.seed) np.random.seed(opt.seed) random.seed(opt.seed) opt.cuda = len(opt.gpus) if opt.save_dir and not os.path.exists(opt.save_dir): os.makedirs(opt.save_dir) if torch.cuda.is_available() and not opt.cuda: print("WARNING: You have a CUDA device, so you should probably run with -gpus 1") if opt.cuda: cuda.set_device(opt.gpus[0]) torch.cuda.manual_seed(opt.seed) dicts, supervised_data, rl_data, valid_data, test_data, vis_data = load_data(opt) print("Building model...") use_critic = opt.start_reinforce is not None print("use_critic: ", use_critic) if opt.load_from is None: if opt.data_type == 'code': model, optim = create_model(lib.Tree2SeqModel, dicts, dicts["tgt"].size()) elif opt.data_type == 'text': model, optim = create_model(lib.Seq2SeqModel, dicts, dicts["tgt"].size()) elif opt.data_type == 'hybrid': model, optim = create_model(lib.Hybrid2SeqModel, dicts, dicts["tgt"].size()) checkpoint = None print("model: ", model) print("optim: ", optim) else: print("Loading from checkpoint at %s" % opt.load_from) checkpoint = torch.load(opt.load_from, map_location=lambda storage, loc: storage) model = checkpoint["model"] optim = checkpoint["optim"] opt.start_epoch = checkpoint["epoch"] + 1 # GPU. if opt.cuda: model.cuda(opt.gpus[0]) # Start reinforce training immediately. print("opt.start_reinforce: ", opt.start_reinforce) if opt.start_reinforce == -1: opt.start_decay_at = opt.start_epoch opt.start_reinforce = opt.start_epoch # Check if end_epoch is large enough. if use_critic: assert opt.start_epoch + opt.critic_pretrain_epochs - 1 <= \ opt.end_epoch, "Please increase -end_epoch to perform pretraining!" nParams = sum([p.nelement() for p in model.parameters()]) print("* number of parameters: %d" % nParams) # Metrics. metrics = {} metrics["xent_loss"] = lib.Loss.weighted_xent_loss metrics["critic_loss"] = lib.Loss.weighted_mse metrics["sent_reward"] = lib.Reward.sentence_bleu metrics["corp_reward"] = lib.Reward.corpus_bleu if opt.pert_func is not None: opt.pert_func = lib.PertFunction(opt.pert_func, opt.pert_param) print("opt.eval: ", opt.eval) print("opt.eval_sample: ", opt.eval_sample) # Evaluate model on heldout dataset. if opt.eval: evaluator = lib.Evaluator(model, metrics, dicts, opt) # On validation set. if opt.var_length: pred_file = opt.load_from.replace(".pt", ".valid.pred.var"+opt.var_type) else: pred_file = opt.load_from.replace(".pt", ".valid.pred") evaluator.eval(valid_data, pred_file) # On test set. if opt.var_length: pred_file = opt.load_from.replace(".pt", ".test.pred.var"+opt.var_type) else: pred_file = opt.load_from.replace(".pt", ".test.pred") evaluator.eval(test_data, pred_file) elif opt.eval_one: print("eval_one..") evaluator = lib.Evaluator(model, metrics, dicts, opt) # On test set. pred_file = opt.load_from.replace(".pt", ".test_one.pred") evaluator.eval(vis_data, pred_file) elif opt.eval_sample: opt.no_update = True critic, critic_optim = create_critic(checkpoint, dicts, opt) reinforce_trainer = lib.ReinforceTrainer(model, critic, rl_data, test_data, metrics, dicts, optim, critic_optim, opt) reinforce_trainer.train(opt.start_epoch, opt.start_epoch, False) else: print("supervised_data.src: ", len(supervised_data.src)) print("supervised_data.tgt: ", len(supervised_data.tgt)) print("supervised_data.trees: ", len(supervised_data.trees)) print("supervised_data.leafs: ", len(supervised_data.leafs)) xent_trainer = lib.Trainer(model, supervised_data, valid_data, metrics, dicts, optim, opt) if use_critic: start_time = time.time() # Supervised training. print("supervised training..") print("start_epoch: ", opt.start_epoch) xent_trainer.train(opt.start_epoch, opt.start_reinforce - 1, start_time) # Create critic here to not affect random seed. critic, critic_optim = create_critic(checkpoint, dicts, opt) # Pretrain critic. print("pretrain critic...") if opt.critic_pretrain_epochs > 0: reinforce_trainer = lib.ReinforceTrainer(model, critic, supervised_data, test_data, metrics, dicts, optim, critic_optim, opt) reinforce_trainer.train(opt.start_reinforce, opt.start_reinforce + opt.critic_pretrain_epochs - 1, True, start_time) # Reinforce training. print("reinforce training...") reinforce_trainer = lib.ReinforceTrainer(model, critic, rl_data, test_data, metrics, dicts, optim, critic_optim, opt) reinforce_trainer.train(opt.start_reinforce + opt.critic_pretrain_epochs, opt.end_epoch, False, start_time) # Supervised training only. else: xent_trainer.train(opt.start_epoch, opt.end_epoch)
def main(): if args.wandb_on: wandb.init(project=args.wandb_project, name=args.model_name + '-' + args.data_folder.split('/')[2] + '-' + args.loss_type) wandb.config.update( {'hostname': os.popen('hostname').read().split('.')[0]}) wandb.config.update(args) if args.item2idx_dict is not None: item2idx_dict = pd.read_pickle( os.path.join(args.data_folder, args.item2idx_dict)) else: item2idx_dict = None print("Loading train data from {}".format( os.path.join(args.data_folder, args.train_data))) print("Loading valid data from {}".format( os.path.join(args.data_folder, args.valid_data))) train_data = lib.Dataset(os.path.join(args.data_folder, args.train_data)) valid_data = lib.Dataset(os.path.join(args.data_folder, args.valid_data), itemmap=train_data.itemmap) if args.debug: train_data.df.to_csv( os.path.join(args.data_folder, 'GRU4Rec-train-data.csv')) valid_data.df.to_csv( os.path.join(args.data_folder, 'GRU4Rec-valid-data.csv')) make_checkpoint_dir() #set all the parameters according to the defined arguments args.input_size = len(train_data.items) args.output_size = args.input_size #loss function loss_function = lib.LossFunction( loss_type=args.loss_type, use_cuda=args.cuda) #cuda is used with cross entropy only if not args.is_eval: #training #Initialize the model model = lib.GRU4REC(args.input_size, args.hidden_size, args.output_size, final_act=args.final_act, num_layers=args.num_layers, use_cuda=args.cuda, batch_size=args.batch_size, dropout_input=args.dropout_input, dropout_hidden=args.dropout_hidden, embedding_dim=args.embedding_dim) #weights initialization init_model_weight(model) if args.wandb_on: wandb.watch(model, log="all") #optimizer optimizer = lib.Optimizer(model.parameters(), optimizer_type=args.optimizer_type, lr=args.lr, weight_decay=args.weight_decay, momentum=args.momentum, eps=args.eps) #trainer class trainer = lib.Trainer(model, train_data=train_data, eval_data=valid_data, optim=optimizer, use_cuda=args.cuda, loss_func=loss_function, batch_size=args.batch_size, args=args) print('#### START TRAINING....') trainer.train(0, args.n_epochs - 1) else: #testing if args.load_model is not None: print("Loading pre-trained model from {}".format(args.load_model)) try: checkpoint = torch.load(args.load_model) except: checkpoint = torch.load( args.load_model, map_location=lambda storage, loc: storage) model = lib.GRU4REC(args.input_size, args.hidden_size, args.output_size, final_act=args.final_act, num_layers=args.num_layers, use_cuda=args.cuda, batch_size=args.batch_size, dropout_input=args.dropout_input, dropout_hidden=args.dropout_hidden, embedding_dim=args.embedding_dim) model.load_state_dict(checkpoint["state_dict"]) model.gru.flatten_parameters() evaluation = lib.Evaluation(model, loss_function, use_cuda=args.cuda, k=args.k_eval) loss, recall, mrr = evaluation.eval(valid_data, args.batch_size) print("Final result: recall = {:.2f}, mrr = {:.2f}".format( recall, mrr)) else: print("No Pretrained Model was found!")
), # average first channels of every tree ).to(device) with torch.no_grad(): res = model(torch.as_tensor(data.X_train[:1000], device=device)) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) from qhoptim.pyt import QHAdam optimizer_params = {'nus': (0.7, 1.0), 'betas': (0.95, 0.998)} trainer = lib.Trainer(model=model, loss_function=F.cross_entropy, experiment_name=experiment_name, warm_start=False, Optimizer=QHAdam, optimizer_params=optimizer_params, verbose=True, n_last_checkpoints=5) loss_history_step, auc_history_step = [], [] loss_history, mse_history, auc_history = [], [], [] best_auc = 0 best_step_auc = 0 early_stopping_rounds = 1000 report_frequency = 100 print("------ training starts ------") for batch in lib.iterate_minibatches(data.X_train, data.y_train,
def main(): assert (opt.start_epoch <= opt.end_epoch), 'The start epoch should be <= End Epoch' log('Loading data from "%s"' % opt.data) dataset = torch.load(opt.data) supervised_data = lib.Dataset(dataset["train_xe"], opt.batch_size, opt.cuda, eval=False) bandit_data = lib.Dataset(dataset["train_pg"], opt.batch_size, opt.cuda, eval=False) sup_valid_data = lib.Dataset(dataset["sup_valid"], opt.eval_batch_size, opt.cuda, eval=True) bandit_valid_data = lib.Dataset(dataset["bandit_valid"], opt.eval_batch_size, opt.cuda, eval=True) test_data = lib.Dataset(dataset["test"], opt.eval_batch_size, opt.cuda, eval=True) dicts = dataset["dicts"] log(" * vocabulary size. source = %d; target = %d" % (dicts["src"].size(), dicts["tgt"].size())) log(" * number of XENT training sentences. %d" % len(dataset["train_xe"]["src"])) log(" * number of PG training sentences. %d" % len(dataset["train_pg"]["src"])) log(" * number of bandit valid sentences. %d" % len(dataset["bandit_valid"]["src"])) log(" * number of test sentences. %d" % len(dataset["test"]["src"])) log(" * maximum batch size. %d" % opt.batch_size) log("Building model...") use_critic = opt.start_reinforce is not None if opt.load_from is None: model, optim = create_model(lib.NMTModel, dicts, dicts["tgt"].size()) checkpoint = None else: log("Loading from checkpoint at %s" % opt.load_from) checkpoint = torch.load(opt.load_from) model = checkpoint["model"] optim = checkpoint["optim"] opt.start_epoch = checkpoint["epoch"] + 1 # GPU. if opt.cuda: model.cuda(opt.gpus[0]) # Start reinforce training immediately. if (opt.start_reinforce == -1): opt.start_decay_at = opt.start_epoch opt.start_reinforce = opt.start_epoch nParams = sum([p.nelement() for p in model.parameters()]) log("* number of parameters: %d" % nParams) # Metrics. metrics = {} metrics["nmt_loss"] = lib.Loss.weighted_xent_loss metrics["critic_loss"] = lib.Loss.weighted_mse log(" Simulated Feedback: charF score\nEvaluation: charF and Corpus BLEU") instance_charF = lib.Reward.charFEvaluator(dict_tgt=dicts["tgt"]) metrics["sent_reward"] = instance_charF.sentence_charF metrics["corp_reward"] = lib.Reward.corpus_bleu # Evaluate model on heldout dataset. if opt.eval: evaluator = lib.Evaluator(model, metrics, dicts, opt, trpro_logger) # On Bandit test data pred_file = opt.load_from.replace(".pt", ".test.pred") tgt_file = opt.load_from.replace(".pt", ".test.tgt") evaluator.eval(test_data, pred_file) evaluator.eval(test_data, pred_file=None, tgt_file=tgt_file) else: xent_trainer = lib.Trainer(model, supervised_data, sup_valid_data, metrics, dicts, optim, opt, trainprocess_logger=trpro_logger) if use_critic: start_time = time.time() # Supervised training: used when running pretrain+bandit together xent_trainer.train(opt.start_epoch, opt.start_reinforce - 1, start_time) # Actor-Critic critic, critic_optim = create_critic(checkpoint, dicts, opt) reinforce_trainer = lib.ReinforceTrainer( model, critic, bandit_data, bandit_valid_data, test_data, metrics, dicts, optim, critic_optim, opt, trainprocess_logger=trpro_logger, stat_logger=stat_logger, samples_logger=samples_logger) reinforce_trainer.train(opt.start_reinforce, opt.end_epoch, start_time) if opt.use_bipnmt: stat_logger.close_file() samples_logger.close_file() else: # Supervised training only. xent_trainer.train(opt.start_epoch, opt.end_epoch) trpro_logger.close_file()
def main(): print("Loading train data from {}".format( os.path.join(args.data_folder, args.train_data))) print("Loading valid data from {}".format( os.path.join(args.data_folder, args.valid_data))) print("Loading test data from {}\n".format( os.path.join(args.data_folder, args.test_data))) train_data = lib.Dataset(os.path.join(args.data_folder, args.train_data)) valid_data = lib.Dataset(os.path.join(args.data_folder, args.valid_data), itemmap=train_data.itemmap) test_data = lib.Dataset(os.path.join(args.data_folder, args.test_data)) if not args.is_eval: make_checkpoint_dir() input_size = len(train_data.items) hidden_size = args.hidden_size num_layers = args.num_layers output_size = input_size batch_size = args.batch_size dropout_input = args.dropout_input dropout_hidden = args.dropout_hidden embedding_dim = args.embedding_dim final_act = args.final_act loss_type = args.loss_type optimizer_type = args.optimizer_type lr = args.lr weight_decay = args.weight_decay momentum = args.momentum eps = args.eps n_epochs = args.n_epochs time_sort = args.time_sort if not args.is_eval: model = lib.GRU4REC(input_size, hidden_size, output_size, final_act=final_act, num_layers=num_layers, use_cuda=args.cuda, batch_size=batch_size, dropout_input=dropout_input, dropout_hidden=dropout_hidden, embedding_dim=embedding_dim) # init weight # See Balazs Hihasi(ICLR 2016), pg.7 init_model(model) optimizer = lib.Optimizer(model.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) loss_function = lib.LossFunction(loss_type=loss_type, use_cuda=args.cuda) trainer = lib.Trainer(model, train_data=train_data, eval_data=valid_data, optim=optimizer, use_cuda=args.cuda, loss_func=loss_function, args=args) trainer.train(0, n_epochs - 1) else: if args.load_model is not None: print("Loading pre trained model from {}".format(args.load_model)) checkpoint = torch.load(args.load_model) model = checkpoint["model"] model.gru.flatten_parameters() optim = checkpoint["optim"] loss_function = lib.LossFunction(loss_type=loss_type, use_cuda=args.cuda) evaluation = lib.Evaluation(model, loss_function, use_cuda=args.cuda) loss, recall, mrr = evaluation.eval(valid_data) print("Final result: recall = {:.2f}, mrr = {:.2f}".format( recall, mrr)) else: print("Pre trained model is None!")
def main(args) -> None: # Create directory os.makedirs(pjoin('logs', args.name), exist_ok=True) if pexists(pjoin('logs', args.name, 'MY_IS_FINISHED')): print('Quit! Already finish running for %s' % args.name) return # Set seed if args.seed is not None: lib.utils.seed_everything(args.seed) device = 'cuda' if torch.cuda.is_available() else 'cpu' # Data with lib.utils.Timer(f'Load dataset {args.dataset}'): data = lib.DATASETS[args.dataset.upper()](path='./data', fold=args.fold) qn = args.quantile_noise if getattr(args, 'quantile_noise', None) is not None \ else data.get('quantile_noise', 1e-3) preprocessor = lib.MyPreprocessor( cat_features=data.get('cat_features', None), y_normalize=(data['problem'] == 'regression'), random_state=1337, quantile_transform=True, output_distribution=args.quantile_dist, quantile_noise=qn, n_quantiles=args.n_quantiles, ) X_train, y_train = data['X_train'], data['y_train'] preprocessor.fit(X_train, y_train) if args.data_subsample > 1.: args.data_subsample = int(args.data_subsample) # Do not subsample data in the pretraining! if args.pretrain == 0 and args.data_subsample != 1. \ and args.data_subsample < X_train.shape[0]: print(f'Subsample the data by ds={args.data_subsample}') X_train, _, y_train, _ = train_test_split( X_train, y_train, train_size=args.data_subsample, random_state=1377, stratify=(y_train if data['problem'] == 'classification' else None)) use_data_val = ('X_valid' in data and 'y_valid' in data and (not args.split_train_as_val)) if use_data_val: X_valid, y_valid = data['X_valid'], data['y_valid'] else: # Merge with the valid set, and cut it ourselves if 'X_valid' in data: X_train = pd.concat([X_train, data['X_valid']], axis=0) y_train = np.concatenate([y_train, data['y_valid']], axis=0) X_train, X_valid, y_train, y_valid = train_test_split( X_train, y_train, test_size=0.2, random_state=1377, stratify=(y_train if data['problem'] == 'classification' else None)) # Transform dataset X_train, y_train = preprocessor.transform(X_train, y_train) X_valid, y_valid = preprocessor.transform(X_valid, y_valid) X_test, y_test = preprocessor.transform(data['X_test'], data['y_test']) # Save preprocessor with open(pjoin('logs', args.name, 'preprocessor.pkl'), 'wb') as op: pickle.dump(preprocessor, op) metric = data.get('metric', ('classification_error' if data['problem'] == 'classification' else 'mse')) # Modify args based on the dataset args.in_features = X_train.shape[1] args.problem = data['problem'] args.num_classes = data.get('num_classes', 1) args.data_addi_tree_dim = data.get('addi_tree_dim', 0) # Modify based on if doing pretraining! if args.pretrain > 0: assert args.pretraining_ratio > 0. if args.pretrain == 1: args.problem = 'pretrain_mask' elif args.pretrain == 2: args.problem = 'pretrain_recon' elif args.pretrain == 3: args.problem = 'pretrain_recon2' else: raise NotImplementedError('Wrong pretrain: ' + str(args.pretrain)) metric = 'pretrain_loss' args.num_classes = args.in_features args.data_addi_tree_dim = (-args.in_features) + 1 # Use both train/val as training set, and use test as val X_train, X_valid = np.concatenate([X_train, X_valid], axis=0), X_test y_train, y_valid = X_train, X_valid print( f'X_train: {X_train.shape}, X_valid: {X_valid.shape}, X_test: {X_test.shape}' ) # Model model, step_callbacks = getattr(lib.arch, args.arch + 'Block').load_model_by_hparams( args, ret_step_callback=True) # Initialize bias before sending to cuda if 'init_bias' in args and args.init_bias and args.problem == 'classification': model.set_bias(y_train) model.to(device) # if torch.cuda.device_count() > 1: # model = nn.DataParallel(model) # Load from pretrained model. Since last fc layer has diff size if getattr(args, 'load_from_pretrain', None) is not None: print("=> using pre-trained model '{}'".format( args.load_from_pretrain)) path = pjoin('logs', args.load_from_pretrain, "checkpoint_best.pth") checkpoint = torch.load(path) model_state = model.state_dict() pretrained_state = { k: v for k, v in checkpoint['model'].items() if k in model_state and v.size() == model_state[k].size() } print('Pre-load the following weights:') print(list(pretrained_state.keys())) print('Ignore the following weights:') print([k for k in model_state if k not in pretrained_state]) model_state.update(pretrained_state) model.load_state_dict(model_state) from qhoptim.pyt import QHAdam optimizer_params = {'nus': (0.7, 1.0), 'betas': (0.95, 0.998)} trainer = lib.Trainer( model=model, experiment_name=args.name, warm_start=True, # To handle the interruption on v server Optimizer=QHAdam, optimizer_params=optimizer_params, lr=args.lr, lr_warmup_steps=args.lr_warmup_steps, verbose=False, n_last_checkpoints=5, step_callbacks=step_callbacks, # Temp annelaing fp16=args.fp16, problem=args.problem, pretraining_ratio=args.pretraining_ratio, opt_only_last_layer=(args.load_from_pretrain is not None and args.opt_only_last_layer), freeze_steps=(0 if args.load_from_pretrain is None else args.freeze_steps), ) assert metric in [ 'negative_auc', 'classification_error', 'mse', 'multiple_mse', 'pretrain_loss' ] eval_fn = getattr(trainer, 'evaluate_' + metric) # Before we start, we will need to select the batch size if unspecified if args.batch_size is None or args.batch_size < 0: assert device != 'cpu', 'Have to specify batch size when using CPU' args.batch_size = choose_batch_size(trainer, X_train, y_train, device, max_bs=args.max_bs, min_bs=args.min_bs) else: try: with torch.no_grad(): res = model( torch.as_tensor(X_train[:(2 * args.batch_size)], device=device)) # trigger data-aware init except RuntimeError as e: handle_oom_error(e, args) # Then show hparams after deciding the batch size print("experiment:", args.name) print("Args:") print(args) # Then record hparams saved_args = pjoin('logs', args.name, 'hparams.json') json.dump(vars(args), open(saved_args, 'w')) # record hparams again, since logs/{args.name} will be deleted! os.makedirs(pjoin('logs', 'hparams'), exist_ok=True) json.dump(vars(args), open(pjoin('logs', 'hparams', args.name), 'w')) # To make sure when rerunning the err history and time are accurate, # we save the whole history in training.json. recorder = lib.Recorder(path=pjoin('logs', args.name)) ntf_diff, ntf = 0., None # Record number of trees assigned to each feature st_time = time.time() for batch in lib.iterate_minibatches(X_train, y_train, batch_size=args.batch_size, shuffle=True, epochs=float('inf')): # Handle removing missing by sampling from a Gaussian! try: metrics = trainer.train_on_batch(*batch, device=device) except RuntimeError as e: handle_oom_error(e, args) if recorder.loss_history is not None: recorder.loss_history.append(float(metrics['loss'])) if trainer.step % args.report_frequency == 0: trainer.save_checkpoint() trainer.remove_old_temp_checkpoints() trainer.average_checkpoints(out_tag='avg') trainer.load_checkpoint(tag='avg') err = eval_fn(X_valid, y_valid, device=device, batch_size=args.batch_size * 2) # Handle per-task early stopping when metric='multiple_mse' if metric == 'multiple_mse': # Initialize if not isinstance(recorder.best_err, list): recorder.best_err = [float('inf') for _ in range(len(err))] recorder.best_step_err = [0 for _ in range(len(err))] for idx, (e, be) in enumerate(zip(err, recorder.best_err)): if e < be: recorder.best_err[idx] = e recorder.best_step_err[idx] = trainer.step trainer.save_checkpoint(tag='best_t%d' % idx) if recorder.err_history is not None: recorder.err_history.append(np.mean(err)) else: if err < recorder.best_err: recorder.best_err = err recorder.best_step_err = trainer.step trainer.save_checkpoint(tag='best') if recorder.err_history is not None: recorder.err_history.append(err) recorder.step = trainer.step recorder.run_time += float(time.time() - st_time) st_time = time.time() recorder.save_record() trainer.load_checkpoint() # last if recorder.loss_history is not None and recorder.err_history is not None: save_loss_fig(recorder.loss_history, recorder.err_history, pjoin('loss_figs', f'{args.name}.jpg')) cur_ntf = trainer.model.get_num_trees_assigned_to_each_feature() if cur_ntf is None: # ODST no NTF ntf_diff = 0. else: if ntf is not None: ntf_diff = (torch.sum(torch.abs(cur_ntf - ntf)) * 100.0 / torch.sum(cur_ntf)).item() ntf = cur_ntf if trainer.step == 1: print("Step\tVal_Err\tTime(s)\tNTF(%)") print('{}\t{}\t{:.0f}\t{:.2f}%'.format(trainer.step, np.around(err, 5), recorder.run_time, ntf_diff)) bstep = recorder.best_step_err if isinstance(bstep, list): bstep = np.max(bstep) min_steps = max(bstep, getattr(args, 'anneal_steps', -1)) if trainer.step > min_steps + args.early_stopping_rounds: print('BREAK. There is no improvment for {} steps'.format( args.early_stopping_rounds)) break if args.lr_decay_steps > 0 \ and trainer.step > bstep + args.lr_decay_steps \ and trainer.step > (recorder.lr_decay_step + args.lr_decay_steps): lr_before = trainer.lr trainer.decrease_lr(ratio=0.2, min_lr=1e-6) recorder.lr_decay_step = trainer.step print('LR: %.2e -> %.2e' % (lr_before, trainer.lr)) if 0 < args.max_rounds < trainer.step: print('End. It reaches the maximum rounds %d' % args.max_rounds) break if recorder.run_time > args.max_time: print('End. It reaches the maximum run time %d (s)' % args.max_time) break print("Best step: ", recorder.best_step_err) print("Best Val Error: ", recorder.best_err) if args.pretrain: # Submit another sbatch job for the real training print('***** FINISH pretraining! *****') else: max_step = trainer.step # Run test time if metric != 'multiple_mse': trainer.load_checkpoint(tag='best') test_err = eval_fn(X_test, y_test, device=device, batch_size=2 * args.batch_size) else: test_err = [] for idx in range(len(recorder.best_err)): trainer.load_checkpoint(tag='best_t%d' % idx) tmp = eval_fn(X_test, y_test, device=device, batch_size=2 * args.batch_size) test_err.append(tmp[idx]) print("Test Error rate: {}".format(test_err)) # Save csv results results = dict() results['test_err'] = test_err results['val_err'] = recorder.best_err results['best_step_err'] = recorder.best_step_err results['max_step'] = max_step results['time(s)'] = '%d' % recorder.run_time results['fold'] = args.fold results['fp16'] = args.fp16 results['batch_size'] = args.batch_size results['finetuned'] = int(args.load_from_pretrain is not None) # Append the hyperparameters rs_hparams = getattr(lib.arch, args.arch + 'Block').get_model_specific_rs_hparams() for k in rs_hparams: results[k] = getattr(args, k) results = getattr(lib.arch, args.arch + 'Block').add_model_specific_results( results, args) results['name'] = args.name os.makedirs(f'results', exist_ok=True) dataset_postfix = f'_ds{args.data_subsample}' if args.data_subsample != 1. else '' if metric != 'multiple_mse': csv_file = f'results/{args.dataset}{dataset_postfix}_{args.arch}_new10.csv' lib.utils.output_csv(csv_file, results) else: csv_file = f'results/{args.dataset}{dataset_postfix}_{args.arch}_new10.ssv' lib.utils.output_csv(csv_file, results, delimiter=';') print('output results to %s' % csv_file) # Clean up open(pjoin('logs', args.name, 'MY_IS_FINISHED'), 'a') trainer.remove_old_temp_checkpoints(number_ckpts_to_keep=0)
def main(): print("Loading train data from {}".format( os.path.join(args.data_folder, args.train_data))) print("Loading valid data from {}".format( os.path.join(args.data_folder, args.valid_data))) train_data = lib.Dataset(os.path.join(args.data_folder, args.train_data)) valid_data = lib.Dataset(os.path.join(args.data_folder, args.valid_data), itemmap=train_data.itemmap) make_checkpoint_dir() #set all the parameters according to the defined arguments input_size = len(train_data.items) hidden_size = args.hidden_size num_layers = args.num_layers output_size = input_size batch_size = args.batch_size dropout_input = args.dropout_input dropout_hidden = args.dropout_hidden embedding_dim = args.embedding_dim final_act = args.final_act loss_type = args.loss_type optimizer_type = args.optimizer_type lr = args.lr weight_decay = args.weight_decay momentum = args.momentum eps = args.eps n_epochs = args.n_epochs time_sort = args.time_sort #loss function loss_function = lib.LossFunction( loss_type=loss_type, use_cuda=args.cuda) #cuda is used with cross entropy only if not args.is_eval: #training #Initialize the model model = lib.GRU4REC(input_size, hidden_size, output_size, final_act=final_act, num_layers=num_layers, use_cuda=args.cuda, batch_size=batch_size, dropout_input=dropout_input, dropout_hidden=dropout_hidden, embedding_dim=embedding_dim) #weights initialization init_model(model) #optimizer optimizer = lib.Optimizer(model.parameters(), optimizer_type=optimizer_type, lr=lr, weight_decay=weight_decay, momentum=momentum, eps=eps) #trainer class trainer = lib.Trainer(model, train_data=train_data, eval_data=valid_data, optim=optimizer, use_cuda=args.cuda, loss_func=loss_function, batch_size=batch_size, args=args) print('#### START TRAINING....') trainer.train(0, n_epochs - 1) else: #testing if args.load_model is not None: print("Loading pre-trained model from {}".format(args.load_model)) try: checkpoint = torch.load(args.load_model) except: checkpoint = torch.load( args.load_model, map_location=lambda storage, loc: storage) model = checkpoint["model"] model.gru.flatten_parameters() evaluation = lib.Evaluation(model, loss_function, use_cuda=args.cuda, k=args.k_eval) loss, recall, mrr = evaluation.eval(valid_data, batch_size) print("Final result: recall = {:.2f}, mrr = {:.2f}".format( recall, mrr)) else: print("No Pretrained Model was found!")