def train(self, model: nn.Module, data_dict: Dict[str, BaseModel.Dataset]) -> NoReturn: main_metric_results, dev_results, test_results = list(), list(), list() self._check_time(start=True) try: for epoch in range(self.epoch): # Fit self._check_time() loss = self.fit(model, data_dict['train'], epoch=epoch + 1) training_time = self._check_time() # Observe selected tensors if len( model.check_list ) > 0 and self.check_epoch > 0 and epoch % self.check_epoch == 0: utils.check(model.check_list) # Record dev and test results dev_result = self.evaluate(model, data_dict['dev'], self.topk[:1], self.metrics) test_result = self.evaluate(model, data_dict['test'], self.topk[:1], self.metrics) testing_time = self._check_time() dev_results.append(dev_result) test_results.append(test_result) main_metric_results.append(dev_result[self.main_metric]) logging.info( "Epoch {:<5} loss={:<.4f} [{:<.1f} s]\t dev=({}) test=({}) [{:<.1f} s] " .format(epoch + 1, loss, training_time, utils.format_metric(dev_result), utils.format_metric(test_result), testing_time)) # Save model and early stop if max(main_metric_results) == main_metric_results[-1] or \ (hasattr(model, 'stage') and model.stage == 1): model.save_model() if self.early_stop and self.eval_termination( main_metric_results): logging.info("Early stop at %d based on dev result." % (epoch + 1)) break except KeyboardInterrupt: logging.info("Early stop manually") exit_here = input( "Exit completely without evaluation? (y/n) (default n):") if exit_here.lower().startswith('y'): logging.info(os.linesep + '-' * 45 + ' END: ' + utils.get_time() + ' ' + '-' * 45) exit(1) # Find the best dev result across iterations best_epoch = main_metric_results.index(max(main_metric_results)) logging.info( os.linesep + "Best Iter(dev)={:>5}\t dev=({}) test=({}) [{:<.1f} s] ".format( best_epoch + 1, utils.format_metric(dev_results[best_epoch]), utils.format_metric(test_results[best_epoch]), self.time[1] - self.time[0])) model.load_model()
def print_res(self, model: nn.Module, data: BaseModel.Dataset) -> str: """ Construct the final result string before/after training :return: test result string """ result_dict = self.evaluate(model, data, self.topk, self.metrics) res_str = '(' + utils.format_metric(result_dict) + ')' return res_str
def print_res(self, X, y): """ Construct the final test result string before/after training :return: test result string """ result_dict = self.evaluate(X, y) res_str = '(' + utils.format_metric(result_dict) + ')' return res_str
def print_res(self, model, corpus): """ Construct the final test result string before/after training :return: test result string """ phase, res_str = 'test', '' result_dict = self.evaluate(model, corpus, phase, self.topk, self.metrics) res_str += '(' + utils.format_metric(result_dict) + ')' return res_str
def main(args): logging.info('-' * 45 + ' BEGIN: ' + utils.get_time() + ' ' + '-' * 45) exclude = [ 'check_epoch', 'log_file', 'model_path', 'path', 'pin_memory', 'regenerate', 'sep', 'train', 'verbose', 'load', 'buffer' ] logging.info(utils.format_arg_str(args, exclude_lst=exclude)) # Random seed np.random.seed(args.random_seed) # Read Data dataloader = DataLoader.DataLoader(args) dataloader._load_data() # Define Model model = model_name(args) # Run Model evaluations_list = {} for i in range(5): model.fit(dataloader.train_feature[i], dataloader.train_label[i]) evaluations = model.print_res(dataloader.test_feature[i], dataloader.test_label[i]) evaluation_results = model.evaluate(dataloader.test_feature[i], dataloader.test_label[i]) for key in evaluation_results: if key not in evaluations_list: evaluations_list[key] = [] evaluations_list[key].append(evaluation_results[key]) logging.info('Test Results at {} times: {}'.format(i, evaluations)) evaluations_all = {} for key in evaluations_list: evaluations_all[key] = np.mean(evaluations_list[key]) logging.info("Average results: {}".format( utils.format_metric(evaluations_all)))
def train(self, model, data_processor): """ 训练模型 :param model: 模型 :param data_processor: DataProcessor实例 :return: """ # 获得训练、验证、测试数据,epoch=-1不shuffle train_data = data_processor.get_train_data(epoch=-1, model=model) validation_data = data_processor.get_validation_data(model=model) test_data = data_processor.get_test_data( model=model) if data_processor.unlabel_test == 0 else None self._check_time(start=True) # 记录初始时间 # 训练之前的模型效果 init_train = self.evaluate(model, train_data, data_processor) \ if train_data is not None else [-1.0] * len(self.metrics) init_valid = self.evaluate(model, validation_data, data_processor) \ if validation_data is not None else [-1.0] * len(self.metrics) init_test = self.evaluate(model, test_data, data_processor) \ if test_data is not None and data_processor.unlabel_test == 0 else [-1.0] * len(self.metrics) logging.info( "Init: \t train= %s validation= %s test= %s [%.1f s] " % (utils.format_metric(init_train), utils.format_metric(init_valid), utils.format_metric(init_test), self._check_time()) + ','.join(self.metrics)) try: for epoch in range(self.epoch): self._check_time() # 每一轮需要重新获得训练数据,因为涉及shuffle或者topn推荐时需要重新采样负例 epoch_train_data = data_processor.get_train_data(epoch=epoch, model=model) train_predictions, last_batch, mean_loss, mean_loss_l2 = \ self.fit(model, epoch_train_data, data_processor, epoch=epoch) # 检查模型中间结果 if self.check_epoch > 0 and (epoch == 1 or epoch % self.check_epoch == 0): last_batch['mean_loss'] = mean_loss last_batch['mean_loss_l2'] = mean_loss_l2 self.check(model, last_batch) training_time = self._check_time() # # evaluate模型效果 train_result = [mean_loss] + model.evaluate_method( train_predictions, train_data, metrics=['rmse']) valid_result = self.evaluate(model, validation_data, data_processor) \ if validation_data is not None else [-1.0] * len(self.metrics) test_result = self.evaluate(model, test_data, data_processor) \ if test_data is not None and data_processor.unlabel_test == 0 else [-1.0] * len(self.metrics) testing_time = self._check_time() self.train_results.append(train_result) self.valid_results.append(valid_result) self.test_results.append(test_result) # 输出当前模型效果 logging.info( "Epoch %5d [%.1f s]\t train= %s validation= %s test= %s [%.1f s] " % (epoch + 1, training_time, utils.format_metric( train_result), utils.format_metric(valid_result), utils.format_metric(test_result), testing_time) + ','.join(self.metrics)) # 如果当前效果是最优的,保存模型,基于验证集 if utils.best_result( self.metrics[0], self.valid_results) == self.valid_results[-1]: model.save_model() # model.save_model( # model_path='../model/variable_tsne_logic_epoch/variable_tsne_logic_epoch_%d.pt' % (epoch + 1)) # 检查是否终止训练,基于验证集 if self.eva_termination(model) and self.early_stop == 1: logging.info( "Early stop at %d based on validation result." % (epoch + 1)) break except KeyboardInterrupt: logging.info("Early stop manually") save_here = input("Save here? (1/0) (default 0):") if str(save_here).lower().startswith('1'): model.save_model() # Find the best validation result across iterations best_valid_score = utils.best_result(self.metrics[0], self.valid_results) best_epoch = self.valid_results.index(best_valid_score) logging.info( "Best Iter(validation)= %5d\t train= %s valid= %s test= %s [%.1f s] " % (best_epoch + 1, utils.format_metric(self.train_results[best_epoch]), utils.format_metric(self.valid_results[best_epoch]), utils.format_metric(self.test_results[best_epoch]), self.time[1] - self.time[0]) + ','.join(self.metrics)) best_test_score = utils.best_result(self.metrics[0], self.test_results) best_epoch = self.test_results.index(best_test_score) logging.info( "Best Iter(test)= %5d\t train= %s valid= %s test= %s [%.1f s] " % (best_epoch + 1, utils.format_metric(self.train_results[best_epoch]), utils.format_metric(self.valid_results[best_epoch]), utils.format_metric(self.test_results[best_epoch]), self.time[1] - self.time[0]) + ','.join(self.metrics)) model.load_model()
def main(): # init args init_parser = argparse.ArgumentParser(description='Model', add_help=False) init_parser.add_argument('--rank', type=int, default=1, help='1=ranking, 0=rating/click') init_parser.add_argument('--data_loader', type=str, default='', help='数据加载器') init_parser.add_argument('--model_name', type=str, default='NLRRec', help='模型名') init_parser.add_argument('--runner_name', type=str, default='', help='运行器') init_parser.add_argument('--data_processor', type=str, default='', help='数据处理') init_args, init_extras = init_parser.parse_known_args() # choose model model_name = eval(init_args.model_name) # choose data_loader if init_args.data_loader == '': init_args.data_loader = model_name.data_loader data_loader_name = eval(init_args.data_loader) # choose data_processor if init_args.data_processor == '': init_args.data_processor = model_name.data_processor data_processor_name = eval(init_args.data_processor) # choose runner if init_args.runner_name == '': init_args.runner_name = model_name.runner runner_name = eval(init_args.runner_name) # cmd line paras parser = argparse.ArgumentParser(description='') parser = utils.parse_global_args(parser) parser = data_loader_name.parse_data_args(parser) parser = model_name.parse_model_args(parser, model_name=init_args.model_name) parser = runner_name.parse_runner_args(parser) parser = data_processor_name.parse_dp_args(parser) origin_args, extras = parser.parse_known_args() # log,model,result filename paras = sorted(vars(origin_args).items(), key=lambda kv: kv[0]) log_name_exclude = [ 'check_epoch', 'eval_batch_size', 'gpu', 'label', 'load', 'log_file', 'metrics', 'model_path', 'path', 'pre_gpu', 'result_file', 'sep', 'seq_sep', 'train', 'unlabel_test', 'verbose', 'dataset', 'random_seed' ] log_file_name = [str(init_args.rank) + str(origin_args.drop_neg), init_args.model_name, origin_args.dataset, str(origin_args.random_seed)] + \ [p[0].replace('_', '')[:3] + str(p[1]) for p in paras if p[0] not in log_name_exclude] log_file_name = [ l.replace(' ', '-').replace('_', '-') for l in log_file_name ] log_file_name = '_'.join(log_file_name) if origin_args.log_file == os.path.join(LOG_DIR, 'log.txt'): origin_args.log_file = os.path.join( LOG_DIR, '%s/%s.txt' % (init_args.model_name, log_file_name)) utils.check_dir_and_mkdir(origin_args.log_file) if origin_args.result_file == os.path.join(RESULT_DIR, 'result.npy'): origin_args.result_file = os.path.join( RESULT_DIR, '%s/%s.npy' % (init_args.model_name, log_file_name)) utils.check_dir_and_mkdir(origin_args.result_file) if origin_args.model_path == os.path.join( MODEL_DIR, '%s/%s.pt' % (init_args.model_name, init_args.model_name)): origin_args.model_path = os.path.join( MODEL_DIR, '%s/%s.pt' % (init_args.model_name, log_file_name)) utils.check_dir_and_mkdir(origin_args.model_path) args = copy.deepcopy(origin_args) # logging for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=args.log_file, level=args.verbose) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logging.info(vars(init_args)) logging.info(vars(origin_args)) logging.info(extras) logging.info('DataLoader: ' + init_args.data_loader) logging.info('Model: ' + init_args.model_name) logging.info('Runner: ' + init_args.runner_name) logging.info('DataProcessor: ' + init_args.data_processor) # random seed torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) np.random.seed(args.random_seed) # cuda os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu # default '0' logging.info("# cuda devices: %d" % torch.cuda.device_count()) # create data_loader args.load_data = True dl_para_dict = utils.get_init_paras_dict(data_loader_name, vars(args)) logging.info(init_args.data_loader + ': ' + str(dl_para_dict)) data_loader = data_loader_name(**dl_para_dict) # 需要由data_loader来append_his if 'all_his' in origin_args: data_loader.append_his(all_his=origin_args.all_his, max_his=origin_args.max_his, neg_his=origin_args.neg_his, neg_column=origin_args.neg_column) # 如果是top n推荐,只保留正例,负例是训练过程中采样得到,并且将label转换为01二值 if init_args.rank == 1: data_loader.label_01() if origin_args.drop_neg == 1: data_loader.drop_neg() # create data_processor args.data_loader, args.rank = data_loader, init_args.rank dp_para_dict = utils.get_init_paras_dict(data_processor_name, vars(args)) logging.info(init_args.data_processor + ': ' + str(dp_para_dict)) data_processor = data_processor_name(**dp_para_dict) # # prepare train,test,validation samples 需要写在模型产生和训练之前,保证对不同模型相同random seed产生一样的测试负例 data_processor.get_train_data(epoch=-1, model=model_name) data_processor.get_validation_data(model=model_name) data_processor.get_test_data(model=model_name) # create model # 根据模型需要生成 数据集的特征,特征总共one-hot/multi-hot维度,特征每个field最大值和最小值, features, feature_dims, feature_min, feature_max = \ data_loader.feature_info(include_id=model_name.include_id, include_item_features=model_name.include_item_features, include_user_features=model_name.include_user_features) args.feature_num, args.feature_dims = len(features), feature_dims args.user_feature_num = len([f for f in features if f.startswith('u_')]) args.item_feature_num = len([f for f in features if f.startswith('i_')]) args.context_feature_num = len([f for f in features if f.startswith('c_')]) data_loader_vars = vars(data_loader) for key in data_loader_vars: if key not in args.__dict__: args.__dict__[key] = data_loader_vars[key] # print(args.__dict__.keys()) model_para_dict = utils.get_init_paras_dict(model_name, vars(args)) logging.info(init_args.model_name + ': ' + str(model_para_dict)) model = model_name(**model_para_dict) # init model paras model.apply(model.init_paras) # use gpu if torch.cuda.device_count() > 0: # model = model.to('cuda:0') model = model.cuda() # create runner runner_para_dict = utils.get_init_paras_dict(runner_name, vars(args)) logging.info(init_args.runner_name + ': ' + str(runner_para_dict)) runner = runner_name(**runner_para_dict) # training/testing # utils.format_metric(runner.evaluate(model, data_processor.get_test_data(model=model), data_processor)) logging.info( 'Test Before Training: train= %s validation= %s test= %s' % (utils.format_metric( runner.evaluate( model, data_processor.get_train_data(epoch=-1, model=model), data_processor)), utils.format_metric( runner.evaluate(model, data_processor.get_validation_data( model=model), data_processor)), utils.format_metric( runner.evaluate(model, data_processor.get_test_data( model=model), data_processor)) if args.unlabel_test == 0 else '-1') + ' ' + ','.join(runner.metrics)) # 如果load > 0,表示载入模型继续训练 if args.load > 0: model.load_model() # 如果train > 0,表示需要训练,否则直接测试 if args.train > 0: runner.train(model, data_processor) # logging.info('Test After Training: train= %s validation= %s test= %s' % ( # utils.format_metric( # runner.evaluate(model, data_processor.get_train_data(epoch=-1, model=model), data_processor)), # utils.format_metric(runner.evaluate(model, data_processor.get_validation_data(model=model), data_processor)), # utils.format_metric(runner.evaluate(model, data_processor.get_test_data(model=model), data_processor)) # if args.unlabel_test == 0 else '-1') + ' ' + ','.join(runner.metrics)) # save test results train_result = runner.predict( model, data_processor.get_train_data(epoch=-1, model=model), data_processor) validation_result = runner.predict( model, data_processor.get_validation_data(model=model), data_processor) test_result = runner.predict(model, data_processor.get_test_data(model=model), data_processor) np.save(args.result_file.replace('.npy', '__train.npy'), train_result) np.save(args.result_file.replace('.npy', '__validation.npy'), validation_result) np.save(args.result_file.replace('.npy', '__test.npy'), test_result) logging.info('Save Results to ' + args.result_file) all_metrics = [ 'rmse', 'mae', 'auc', 'f1', 'accuracy', 'precision', 'recall' ] if init_args.rank == 1: all_metrics = ['ndcg@1', 'ndcg@5', 'ndcg@10', 'ndcg@20', 'ndcg@50', 'ndcg@100'] \ + ['hit@1', 'hit@5', 'hit@10', 'hit@20', 'hit@50', 'hit@100'] \ + ['precision@1', 'precision@5', 'precision@10', 'precision@20', 'precision@50', 'precision@100'] \ + ['recall@1', 'recall@5', 'recall@10', 'recall@20', 'recall@50', 'recall@100'] results = [train_result, validation_result, test_result] name_map = ['Train', 'Valid', 'Test'] datasets = [ data_processor.get_train_data(epoch=-1, model=model), data_processor.get_validation_data(model=model) ] if args.unlabel_test != 1: datasets.append(data_processor.get_test_data(model=model)) for i, dataset in enumerate(datasets): metrics = model.evaluate_method(results[i], datasets[i], metrics=all_metrics, error_skip=True) log_info = 'Test After Training on %s: ' % name_map[i] log_metrics = [ '%s=%s' % (metric, utils.format_metric(metrics[j])) for j, metric in enumerate(all_metrics) ] log_info += ', '.join(log_metrics) logging.info(os.linesep + log_info + os.linesep) if args.verbose <= logging.DEBUG: if args.unlabel_test == 0: logging.debug( runner.evaluate(model, data_processor.get_test_data(model=model), data_processor)) logging.debug( runner.evaluate(model, data_processor.get_test_data(model=model), data_processor)) else: logging.debug( runner.evaluate( model, data_processor.get_validation_data(model=model), data_processor)) logging.debug( runner.evaluate( model, data_processor.get_validation_data(model=model), data_processor)) logging.info('# of params: %d' % model.total_parameters) logging.info(vars(origin_args)) logging.info(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # result_dict = runner.run_some_tensors(model, data_processor.get_train_data(epoch=-1, model=model), data_processor, # dict_keys=['sth']) # pickle.dump(result_dict, open('./sth.pk', 'rb')) return
def main(): init_parser = argparse.ArgumentParser(description='Model') init_parser.add_argument('--model_name', type=str, default='BaseModel', help='Choose model to run.') # init_parser.add_argument('--runner_name', type=str, default='BaseRunner', # help='Choose runner to run.') init_args, init_extras = init_parser.parse_known_args() model_name = eval(init_args.model_name) init_args.runner_name = 'BaseRunner' runner_name = eval(init_args.runner_name) parser = argparse.ArgumentParser(description='') parser = utils.parse_global_args(parser) parser = BaseDataLoader.parse_data_args(parser) parser = model_name.parse_model_args(parser, model_name=init_args.model_name) parser = runner_name.parse_runner_args(parser) args, extras = parser.parse_known_args() args = model_name.add_model_args(args) log_file_name = [ init_args.model_name, args.dataset, str(args.random_seed), init_args.runner_name, 'optimizer=' + args.optimizer, 'lr=' + str(args.lr), 'l2=' + str(args.l2), 'dropout=' + str(args.dropout), 'batch_size=' + str(args.batch_size) ] log_file_name = '__'.join(log_file_name).replace(' ', '__') if args.log_file == '../log/log.txt': args.log_file = '../log/%s.txt' % log_file_name if args.result_file == '../result/result.npy': args.result_file = '../result/%s.npy' % log_file_name if args.model_path == '../model/%s/%s.ckpt' % (init_args.model_name, init_args.model_name): args.model_path = '../model/%s/%s.ckpt' % (init_args.model_name, log_file_name) logging.basicConfig(filename=args.log_file, level=args.verbose) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logging.info(init_args) logging.info(args) tf.set_random_seed(args.random_seed) np.random.seed(args.random_seed) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu data = BaseDataLoader(path=args.path, dataset=args.dataset, sep=args.sep, label=args.label, append_id=args.append_id, include_id=args.include_id, balance_train=args.balance_data > 0) if init_args.model_name in ['BaseModel']: model = model_name(class_num=data.class_num, feature_num=len(data.features), random_seed=args.random_seed, model_path=args.model_path) elif init_args.model_name in ['DeepModel', 'NFM', 'WideDeep']: model = model_name(class_num=data.class_num, feature_num=len(data.features), feature_dims=data.feature_dims, f_vector_size=args.f_vector_size, layers=eval(args.layers), random_seed=args.random_seed, model_path=args.model_path) elif init_args.model_name in ['FSNFM', 'FSWideDeep']: model = model_name(class_num=data.class_num, feature_num=len(data.features), feature_dims=data.feature_dims, f_vector_size=args.f_vector_size, layers=eval(args.layers), random_seed=args.random_seed, model_path=args.model_path) elif init_args.model_name in ['RecModel', 'BiasedMF', 'CSRecModel']: model = model_name(class_num=data.class_num, feature_num=len(data.features), user_num=data.user_num, item_num=data.item_num, u_vector_size=args.u_vector_size, i_vector_size=args.i_vector_size, random_seed=args.random_seed, model_path=args.model_path) elif init_args.model_name in ['ACCM', 'CCCC', 'FSACCM', 'FSCCCC']: model = model_name(class_num=data.class_num, feature_num=len(data.features), user_num=data.user_num, item_num=data.item_num, user_feature_num=len(data.user_features), item_feature_num=len(data.item_features), feature_dims=data.feature_dims, f_vector_size=args.f_vector_size, cb_hidden_layers=eval(args.cb_hidden_layers), attention_size=args.attention_size, u_vector_size=args.u_vector_size, i_vector_size=args.i_vector_size, cs_ratio=args.cs_ratio, random_seed=args.random_seed, model_path=args.model_path) else: logging.error('Unknown Model: ' + init_args.model_name) return if init_args.runner_name in ['BaseRunner']: runner = runner_name(optimizer=args.optimizer, learning_rate=args.lr, epoch=args.epoch, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, dropout=args.dropout, l2=args.l2, metrics=args.metric, check_epoch=args.check_epoch) else: logging.error('Unknown Runner: ' + init_args.runner_name) return dp_parser = FSDataProcessor.parse_dp_args( argparse.ArgumentParser(description='')) dp_args, extras = dp_parser.parse_known_args() data_processor = FSDataProcessor(data, model, runner, fs_ratio=dp_args.fs_ratio, mode=dp_args.fs_mode) logging.info({**vars(args), **vars(dp_args)}) logging.info('Test Before Training = ' + utils.format_metric(runner.evaluate(model, data.test_data)) + ' ' + ','.join(runner.metrics)) if args.load > 0: runner.load_model(model) if args.train > 0: runner.train(model, data.train_data, data.validation_data, data.test_data, data_processor=data_processor) logging.info('Test After Training = ' + utils.format_metric(runner.evaluate(model, data.test_data)) + ' ' + ','.join(runner.metrics)) np.save(args.result_file, runner.predict(model, data.test_data)) logging.info('Save Test Results to ' + args.result_file) runner.lrp(model, data.train_data) runner.lrp(model, data.test_data) logging.debug(runner.evaluate(model, data.test_data)) logging.debug(runner.evaluate(model, data.test_data)) return
def main(): # init args init_parser = argparse.ArgumentParser(description='Model') init_parser.add_argument('--rank', type=int, default=1, help='1=ranking, 0=rating/click') init_parser.add_argument('--data_loader', type=str, default='DataLoader', help='Choose data_loader') init_parser.add_argument('--model_name', type=str, default='BaseModel', help='Choose model to run.') init_parser.add_argument('--runner', type=str, default='BaseRunner', help='Choose runner') init_parser.add_argument('--data_processor', type=str, default='DataProcessor', help='Choose runner') init_args, init_extras = init_parser.parse_known_args() # choose data_loader data_loader_name = eval(init_args.data_loader) # choose model model_name = eval(init_args.model_name) # choose runner if init_args.model_name in ['NCR']: init_args.runner_name = 'ProLogicRunner' else: init_args.runner_name = 'BaseRunner' runner_name = eval(init_args.runner_name) # choose data_processor if init_args.model_name in ['SVDPP']: init_args.data_processor = 'HisDataProcessor' elif init_args.model_name in ['NCR', 'RNNModel', 'CompareModel', 'GRU4Rec', 'STAMP']: init_args.data_processor = 'ProLogicRecDP' data_processor_name = eval(init_args.data_processor) # cmd line paras parser = argparse.ArgumentParser(description='') parser = utils.parse_global_args(parser) parser = data_loader_name.parse_data_args(parser) parser = model_name.parse_model_args(parser, model_name=init_args.model_name) parser = runner_name.parse_runner_args(parser) parser = data_processor_name.parse_dp_args(parser) args, extras = parser.parse_known_args() # log,model,result filename log_file_name = [str(init_args.rank), init_args.model_name, args.dataset, str(args.random_seed), 'optimizer=' + args.optimizer, 'lr=' + str(args.lr), 'l2=' + str(args.l2), 'dropout=' + str(args.dropout), 'batch_size=' + str(args.batch_size)] log_file_name = '__'.join(log_file_name).replace(' ', '__') if args.log_file == '../log/log.txt': args.log_file = '../log/%s.txt' % log_file_name if args.result_file == '../result/result.npy': args.result_file = '../result/%s.npy' % log_file_name if args.model_path == '../model/%s/%s.pt' % (init_args.model_name, init_args.model_name): args.model_path = '../model/%s/%s.pt' % (init_args.model_name, log_file_name) # logging for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=args.log_file, level=args.verbose) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) # convert the namespace into dictionary e.g. init_args.model_name -> {'model_name': BaseModel} logging.info(vars(init_args)) logging.info(vars(args)) logging.info('DataLoader: ' + init_args.data_loader) logging.info('Model: ' + init_args.model_name) logging.info('Runner: ' + init_args.runner_name) logging.info('DataProcessor: ' + init_args.data_processor) # random seed torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) np.random.seed(args.random_seed) # cuda os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu logging.info("# cuda devices: %d" % torch.cuda.device_count()) # create data_loader data_loader = data_loader_name(path=args.path, dataset=args.dataset, label=args.label, sep=args.sep) features, feature_dims, feature_min, feature_max = \ data_loader.feature_info(include_id=model_name.include_id, include_item_features=model_name.include_item_features, include_user_features=model_name.include_user_features) # create model if init_args.model_name in ['BaseModel']: model = model_name(label_min=data_loader.label_min, label_max=data_loader.label_max, feature_num=len(features), random_seed=args.random_seed, model_path=args.model_path) elif init_args.model_name in ['RecModel', 'BiasedMF', 'SVDPP']: model = model_name(label_min=data_loader.label_min, label_max=data_loader.label_max, feature_num=0, user_num=data_loader.user_num, item_num=data_loader.item_num, u_vector_size=args.u_vector_size, i_vector_size=args.i_vector_size, random_seed=args.random_seed, model_path=args.model_path) elif init_args.model_name in ['GRU4Rec']: model = model_name(neg_emb=args.neg_emb, neg_layer=args.neg_layer, hidden_size=args.hidden_size, num_layers=args.num_layers, p_layers=args.p_layers, label_min=data_loader.label_min, label_max=data_loader.label_max, feature_num=0, user_num=data_loader.user_num, item_num=data_loader.item_num, u_vector_size=args.u_vector_size, i_vector_size=args.i_vector_size, random_seed=args.random_seed, model_path=args.model_path) elif init_args.model_name in ['STAMP']: model = model_name(neg_emb=args.neg_emb, neg_layer=args.neg_layer, hidden_size=args.hidden_size, num_layers=args.num_layers, p_layers=args.p_layers, label_min=data_loader.label_min, label_max=data_loader.label_max, feature_num=0, user_num=data_loader.user_num, item_num=data_loader.item_num, u_vector_size=args.u_vector_size, i_vector_size=args.i_vector_size, random_seed=args.random_seed, model_path=args.model_path, attention_size=args.attention_size) elif init_args.model_name in ['NCR', 'CompareModel']: model = model_name(label_min=data_loader.label_min, label_max=data_loader.label_max, feature_num=0, user_num=data_loader.user_num, item_num=data_loader.item_num, u_vector_size=args.u_vector_size, i_vector_size=args.i_vector_size, r_weight=args.r_weight, ppl_weight=args.ppl_weight, pos_weight=args.pos_weight, random_seed=args.random_seed, model_path=args.model_path) elif init_args.model_name in ['RNNModel']: model = model_name(label_min=data_loader.label_min, label_max=data_loader.label_max, feature_num=0, user_num=data_loader.user_num, item_num=data_loader.item_num, u_vector_size=args.u_vector_size, i_vector_size=args.i_vector_size, random_seed=args.random_seed, model_path=args.model_path) else: logging.error('Unknown Model: ' + init_args.model_name) return # init model paras model.apply(model.init_paras) # use gpu if torch.cuda.device_count() > 0: # model = model.to('cuda:0') model = model.cuda() # append user interaction history if init_args.model_name in ['NCR', 'RNNModel', 'CompareModel', 'GRU4Rec', 'STAMP']: data_loader.append_his(last_n=args.max_his, supply=False, neg=True, neg_column=False) # if ranking, only keeps observed interactions, negative items are sampled during training process. if init_args.rank == 1: data_loader.drop_neg() # create data_processor if init_args.data_processor in ['ProLogicRecDP']: data_processor = data_processor_name( data_loader, model, rank=init_args.rank, test_neg_n=args.test_neg_n, max_his=args.max_his, sup_his=0, sparse_his=0 ) elif init_args.data_processor in ['HisDataProcessor']: data_processor = data_processor_name( data_loader, model, rank=init_args.rank, test_neg_n=args.test_neg_n, sup_his=args.sup_his, max_his=args.max_his, sparse_his=args.sparse_his) else: data_processor = data_processor_name(data_loader, model, rank=init_args.rank, test_neg_n=args.test_neg_n) # create runner # batch_size is the training batch size, eval_batch_size is the batch size for evaluation if init_args.runner_name in ['BaseRunner', 'ProLogicRunner']: runner = runner_name( optimizer=args.optimizer, learning_rate=args.lr, epoch=args.epoch, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, dropout=args.dropout, l2=args.l2, metrics=args.metric, check_epoch=args.check_epoch, early_stop=args.early_stop) else: logging.error('Unknown Runner: ' + init_args.runner_name) return # training/testing logging.info('Test Before Training = ' + utils.format_metric( runner.evaluate(model, data_processor.get_test_data(), data_processor)) + ' ' + ','.join(runner.metrics)) if args.load > 0: model.load_model() if args.train > 0: runner.train(model, data_processor, skip_eval=args.skip_eval) logging.info('Test After Training = ' + utils.format_metric( runner.evaluate(model, data_processor.get_test_data(), data_processor)) + ' ' + ','.join(runner.metrics)) # save test results np.save(args.result_file, runner.predict(model, data_processor.get_test_data(), data_processor)) logging.info('Save Test Results to ' + args.result_file) logging.debug(runner.evaluate(model, data_processor.get_test_data(), data_processor)) logging.debug(runner.evaluate(model, data_processor.get_test_data(), data_processor)) return
def train(self, model, train_data, validation_data=None, test_data=None, data_processor=None): assert train_data is not None if model.sess is None: self._build_sess(model) if data_processor is None: data_processor = BaseDataProcessor() self._check_time(start=True) init_train = self.evaluate(model, train_data) \ if train_data is not None else [-1.0] * len(self.metrics) init_valid = self.evaluate(model, validation_data) \ if validation_data is not None else [-1.0] * len(self.metrics) init_test = self.evaluate(model, test_data) \ if test_data is not None else [-1.0] * len(self.metrics) logging.info("Init: \t train= %s validation= %s test= %s [%.1f s] " % ( utils.format_metric(init_train), utils.format_metric(init_valid), utils.format_metric(init_test), self._check_time()) + ','.join(self.metrics)) try: for epoch in range(self.epoch): gc.collect() self._check_time() epoch_train_data = copy.deepcopy(train_data) epoch_train_data = data_processor.epoch_process_train(epoch_train_data, epoch=epoch + 1) if self.check_epoch > 0 and (epoch == 1 or epoch % self.check_epoch == 0): self.check(model, epoch_train_data) self.fit(model, epoch_train_data, epoch=epoch + 1) del epoch_train_data training_time = self._check_time() # output validation train_result = self.evaluate(model, train_data) \ if train_data is not None else [-1.0] * len(self.metrics) valid_result = self.evaluate(model, validation_data) \ if validation_data is not None else [-1.0] * len(self.metrics) test_result = self.evaluate(model, test_data) \ if test_data is not None else [-1.0] * len(self.metrics) testing_time = self._check_time() self.train_results.append(train_result) self.valid_results.append(valid_result) self.test_results.append(test_result) logging.info("Epoch %5d [%.1f s]\t train= %s validation= %s test= %s [%.1f s] " % (epoch + 1, training_time, utils.format_metric(train_result), utils.format_metric(valid_result), utils.format_metric(test_result), testing_time) + ','.join(self.metrics)) if utils.best_result(self.metrics[0], self.valid_results) == self.valid_results[-1]: self.save_model(model) if utils.eva_termination(self.metrics[0], self.valid_results): logging.info("Early stop at %d based on validation result." % (epoch + 1)) break except KeyboardInterrupt: logging.info("Early stop manually") save_here = input("Save here? (1/0) (default 0):") if str(save_here).lower().startswith('1'): self.save_model(model) # Find the best validation result across iterations best_valid_score = utils.best_result(self.metrics[0], self.valid_results) best_epoch = self.valid_results.index(best_valid_score) logging.info("Best Iter(validation)= %5d\t train= %s valid= %s test= %s [%.1f s] " % (best_epoch + 1, utils.format_metric(self.train_results[best_epoch]), utils.format_metric(self.valid_results[best_epoch]), utils.format_metric(self.test_results[best_epoch]), self.time[1] - self.time[0]) + ','.join(self.metrics)) best_test_score = utils.best_result(self.metrics[0], self.test_results) best_epoch = self.test_results.index(best_test_score) logging.info("Best Iter(test)= %5d\t train= %s valid= %s test= %s [%.1f s] " % (best_epoch + 1, utils.format_metric(self.train_results[best_epoch]), utils.format_metric(self.valid_results[best_epoch]), utils.format_metric(self.test_results[best_epoch]), self.time[1] - self.time[0]) + ','.join(self.metrics)) self.load_model(model)
def train(self, model, corpus): try: self._check_time(start=True) for epoch in range(self.epoch): self._check_time() # Shuffle training data epoch_train_data = copy.deepcopy(corpus.data_df['train']) epoch_train_data = epoch_train_data.sample(frac=1).reset_index( drop=True) # Fit last_batch, mean_loss, mean_l2 = self.fit(model, corpus, epoch_train_data, epoch=epoch + 1) # Observe selected tensors if self.check_epoch > 0 and epoch % self.check_epoch == 0: last_batch['mean_loss'] = mean_loss last_batch['mean_l2'] = mean_l2 model.check(last_batch) del epoch_train_data training_time = self._check_time() # Record dev and test results dev_result = self.evaluate(model, corpus, 'dev', [self.main_topk], self.metrics) test_result = self.evaluate(model, corpus, 'test', [self.main_topk], self.metrics) testing_time = self._check_time() self.dev_results.append(dev_result) self.test_results.append(test_result) logging.info( "Epoch {:<3} loss={:<.4f} [{:<.1f} s]\t dev=({}) test=({}) [{:<.1f} s] " .format(epoch + 1, mean_loss, training_time, utils.format_metric(dev_result), utils.format_metric(test_result), testing_time)) # Save model and early stop main_metric_result = [ x[self.main_key] for x in self.dev_results ] if max(main_metric_result) == main_metric_result[-1] \ or (hasattr(model, 'stage') and model.stage == 1): model.save_model() if self.early_stop and self.eval_termination(): logging.info( "Early stop at %d based on validation result." % (epoch + 1)) break except KeyboardInterrupt: logging.info("Early stop manually") exit_here = input( "Exit completely without evaluation? (y/n) (default n):") if exit_here.lower().startswith('y'): logging.info(os.linesep + '-' * 45 + ' END: ' + utils.get_time() + ' ' + '-' * 45) exit(1) # Find the best dev result across iterations main_metric_result = [x[self.main_key] for x in self.dev_results] best_dev_score = max(main_metric_result) best_epoch = main_metric_result.index(best_dev_score) logging.info( "\nBest Iter(dev)= %5d\t dev=(%s) test=(%s) [%.1f s] " % (best_epoch + 1, utils.format_metric(self.dev_results[best_epoch]), utils.format_metric( self.test_results[best_epoch]), self.time[1] - self.time[0])) model.load_model()
def print_res(self, model, corpus): set_name = 'test' result = self.evaluate(model, corpus, set_name) res_str = utils.format_metric(result) return res_str
def train(self, model, corpus): assert (corpus.data_df['train'] is not None) self._check_time(start=True) try: for epoch in range(self.epoch): gc.collect() self._check_time() epoch_train_data = copy.deepcopy(corpus.data_df['train']) epoch_train_data = epoch_train_data.sample(frac=1).reset_index( drop=True) loss = self.fit(model, corpus, epoch_train_data, epoch=epoch + 1) del epoch_train_data training_time = self._check_time() # output validation valid_result = self.evaluate(model, corpus, 'dev') test_result = self.evaluate(model, corpus, 'test') testing_time = self._check_time() for metric in self.metrics: self.valid_results[metric].append(valid_result[metric]) self.test_results[metric].append(test_result[metric]) logging.info( "Epoch {:<3} loss={:<.4f} [{:<.1f} s]\t valid=({}) test=({}) [{:<.1f} s] " .format(epoch + 1, loss, training_time, utils.format_metric(valid_result), utils.format_metric(test_result), testing_time)) if max(self.valid_results[self.metrics[0]] ) == self.valid_results[self.metrics[0]][-1]: model.save_model() if self.eva_termination(model) and self.early_stop: logging.info( "Early stop at %d based on validation result." % (epoch + 1)) break except KeyboardInterrupt: logging.info("Early stop manually") exit_here = input( "Exit completely without evaluation? (y/n) (default n):") if exit_here.lower().startswith('y'): logging.info(os.linesep + '-' * 45 + ' END: ' + utils.get_time() + ' ' + '-' * 45) exit(1) # Find the best validation result across iterations best_valid_score = max(self.valid_results[self.metrics[0]]) best_epoch = self.valid_results[self.metrics[0]].index( best_valid_score) valid_res_dict, test_res_dict = dict(), dict() for metric in self.metrics: valid_res_dict[metric] = self.valid_results[metric][best_epoch] test_res_dict[metric] = self.test_results[metric][best_epoch] logging.info( "\nBest Iter(dev)= %5d\t valid=(%s) test=(%s) [%.1f s] " % (best_epoch + 1, utils.format_metric(valid_res_dict), utils.format_metric(test_res_dict), self.time[1] - self.time[0])) best_test_score = max(self.test_results[self.metrics[0]]) best_epoch = self.test_results[self.metrics[0]].index(best_test_score) for metric in self.metrics: valid_res_dict[metric] = self.valid_results[metric][best_epoch] test_res_dict[metric] = self.test_results[metric][best_epoch] logging.info( "Best Iter(test)= %5d\t valid=(%s) test=(%s) [%.1f s] \n" % (best_epoch + 1, utils.format_metric(valid_res_dict), utils.format_metric(test_res_dict), self.time[1] - self.time[0])) model.load_model()