def main(args): table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + args.problem, 'NDCG') R_train = load_numpy(path=args.path, name=args.train) R_valid = load_numpy(path=args.path, name=args.valid) R_test = load_numpy(path=args.path, name=args.test) R_train = R_train + R_valid topK = [5, 10, 15, 20, 50] frame = [] for idx, row in df.iterrows(): start = timeit.default_timer() row = row.to_dict() row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"] row['topK'] = topK result = execute(R_train, R_test, row, models[row['model']], gpu_on=args.gpu) stop = timeit.default_timer() print('Time: ', stop - start) frame.append(result) results = pd.concat(frame) save_dataframe_csv(results, table_path, args.name)
def main(args): table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + args.tuning_result_path, 'NDCG') R_train = load_numpy(path=args.data_dir, name=args.train_set) R_valid = load_numpy(path=args.data_dir, name=args.valid_set) R_test = load_numpy(path=args.data_dir, name=args.test_set) R_train = R_train + R_valid topK = [5, 10, 15, 20, 50] frame = [] for idx, row in df.iterrows(): start = timeit.default_timer() row = row.to_dict() row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"] row['topK'] = topK result = general(R_train, R_test, row, models[row['model']], measure=row['similarity'], gpu_on=args.gpu, model_folder=args.model_folder) stop = timeit.default_timer() print('Time: ', stop - start) frame.append(result) results = pd.concat(frame) save_dataframe_csv(results, table_path, args.save_path)
def main(args): table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path+args.tuning_result_path, 'MAP@10') R_train = load_numpy(path=args.path, name=args.train) R_valid = load_numpy(path=args.path, name=args.valid) R_test = load_numpy(path=args.path, name=args.test) R_train = R_train + R_valid # R_train[(R_train <= 3).nonzero()] = 0 # R_test[(R_test <= 3).nonzero()] = 0 # R_train[(R_train > 3).nonzero()] = 1 # R_test[(R_test > 3).nonzero()] = 1 # import ipdb; ipdb.set_trace() topK = [5, 10, 15, 20, 50] frame = [] for idx, row in df.iterrows(): start = timeit.default_timer() row = row.to_dict() row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"] row['topK'] = topK result = execute(R_train, R_test, row, models[row['model']]) stop = timeit.default_timer() print('Time: ', stop - start) frame.append(result) results = pd.concat(frame) save_dataframe_csv(results, table_path, args.name)
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Number of Users Sampled: {}".format(args.num_users_sampled)) print("Number of Items Sampled: {}".format(args.num_items_sampled)) print("Number of Max Allowed Iterations: {}".format( args.max_iteration_threshold)) print("Critiquing Model: {}".format(args.critiquing_model_name)) R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_test = load_numpy(path=args.data_dir, name=args.test_set) print("Test U-I Dimensions: {}".format(R_test.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train Item Keyphrase U-I Dimensions: {}".format( R_train_keyphrase.shape)) R_train_item_keyphrase = load_numpy( path=args.data_dir, name=args.train_item_keyphrase_set).toarray() table_path = load_yaml('config/global.yml', key='path')['tables'] parameters = find_best_hyperparameters(table_path + args.dataset_name, 'NDCG') parameters_row = parameters.loc[parameters['model'] == args.model] if args.dataset_name == "yelp/": R_train_item_keyphrase = R_train_item_keyphrase.T start_time = time.time() results = critiquing( matrix_Train=R_train, matrix_Test=R_test, keyphrase_freq=R_train_keyphrase, item_keyphrase_freq=R_train_item_keyphrase, num_users_sampled=args.num_users_sampled, num_items_sampled=args.num_items_sampled, max_iteration_threshold=args.max_iteration_threshold, dataset_name=args.dataset_name, model=models[args.model], parameters_row=parameters_row, critiquing_model_name=args.critiquing_model_name, keyphrase_selection_method=args.keyphrase_selection_method, topk=args.topk, lamb=args.lamb) print("Final Time Elapsed: {}".format(inhour(time.time() - start_time))) table_path = load_yaml('config/global.yml', key='path')['tables'] save_dataframe_csv(results, table_path, args.save_path)
def main(args): table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path+args.param, 'NDCG') R_train = load_numpy(path=args.path, name=args.train) R_valid = load_numpy(path=args.path, name=args.valid) results = converge(R_train, R_valid, df, table_path, args.name, epochs=500, gpu_on=args.gpu) show_training_progress(results, hue='model', metric='NDCG', name="epoch_vs_ndcg")
def main(args): table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + args.problem, 'NDCG') R_train = load_numpy(path=args.path, name=args.train) R_valid = load_numpy(path=args.path, name=args.valid) R_test = load_numpy(path=args.path, name=args.test) R_train = R_train + R_valid topK = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50] personalization(R_train, R_test, df, topK, args.problem, args.model_folder, gpu_on=args.gpu)
def main(args): table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + args.problem, 'NDCG') R_train = load_numpy(path=args.path, name=args.train) R_valid = load_numpy(path=args.path, name=args.valid) R_test = load_numpy(path=args.path, name=args.test) R_train = R_train + R_valid topK = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] metric = ['R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP'] usercategory(R_train, R_test, df, topK, metric, args.problem, args.model_folder, gpu_on=args.gpu)
def main(args): table_path = load_yaml('config/global.yml', key='path')['tables'] if args.explanation: df = find_best_hyperparameters(table_path + args.tuning_result_path, 'NDCG@10') else: df = find_best_hyperparameters(table_path + args.tuning_result_path, 'NDCG') num_users = pd.read_csv(args.data_dir + args.dataset_name + '/' + args.user_col + '.csv')[args.user_col].nunique() num_items = pd.read_csv(args.data_dir + args.dataset_name + '/' + args.item_col + '.csv')[args.item_col].nunique() df_train = pd.read_csv(args.data_dir + args.dataset_name + '/' + args.train_set) df_train = df_train[df_train[args.rating_col] == 1] df_train[args.keyphrase_vector_col] = df_train[ args.keyphrase_vector_col].apply(ast.literal_eval) df_test = pd.read_csv(args.data_dir + args.dataset_name + '/' + args.test_set) keyphrase_names = pd.read_csv(args.data_dir + args.dataset_name + '/' + args.keyphrase_set)[ args.keyphrase_col].values if args.explanation: results = explanation_converge(num_users, num_items, args.user_col, args.item_col, args.rating_col, args.keyphrase_vector_col, df_train, df_test, keyphrase_names, df, table_path, args.save_path, epoch=args.epoch) else: results = converge(num_users, num_items, args.user_col, args.item_col, args.rating_col, args.keyphrase_vector_col, df_train, df_test, keyphrase_names, df, table_path, args.save_path, epoch=args.epoch) show_training_progress(results, hue='model', metric='NDCG', name="epoch_vs_ndcg")
def general(train, test, keyphrase_train, keyphrase_test, params, save_path, final_explanation=False): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + params['tuning_result_path'], 'NDCG') try: output_df = load_dataframe_csv(table_path, save_path) except: output_df = pd.DataFrame(columns=['model', 'rank', 'beta', 'lambda_l2', 'lambda_keyphrase', 'lambda_latent', 'lambda_rating', 'topK', 'learning_rate', 'epoch', 'corruption', 'optimizer']) for index, row in df.iterrows(): algorithm = row['model'] rank = row['rank'] beta = row['beta'] lamb_l2 = row['lambda_l2'] lamb_keyphrase = row['lambda_keyphrase'] lamb_latent = row['lambda_latent'] lamb_rating = row['lambda_rating'] learning_rate = row['learning_rate'] epoch = row['epoch'] corruption = row['corruption'] optimizer = row['optimizer'] row['topK'] = [5, 10, 15, 20, 50] row['metric'] = ['R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP'] format = "model: {}, rank: {}, beta: {}, lambda_l2: {}, lambda_keyphrase: {}, " \ "lambda_latent: {}, lambda_rating: {}, learning_rate: {}, " \ "epoch: {}, corruption: {}, optimizer: {}" progress.section(format.format(algorithm, rank, beta, lamb_l2, lamb_keyphrase, lamb_latent, lamb_rating, learning_rate, epoch, corruption, optimizer)) progress.subsection("Training") model = models[algorithm](matrix_train=train, epoch=epoch, lamb_l2=lamb_l2, lamb_keyphrase=lamb_keyphrase, lamb_latent=lamb_latent, lamb_rating=lamb_rating, beta=beta, learning_rate=learning_rate, rank=rank, corruption=corruption, optimizer=optimizer, matrix_train_keyphrase=keyphrase_train) progress.subsection("Prediction") rating_score, keyphrase_score = model.predict(train.todense()) progress.subsection("Evaluation") if final_explanation: prediction = predict_keyphrase(keyphrase_score, topK=row['topK'][-2]) result = evaluate_explanation(prediction, keyphrase_test, row['metric'], row['topK']) else: prediction = predict(rating_score, topK=row['topK'][-1], matrix_Train=train) result = evaluate(prediction, test, row['metric'], row['topK']) result_dict = {'model': algorithm, 'rank': rank, 'beta': beta, 'lambda_l2': lamb_l2, 'lambda_keyphrase': lamb_keyphrase, 'lambda_latent': lamb_latent, 'lambda_rating': lamb_rating, 'learning_rate': learning_rate, 'epoch': epoch, 'corruption': corruption, 'optimizer': optimizer} for name in result.keys(): result_dict[name] = [round(result[name][0], 4), round(result[name][1], 4)] output_df = output_df.append(result_dict, ignore_index=True) model.sess.close() tf.reset_default_graph() save_dataframe_csv(output_df, table_path, save_path) return output_df
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Number of Users Sampled: {}".format(args.num_users_sampled)) print("Number of Items Sampled: {}".format(args.num_items_sampled)) print("Number of Max Allowed Iterations: {}".format( args.max_iteration_threshold)) print("Critiquing Model: {}".format(args.critiquing_model_name)) R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_test = load_numpy(path=args.data_dir, name=args.test_set) print("Test U-I Dimensions: {}".format(R_test.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train Item Keyphrase U-I Dimensions: {}".format( R_train_keyphrase.shape)) R_train_item_keyphrase = load_numpy( path=args.data_dir, name=args.train_item_keyphrase_set).toarray() table_path = load_yaml('config/global.yml', key='path')['tables'] parameters = find_best_hyperparameters(table_path + args.dataset_name, 'NDCG') parameters_row = parameters.loc[parameters['model'] == args.model] lambs = [ 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000 ] topks = [10, 20, 50, 100] if args.dataset_name == "yelp/": R_train_item_keyphrase = R_train_item_keyphrase.T for topk in topks: for lamb in lambs: results = critiquing( matrix_Train=R_train, matrix_Test=R_test, keyphrase_freq=R_train_keyphrase, item_keyphrase_freq=R_train_item_keyphrase, num_users_sampled=args.num_users_sampled, num_items_sampled=args.num_items_sampled, max_iteration_threshold=args.max_iteration_threshold, dataset_name=args.dataset_name, model=models[args.model], parameters_row=parameters_row, critiquing_model_name=args.critiquing_model_name, keyphrase_selection_method=args.keyphrase_selection_method, topk=topk, lamb=lamb) table_path = load_yaml('config/global.yml', key='path')['tables'] topk_path = "topk_" + str(topk) + "/" save_name = args.save_path + topk_path + "tuning_at_lamb_" + str( lamb) + "_with_" + args.keyphrase_selection_method + ".csv" save_dataframe_csv(results, table_path, save_name)
def general(num_users, num_items, user_col, item_col, rating_col, keyphrase_vector_col, df_train, df_test, keyphrase_names, params, save_path): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + params['tuning_result_path'], 'NDCG') try: output_df = load_dataframe_csv(table_path, save_path) except: output_df = pd.DataFrame(columns=[ 'model', 'rank', 'num_layers', 'train_batch_size', 'predict_batch_size', 'lambda', 'topK', 'learning_rate', 'epoch', 'negative_sampling_size' ]) for index, row in df.iterrows(): algorithm = row['model'] rank = row['rank'] num_layers = row['num_layers'] train_batch_size = row['train_batch_size'] predict_batch_size = row['predict_batch_size'] lamb = row['lambda'] learning_rate = row['learning_rate'] epoch = 300 negative_sampling_size = row['negative_sampling_size'] row['topK'] = [5, 10, 15, 20, 50] row['metric'] = [ 'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP' ] format = "model: {0}, rank: {1}, num_layers: {2}, train_batch_size: {3}, " \ "predict_batch_size: {4}, lambda: {5}, learning_rate: {6}, epoch: {7}, negative_sampling_size: {8}" progress.section( format.format(algorithm, rank, num_layers, train_batch_size, predict_batch_size, lamb, learning_rate, epoch, negative_sampling_size)) progress.subsection("Initializing Negative Sampler") negative_sampler = Negative_Sampler( df_train[[user_col, item_col, keyphrase_vector_col]], user_col, item_col, rating_col, keyphrase_vector_col, num_items=num_items, batch_size=train_batch_size, num_keyphrases=len(keyphrase_names), negative_sampling_size=negative_sampling_size) model = models[algorithm](num_users=num_users, num_items=num_items, text_dim=len(keyphrase_names), embed_dim=rank, num_layers=num_layers, negative_sampler=negative_sampler, lamb=lamb, learning_rate=learning_rate) progress.subsection("Training") pretrained_path = load_yaml('config/global.yml', key='path')['pretrained'] # try: # model.load_model(pretrained_path+params['tuning_result_path'], row['model']) # except: model.train_model(df_train, user_col, item_col, rating_col, epoch=epoch) # model.save_model(pretrained_path+params['tuning_result_path'], row['model']) progress.subsection("Prediction") prediction, explanation = predict_elementwise( model, df_train, user_col, item_col, row['topK'][-1], batch_size=row['predict_batch_size'], enable_explanation=False, keyphrase_names=keyphrase_names) R_test = to_sparse_matrix(df_test, num_users, num_items, user_col, item_col, rating_col) result = evaluate(prediction, R_test, row['metric'], row['topK']) # Note Finished yet result_dict = { 'model': row['model'], 'rank': row['rank'], 'num_layers': row['num_layers'], 'train_batch_size': row['train_batch_size'], 'predict_batch_size': row['predict_batch_size'], 'lambda': row['lambda'], 'topK': row['topK'][-1], 'learning_rate': row['learning_rate'], 'epoch': epoch, 'negative_sampling_size': row['negative_sampling_size'], } for name in result.keys(): result_dict[name] = round(result[name][0], 4) output_df = output_df.append(result_dict, ignore_index=True) model.sess.close() tf.reset_default_graph() save_dataframe_csv(output_df, table_path, save_path) return output_df