def main(args): settings_df = load_dataframe_csv(args.tab_path + args.setting_dir) R_train = load_numpy(path=args.data_dir, name=args.train_set) R_valid = load_numpy(path=args.data_dir, name=args.valid_set) R_test = load_numpy(path=args.data_dir, name=args.test_set) index_map = np.load(args.data_dir + args.index) item_names = None try: item_names = load_dataframe_csv(args.data_dir + args.names, delimiter="::", names=['ItemID', 'Name', 'Category']) except: print("Meta-data does not exist") attention(R_train, R_valid, R_test, index_map, item_names, args.tex_path, args.fig_path, settings_df, args.template_path, preference_analysis=args.preference_analysis, case_study=args.case_study, gpu_on=True)
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Number of Users Sampled: {}".format(args.num_users_sampled)) print("Number of Items Sampled: {}".format(args.num_items_sampled)) print("Number of Max Allowed Iterations: {}".format(args.max_iteration_threshold)) print("Critiquing Model: {}".format(args.critiquing_model_name)) R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_test = load_numpy(path=args.data_dir, name=args.test_set) print("Test U-I Dimensions: {}".format(R_test.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train Item Keyphrase U-I Dimensions: {}".format(R_train_keyphrase.shape)) R_train_item_keyphrase = load_numpy(path=args.data_dir, name=args.train_item_keyphrase_set).toarray() table_path = load_yaml('config/global.yml', key='path')['tables'] # parameters = find_best_hyperparameters(table_path+args.dataset_name, 'NDCG') # parameters_row = parameters.loc[parameters['model'] == args.model] parameters_row = { 'iter' : 10, 'lambda' : 200, 'rank' : 200 } keyphrases_names = load_dataframe_csv(path = args.data_dir, name = "Keyphrases.csv")['Phrases'].tolist() results = critiquing(matrix_Train=R_train, matrix_Test=R_test, keyphrase_freq=R_train_keyphrase, item_keyphrase_freq=R_train_item_keyphrase.T, num_users_sampled=args.num_users_sampled, num_items_sampled=args.num_items_sampled, max_iteration_threshold=args.max_iteration_threshold, dataset_name=args.dataset_name, model=models[args.model], parameters_row=parameters_row, critiquing_model_name=args.critiquing_model_name, lamb = args.lambdas, keyphrases_names = keyphrases_names, keyphrase_selection_method = args.keyphrase_selection_method) table_path = load_yaml('config/global.yml', key='path')['tables'] save_dataframe_csv(results, table_path, args.save_path)
def hyper_parameter_tuning(train, validation, params, save_path): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame(columns=['model', 'k', 'topK']) num_user = train.shape[0] for algorithm in params['models']: for k in params['k']: if ((df['model'] == algorithm) & (df['k'] == k)).any(): continue format = "model: {}, k: {}" progress.section(format.format(algorithm, k)) progress.subsection("Training") model = params['models'][algorithm]() model.train(train) progress.subsection("Prediction") prediction_score = model.predict(train, k=k) prediction = predict(prediction_score=prediction_score, topK=params['topK'][-1], matrix_Train=train) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = {'model': algorithm, 'k': k} for name in result.keys(): result_dict[name] = [ round(result[name][0], 4), round(result[name][1], 4) ] df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path)
def hyper_parameter_tuning(train, validation, params, save_path, measure='Cosine', gpu_on=True): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame(columns=['model', 'similarity', 'alpha', 'batch_size', 'corruption', 'epoch', 'iteration', 'key_dimension', 'lambda', 'learning_rate', 'mode_dimension', 'normalize', 'rank', 'root', 'topK']) num_user = train.shape[0] for algorithm in params['models']: for alpha in params['alpha']: for batch_size in params['batch_size']: for corruption in params['corruption']: for epoch in params['epoch']: for iteration in params['iteration']: for key_dim in params['key_dimension']: for lamb in params['lambda']: for learning_rate in params['learning_rate']: for mode_dim in params['mode_dimension']: for rank in params['rank']: for root in params['root']: if ((df['model'] == algorithm) & (df['alpha'] == alpha) & (df['batch_size'] == batch_size) & (df['corruption'] == corruption) & (df['epoch'] == epoch) & (df['iteration'] == iteration) & (df['key_dimension'] == key_dim) & (df['lambda'] == lamb) & (df['learning_rate'] == learning_rate) & (df['mode_dimension'] == mode_dim) & (df['rank'] == rank) & (df['root'] == root)).any(): continue format = "model: {}, alpha: {}, batch_size: {}, corruption: {}, epoch: {}, iteration: {}, \ key_dimension: {}, lambda: {}, learning_rate: {}, mode_dimension: {}, rank: {}, root: {}" progress.section(format.format(algorithm, alpha, batch_size, corruption, epoch, iteration, key_dim, lamb, learning_rate, mode_dim, rank, root)) RQ, Yt, Bias = params['models'][algorithm](train, embedded_matrix=np.empty((0)), mode_dim=mode_dim, key_dim=key_dim, batch_size=batch_size, learning_rate=learning_rate, iteration=iteration, epoch=epoch, rank=rank, corruption=corruption, gpu_on=gpu_on, lamb=lamb, alpha=alpha, root=root) Y = Yt.T progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, bias=Bias, topK=params['topK'][-1], matrix_Train=train, measure=measure, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = {'model': algorithm, 'alpha': alpha, 'batch_size': batch_size, 'corruption': corruption, 'epoch': epoch, 'iteration': iteration, 'key_dimension': key_dim, 'lambda': lamb, 'learning_rate': learning_rate, 'mode_dimension': mode_dim, 'rank': rank, 'similarity': params['similarity'], 'root': root} for name in result.keys(): result_dict[name] = [round(result[name][0], 4), round(result[name][1], 4)] df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path)
def explanation_parameter_tuning(num_users, num_items, user_col, item_col, rating_col, keyphrase_vector_col, df_train, df_valid, keyphrase_names, params, save_path): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame(columns=[ 'model', 'rank', 'num_layers', 'train_batch_size', 'predict_batch_size', 'lambda', 'topK', 'learning_rate', 'epoch', 'negative_sampling_size' ]) for algorithm in params['models']: for rank in params['rank']: for num_layers in params['num_layers']: for train_batch_size in params['train_batch_size']: for predict_batch_size in params['predict_batch_size']: for lamb in params['lambda']: for learning_rate in params['learning_rate']: for epoch in params['epoch']: for negative_sampling_size in params[ 'negative_sampling_size']: if ((df['model'] == algorithm) & (df['rank'] == rank) & (df['num_layers'] == num_layers) & (df['train_batch_size'] == train_batch_size) & (df['predict_batch_size'] == predict_batch_size) & (df['lambda'] == lamb) & (df['learning_rate'] == learning_rate) & (df['epoch'] == epoch) & (df['negative_sampling_size'] == negative_sampling_size)).any(): continue format = "model: {0}, rank: {1}, num_layers: {2}, " \ "train_batch_size: {3}, predict_batch_size: {4}, " \ "lambda: {5}, learning_rate: {6}, epoch: {7}, " \ "negative_sampling_size: {8}" progress.section( format.format( algorithm, rank, num_layers, train_batch_size, predict_batch_size, lamb, learning_rate, epoch, negative_sampling_size)) progress.subsection( "Initializing Negative Sampler") negative_sampler = Negative_Sampler( df_train[[ user_col, item_col, keyphrase_vector_col ]], user_col, item_col, rating_col, keyphrase_vector_col, num_items=num_items, batch_size=train_batch_size, num_keyphrases=len( keyphrase_names), negative_sampling_size= negative_sampling_size) model = params['models'][algorithm]( num_users=num_users, num_items=num_items, text_dim=len(keyphrase_names), embed_dim=rank, num_layers=num_layers, negative_sampler=negative_sampler, lamb=lamb, learning_rate=learning_rate) progress.subsection("Training") model.train_model(df_train, user_col, item_col, rating_col, epoch=epoch) progress.subsection("Prediction") df_valid_explanation = predict_explanation( model, df_valid, user_col, item_col, topk_keyphrase=params['topK'][-1]) progress.subsection("Evaluation") explanation_result = evaluate_explanation( df_valid_explanation, df_valid, params['metric'], params['topK'], user_col, item_col, rating_col, keyphrase_vector_col) result_dict = { 'model': algorithm, 'rank': rank, 'num_layers': num_layers, 'train_batch_size': train_batch_size, 'predict_batch_size': predict_batch_size, 'lambda': lamb, 'learning_rate': learning_rate, 'epoch': epoch, 'negative_sampling_size': negative_sampling_size } for name in explanation_result.keys(): result_dict[name] = [ round( explanation_result[name] [0], 4), round( explanation_result[name] [1], 4) ] df = df.append(result_dict, ignore_index=True) model.sess.close() tf.reset_default_graph() save_dataframe_csv( df, table_path, save_path)
def hyper_parameter_tuning(train, validation, params, save_path, measure='Cosine', gpu_on=True): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame(columns=[ 'model', 'rank', 'alpha', 'lambda', 'iter', 'similarity', 'corruption', 'root', 'topK' ]) num_user = train.shape[0] for algorithm in params['models']: for rank in params['rank']: for alpha in params['alpha']: for lam in params['lambda']: for corruption in params['corruption']: for root in params['root']: if ((df['model'] == algorithm) & (df['rank'] == rank) & (df['alpha'] == alpha) & (df['lambda'] == lam) & (df['corruption'] == corruption) & (df['root'] == root)).any(): continue format = "model: {0}, rank: {1}, alpha: {2}, lambda: {3}, corruption: {4}, root: {5}" progress.section( format.format(algorithm, rank, alpha, lam, corruption, root)) RQ, Yt, Bias = params['models'][algorithm]( train, embeded_matrix=np.empty((0)), iteration=params['iter'], rank=rank, lam=lam, alpha=alpha, corruption=corruption, root=root, gpu_on=gpu_on) Y = Yt.T progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, measure=measure, bias=Bias, topK=params['topK'][-1], matrix_Train=train, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = { 'model': algorithm, 'rank': rank, 'alpha': alpha, 'lambda': lam, 'iter': params['iter'], 'similarity': params['similarity'], 'corruption': corruption, 'root': root } for name in result.keys(): result_dict[name] = [ round(result[name][0], 4), round(result[name][1], 4) ] df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path)
def hyper_parameter_tuning(train, validation, params, save_path, gpu_on=True): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame( columns=['model', 'rank', 'lambda', 'epoch', 'corruption', 'topK']) num_user = train.shape[0] for algorithm in params['models']: for rank in params['rank']: for lamb in params['lambda']: for corruption in params['corruption']: if ((df['model'] == algorithm) & (df['rank'] == rank) & (df['lambda'] == lamb) & (df['corruption'] == corruption)).any(): continue format = "model: {}, rank: {}, lambda: {}, corruption: {}" progress.section( format.format(algorithm, rank, lamb, corruption)) RQ, Yt, Bias = params['models'][algorithm]( train, epoch=params['epoch'], lamb=lamb, rank=rank, corruption=corruption) Y = Yt.T progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, bias=Bias, topK=params['topK'][-1], matrix_Train=train, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = { 'model': algorithm, 'rank': rank, 'lambda': lamb, 'epoch': params['epoch'], 'corruption': corruption } for name in result.keys(): result_dict[name] = [ round(result[name][0], 4), round(result[name][1], 4) ] df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path)
import pandas as pd from utils.io import load_dataframe_csv from plots.rec_plots import precision_recall_curve topK = [5, 10, 15, 20, 50] df = load_dataframe_csv('tables/', 'movielens20m_result.csv') precision_recall_curve(df, topK, save=True, folder='analysis/' + 'movielens20m', reloaded=True) df = load_dataframe_csv('tables/', 'netflix_result.csv') precision_recall_curve(df, topK, save=True, folder='analysis/' + 'netflix', reloaded=True) df = load_dataframe_csv('tables/', 'yahoo_result.csv') precision_recall_curve(df, topK, save=True, folder='analysis/' + 'yahoo', reloaded=True)
def general(train, test, keyphrase_train, keyphrase_test, params, save_path, final_explanation=False): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + params['tuning_result_path'], 'NDCG') try: output_df = load_dataframe_csv(table_path, save_path) except: output_df = pd.DataFrame(columns=['model', 'rank', 'beta', 'lambda_l2', 'lambda_keyphrase', 'lambda_latent', 'lambda_rating', 'topK', 'learning_rate', 'epoch', 'corruption', 'optimizer']) for index, row in df.iterrows(): algorithm = row['model'] rank = row['rank'] beta = row['beta'] lamb_l2 = row['lambda_l2'] lamb_keyphrase = row['lambda_keyphrase'] lamb_latent = row['lambda_latent'] lamb_rating = row['lambda_rating'] learning_rate = row['learning_rate'] epoch = row['epoch'] corruption = row['corruption'] optimizer = row['optimizer'] row['topK'] = [5, 10, 15, 20, 50] row['metric'] = ['R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP'] format = "model: {}, rank: {}, beta: {}, lambda_l2: {}, lambda_keyphrase: {}, " \ "lambda_latent: {}, lambda_rating: {}, learning_rate: {}, " \ "epoch: {}, corruption: {}, optimizer: {}" progress.section(format.format(algorithm, rank, beta, lamb_l2, lamb_keyphrase, lamb_latent, lamb_rating, learning_rate, epoch, corruption, optimizer)) progress.subsection("Training") model = models[algorithm](matrix_train=train, epoch=epoch, lamb_l2=lamb_l2, lamb_keyphrase=lamb_keyphrase, lamb_latent=lamb_latent, lamb_rating=lamb_rating, beta=beta, learning_rate=learning_rate, rank=rank, corruption=corruption, optimizer=optimizer, matrix_train_keyphrase=keyphrase_train) progress.subsection("Prediction") rating_score, keyphrase_score = model.predict(train.todense()) progress.subsection("Evaluation") if final_explanation: prediction = predict_keyphrase(keyphrase_score, topK=row['topK'][-2]) result = evaluate_explanation(prediction, keyphrase_test, row['metric'], row['topK']) else: prediction = predict(rating_score, topK=row['topK'][-1], matrix_Train=train) result = evaluate(prediction, test, row['metric'], row['topK']) result_dict = {'model': algorithm, 'rank': rank, 'beta': beta, 'lambda_l2': lamb_l2, 'lambda_keyphrase': lamb_keyphrase, 'lambda_latent': lamb_latent, 'lambda_rating': lamb_rating, 'learning_rate': learning_rate, 'epoch': epoch, 'corruption': corruption, 'optimizer': optimizer} for name in result.keys(): result_dict[name] = [round(result[name][0], 4), round(result[name][1], 4)] output_df = output_df.append(result_dict, ignore_index=True) model.sess.close() tf.reset_default_graph() save_dataframe_csv(output_df, table_path, save_path) return output_df
def hyper_parameter_tuning(train, validation, keyphrase_train, keyphrase_validation, params, save_path, tune_explanation=False): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame(columns=['model', 'rank', 'beta', 'lambda_l2', 'lambda_keyphrase', 'lambda_latent', 'lambda_rating', 'topK', 'learning_rate', 'epoch', 'corruption', 'optimizer']) for algorithm in params['models']: for rank in params['rank']: for beta in params['beta']: for lamb_l2 in params['lambda_l2']: for lamb_keyphrase in params['lambda_keyphrase']: for lamb_latent in params['lambda_latent']: for lamb_rating in params['lambda_rating']: for learning_rate in params['learning_rate']: for epoch in params['epoch']: for corruption in params['corruption']: for optimizer in params['optimizer']: if ((df['model'] == algorithm) & (df['rank'] == rank) & (df['beta'] == beta) & (df['lambda_l2'] == lamb_l2) & (df['lambda_keyphrase'] == lamb_keyphrase) & (df['lambda_latent'] == lamb_latent) & (df['lambda_rating'] == lamb_rating) & (df['learning_rate'] == learning_rate) & (df['epoch'] == epoch) & (df['corruption'] == corruption) & (df['optimizer'] == optimizer)).any() or (lamb_latent != lamb_keyphrase): continue format = "model: {}, rank: {}, beta: {}, lambda_l2: {}, " \ "lambda_keyphrase: {}, lambda_latent: {}, lambda_rating: {}, " \ "learning_rate: {}, epoch: {}, corruption: {}, optimizer: {}" progress.section(format.format(algorithm, rank, beta, lamb_l2, lamb_keyphrase, lamb_latent, lamb_rating, learning_rate, epoch, corruption, optimizer)) progress.subsection("Training") model = models[algorithm](matrix_train=train, epoch=epoch, lamb_l2=lamb_l2, lamb_keyphrase=lamb_keyphrase, lamb_latent=lamb_latent, lamb_rating=lamb_rating, beta=beta, learning_rate=learning_rate, rank=rank, corruption=corruption, optimizer=optimizer, matrix_train_keyphrase=keyphrase_train) progress.subsection("Prediction") rating_score, keyphrase_score = model.predict(train.todense()) progress.subsection("Evaluation") if tune_explanation: prediction = predict_keyphrase(keyphrase_score, topK=params['topK'][-1]) result = evaluate(prediction, keyphrase_validation, params['metric'], params['topK']) else: prediction = predict(rating_score, topK=params['topK'][-1], matrix_Train=train) result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = {'model': algorithm, 'rank': rank, 'beta': beta, 'lambda_l2': lamb_l2, 'lambda_keyphrase': lamb_keyphrase, 'lambda_latent': lamb_latent, 'lambda_rating': lamb_rating, 'learning_rate': learning_rate, 'epoch': epoch, 'corruption': corruption, 'optimizer': optimizer} for name in result.keys(): result_dict[name] = [round(result[name][0], 4), round(result[name][1], 4)] df = df.append(result_dict, ignore_index=True) model.sess.close() tf.reset_default_graph() save_dataframe_csv(df, table_path, save_path)
def general(num_users, num_items, user_col, item_col, rating_col, keyphrase_vector_col, df_train, df_test, keyphrase_names, params, save_path): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + params['tuning_result_path'], 'NDCG') try: output_df = load_dataframe_csv(table_path, save_path) except: output_df = pd.DataFrame(columns=[ 'model', 'rank', 'num_layers', 'train_batch_size', 'predict_batch_size', 'lambda', 'topK', 'learning_rate', 'epoch', 'negative_sampling_size' ]) for index, row in df.iterrows(): algorithm = row['model'] rank = row['rank'] num_layers = row['num_layers'] train_batch_size = row['train_batch_size'] predict_batch_size = row['predict_batch_size'] lamb = row['lambda'] learning_rate = row['learning_rate'] epoch = 300 negative_sampling_size = row['negative_sampling_size'] row['topK'] = [5, 10, 15, 20, 50] row['metric'] = [ 'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP' ] format = "model: {0}, rank: {1}, num_layers: {2}, train_batch_size: {3}, " \ "predict_batch_size: {4}, lambda: {5}, learning_rate: {6}, epoch: {7}, negative_sampling_size: {8}" progress.section( format.format(algorithm, rank, num_layers, train_batch_size, predict_batch_size, lamb, learning_rate, epoch, negative_sampling_size)) progress.subsection("Initializing Negative Sampler") negative_sampler = Negative_Sampler( df_train[[user_col, item_col, keyphrase_vector_col]], user_col, item_col, rating_col, keyphrase_vector_col, num_items=num_items, batch_size=train_batch_size, num_keyphrases=len(keyphrase_names), negative_sampling_size=negative_sampling_size) model = models[algorithm](num_users=num_users, num_items=num_items, text_dim=len(keyphrase_names), embed_dim=rank, num_layers=num_layers, negative_sampler=negative_sampler, lamb=lamb, learning_rate=learning_rate) progress.subsection("Training") pretrained_path = load_yaml('config/global.yml', key='path')['pretrained'] # try: # model.load_model(pretrained_path+params['tuning_result_path'], row['model']) # except: model.train_model(df_train, user_col, item_col, rating_col, epoch=epoch) # model.save_model(pretrained_path+params['tuning_result_path'], row['model']) progress.subsection("Prediction") prediction, explanation = predict_elementwise( model, df_train, user_col, item_col, row['topK'][-1], batch_size=row['predict_batch_size'], enable_explanation=False, keyphrase_names=keyphrase_names) R_test = to_sparse_matrix(df_test, num_users, num_items, user_col, item_col, rating_col) result = evaluate(prediction, R_test, row['metric'], row['topK']) # Note Finished yet result_dict = { 'model': row['model'], 'rank': row['rank'], 'num_layers': row['num_layers'], 'train_batch_size': row['train_batch_size'], 'predict_batch_size': row['predict_batch_size'], 'lambda': row['lambda'], 'topK': row['topK'][-1], 'learning_rate': row['learning_rate'], 'epoch': epoch, 'negative_sampling_size': row['negative_sampling_size'], } for name in result.keys(): result_dict[name] = round(result[name][0], 4) output_df = output_df.append(result_dict, ignore_index=True) model.sess.close() tf.reset_default_graph() save_dataframe_csv(output_df, table_path, save_path) return output_df