def main(args): progress = WorkSplitter() table_path = 'tables/' test = load_numpy(path=args.path, name=args.dataset + args.test) df = pd.DataFrame({ 'model': [ 'AutoRec', 'AutoRec', 'AutoRec', 'InitFeatureEmbedAE', 'InitFeatureEmbedAE', 'InitFeatureEmbedAE', 'AlterFeatureEmbedAE', 'ConcatFeatureEmbedAE', 'UnionSampleAE', 'WRSampleAE', 'BatchSampleAE', 'BridgeLabelAE', 'RefineLabelAE', 'DeepAutoRec', 'DeepAutoRec', 'SoftLabelAE', 'HintAE' ], 'way': [ None, 'unif', 'combine', 'user', 'item', 'both', None, None, None, None, None, None, None, None, 'unif', None, None ] }) progress.subsection("Reproduce") frame = [] for idx, row in df.iterrows(): row = row.to_dict() row['metric'] = ['NLL', 'AUC'] row['rank'] = 200 result = execute(test, row, folder=args.model_folder + args.dataset) frame.append(result) results = pd.concat(frame) save_dataframe_csv(results, table_path, args.name)
def main(args): table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path+args.tuning_result_path, 'MAP@10') R_train = load_numpy(path=args.path, name=args.train) R_valid = load_numpy(path=args.path, name=args.valid) R_test = load_numpy(path=args.path, name=args.test) R_train = R_train + R_valid # R_train[(R_train <= 3).nonzero()] = 0 # R_test[(R_test <= 3).nonzero()] = 0 # R_train[(R_train > 3).nonzero()] = 1 # R_test[(R_test > 3).nonzero()] = 1 # import ipdb; ipdb.set_trace() topK = [5, 10, 15, 20, 50] frame = [] for idx, row in df.iterrows(): start = timeit.default_timer() row = row.to_dict() row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"] row['topK'] = topK result = execute(R_train, R_test, row, models[row['model']]) stop = timeit.default_timer() print('Time: ', stop - start) frame.append(result) results = pd.concat(frame) save_dataframe_csv(results, table_path, args.name)
def main(args): progress = WorkSplitter() table_path = 'tables/' test = load_numpy(path=args.path, name=args.dataset + args.test) df = pd.DataFrame({ 'model': [ "BiasedMF", "BiasedMF", "BiasedMF", "PropensityMF", "InitFeatureEmbedMF", "InitFeatureEmbedMF", "InitFeatureEmbedMF", "AlterFeatureEmbedMF", "ConcatFeatureEmbedMF", "CausalSampleMF", "UnionSampleMF", "WRSampleMF", "BatchSampleMF", "BridgeLabelMF", "RefineLabelMF" ], 'way': [ None, "unif", "combine", None, "user", "item", "both", None, None, None, None, None, None, None, None ] }) progress.subsection("Reproduce") frame = [] for idx, row in df.iterrows(): row = row.to_dict() row['metric'] = ['NLL', 'AUC'] row['rank'] = 10 result = execute(test, row, folder=args.model_folder + args.dataset) frame.append(result) results = pd.concat(frame) save_dataframe_csv(results, table_path, args.name)
def main(args): table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + args.problem, 'NDCG') R_train = load_numpy(path=args.path, name=args.train) R_valid = load_numpy(path=args.path, name=args.valid) R_test = load_numpy(path=args.path, name=args.test) R_train = R_train + R_valid topK = [5, 10, 15, 20, 50] frame = [] for idx, row in df.iterrows(): start = timeit.default_timer() row = row.to_dict() row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"] row['topK'] = topK result = execute(R_train, R_test, row, models[row['model']], gpu_on=args.gpu) stop = timeit.default_timer() print('Time: ', stop - start) frame.append(result) results = pd.concat(frame) save_dataframe_csv(results, table_path, args.name)
def main(args): progress = WorkSplitter() table_path = 'tables/' test = load_numpy(path=args.path, name=args.dataset + args.test) df = pd.DataFrame({ 'model': [ 'RestrictedBatchSampleMF', 'RestrictedBatchSampleMF', 'RestrictedBatchSampleMF', 'RestrictedBatchSampleMF', 'RestrictedBatchSampleMF' ], 'way': [None, 'head_users', 'tail_users', 'head_items', 'tail_items'] }) progress.subsection("Gain Analysis") frame = [] for idx, row in df.iterrows(): row = row.to_dict() row['metric'] = ['NLL', 'AUC'] row['rank'] = 10 result = execute(test, row, folder=args.model_folder + args.dataset) frame.append(result) results = pd.concat(frame) save_dataframe_csv(results, table_path, args.name)
def main(args): table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + args.tuning_result_path, 'NDCG') R_train = load_numpy(path=args.data_dir, name=args.train_set) R_valid = load_numpy(path=args.data_dir, name=args.valid_set) R_test = load_numpy(path=args.data_dir, name=args.test_set) R_train = R_train + R_valid topK = [5, 10, 15, 20, 50] frame = [] for idx, row in df.iterrows(): start = timeit.default_timer() row = row.to_dict() row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"] row['topK'] = topK result = general(R_train, R_test, row, models[row['model']], measure=row['similarity'], gpu_on=args.gpu, model_folder=args.model_folder) stop = timeit.default_timer() print('Time: ', stop - start) frame.append(result) results = pd.concat(frame) save_dataframe_csv(results, table_path, args.save_path)
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Number of Users Sampled: {}".format(args.num_users_sampled)) print("Number of Items Sampled: {}".format(args.num_items_sampled)) print("Number of Max Allowed Iterations: {}".format( args.max_iteration_threshold)) print("Critiquing Model: {}".format(args.critiquing_model_name)) R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_test = load_numpy(path=args.data_dir, name=args.test_set) print("Test U-I Dimensions: {}".format(R_test.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train Item Keyphrase U-I Dimensions: {}".format( R_train_keyphrase.shape)) R_train_item_keyphrase = load_numpy( path=args.data_dir, name=args.train_item_keyphrase_set).toarray() table_path = load_yaml('config/global.yml', key='path')['tables'] parameters = find_best_hyperparameters(table_path + args.dataset_name, 'NDCG') parameters_row = parameters.loc[parameters['model'] == args.model] if args.dataset_name == "yelp/": R_train_item_keyphrase = R_train_item_keyphrase.T start_time = time.time() results = critiquing( matrix_Train=R_train, matrix_Test=R_test, keyphrase_freq=R_train_keyphrase, item_keyphrase_freq=R_train_item_keyphrase, num_users_sampled=args.num_users_sampled, num_items_sampled=args.num_items_sampled, max_iteration_threshold=args.max_iteration_threshold, dataset_name=args.dataset_name, model=models[args.model], parameters_row=parameters_row, critiquing_model_name=args.critiquing_model_name, keyphrase_selection_method=args.keyphrase_selection_method, topk=args.topk, lamb=args.lamb) print("Final Time Elapsed: {}".format(inhour(time.time() - start_time))) table_path = load_yaml('config/global.yml', key='path')['tables'] save_dataframe_csv(results, table_path, args.save_path)
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Number of Users Sampled: {}".format(args.num_users_sampled)) print("Number of Items Sampled: {}".format(args.num_items_sampled)) print("Number of Max Allowed Iterations: {}".format(args.max_iteration_threshold)) print("Critiquing Model: {}".format(args.critiquing_model_name)) R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_test = load_numpy(path=args.data_dir, name=args.test_set) print("Test U-I Dimensions: {}".format(R_test.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train Item Keyphrase U-I Dimensions: {}".format(R_train_keyphrase.shape)) R_train_item_keyphrase = load_numpy(path=args.data_dir, name=args.train_item_keyphrase_set).toarray() table_path = load_yaml('config/global.yml', key='path')['tables'] # parameters = find_best_hyperparameters(table_path+args.dataset_name, 'NDCG') # parameters_row = parameters.loc[parameters['model'] == args.model] parameters_row = { 'iter' : 10, 'lambda' : 200, 'rank' : 200 } keyphrases_names = load_dataframe_csv(path = args.data_dir, name = "Keyphrases.csv")['Phrases'].tolist() results = critiquing(matrix_Train=R_train, matrix_Test=R_test, keyphrase_freq=R_train_keyphrase, item_keyphrase_freq=R_train_item_keyphrase.T, num_users_sampled=args.num_users_sampled, num_items_sampled=args.num_items_sampled, max_iteration_threshold=args.max_iteration_threshold, dataset_name=args.dataset_name, model=models[args.model], parameters_row=parameters_row, critiquing_model_name=args.critiquing_model_name, lamb = args.lambdas, keyphrases_names = keyphrases_names, keyphrase_selection_method = args.keyphrase_selection_method) table_path = load_yaml('config/global.yml', key='path')['tables'] save_dataframe_csv(results, table_path, args.save_path)
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Number of Users Sampled: {}".format(args.num_users_sampled)) print("Number of Items Sampled: {}".format(args.num_items_sampled)) print("Number of Max Allowed Iterations: {}".format( args.max_iteration_threshold)) print("Critiquing Model: {}".format(args.critiquing_model_name)) R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_test = load_numpy(path=args.data_dir, name=args.test_set) print("Test U-I Dimensions: {}".format(R_test.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train User Keyphrase U-I Dimensions: {}".format( R_train_keyphrase.shape)) R_train_item_keyphrase = load_numpy( path=args.data_dir, name=args.train_item_keyphrase_set).toarray() print("Train Item Keyphrase U-I Dimensions: {}".format( R_train_item_keyphrase.shape)) # table_path = load_yaml('config/global.yml', key='path')['tables'] # parameters = find_best_hyperparameters(table_path+args.dataset_name, 'NDCG') # parameters_row = parameters.loc[parameters['model'] == args.model] parameters_row = pd.DataFrame({'iter': [4], 'lambda': [80], 'rank': [200]}) results = critiquing(matrix_Train=R_train, matrix_Test=R_test, keyphrase_freq=R_train_keyphrase, item_keyphrase_freq=R_train_item_keyphrase, num_users_sampled=args.num_users_sampled, num_items_sampled=args.num_items_sampled, max_iteration_threshold=args.max_iteration_threshold, dataset_name=args.dataset_name, model=models[args.model], parameters_row=parameters_row, critiquing_model_name=args.critiquing_model_name) # table_path = load_yaml('config/global.yml', key='path')['tables'] table_path = '/home/shuyang/data4/LatentLinearCritiquingforConvRecSys/' save_dataframe_csv(results, table_path, args.save_path)
def hyper_parameter_tuning(train, validation, params, save_path): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame(columns=['model', 'k', 'topK']) num_user = train.shape[0] for algorithm in params['models']: for k in params['k']: if ((df['model'] == algorithm) & (df['k'] == k)).any(): continue format = "model: {}, k: {}" progress.section(format.format(algorithm, k)) progress.subsection("Training") model = params['models'][algorithm]() model.train(train) progress.subsection("Prediction") prediction_score = model.predict(train, k=k) prediction = predict(prediction_score=prediction_score, topK=params['topK'][-1], matrix_Train=train) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = {'model': algorithm, 'k': k} for name in result.keys(): result_dict[name] = [ round(result[name][0], 4), round(result[name][1], 4) ] df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path)
def explanation_converge(num_users, num_items, user_col, item_col, rating_col, keyphrase_vector_col, df_train, df_test, keyphrase_names, df, table_path, file_name, epoch=10): progress = WorkSplitter() results = pd.DataFrame(columns=[ 'model', 'rank', 'num_layers', 'train_batch_size', 'predict_batch_size', 'lambda', 'topK', 'learning_rate', 'epoch', 'negative_sampling_size', 'optimizer' ]) for run in range(3): for idx, row in df.iterrows(): row = row.to_dict() if row['model'] not in explanable_models: continue progress.section(json.dumps(row)) row['metric'] = ['NDCG', 'Recall', 'Precision', 'MAP'] row['topK'] = [10] if 'optimizer' not in row.keys(): row['optimizer'] = 'Adam' negative_sampler = Negative_Sampler( df_train[[user_col, item_col, keyphrase_vector_col]], user_col, item_col, rating_col, keyphrase_vector_col, num_items=num_items, batch_size=row['train_batch_size'], num_keyphrases=len(keyphrase_names), negative_sampling_size=1) # explanation does not sensitive to negative samples model = models[row['model']](num_users=num_users, num_items=num_items, text_dim=len(keyphrase_names), embed_dim=row['rank'], num_layers=row['num_layers'], negative_sampler=negative_sampler, lamb=row['lambda'], learning_rate=row['learning_rate']) batches = negative_sampler.get_batches() epoch_batch = 10 for i in range(epoch // epoch_batch): if i == 0: model.train_model(df_train, user_col, item_col, rating_col, epoch=epoch_batch, batches=batches, init_embedding=True) else: model.train_model(df_train, user_col, item_col, rating_col, epoch=epoch_batch, batches=batches, init_embedding=False) df_valid_explanation = predict_explanation( model, df_test, user_col, item_col, topk_keyphrase=row['topK'][0]) result = evaluate_explanation(df_valid_explanation, df_test, row['metric'], row['topK'], user_col, item_col, rating_col, keyphrase_vector_col) # Note Finished yet result_dict = { 'model': row['model'], 'rank': row['rank'], 'num_layers': row['num_layers'], 'train_batch_size': row['train_batch_size'], 'predict_batch_size': row['predict_batch_size'], 'lambda': row['lambda'], 'topK': row['topK'][0], 'learning_rate': row['learning_rate'], 'epoch': (i + 1) * epoch_batch, 'negative_sampling_size': row['negative_sampling_size'], 'optimizer': row['optimizer'] } for name in result.keys(): result_dict[name] = round(result[name][0], 4) results = results.append(result_dict, ignore_index=True) print("result is \n {}".format(results)) model.sess.close() tf.reset_default_graph() save_dataframe_csv(results, table_path, file_name) return results
def explanation_parameter_tuning(num_users, num_items, user_col, item_col, rating_col, keyphrase_vector_col, df_train, df_valid, keyphrase_names, params, save_path): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame(columns=[ 'model', 'rank', 'num_layers', 'train_batch_size', 'predict_batch_size', 'lambda', 'topK', 'learning_rate', 'epoch', 'negative_sampling_size' ]) for algorithm in params['models']: for rank in params['rank']: for num_layers in params['num_layers']: for train_batch_size in params['train_batch_size']: for predict_batch_size in params['predict_batch_size']: for lamb in params['lambda']: for learning_rate in params['learning_rate']: for epoch in params['epoch']: for negative_sampling_size in params[ 'negative_sampling_size']: if ((df['model'] == algorithm) & (df['rank'] == rank) & (df['num_layers'] == num_layers) & (df['train_batch_size'] == train_batch_size) & (df['predict_batch_size'] == predict_batch_size) & (df['lambda'] == lamb) & (df['learning_rate'] == learning_rate) & (df['epoch'] == epoch) & (df['negative_sampling_size'] == negative_sampling_size)).any(): continue format = "model: {0}, rank: {1}, num_layers: {2}, " \ "train_batch_size: {3}, predict_batch_size: {4}, " \ "lambda: {5}, learning_rate: {6}, epoch: {7}, " \ "negative_sampling_size: {8}" progress.section( format.format( algorithm, rank, num_layers, train_batch_size, predict_batch_size, lamb, learning_rate, epoch, negative_sampling_size)) progress.subsection( "Initializing Negative Sampler") negative_sampler = Negative_Sampler( df_train[[ user_col, item_col, keyphrase_vector_col ]], user_col, item_col, rating_col, keyphrase_vector_col, num_items=num_items, batch_size=train_batch_size, num_keyphrases=len( keyphrase_names), negative_sampling_size= negative_sampling_size) model = params['models'][algorithm]( num_users=num_users, num_items=num_items, text_dim=len(keyphrase_names), embed_dim=rank, num_layers=num_layers, negative_sampler=negative_sampler, lamb=lamb, learning_rate=learning_rate) progress.subsection("Training") model.train_model(df_train, user_col, item_col, rating_col, epoch=epoch) progress.subsection("Prediction") df_valid_explanation = predict_explanation( model, df_valid, user_col, item_col, topk_keyphrase=params['topK'][-1]) progress.subsection("Evaluation") explanation_result = evaluate_explanation( df_valid_explanation, df_valid, params['metric'], params['topK'], user_col, item_col, rating_col, keyphrase_vector_col) result_dict = { 'model': algorithm, 'rank': rank, 'num_layers': num_layers, 'train_batch_size': train_batch_size, 'predict_batch_size': predict_batch_size, 'lambda': lamb, 'learning_rate': learning_rate, 'epoch': epoch, 'negative_sampling_size': negative_sampling_size } for name in explanation_result.keys(): result_dict[name] = [ round( explanation_result[name] [0], 4), round( explanation_result[name] [1], 4) ] df = df.append(result_dict, ignore_index=True) model.sess.close() tf.reset_default_graph() save_dataframe_csv( df, table_path, save_path)
def hyper_parameter_tuning(train, validation, params, save_path, measure='Cosine', gpu_on=True): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame(columns=['model', 'similarity', 'alpha', 'batch_size', 'corruption', 'epoch', 'iteration', 'key_dimension', 'lambda', 'learning_rate', 'mode_dimension', 'normalize', 'rank', 'root', 'topK']) num_user = train.shape[0] for algorithm in params['models']: for alpha in params['alpha']: for batch_size in params['batch_size']: for corruption in params['corruption']: for epoch in params['epoch']: for iteration in params['iteration']: for key_dim in params['key_dimension']: for lamb in params['lambda']: for learning_rate in params['learning_rate']: for mode_dim in params['mode_dimension']: for rank in params['rank']: for root in params['root']: if ((df['model'] == algorithm) & (df['alpha'] == alpha) & (df['batch_size'] == batch_size) & (df['corruption'] == corruption) & (df['epoch'] == epoch) & (df['iteration'] == iteration) & (df['key_dimension'] == key_dim) & (df['lambda'] == lamb) & (df['learning_rate'] == learning_rate) & (df['mode_dimension'] == mode_dim) & (df['rank'] == rank) & (df['root'] == root)).any(): continue format = "model: {}, alpha: {}, batch_size: {}, corruption: {}, epoch: {}, iteration: {}, \ key_dimension: {}, lambda: {}, learning_rate: {}, mode_dimension: {}, rank: {}, root: {}" progress.section(format.format(algorithm, alpha, batch_size, corruption, epoch, iteration, key_dim, lamb, learning_rate, mode_dim, rank, root)) RQ, Yt, Bias = params['models'][algorithm](train, embedded_matrix=np.empty((0)), mode_dim=mode_dim, key_dim=key_dim, batch_size=batch_size, learning_rate=learning_rate, iteration=iteration, epoch=epoch, rank=rank, corruption=corruption, gpu_on=gpu_on, lamb=lamb, alpha=alpha, root=root) Y = Yt.T progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, bias=Bias, topK=params['topK'][-1], matrix_Train=train, measure=measure, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = {'model': algorithm, 'alpha': alpha, 'batch_size': batch_size, 'corruption': corruption, 'epoch': epoch, 'iteration': iteration, 'key_dimension': key_dim, 'lambda': lamb, 'learning_rate': learning_rate, 'mode_dimension': mode_dim, 'rank': rank, 'similarity': params['similarity'], 'root': root} for name in result.keys(): result_dict[name] = [round(result[name][0], 4), round(result[name][1], 4)] df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path)
def hyper_parameter_tuning(train, validation, params, save_path, measure='Cosine', gpu_on=True): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame(columns=[ 'model', 'rank', 'alpha', 'lambda', 'iter', 'similarity', 'corruption', 'root', 'topK' ]) num_user = train.shape[0] for algorithm in params['models']: for rank in params['rank']: for alpha in params['alpha']: for lam in params['lambda']: for corruption in params['corruption']: for root in params['root']: if ((df['model'] == algorithm) & (df['rank'] == rank) & (df['alpha'] == alpha) & (df['lambda'] == lam) & (df['corruption'] == corruption) & (df['root'] == root)).any(): continue format = "model: {0}, rank: {1}, alpha: {2}, lambda: {3}, corruption: {4}, root: {5}" progress.section( format.format(algorithm, rank, alpha, lam, corruption, root)) RQ, Yt, Bias = params['models'][algorithm]( train, embeded_matrix=np.empty((0)), iteration=params['iter'], rank=rank, lam=lam, alpha=alpha, corruption=corruption, root=root, gpu_on=gpu_on) Y = Yt.T progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, measure=measure, bias=Bias, topK=params['topK'][-1], matrix_Train=train, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = { 'model': algorithm, 'rank': rank, 'alpha': alpha, 'lambda': lam, 'iter': params['iter'], 'similarity': params['similarity'], 'corruption': corruption, 'root': root } for name in result.keys(): result_dict[name] = [ round(result[name][0], 4), round(result[name][1], 4) ] df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path)
def converge(Rtrain, Rtest, df, table_path, file_name, epochs=10, gpu_on=True): progress = WorkSplitter() m, n = Rtrain.shape valid_models = autoencoders.keys() results = pd.DataFrame( columns=['model', 'rank', 'lambda', 'epoch', 'optimizer']) for run in range(3): for idx, row in df.iterrows(): row = row.to_dict() if row['model'] not in valid_models: continue progress.section(json.dumps(row)) row['metric'] = ['NDCG', 'R-Precision'] row['topK'] = [50] if 'optimizer' not in row.keys(): row['optimizer'] = 'RMSProp' try: model = autoencoders[row['model']]( n, row['rank'], batch_size=100, lamb=row['lambda'], optimizer=Regularizer[row['optimizer']]) except: model = autoencoders[row['model']]( m, n, row['rank'], batch_size=100, lamb=row['lambda'], optimizer=Regularizer[row['optimizer']]) batches = model.get_batches(Rtrain, 100) epoch_batch = 50 for i in range(epochs // epoch_batch): model.train_model(Rtrain, corruption=row['corruption'], epoch=epoch_batch, batches=batches) RQ = model.get_RQ(Rtrain) Y = model.get_Y() Bias = model.get_Bias() Y = Y.T prediction = predict(matrix_U=RQ, matrix_V=Y, bias=Bias, topK=row['topK'][0], matrix_Train=Rtrain, measure='Cosine', gpu=gpu_on) result = evaluate(prediction, Rtest, row['metric'], row['topK']) # Note Finished yet result_dict = { 'model': row['model'], 'rank': row['rank'], 'lambda': row['lambda'], 'optimizer': row['optimizer'], 'epoch': (i + 1) * epoch_batch } for name in result.keys(): result_dict[name] = round(result[name][0], 4) results = results.append(result_dict, ignore_index=True) model.sess.close() tf.reset_default_graph() save_dataframe_csv(results, table_path, file_name) return results
def hyper_parameter_tuning(train, validation, params, unif_train, save_path, seed, way, dataset, gpu_on): progress = WorkSplitter() table_path = 'tables/' data_name = save_path.split('/')[0] save_dir = 'tables/' + data_name + '/' if not os.path.exists(save_dir): os.makedirs(save_dir) for algorithm in params['models']: if algorithm in ['AutoRec']: df = pd.DataFrame( columns=['model', 'rank', 'batch_size', 'lambda', 'iter']) for rank in params['rank']: for batch_size in params['batch_size']: for lam in params['lambda']: format = "model: {0}, rank: {1}, batch_size: {2}, lambda: {3}" progress.section( format.format(algorithm, rank, batch_size, lam)) RQ, X, xBias, Y, yBias = params['models'][algorithm]( train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=rank, gpu_on=gpu_on, lam=lam, seed=seed, batch_size=batch_size, way=way, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y.T, matrix_Valid=validation, bias=yBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = { 'model': algorithm, 'rank': rank, 'batch_size': batch_size, 'lambda': lam, 'iter': params['iter'] } for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) elif algorithm in ['InitFeatureEmbedAE', 'ConcatFeatureEmbedAE']: df = pd.DataFrame( columns=['model', 'batch_size', 'lambda', 'iter']) for batch_size in params['batch_size']: for lam in params['lambda']: format = "model: {0}, batch_size: {1}, lambda: {2}" progress.section(format.format(algorithm, batch_size, lam)) RQ, X, xBias, Y, yBias = params['models'][algorithm]( train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=params['rank'], gpu_on=gpu_on, lam=lam, seed=seed, batch_size=batch_size, way=way, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y.T, matrix_Valid=validation, bias=yBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = { 'model': algorithm, 'batch_size': batch_size, 'lambda': lam, 'iter': params['iter'] } for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) elif algorithm in ['UnionSampleAE', 'RefineLabelAE']: df = pd.DataFrame(columns=['model', 'confidence', 'iter']) for conf in params['confidence']: format = "model: {0}, confidence: {1}" progress.section(format.format(algorithm, conf)) RQ, X, xBias, Y, yBias = params['models'][algorithm]( train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=params['rank'], gpu_on=gpu_on, lam=params['lambda'], seed=seed, batch_size=params['batch_size'], way=way, confidence=conf, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y.T, matrix_Valid=validation, bias=yBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = { 'model': algorithm, 'confidence': conf, 'iter': params['iter'] } for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) elif algorithm in ['BatchSampleAE']: df = pd.DataFrame(columns=['model', 'step', 'iter']) for step in params['step']: format = "model: {0}, step: {1}" progress.section(format.format(algorithm, step)) RQ, X, xBias, Y, yBias = params['models'][algorithm]( train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=params['rank'], gpu_on=gpu_on, lam=params['lambda'], seed=seed, batch_size=params['batch_size'], way=way, step=step, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y.T, matrix_Valid=validation, bias=yBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = { 'model': algorithm, 'step': step, 'iter': params['iter'] } for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) elif algorithm in ['BridgeLabelAE']: df = pd.DataFrame(columns=['model', 'lambda', 'lambda2', 'iter']) for lam in params['lambda']: for lam2 in params['lambda2']: format = "model: {0}, lambda: {1}, lambda2: {2}" progress.section(format.format(algorithm, lam, lam2)) RQ, X, xBias, Y, yBias = params['models'][algorithm]( train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=params['rank'], gpu_on=gpu_on, lam=lam, lam2=lam2, seed=seed, batch_size=params['batch_size'], way=way, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y.T, matrix_Valid=validation, bias=yBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = { 'model': algorithm, 'lambda': lam, 'lambda2': lam2, 'iter': params['iter'] } for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) elif algorithm in ['SoftLabelAE']: df = pd.DataFrame(columns=['model', 'confidence', 'tau', 'iter']) for conf in params['confidence']: for tau in params['tau']: format = "model: {0}, confidence: {1}, tau: {2}" progress.section(format.format(algorithm, conf, tau)) RQ, X, xBias, Y, yBias, Z, zBias, K, kBias = params[ 'models'][algorithm](train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=params['rank'], rank2=params['rank2'], gpu_on=gpu_on, lam=params['lambda'], seed=seed, batch_size=params['batch_size'], confidence=conf, tau=tau, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=K.T, matrix_Valid=validation, bias=yBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = { 'model': algorithm, 'confidence': conf, 'tau': tau, 'iter': params['iter'] } for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) elif algorithm in ['HintAE']: df = pd.DataFrame(columns=['model', 'confidence', 'iter']) for conf in params['confidence']: format = "model: {0}, confidence: {1}" progress.section(format.format(algorithm, conf)) RQ, X, xBias, Y, yBias, Z, zBias, K, kBias = params['models'][ algorithm](train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=params['rank'], rank2=params['rank2'], gpu_on=gpu_on, lam=params['lambda'], seed=seed, batch_size=params['batch_size'], confidence=conf, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=K.T, matrix_Valid=validation, bias=yBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = { 'model': algorithm, 'confidence': conf, 'iter': params['iter'] } for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path)
def general(num_users, num_items, user_col, item_col, rating_col, keyphrase_vector_col, df_train, df_test, keyphrase_names, params, save_path): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + params['tuning_result_path'], 'NDCG') try: output_df = load_dataframe_csv(table_path, save_path) except: output_df = pd.DataFrame(columns=[ 'model', 'rank', 'num_layers', 'train_batch_size', 'predict_batch_size', 'lambda', 'topK', 'learning_rate', 'epoch', 'negative_sampling_size' ]) for index, row in df.iterrows(): algorithm = row['model'] rank = row['rank'] num_layers = row['num_layers'] train_batch_size = row['train_batch_size'] predict_batch_size = row['predict_batch_size'] lamb = row['lambda'] learning_rate = row['learning_rate'] epoch = 300 negative_sampling_size = row['negative_sampling_size'] row['topK'] = [5, 10, 15, 20, 50] row['metric'] = [ 'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP' ] format = "model: {0}, rank: {1}, num_layers: {2}, train_batch_size: {3}, " \ "predict_batch_size: {4}, lambda: {5}, learning_rate: {6}, epoch: {7}, negative_sampling_size: {8}" progress.section( format.format(algorithm, rank, num_layers, train_batch_size, predict_batch_size, lamb, learning_rate, epoch, negative_sampling_size)) progress.subsection("Initializing Negative Sampler") negative_sampler = Negative_Sampler( df_train[[user_col, item_col, keyphrase_vector_col]], user_col, item_col, rating_col, keyphrase_vector_col, num_items=num_items, batch_size=train_batch_size, num_keyphrases=len(keyphrase_names), negative_sampling_size=negative_sampling_size) model = models[algorithm](num_users=num_users, num_items=num_items, text_dim=len(keyphrase_names), embed_dim=rank, num_layers=num_layers, negative_sampler=negative_sampler, lamb=lamb, learning_rate=learning_rate) progress.subsection("Training") pretrained_path = load_yaml('config/global.yml', key='path')['pretrained'] # try: # model.load_model(pretrained_path+params['tuning_result_path'], row['model']) # except: model.train_model(df_train, user_col, item_col, rating_col, epoch=epoch) # model.save_model(pretrained_path+params['tuning_result_path'], row['model']) progress.subsection("Prediction") prediction, explanation = predict_elementwise( model, df_train, user_col, item_col, row['topK'][-1], batch_size=row['predict_batch_size'], enable_explanation=False, keyphrase_names=keyphrase_names) R_test = to_sparse_matrix(df_test, num_users, num_items, user_col, item_col, rating_col) result = evaluate(prediction, R_test, row['metric'], row['topK']) # Note Finished yet result_dict = { 'model': row['model'], 'rank': row['rank'], 'num_layers': row['num_layers'], 'train_batch_size': row['train_batch_size'], 'predict_batch_size': row['predict_batch_size'], 'lambda': row['lambda'], 'topK': row['topK'][-1], 'learning_rate': row['learning_rate'], 'epoch': epoch, 'negative_sampling_size': row['negative_sampling_size'], } for name in result.keys(): result_dict[name] = round(result[name][0], 4) output_df = output_df.append(result_dict, ignore_index=True) model.sess.close() tf.reset_default_graph() save_dataframe_csv(output_df, table_path, save_path) return output_df
def hyper_parameter_tuning(train, validation, params, save_path, gpu_on=True): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame( columns=['model', 'rank', 'lambda', 'epoch', 'corruption', 'topK']) num_user = train.shape[0] for algorithm in params['models']: for rank in params['rank']: for lamb in params['lambda']: for corruption in params['corruption']: if ((df['model'] == algorithm) & (df['rank'] == rank) & (df['lambda'] == lamb) & (df['corruption'] == corruption)).any(): continue format = "model: {}, rank: {}, lambda: {}, corruption: {}" progress.section( format.format(algorithm, rank, lamb, corruption)) RQ, Yt, Bias = params['models'][algorithm]( train, epoch=params['epoch'], lamb=lamb, rank=rank, corruption=corruption) Y = Yt.T progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, bias=Bias, topK=params['topK'][-1], matrix_Train=train, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = { 'model': algorithm, 'rank': rank, 'lambda': lamb, 'epoch': params['epoch'], 'corruption': corruption } for name in result.keys(): result_dict[name] = [ round(result[name][0], 4), round(result[name][1], 4) ] df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path)
def critiquing(num_users, num_items, user_col, item_col, rating_col, keyphrase_vector_col, df_train, keyphrase_names, params, num_users_sampled, load_path, save_path): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] df = pd.read_csv(table_path + load_path) dfs_fmap = [] for index, row in df.iterrows(): if row['model'] not in critiquing_models: continue algorithm = row['model'] rank = row['rank'] num_layers = row['num_layers'] train_batch_size = row['train_batch_size'] predict_batch_size = row['predict_batch_size'] lamb = row['lambda'] learning_rate = row['learning_rate'] epoch = 200 negative_sampling_size = 1 format = "model: {0}, rank: {1}, num_layers: {2}, train_batch_size: {3}, " \ "predict_batch_size: {4}, lambda: {5}, learning_rate: {6}, epoch: {7}, negative_sampling_size: {8}" progress.section( format.format(algorithm, rank, num_layers, train_batch_size, predict_batch_size, lamb, learning_rate, epoch, negative_sampling_size)) progress.subsection("Initializing Negative Sampler") negative_sampler = Negative_Sampler( df_train[[user_col, item_col, keyphrase_vector_col]], user_col, item_col, rating_col, keyphrase_vector_col, num_items=num_items, batch_size=train_batch_size, num_keyphrases=len(keyphrase_names), negative_sampling_size=negative_sampling_size) model = critiquing_models[algorithm](num_users=num_users, num_items=num_items, text_dim=len(keyphrase_names), embed_dim=rank, num_layers=num_layers, negative_sampler=negative_sampler, lamb=lamb, learning_rate=learning_rate) pretrained_path = load_yaml('config/global.yml', key='path')['pretrained'] try: model.load_model(pretrained_path + params['model_saved_path'], row['model']) except: model.train_model(df_train, user_col, item_col, rating_col, epoch=epoch) model.save_model(pretrained_path + params['model_saved_path'], row['model']) df_fmap = critiquing_evaluation(model, algorithm, num_users, num_items, num_users_sampled, topk=[5, 10, 20]) dfs_fmap.append(df_fmap) model.sess.close() tf.reset_default_graph() df_output_fmap = pd.concat(dfs_fmap) save_dataframe_csv(df_output_fmap, table_path, name=save_path + '_FMAP.csv')
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Number of Users Sampled: {}".format(args.num_users_sampled)) print("Number of Items Sampled: {}".format(args.num_items_sampled)) print("Number of Max Allowed Iterations: {}".format( args.max_iteration_threshold)) print("Critiquing Model: {}".format(args.critiquing_model_name)) R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_test = load_numpy(path=args.data_dir, name=args.test_set) print("Test U-I Dimensions: {}".format(R_test.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train Item Keyphrase U-I Dimensions: {}".format( R_train_keyphrase.shape)) R_train_item_keyphrase = load_numpy( path=args.data_dir, name=args.train_item_keyphrase_set).toarray() table_path = load_yaml('config/global.yml', key='path')['tables'] parameters = find_best_hyperparameters(table_path + args.dataset_name, 'NDCG') parameters_row = parameters.loc[parameters['model'] == args.model] lambs = [ 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500, 1000, 10000, 100000 ] topks = [10, 20, 50, 100] if args.dataset_name == "yelp/": R_train_item_keyphrase = R_train_item_keyphrase.T for topk in topks: for lamb in lambs: results = critiquing( matrix_Train=R_train, matrix_Test=R_test, keyphrase_freq=R_train_keyphrase, item_keyphrase_freq=R_train_item_keyphrase, num_users_sampled=args.num_users_sampled, num_items_sampled=args.num_items_sampled, max_iteration_threshold=args.max_iteration_threshold, dataset_name=args.dataset_name, model=models[args.model], parameters_row=parameters_row, critiquing_model_name=args.critiquing_model_name, keyphrase_selection_method=args.keyphrase_selection_method, topk=topk, lamb=lamb) table_path = load_yaml('config/global.yml', key='path')['tables'] topk_path = "topk_" + str(topk) + "/" save_name = args.save_path + topk_path + "tuning_at_lamb_" + str( lamb) + "_with_" + args.keyphrase_selection_method + ".csv" save_dataframe_csv(results, table_path, save_name)
def hyper_parameter_tuning(train, validation, keyphrase_train, keyphrase_validation, params, save_path, tune_explanation=False): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame(columns=['model', 'rank', 'beta', 'lambda_l2', 'lambda_keyphrase', 'lambda_latent', 'lambda_rating', 'topK', 'learning_rate', 'epoch', 'corruption', 'optimizer']) for algorithm in params['models']: for rank in params['rank']: for beta in params['beta']: for lamb_l2 in params['lambda_l2']: for lamb_keyphrase in params['lambda_keyphrase']: for lamb_latent in params['lambda_latent']: for lamb_rating in params['lambda_rating']: for learning_rate in params['learning_rate']: for epoch in params['epoch']: for corruption in params['corruption']: for optimizer in params['optimizer']: if ((df['model'] == algorithm) & (df['rank'] == rank) & (df['beta'] == beta) & (df['lambda_l2'] == lamb_l2) & (df['lambda_keyphrase'] == lamb_keyphrase) & (df['lambda_latent'] == lamb_latent) & (df['lambda_rating'] == lamb_rating) & (df['learning_rate'] == learning_rate) & (df['epoch'] == epoch) & (df['corruption'] == corruption) & (df['optimizer'] == optimizer)).any() or (lamb_latent != lamb_keyphrase): continue format = "model: {}, rank: {}, beta: {}, lambda_l2: {}, " \ "lambda_keyphrase: {}, lambda_latent: {}, lambda_rating: {}, " \ "learning_rate: {}, epoch: {}, corruption: {}, optimizer: {}" progress.section(format.format(algorithm, rank, beta, lamb_l2, lamb_keyphrase, lamb_latent, lamb_rating, learning_rate, epoch, corruption, optimizer)) progress.subsection("Training") model = models[algorithm](matrix_train=train, epoch=epoch, lamb_l2=lamb_l2, lamb_keyphrase=lamb_keyphrase, lamb_latent=lamb_latent, lamb_rating=lamb_rating, beta=beta, learning_rate=learning_rate, rank=rank, corruption=corruption, optimizer=optimizer, matrix_train_keyphrase=keyphrase_train) progress.subsection("Prediction") rating_score, keyphrase_score = model.predict(train.todense()) progress.subsection("Evaluation") if tune_explanation: prediction = predict_keyphrase(keyphrase_score, topK=params['topK'][-1]) result = evaluate(prediction, keyphrase_validation, params['metric'], params['topK']) else: prediction = predict(rating_score, topK=params['topK'][-1], matrix_Train=train) result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = {'model': algorithm, 'rank': rank, 'beta': beta, 'lambda_l2': lamb_l2, 'lambda_keyphrase': lamb_keyphrase, 'lambda_latent': lamb_latent, 'lambda_rating': lamb_rating, 'learning_rate': learning_rate, 'epoch': epoch, 'corruption': corruption, 'optimizer': optimizer} for name in result.keys(): result_dict[name] = [round(result[name][0], 4), round(result[name][1], 4)] df = df.append(result_dict, ignore_index=True) model.sess.close() tf.reset_default_graph() save_dataframe_csv(df, table_path, save_path)
def general(train, test, keyphrase_train, keyphrase_test, params, save_path, final_explanation=False): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] df = find_best_hyperparameters(table_path + params['tuning_result_path'], 'NDCG') try: output_df = load_dataframe_csv(table_path, save_path) except: output_df = pd.DataFrame(columns=['model', 'rank', 'beta', 'lambda_l2', 'lambda_keyphrase', 'lambda_latent', 'lambda_rating', 'topK', 'learning_rate', 'epoch', 'corruption', 'optimizer']) for index, row in df.iterrows(): algorithm = row['model'] rank = row['rank'] beta = row['beta'] lamb_l2 = row['lambda_l2'] lamb_keyphrase = row['lambda_keyphrase'] lamb_latent = row['lambda_latent'] lamb_rating = row['lambda_rating'] learning_rate = row['learning_rate'] epoch = row['epoch'] corruption = row['corruption'] optimizer = row['optimizer'] row['topK'] = [5, 10, 15, 20, 50] row['metric'] = ['R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP'] format = "model: {}, rank: {}, beta: {}, lambda_l2: {}, lambda_keyphrase: {}, " \ "lambda_latent: {}, lambda_rating: {}, learning_rate: {}, " \ "epoch: {}, corruption: {}, optimizer: {}" progress.section(format.format(algorithm, rank, beta, lamb_l2, lamb_keyphrase, lamb_latent, lamb_rating, learning_rate, epoch, corruption, optimizer)) progress.subsection("Training") model = models[algorithm](matrix_train=train, epoch=epoch, lamb_l2=lamb_l2, lamb_keyphrase=lamb_keyphrase, lamb_latent=lamb_latent, lamb_rating=lamb_rating, beta=beta, learning_rate=learning_rate, rank=rank, corruption=corruption, optimizer=optimizer, matrix_train_keyphrase=keyphrase_train) progress.subsection("Prediction") rating_score, keyphrase_score = model.predict(train.todense()) progress.subsection("Evaluation") if final_explanation: prediction = predict_keyphrase(keyphrase_score, topK=row['topK'][-2]) result = evaluate_explanation(prediction, keyphrase_test, row['metric'], row['topK']) else: prediction = predict(rating_score, topK=row['topK'][-1], matrix_Train=train) result = evaluate(prediction, test, row['metric'], row['topK']) result_dict = {'model': algorithm, 'rank': rank, 'beta': beta, 'lambda_l2': lamb_l2, 'lambda_keyphrase': lamb_keyphrase, 'lambda_latent': lamb_latent, 'lambda_rating': lamb_rating, 'learning_rate': learning_rate, 'epoch': epoch, 'corruption': corruption, 'optimizer': optimizer} for name in result.keys(): result_dict[name] = [round(result[name][0], 4), round(result[name][1], 4)] output_df = output_df.append(result_dict, ignore_index=True) model.sess.close() tf.reset_default_graph() save_dataframe_csv(output_df, table_path, save_path) return output_df
def critiquing(train_set, keyphrase_train_set, item_keyphrase_train_set, params, num_users_sampled, load_path, save_path, critiquing_function): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] df = pd.read_csv(table_path + load_path) dfs_fmap = [] for index, row in df.iterrows(): if row['model'] not in critiquing_models: continue algorithm = row['model'] rank = row['rank'] beta = row['beta'] lamb_l2 = row['lambda_l2'] lamb_keyphrase = row['lambda_keyphrase'] lamb_latent = row['lambda_latent'] lamb_rating = row['lambda_rating'] learning_rate = row['learning_rate'] epoch = row['epoch'] corruption = row['corruption'] optimizer = row['optimizer'] format = "model: {}, rank: {}, beta: {}, lambda_l2: {}, lambda_keyphrase: {}, " \ "lambda_latent: {}, lambda_rating: {}, learning_rate: {}, " \ "epoch: {}, corruption: {}, optimizer: {}" progress.section( format.format(algorithm, rank, beta, lamb_l2, lamb_keyphrase, lamb_latent, lamb_rating, learning_rate, epoch, corruption, optimizer)) progress.subsection("Training") model = critiquing_models[algorithm]( matrix_train=train_set, epoch=epoch, lamb_l2=lamb_l2, lamb_keyphrase=lamb_keyphrase, lamb_latent=lamb_latent, lamb_rating=lamb_rating, beta=beta, learning_rate=learning_rate, rank=rank, corruption=corruption, optimizer=optimizer, matrix_train_keyphrase=keyphrase_train_set) num_users, num_items = train_set.shape df_fmap = critiquing_evaluation(train_set, keyphrase_train_set, item_keyphrase_train_set, model, algorithm, num_users, num_items, num_users_sampled, critiquing_function, topk=[5, 10, 20]) df_fmap['model'] = algorithm df_fmap['rank'] = rank df_fmap['beta'] = beta df_fmap['lambda_l2'] = lamb_l2 df_fmap['lambda_keyphrase'] = lamb_keyphrase df_fmap['lambda_latent'] = lamb_latent df_fmap['lambda_rating'] = lamb_rating df_fmap['learning_rate'] = learning_rate df_fmap['epoch'] = epoch df_fmap['corruption'] = corruption df_fmap['optimizer'] = optimizer dfs_fmap.append(df_fmap) model.sess.close() tf.reset_default_graph() df_output_fmap = pd.concat(dfs_fmap) save_dataframe_csv(df_output_fmap, table_path, name=save_path + '_FMAP.csv')
def hyper_parameter_tuning(train, validation, params, unif_train, save_path, seed, way, dataset, gpu_on): progress = WorkSplitter() table_path = 'tables/' data_name = save_path.split('/')[0] save_dir = 'tables/' + data_name + '/' if not os.path.exists(save_dir): os.makedirs(save_dir) for algorithm in params['models']: if algorithm in ['BiasedMF', 'PropensityMF']: df = pd.DataFrame(columns=['model', 'batch_size', 'lambda', 'iter']) for batch_size in params['batch_size']: for lam in params['lambda']: format = "model: {0}, batch_size: {1}, lambda: {2}" progress.section(format.format(algorithm, batch_size, lam)) RQ, Y, uBias, iBias = params['models'][algorithm](train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=params['rank'], gpu_on=gpu_on, lam=lam, seed=seed, batch_size=batch_size, way=way, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=validation, ubias=uBias, ibias=iBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = {'model': algorithm, 'batch_size': batch_size, 'lambda': lam, 'iter': params['iter']} for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) elif algorithm in ['InitFeatureEmbedMF', 'AlterFeatureEmbedMF', 'WRSampleMF']: df = pd.DataFrame(columns=['model', 'lambda', 'iter']) for lam in params['lambda']: format = "model: {0}, lambda: {1}" progress.section(format.format(algorithm, lam)) RQ, Y, uBias, iBias = params['models'][algorithm](train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=params['rank'], gpu_on=gpu_on, lam=lam, seed=seed, batch_size=params['batch_size'], way=way, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=validation, ubias=uBias, ibias=iBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = {'model': algorithm, 'lambda': lam, 'iter': params['iter']} for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) elif algorithm in ['CausalSampleMF', 'BridgeLabelMF']: df = pd.DataFrame(columns=['model', 'lambda', 'lambda2', 'iter']) for lam in params['lambda']: for lam2 in params['lambda2']: format = "model: {0}, lambda: {1}, lambda2: {2}" progress.section(format.format(algorithm, lam, lam2)) RQ, Y, uBias, iBias = params['models'][algorithm](train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=params['rank'], gpu_on=gpu_on, lam=lam, lam2=lam2, seed=seed, batch_size=params['batch_size'], way=way, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=validation, ubias=uBias, ibias=iBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = {'model': algorithm, 'lambda': lam, 'lambda2': lam2, 'iter': params['iter']} for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) elif algorithm in ['UnionSampleMF', 'RefineLabelMF']: df = pd.DataFrame(columns=['model', 'confidence', 'iter']) for conf in params['confidence']: format = "model: {0}, confidence: {1}" progress.section(format.format(algorithm, conf)) RQ, Y, uBias, iBias = params['models'][algorithm](train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=params['rank'], gpu_on=gpu_on, lam=params['lambda'], seed=seed, batch_size=params['batch_size'], way=way, confidence=conf, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=validation, ubias=uBias, ibias=iBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = {'model': algorithm, 'confidence': conf, 'iter': params['iter']} for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) elif algorithm in ['BatchSampleMF']: df = pd.DataFrame(columns=['model', 'step', 'iter']) for step in params['step']: format = "model: {0}, step: {1}" progress.section(format.format(algorithm, step)) RQ, Y, uBias, iBias = params['models'][algorithm](train, validation, matrix_unif_train=unif_train, iteration=params['iter'], rank=params['rank'], gpu_on=gpu_on, lam=params['lambda'], seed=seed, batch_size=params['batch_size'], way=way, step=step, dataset=dataset) progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=validation, ubias=uBias, ibias=iBias, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], gpu=gpu_on) result_dict = {'model': algorithm, 'step': step, 'iter': params['iter']} for name in result.keys(): result_dict[name] = round(result[name][0], 8) df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path)
def multiple_run_tune(defaul_params, tune_params, save_path): # Set up data stream start = time.time() print('Setting up data stream') data_continuum = continuum(defaul_params.data, defaul_params.cl_type, defaul_params) data_end = time.time() print('data setup time: {}'.format(data_end - start)) #store table # set up storing table table_path = load_yaml('config/global.yml', key='path')['tables'] metric_list = ['Avg_End_Acc'] + ['Avg_End_Fgt'] + ['Time'] + [ "Batch" + str(i) for i in range(defaul_params.num_val, data_continuum.task_nums) ] param_list = list(tune_params.keys()) + metric_list table_columns = ['Run'] + param_list table_path = table_path + defaul_params.data os.makedirs(table_path, exist_ok=True) if not save_path: save_path = defaul_params.model_name + '_' + defaul_params.data_name + '.csv' df = pd.DataFrame(columns=table_columns) # store list accuracy_list = [] params_keep = [] for run in range(defaul_params.num_runs): tmp_acc = [] tune_data = [] run_start = time.time() data_continuum.new_run() # prepare val data loader test_loaders = setup_test_loader(data_continuum.test_data(), defaul_params) tune_test_loaders = test_loaders[:defaul_params.num_val] test_loaders = test_loaders[defaul_params.num_val:] for i, (x_train, y_train, labels) in enumerate(data_continuum): if i < defaul_params.num_val: #collection tune data tune_data.append((x_train, y_train, labels)) if len(tune_data) == defaul_params.num_val: # tune best_params = tune_hyper(tune_data, tune_test_loaders, defaul_params, tune_params) params_keep.append(best_params) final_params = vars(defaul_params) final_params.update(best_params) final_params = SimpleNamespace(**final_params) # set up print('Tuning is done. Best hyper parameter set is {}'. format(best_params)) model = setup_architecture(final_params) model = maybe_cuda(model, final_params.cuda) opt = setup_opt(final_params.optimizer, model, final_params.learning_rate, final_params.weight_decay) agent = agents[final_params.agent](model, opt, final_params) print('Training Start') else: print("----------run {} training batch {}-------------".format( run, i)) print('size: {}, {}'.format(x_train.shape, y_train.shape)) agent.train_learner(x_train, y_train) acc_array = agent.evaluate(test_loaders) tmp_acc.append(acc_array) run_end = time.time() print( "-----------run {}-----------avg_end_acc {}-----------train time {}" .format(run, np.mean(tmp_acc[-1]), run_end - run_start)) accuracy_list.append(np.array(tmp_acc)) #store result result_dict = {'Run': run} result_dict.update(best_params) end_task_acc = tmp_acc[-1] for i in range(data_continuum.task_nums - defaul_params.num_val): result_dict["Batch" + str(i + defaul_params.num_val)] = end_task_acc[i] result_dict['Avg_End_Acc'] = np.mean(tmp_acc[-1]) result_dict['Avg_End_Fgt'] = single_run_avg_end_fgt(np.array(tmp_acc)) result_dict['Time'] = run_end - run_start df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) accuracy_list = np.array(accuracy_list) avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt = compute_performance( accuracy_list) end = time.time() final_result = {'Run': 'Final Result'} final_result['Avg_End_Acc'] = avg_end_acc final_result['Avg_End_Fgt'] = avg_end_fgt final_result['Time'] = end - start df = df.append(final_result, ignore_index=True) save_dataframe_csv(df, table_path, save_path) print('----------- Total {} run: {}s -----------'.format( defaul_params.num_runs, end - start)) print( '----------- Avg_End_Acc {} Avg_End_Fgt {} Avg_Acc {} Avg_Bwtp {} Avg_Fwt {}-----------' .format(avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt))
def main(args): progress = WorkSplitter() progress.section("Parameter Setting") print("Data Path: {}".format(args.data_dir)) reviewJsonToronto = args.data_dir + args.data_name progress.section("Load data") df = get_yelp_df(path='', filename=reviewJsonToronto, sampling=True) print('Data loaded sucessfully') progress.section("Matrix Generation") rating_matrix, timestamp_matrix, I_C_matrix, IC_dictionary = get_rating_timestamp_matrix( df) # get ratingWuserAvg_matrix rating_array = rating_matrix.toarray() user_average_array = rating_array.sum(axis=1) / np.count_nonzero( rating_array, axis=1) init_UI = np.zeros(rating_array.shape) init_UI[rating_array.nonzero()] = 1 #Creating rating with user average array array for i in range(user_average_array.shape[0]): init_UI[i] = init_UI[i] * (user_average_array[i] - 0.001) user_average_array = init_UI ratingWuserAvg_array = rating_array - user_average_array ratingWuserAvg_matrix = sparse.csr_matrix(ratingWuserAvg_array) progress.section("Split for training") rtrain_implicit, rvalid_implicit, rtest_implicit, rtrain_userAvg_implicit, rvalid_userAvg_implicit, \ rtest_userAvg_implicit, nonzero_index, rtime, item_idx_matrix_train_implicit,item_idx_matrix_valid_implicit, item_idx_matrix_test_implicit \ = time_ordered_splitModified(rating_matrix=rating_matrix, ratingWuserAvg_matrix=ratingWuserAvg_matrix, timestamp_matrix=timestamp_matrix, ratio=[0.5,0.2,0.3], implicit=True, remove_empty=False, threshold=3,sampling=False, sampling_ratio=0.1, trainSampling=0.95) rtrain, rvalid, rtest, rtrain_userAvg, rvalid_userAvg, rtest_userAvg, nonzero_index, rtime, \ item_idx_matrix_train,item_idx_matrix_valid, item_idx_matrix_test = time_ordered_splitModified(rating_matrix=rating_matrix, ratingWuserAvg_matrix=ratingWuserAvg_matrix, timestamp_matrix=timestamp_matrix, ratio=[0.5,0.2,0.3], implicit=False, remove_empty=False, threshold=3, sampling=False, sampling_ratio=0.1, trainSampling=0.95) rtrain = rtrain + rvalid + rtest rtrain_implicit = rtrain_implicit + rvalid_implicit + rtest_implicit progress.section("Get UC Matrix") #Get UC matrices U_C_matrix_explicit, U_C_matrix_implicit = get_UC_Matrix( I_C_matrix, rtrain_implicit) progress.section("Get IK Similarity") IK_MATRIX = ikGeneration(df) IK_similarity = train(IK_MATRIX) ''' progress.section("Get IC Similarity") IC_similarity = train(I_C_matrix) ''' progress.section("Get IP, IS, ID Dictionary") #intersection = get_intersection() intersection_yonge_and_finch, intersection_bloor_and_bathurst, intersection_spadina_and_dundas,\ intersection_queen_and_spadina, intersection_bloor_and_yonge, intersection_dundas_and_yonge = get_intersection() IP_df, IP_dictionary = get_IP_matrix_dictionary(df, IK_similarity) IS_dictionary = get_IS_dictionary(df) #ID_dictionary = get_ID_dictionary(df,list(set(df['business_num_id'])),intersection) ID_dictionary_yonge_and_finch = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_yonge_and_finch) ID_dictionary_bloor_and_bathurst = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_bloor_and_bathurst) ID_dictionary_spadina_and_dundas = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_spadina_and_dundas) ID_dictionary_queen_and_spadina = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_queen_and_spadina) ID_dictionary_bloor_and_yonge = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_bloor_and_yonge) ID_dictionary_dundas_and_yonge = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_dundas_and_yonge) progress.section("user item predict") user_item_prediction_score = predict(rtrain, 110, IK_similarity, item_similarity_en=True) UI_Prediction_Matrix = prediction(user_item_prediction_score, rtrain) progress.section("Save datafiles csv") save_dataframe_csv(df, args.data_dir, "Dataframe") progress.section("Save datafiles JSON") saveDictToJson(IC_dictionary, args.data_dir, 'icDictionary', trainOrTest='train') saveDictToJson(IP_dictionary, args.data_dir, 'ipDictionary', trainOrTest='train') saveDictToJson(IS_dictionary, args.data_dir, 'isDictionary', trainOrTest='train') #saveDictToJson(ID_dictionary, args.data_dir, 'idDictionary', trainOrTest='train') saveDictToJson(ID_dictionary_yonge_and_finch, args.data_dir, 'idDictionary_yongefinch', trainOrTest='train') saveDictToJson(ID_dictionary_bloor_and_bathurst, args.data_dir, 'idDictionary_bloorbathurst', trainOrTest='train') saveDictToJson(ID_dictionary_spadina_and_dundas, args.data_dir, 'idDictionary_spadinadundas', trainOrTest='train') saveDictToJson(ID_dictionary_queen_and_spadina, args.data_dir, 'idDictionary_queenspadina', trainOrTest='train') saveDictToJson(ID_dictionary_bloor_and_yonge, args.data_dir, 'idDictionary_blooryonge', trainOrTest='train') saveDictToJson(ID_dictionary_dundas_and_yonge, args.data_dir, 'idDictionary_dundasyonge', trainOrTest='train') progress.section("Save datafiles Numpy") save_numpy_csr(rtrain, args.data_dir, "rtrain") save_numpy_csr(I_C_matrix, args.data_dir, "icmatrix") #save_numpy(user_item_prediction_score, args.data_dir, "predictionScore") save_numpy(IK_similarity, args.data_dir, "IKbased_II_similarity") #Tina requested for this name save_numpy(UI_Prediction_Matrix, args.data_dir, "UI_prediction_matrix") '''