def wrsampleae(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=256, gpu_on=True, **unused): progress = WorkSplitter() progress.section("WRSampleAE: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("WRSampleAE: Training") m, n = matrix_train.shape marks = sparse.csr_matrix(matrix_train.shape) marks[(matrix_train != 0).nonzero()] = 1 matrix_train += matrix_unif_train model = WRSampleAE(n, rank, m, lamb=lam, batch_size=batch_size, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] RQ, X, xBias, Y, yBias = model.train_model(matrix_train, marks, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, X, xBias, Y, yBias
def batchsampleae(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=256, gpu_on=True, step=3, **unused): progress = WorkSplitter() progress.section("BatchSampleAE: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("BatchSampleAE: Training") m, n = matrix_train.shape model = BatchSampleAE(n, rank, lamb=lam, batch_size=batch_size, step=step, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] RQ, X, xBias, Y, yBias = model.train_model(matrix_train, matrix_unif_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, X, xBias, Y, yBias
def sensitivity(train, validation, params): progress = WorkSplitter() progress.section("PMI-PLRec Default") RQ, Yt, _ = params['models']['NCE-PLRec'](train, embeded_matrix=np.empty((0)), iteration=params['iter'], rank=params['rank'], lam=params['lambda'], root=1.0) Y = Yt.T default_prediction = predict(matrix_U=RQ, matrix_V=Y, topK=params['topK'][-1], matrix_Train=train, gpu=True) default_result = evaluate(default_prediction, validation, params['metric'], params['topK']) print("-") print("Rank: {0}".format(params['rank'])) print("Lambda: {0}".format(params['lambda'])) print("SVD Iteration: {0}".format(params['iter'])) print("Evaluation Ranking Topk: {0}".format(params['topK'])) for key in default_result.keys(): print("{0} :{1}".format(key, default_result[key])) sensitivity_results = dict() for root in tqdm(params['root']): progress.section("PMI-PLRec, Root: " + str(root)) RQ, Yt, _ = params['models']['NCE-PLRec'](train, embeded_matrix=np.empty((0)), iteration=params['iter'], rank=params['rank'], lam=params['lambda'], root=root) Y = Yt.T prediction = predict(matrix_U=RQ, matrix_V=Y, topK=params['topK'][-1], matrix_Train=train, gpu=True) result = evaluate(prediction, validation, params['metric'], params['topK']) sensitivity_results[root] = result print("-") print("Root: {0}".format(root)) print("Rank: {0}".format(params['rank'])) print("Lambda: {0}".format(params['lambda'])) print("SVD Iteration: {0}".format(params['iter'])) print("Evaluation Ranking Topk: {0}".format(params['topK'])) for key in result.keys(): print("{0} :{1}".format(key, result[key])) return default_result, sensitivity_results
def main(args): # Progress bar progress = WorkSplitter() progress.section("Load Data") if args.emb_type == 'bert': emb_size = 768 elif args.emb_type == 'xlmr': emb_size = 1024 # Load Data start_time = time.time() print("WARNING: Embedding size is set to", emb_size) data = Data(args, args.path, args.train, args.valid,emb_size, is_lb=True) print("Elapsed: {0}".format(inhour(time.time() - start_time))) #build model progress.section("Build Model") if args.network_architecture == 'embedding_net': model = EmbeddingNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100],corruption=args.corruption) elif args.network_architecture == 'embedding_highway_net': model = EmbeddingHighWayNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100]) else: raise NotImplementedError('either use embedding_net or embedding_highway_net') model.cuda() print(model) model.load_state_dict(torch.load(args.checkpoint)) print(model) lb_loader = data.instance_a_lb_loader(args.batch) lbs = {'user_lb': list(), 'tweet_lb': list()} preds = [] model.eval() with torch.no_grad(): lb_iterator = tqdm(lb_loader, desc="lb") for _, batch in enumerate(lb_iterator): token, feature, tweet_lb, user_lb, embedding = batch[0].float().cuda(), batch[1].float().cuda(), batch[2], batch[3], batch[4].float().cuda()#,batch[4].cuda() pred = torch.sigmoid(model(token,feature,embedding)).detach().cpu().numpy() if "Valid" in args.valid: lbs['tweet_lb'] += tweet_lb else: lbs['tweet_lb'] += tweet_lb[0] lbs['user_lb'] += user_lb[0] preds.append(pred) final_csv = pd.DataFrame(lbs) preds = np.float64(np.vstack(preds)) if not os.path.exists(args.spath): os.makedirs(args.spath) print("Generating CSVs...") for i, engage in enumerate(["reply", "retweet", "comment", "like"]): final_csv[engage] = preds[:,i] final_csv[['tweet_lb','user_lb',engage]].to_csv(os.path.join(args.spath, engage+'.csv'),index=False, header=False)
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Number of Users Sampled: {}".format(args.num_users_sampled)) print("Number of Items Sampled: {}".format(args.num_items_sampled)) print("Number of Max Allowed Iterations: {}".format( args.max_iteration_threshold)) print("Critiquing Model: {}".format(args.critiquing_model_name)) R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_test = load_numpy(path=args.data_dir, name=args.test_set) print("Test U-I Dimensions: {}".format(R_test.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train Item Keyphrase U-I Dimensions: {}".format( R_train_keyphrase.shape)) R_train_item_keyphrase = load_numpy( path=args.data_dir, name=args.train_item_keyphrase_set).toarray() table_path = load_yaml('config/global.yml', key='path')['tables'] parameters = find_best_hyperparameters(table_path + args.dataset_name, 'NDCG') parameters_row = parameters.loc[parameters['model'] == args.model] if args.dataset_name == "yelp/": R_train_item_keyphrase = R_train_item_keyphrase.T start_time = time.time() results = critiquing( matrix_Train=R_train, matrix_Test=R_test, keyphrase_freq=R_train_keyphrase, item_keyphrase_freq=R_train_item_keyphrase, num_users_sampled=args.num_users_sampled, num_items_sampled=args.num_items_sampled, max_iteration_threshold=args.max_iteration_threshold, dataset_name=args.dataset_name, model=models[args.model], parameters_row=parameters_row, critiquing_model_name=args.critiquing_model_name, keyphrase_selection_method=args.keyphrase_selection_method, topk=args.topk, lamb=args.lamb) print("Final Time Elapsed: {}".format(inhour(time.time() - start_time))) table_path = load_yaml('config/global.yml', key='path')['tables'] save_dataframe_csv(results, table_path, args.save_path)
def main(args): progress = WorkSplitter() progress.section("Tune Parameters") params = load_yaml(args.grid) params['models'] = {params['models']: models[params['models']]} train = load_numpy(path=args.path, name=args.dataset + args.train) unif_train = load_numpy(path=args.path, name=args.dataset + args.unif_train) valid = load_numpy(path=args.path, name=args.dataset + args.valid) hyper_parameter_tuning(train, valid, params, unif_train=unif_train, save_path=args.dataset + args.name, gpu_on=args.gpu, seed=args.seed, way=args.way, dataset=args.dataset)
def main(args): # Progress bar progress = WorkSplitter() # Show hyper parameter settings progress.section("Parameter Setting") print("Data Path: {}".format(args.path)) print("Train File Name: {}".format(args.train)) if args.validation: print("Valid File Name: {}".format(args.valid)) print("Algorithm: {}".format(args.model)) print("Lambda Diversity: {}".format(args.lambda_diversity)) print("Lambda Serendipity: {}".format(args.lambda_serendipity)) print("Nearest Neighbor Number: {}".format(args.k)) print("Evaluation Ranking Topk: {}".format(args.topk)) # Load Data progress.section("Loading Data") start_time = time.time() R_train = load_numpy(path=args.path, name=args.train) print("Elapsed: {}".format(inhour(time.time() - start_time))) print("Train U-I Dimensions: {}".format(R_train.shape)) progress.section("Train") model = models[args.model]() model.train(R_train) progress.section("Predict") prediction_score = model.predict( R_train, k=args.k, lambda_diversity=args.lambda_diversity, lambda_serendipity=args.lambda_serendipity) prediction = predict(prediction_score=prediction_score, topK=args.topk, matrix_Train=R_train) if args.validation: progress.section("Create Metrics") start_time = time.time() metric_names = [ 'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP' ] R_valid = load_numpy(path=args.path, name=args.valid) result = evaluate(prediction, R_valid, metric_names, [args.topk]) print("-") for metric in result.keys(): print("{}:{}".format(metric, result[metric])) print("Elapsed: {}".format(inhour(time.time() - start_time)))
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Number of Users Sampled: {}".format(args.num_users_sampled)) print("Number of Items Sampled: {}".format(args.num_items_sampled)) print("Number of Max Allowed Iterations: {}".format(args.max_iteration_threshold)) print("Critiquing Model: {}".format(args.critiquing_model_name)) R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_test = load_numpy(path=args.data_dir, name=args.test_set) print("Test U-I Dimensions: {}".format(R_test.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train Item Keyphrase U-I Dimensions: {}".format(R_train_keyphrase.shape)) R_train_item_keyphrase = load_numpy(path=args.data_dir, name=args.train_item_keyphrase_set).toarray() table_path = load_yaml('config/global.yml', key='path')['tables'] # parameters = find_best_hyperparameters(table_path+args.dataset_name, 'NDCG') # parameters_row = parameters.loc[parameters['model'] == args.model] parameters_row = { 'iter' : 10, 'lambda' : 200, 'rank' : 200 } keyphrases_names = load_dataframe_csv(path = args.data_dir, name = "Keyphrases.csv")['Phrases'].tolist() results = critiquing(matrix_Train=R_train, matrix_Test=R_test, keyphrase_freq=R_train_keyphrase, item_keyphrase_freq=R_train_item_keyphrase.T, num_users_sampled=args.num_users_sampled, num_items_sampled=args.num_items_sampled, max_iteration_threshold=args.max_iteration_threshold, dataset_name=args.dataset_name, model=models[args.model], parameters_row=parameters_row, critiquing_model_name=args.critiquing_model_name, lamb = args.lambdas, keyphrases_names = keyphrases_names, keyphrase_selection_method = args.keyphrase_selection_method) table_path = load_yaml('config/global.yml', key='path')['tables'] save_dataframe_csv(results, table_path, args.save_path)
def main(args): progress = WorkSplitter() progress.section("Parameter Setting") print("Data Path: {}".format(args.data_dir)) print("Implicit User Feedback: {}".format(args.implicit)) progress.section("Load Raw Data") rating_matrix, timestamp_matrix = get_yelp_df( args.data_dir + args.data_name, sampling=True, top_user_num=args.top_user_num, top_item_num=args.top_item_num) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split( rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit) import ipdb ipdb.set_trace() progress.section("Save NPZ") save_numpy(rtrain, args.data_dir, "Rtrain") save_numpy(rvalid, args.data_dir, "Rvalid") save_numpy(rtest, args.data_dir, "Rtest") save_numpy(rtime, args.data_dir, "Rtime") save_array(nonzero_index, args.data_dir, "Index")
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Number of Users Sampled: {}".format(args.num_users_sampled)) print("Number of Items Sampled: {}".format(args.num_items_sampled)) print("Number of Max Allowed Iterations: {}".format( args.max_iteration_threshold)) print("Critiquing Model: {}".format(args.critiquing_model_name)) R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_test = load_numpy(path=args.data_dir, name=args.test_set) print("Test U-I Dimensions: {}".format(R_test.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train User Keyphrase U-I Dimensions: {}".format( R_train_keyphrase.shape)) R_train_item_keyphrase = load_numpy( path=args.data_dir, name=args.train_item_keyphrase_set).toarray() print("Train Item Keyphrase U-I Dimensions: {}".format( R_train_item_keyphrase.shape)) # table_path = load_yaml('config/global.yml', key='path')['tables'] # parameters = find_best_hyperparameters(table_path+args.dataset_name, 'NDCG') # parameters_row = parameters.loc[parameters['model'] == args.model] parameters_row = pd.DataFrame({'iter': [4], 'lambda': [80], 'rank': [200]}) results = critiquing(matrix_Train=R_train, matrix_Test=R_test, keyphrase_freq=R_train_keyphrase, item_keyphrase_freq=R_train_item_keyphrase, num_users_sampled=args.num_users_sampled, num_items_sampled=args.num_items_sampled, max_iteration_threshold=args.max_iteration_threshold, dataset_name=args.dataset_name, model=models[args.model], parameters_row=parameters_row, critiquing_model_name=args.critiquing_model_name) # table_path = load_yaml('config/global.yml', key='path')['tables'] table_path = '/home/shuyang/data4/LatentLinearCritiquingforConvRecSys/' save_dataframe_csv(results, table_path, args.save_path)
def hyper_parameter_tuning(train, validation, params, save_path): progress = WorkSplitter() table_path = load_yaml('config/global.yml', key='path')['tables'] try: df = load_dataframe_csv(table_path, save_path) except: df = pd.DataFrame(columns=['model', 'k', 'topK']) num_user = train.shape[0] for algorithm in params['models']: for k in params['k']: if ((df['model'] == algorithm) & (df['k'] == k)).any(): continue format = "model: {}, k: {}" progress.section(format.format(algorithm, k)) progress.subsection("Training") model = params['models'][algorithm]() model.train(train) progress.subsection("Prediction") prediction_score = model.predict(train, k=k) prediction = predict(prediction_score=prediction_score, topK=params['topK'][-1], matrix_Train=train) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = {'model': algorithm, 'k': k} for name in result.keys(): result_dict[name] = [ round(result[name][0], 4), round(result[name][1], 4) ] df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path)
def biasedmf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=500, way=None, gpu_on=True, **unused): progress = WorkSplitter() progress.section("BiasedMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("BiasedMF: Training") m, n = matrix_train.shape model = BiasedMF(m, n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] if way == 'unif': RQ, Y, user_bias, item_bias, _ = model.train_model( matrix_unif_train, matrix_valid, iteration, metric_names) elif way == 'combine': matrix_train += matrix_unif_train RQ, Y, user_bias, item_bias, _ = model.train_model( matrix_train, matrix_valid, iteration, metric_names) else: RQ, Y, user_bias, item_bias, prediction = model.train_model( matrix_train, matrix_valid, iteration, metric_names) # if gpu_on: # np.savetxt('Matlab/biasedmf_prediction.txt', cp.asnumpy(prediction)) # else: # np.savetxt('Matlab/biasedmf_prediction.txt', prediction) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def main(args): progress = WorkSplitter() progress.section("Yahoo R3: Load Raw Data") user_df = pd.read_csv(args.path + args.dataset + args.user, sep=args.sep, header=None, names=args.names) random_df = pd.read_csv(args.path + args.dataset + args.random, sep=args.sep, header=None, names=args.names) if args.implicit: """ If only implicit (clicks, views, binary) feedback, convert to implicit feedback """ user_df['rating'].loc[user_df['rating'] < args.threshold] = -1 user_df['rating'].loc[user_df['rating'] >= args.threshold] = 1 random_df['rating'].loc[random_df['rating'] < args.threshold] = -1 random_df['rating'].loc[random_df['rating'] >= args.threshold] = 1 progress.section("Yahoo R3: Randomly Split Random Set") m, n = max(user_df['uid']) + 1, max(user_df['iid']) + 1 unif_train, validation, test = seed_randomly_split(df=random_df, ratio=args.ratio, split_seed=args.seed, shape=(m, n)) progress.section("Yahoo R3: Save NPZ") save_dir = args.path + args.dataset train = sparse.csr_matrix( (user_df['rating'], (user_df['uid'], user_df['iid'])), shape=(m, n), dtype='float32') save_numpy(train, save_dir, "S_c") save_numpy(unif_train, save_dir, "S_t") save_numpy(validation, save_dir, "S_va") save_numpy(test, save_dir, "S_te") progress.section("Yahoo R3: Statistics of Data Sets") print('* S_c #num: %6d, pos: %.6f, neg: %.6f' % (train.count_nonzero(), np.sum(train == 1) / train.count_nonzero(), 1 - np.sum(train == 1) / train.count_nonzero())) print('* S_t #num: %6d, pos: %.6f, neg: %.6f' % (unif_train.count_nonzero(), np.sum(unif_train == 1) / unif_train.count_nonzero(), 1 - np.sum(unif_train == 1) / unif_train.count_nonzero())) print('* S_va #num: %6d, pos: %.6f, neg: %.6f' % (validation.count_nonzero(), np.sum(validation == 1) / validation.count_nonzero(), 1 - np.sum(validation == 1) / validation.count_nonzero())) print('* S_te #num: %6d, pos: %.6f, neg: %.6f' % (test.count_nonzero(), np.sum(test == 1) / test.count_nonzero(), 1 - np.sum(test == 1) / test.count_nonzero()))
def execute(train, test, params, model, gpu_on=True, analytical=False): progress = WorkSplitter() columns = ['model', 'rank', 'lambda', 'epoch', 'corruption', 'topK'] progress.section("\n".join( [":".join((str(k), str(params[k]))) for k in columns])) df = pd.DataFrame(columns=columns) progress.subsection("Train") RQ, Yt, Bias = model(train, epoch=params['epoch'], lamb=params['lambda'], rank=params['rank'], corruption=params['corruption']) Y = Yt.T progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, bias=Bias, topK=params['topK'][-1], matrix_Train=train, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, test, params['metric'], params['topK'], analytical=analytical) if analytical: return result else: result_dict = params for name in result.keys(): result_dict[name] = [ round(result[name][0], 4), round(result[name][1], 4) ] df = df.append(result_dict, ignore_index=True) return df
def uncertainty(Rtrain, df_input, rank): progress = WorkSplitter() m, n = Rtrain.shape valid_models = vaes.keys() results = [] for run in range(1): for idx, row in df_input.iterrows(): row = row.to_dict() if row['model'] not in valid_models: continue progress.section(json.dumps(row)) if 'optimizer' not in row.keys(): row['optimizer'] = 'RMSProp' model = vaes[row['model']](n, rank, batch_size=100, lamb=row['lambda'], optimizer=Regularizer[row['optimizer']]) model.train_model(Rtrain, corruption=row['corruption'], epoch=row['iter']) data_batches = model.get_batches(Rtrain, batch_size=100) progress.subsection("Predict") for batch in tqdm(data_batches): batch_size = batch.shape[0] _, stds = model.uncertainty(batch.todense()) num_rated = np.squeeze(np.asarray(np.sum(batch, axis=1))) std = np.mean(stds, axis=1) results.append( pd.DataFrame({ 'model': [row['model']] * batch_size, 'numRated': num_rated, 'std': std })) return pd.concat(results)
def wrsamplemf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=500, gpu_on=True, **unused): progress = WorkSplitter() progress.section("WRSampleMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("WRSampleMF: Training") m, n = matrix_train.shape marks = sparse.csr_matrix(matrix_train.shape) marks[(matrix_train != 0).nonzero()] = 1 matrix_train += matrix_unif_train num_samples = len(matrix_train.nonzero()[0]) model = WRSampleMF(m, n, rank, num_samples, lamb=lam, batch_size=batch_size, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] RQ, Y, user_bias, item_bias, confidence, user_item_pairs, prediction = model.train_model( matrix_train, marks, matrix_valid, iteration, metric_names) # np.savetxt('Matlab/wrsamplemf_samples.txt', user_item_pairs) # np.savetxt('Matlab/wrsamplemf_weights.txt', confidence) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def main(args): # Progress bar progress = WorkSplitter() # Load Data progress.section("Load Data") start_time = time.time() data = Data(args.path, args.train, args.valid,is_lb=True) print("Elapsed: {0}".format(inhour(time.time() - start_time))) #build model progress.section("Build Model") model = FeatureNet(data.n_token, data.n_feature, [1024, 2000, 1000, 500, 100]) model.cuda() print(model) model.cuda() model.load_state_dict(torch.load(args.checkpoint)) print(model) lb_loader = data.instance_a_lb_loader(args.batch) lbs = {'user_lb': list(), 'tweet_lb': list()} preds = [] model = model.eval() with torch.no_grad(): lb_iterator = tqdm(lb_loader, desc="lb") for _, batch in enumerate(lb_iterator): token, feature, tweet_lb, user_lb = batch[0].float().cuda(), batch[1].float().cuda(), batch[2], batch[3]#,batch[4].cuda() pred = torch.sigmoid(model(token,feature)).detach().cpu().numpy() lbs['tweet_lb'] += tweet_lb[0] lbs['user_lb'] += user_lb[0] preds.append(pred) final_csv = pd.DataFrame(lbs) preds = np.float64(np.vstack(preds)) if not os.path.exists(args.spath): os.makedirs(args.spath) print("Generating CSVs...") for i, engage in enumerate(["reply", "retweet", "comment", "like"]): final_csv[engage] = preds[:,i] final_csv[['tweet_lb','user_lb',engage]].to_csv(os.path.join(args.spath, engage+'.csv'),index=False, header=False)
def execute(train, test, params, model, analytical=False): progress = WorkSplitter() columns = ['model', 'k', 'topK'] progress.section("\n".join( [":".join((str(k), str(params[k]))) for k in columns])) df = pd.DataFrame(columns=columns) progress.subsection("Train") model = model() model.train(train) progress.subsection("Prediction") prediction_score = model.predict(train, k=params['k']) prediction = predict(prediction_score=prediction_score, topK=params['topK'][-1], matrix_Train=train) progress.subsection("Evaluation") result = evaluate(prediction, test, params['metric'], params['topK'], analytical=analytical) if analytical: return result else: result_dict = params for name in result.keys(): result_dict[name] = [ round(result[name][0], 4), round(result[name][1], 4) ] df = df.append(result_dict, ignore_index=True) return df
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") rating_matrix = load_pandas(row_name='userId', col_name='itemId', value_name=None, path=args.path, name=args.name, shape=args.shape) timestamp_matrix = load_pandas(row_name='userId', col_name='itemId', value_name='Timestamp', path=args.path, name=args.name, shape=args.shape) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split( rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit, sampling=True, percentage=0.2) progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_numpy(rtime, args.path, "Rtime") save_array(nonzero_index, args.path, "Index")
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") rating_matrix = load_pandas_without_names( path=args.path, name=args.name, row_name='userId', sep='\t', col_name='trackId', value_name='rating', shape=args.shape, names=['userId', 'trackId', 'rating']) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index = split_seed_randomly( rating_matrix=rating_matrix, ratio=args.ratio, threshold=80, implicit=args.implicit, sampling=True, percentage=0.2) print("Done splitting Yahoo dataset") progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_array(nonzero_index, args.path, "Index") print("Done saving data for yahoo after splitting")
def initfeatureembedae(matrix_train, matrix_valid, iteration=100, lam=0.01, rank=50, seed=0, batch_size=256, way='both', dataset=None, gpu_on=True, **unused): progress = WorkSplitter() progress.section("InitFeatureEmbedAE: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("InitFeatureEmbedAE: Load the variables trained on S_t") X = np.load('latent/' + dataset + 'unif_X_AutoRec_200.npy') xBias = np.load('latent/' + dataset + 'unif_xB_AutoRec_200.npy') Y = np.load('latent/' + dataset + 'unif_Y_AutoRec_200.npy') yBias = np.load('latent/' + dataset + 'unif_yB_AutoRec_200.npy') progress.section("InitFeatureEmbedAE: Training") m, n = matrix_train.shape model = InitFeatureEmbedAE(n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on, init_X=X, init_Y=Y, init_xBias=xBias, init_yBias=yBias, way=way) metric_names = ['NLL', 'AUC'] RQ, X, xBias, Y, yBias = model.train_model(matrix_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, X, xBias, Y, yBias
def restrictedbatchsamplemf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=500, gpu_on=True, step=3, way=None, **unused): progress = WorkSplitter() progress.section("RestrictedBatchSampleMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("RestrictedBatchSampleMF: Training") m, n = matrix_train.shape model = BatchSampleMF(m, n, rank, lamb=lam, batch_size=batch_size, step=step, gpu_on=gpu_on, way=way) metric_names = ['NLL', 'AUC'] RQ, Y, user_bias, item_bias = model.train_model(matrix_train, matrix_unif_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def propensitymf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=500, gpu_on=True, **unused): progress = WorkSplitter() progress.section("PropensityMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("PropensityMF: Calculating Propensity Score") m, n = matrix_train.shape P_O = matrix_train.count_nonzero() / (m * n) P_YO = np.array([np.sum(matrix_train == -1) / matrix_train.count_nonzero(), 1 - np.sum(matrix_train == -1) / matrix_train.count_nonzero()]) P_Y = np.array([np.sum(matrix_unif_train == -1) / matrix_unif_train.count_nonzero(), 1 - np.sum(matrix_unif_train == -1) / matrix_unif_train.count_nonzero()]) invP = 1 / (P_YO * P_O / P_Y) # Note: Propensity MF uses S_c and S_t as training set matrix_train += matrix_unif_train progress.section("PropensityMF: Training") m, n = matrix_train.shape model = PropensityMF(m, n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] RQ, Y, user_bias, item_bias = model.train_model(matrix_train, matrix_valid, invP, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def unionsamplemf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=500, confidence=0.9, gpu_on=True, **unused): progress = WorkSplitter() progress.section("UnionSampleMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("UnionSampleMF: Training") m, n = matrix_train.shape model = UnionSampleMF(m, n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on, confidence=confidence) metric_names = ['NLL', 'AUC'] marks = sparse.csr_matrix(matrix_train.shape) marks[(matrix_train != 0).nonzero()] = 1 matrix_train += matrix_unif_train RQ, Y, user_bias, item_bias = model.train_model(matrix_train, marks, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def main(args): progress = WorkSplitter() raw = pd.read_csv(args.path + args.name, names=['user', 'item', 'rating', 'timestamp']) raw['userID'] = pd.factorize(raw.user)[0] raw['itemID'] = pd.factorize(raw.item)[0] progress.section("Load Raw Data") rating_matrix = getSparseMatrix(raw, row_name='userID', col_name='itemID', value_name='rating') timestamp_matrix = getSparseMatrix(raw, row_name='userID', col_name='itemID', value_name='timestamp') progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split( rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit) progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_numpy(rtime, args.path, "Rtime") save_array(nonzero_index, args.path, "Index")
def autorec(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=256, way=None, gpu_on=True, **unused): progress = WorkSplitter() progress.section("AutoRec: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("AutoRec: Training") m, n = matrix_train.shape model = AutoRec(n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] if way == 'unif': RQ, X, xBias, Y, yBias = model.train_model(matrix_unif_train, matrix_valid, iteration, metric_names) elif way == 'combine': matrix_train += matrix_unif_train RQ, X, xBias, Y, yBias = model.train_model(matrix_train, matrix_valid, iteration, metric_names) else: RQ, X, xBias, Y, yBias = model.train_model(matrix_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, X, xBias, Y, yBias
def causalsamplemf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, lam2=0.01, rank=50, seed=0, batch_size=500, gpu_on=True, **unused): progress = WorkSplitter() progress.section("CausalSampleMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("CausalSampleMF: Training") m, n = matrix_train.shape # Create new item IDs for S_t (i.e., [n, n*2) unif_user_item_matrix = lil_matrix(matrix_unif_train) unif_user_item_pairs = np.asarray(unif_user_item_matrix.nonzero()).T unif_label = np.asarray(matrix_unif_train[unif_user_item_pairs[:, 0], unif_user_item_pairs[:, 1]]).T unif_user_item_pairs[:, 1] += n # Create new csr matrix including union of S_c and S_t norm_user_item_matrix = lil_matrix(matrix_train) norm_user_item_pairs = np.asarray(norm_user_item_matrix.nonzero()).T norm_label = np.asarray(matrix_train[norm_user_item_pairs[:, 0], norm_user_item_pairs[:, 1]]).T user_item_pairs = np.vstack((unif_user_item_pairs, norm_user_item_pairs)) labels = np.vstack((unif_label, norm_label)) matrix_train = sparse.csr_matrix( (labels[:, 0], (user_item_pairs[:, 0], user_item_pairs[:, 1])), shape=(m, n * 2), dtype='float32') model = CausalSampleMF(m, n, rank, lamb=lam, lamb2=lam2, batch_size=batch_size, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] RQ, Y, user_bias, item_bias = model.train_model(matrix_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def softlabelae(matrix_train, matrix_valid, iteration=100, lam=0.01, rank=50, rank2=50, tau=2, seed=0, batch_size=256, confidence=0.9, dataset=None, gpu_on=True, **unused): progress = WorkSplitter() progress.section("SoftLabelAE: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("SoftLabelAE: Load the variables trained on S_t") X = np.load('latent/' + dataset + 'unif_X_DeepAutoRec_200.npy') Y = np.load('latent/' + dataset + 'unif_Y_DeepAutoRec_200.npy') Z = np.load('latent/' + dataset + 'unif_Z_DeepAutoRec_200.npy') K = np.load('latent/' + dataset + 'unif_K_DeepAutoRec_200.npy') xBias = np.load('latent/' + dataset + 'unif_xB_DeepAutoRec_200.npy') yBias = np.load('latent/' + dataset + 'unif_yB_DeepAutoRec_200.npy') zBias = np.load('latent/' + dataset + 'unif_zB_DeepAutoRec_200.npy') kBias = np.load('latent/' + dataset + 'unif_kB_DeepAutoRec_200.npy') progress.section("SoftLabelAE: Training") m, n = matrix_train.shape model = SoftLabelAE(n, rank, rank2, lamb=lam, batch_size=batch_size, gpu_on=gpu_on, init_X=X, init_Y=Y, init_Z=Z, init_K=K, init_xBias=xBias, init_yBias=yBias, init_zBias=zBias, init_kBias=kBias, tau=tau, confidence=confidence) metric_names = ['NLL', 'AUC'] RQ, X, xBias, Y, yBias, Z, zBias, K, kBias = model.train_model( matrix_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, X, xBias, Y, yBias, Z, zBias, K, kBias
def main(args): progress = WorkSplitter() progress.section("Parameter Setting") print("Data Path: {}".format(args.path)) print("Validation: {}".format(args.validation)) print("Implicit: {}".format(args.implicit)) progress.section("Load Raw Data") rating_matrix = load_pandas(path=args.path, name=args.name, shape=args.shape) timestamp_matrix = load_pandas(path=args.path, value_name='timestamp', name=args.name, shape=args.shape) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, _, _, rtime = split_user_randomly( rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.split_user_ratio, implicit=args.implicit) if args.validation: rtrain, rvalid, _, _, _ = time_ordered_split( rating_matrix=rtrain, timestamp_matrix=rtime, ratio=args.split_train_valid_ratio, implicit=False, remove_empty=False) ractive, rtest, _, _, _ = time_ordered_split( rating_matrix=rtest, timestamp_matrix=rtime, ratio=args.split_active_test_ratio, implicit=False, remove_empty=False) progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(ractive, args.path, "Ractive") save_numpy(rtest, args.path, "Rtest") save_numpy(rtime, args.path, "Rtime")
def bridgelabelmf(matrix_train, matrix_valid, iteration=100, lam=0.01, lam2=0.01, rank=50, seed=0, batch_size=500, gpu_on=True, dataset=None, **unused): progress = WorkSplitter() progress.section("BridgeLabelMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("BridgeLabelMF: Load the variables trained on S_c/S_t") norm_RQ = np.load('latent/' + dataset + 'U_BiasedMF_10.npy') norm_Y = np.load('latent/' + dataset + 'V_BiasedMF_10.npy') norm_uBias = np.load('latent/' + dataset + 'uB_BiasedMF_10.npy') norm_iBias = np.load('latent/' + dataset + 'iB_BiasedMF_10.npy') unif_RQ = np.load('latent/' + dataset + 'unif_U_BiasedMF_10.npy') unif_Y = np.load('latent/' + dataset + 'unif_V_BiasedMF_10.npy') unif_uBias = np.load('latent/' + dataset + 'unif_uB_BiasedMF_10.npy') unif_iBias = np.load('latent/' + dataset + 'unif_iB_BiasedMF_10.npy') progress.section("BridgeLabelMF: Training") m, n = matrix_train.shape model = BridgeLabelMF(m, n, rank, lamb=lam, lamb2=lam2, batch_size=batch_size, gpu_on=gpu_on, norm_init_U=norm_RQ, norm_init_V=norm_Y, norm_init_uBias=norm_uBias, norm_init_iBias=norm_iBias, unif_init_U=unif_RQ, unif_init_V=unif_Y, unif_init_uBias=unif_uBias, unif_init_iBias=unif_iBias) metric_names = ['NLL', 'AUC'] RQ, Y, user_bias, item_bias = model.train_model(matrix_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias