def wrsampleae(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=256, gpu_on=True, **unused): progress = WorkSplitter() progress.section("WRSampleAE: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("WRSampleAE: Training") m, n = matrix_train.shape marks = sparse.csr_matrix(matrix_train.shape) marks[(matrix_train != 0).nonzero()] = 1 matrix_train += matrix_unif_train model = WRSampleAE(n, rank, m, lamb=lam, batch_size=batch_size, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] RQ, X, xBias, Y, yBias = model.train_model(matrix_train, marks, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, X, xBias, Y, yBias
def plrec(matrix_train, iteration=4, lamb=80, rank=200, seed=1, **unused): """ Function used to achieve generalized projected lrec w/o item-attribute embedding :param matrix_train: user-item matrix with shape m*n :param iteration: number of power iterations in randomized svd :param lamb: parameter of penalty :param rank: latent dimension size :param seed: the seed of the pseudo random number generator to use when shuffling the data :return: prediction in sparse matrix """ progress = WorkSplitter() progress.subsection("Randomized SVD") start_time = time.time() P, sigma, Qt = randomized_svd(matrix_train, n_components=rank, n_iter=iteration, random_state=seed) RQ = matrix_train.dot(sparse.csc_matrix(Qt.T*np.sqrt(sigma))) print("Elapsed: {}".format(inhour(time.time() - start_time))) progress.subsection("Closed-Form Linear Optimization") start_time = time.time() pre_inv = RQ.T.dot(RQ) + lamb * sparse.identity(rank, dtype=np.float32) inverse = sparse.linalg.inv(pre_inv.tocsc()) Y = inverse.dot(RQ.T).dot(matrix_train) print("Elapsed: {}".format(inhour(time.time() - start_time))) return np.array(RQ.todense()), np.array(Y.todense()), None
def pure_svd(matrix_train, embeded_matrix=np.empty((0)), iteration=4, rank=200, fb=False, seed=1, **unused): """ PureSVD algorithm :param matrix_train: rating matrix :param embeded_matrix: item or user embedding matrix(side info) :param iteration: number of random SVD iterations :param rank: SVD top K eigenvalue ranks :param fb: facebook package or sklearn package. boolean :param seed: Random initialization seed :param unused: args that not applicable for this algorithm :return: """ progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) progress.subsection("Randomized SVD") start_time = time.time() if fb: P, sigma, Qt = pca(matrix_input, k=rank, n_iter=iteration, raw=True) else: P, sigma, Qt = randomized_svd(matrix_input, n_components=rank, n_iter=iteration, power_iteration_normalizer='QR', random_state=seed) RQ = matrix_input.dot(sparse.csc_matrix(Qt).T) print("Elapsed: {0}".format(inhour(time.time() - start_time))) return np.array(RQ.todense()), Qt, None
def nceplrec(matrix_train, embeded_matrix=np.empty((0)), iteration=4, lam=80, rank=200, seed=1, root=1.1, gpu_on=True, **unused): """ Function used to achieve generalized projected lrec w/o item-attribute embedding :param matrix_train: user-item matrix with shape m*n :param embeded_matrix: item-attribute matrix with length n (each row represents one item) :param lam: parameter of penalty :param k_factor: ratio of the latent dimension/number of items :return: prediction in sparse matrix """ progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) progress.subsection("Create PMI matrix") if gpu_on: pmi_matrix = get_pmi_matrix_gpu(matrix_input, root) else: pmi_matrix = get_pmi_matrix(matrix_input, root) progress.subsection("Randomized SVD") start_time = time.time() P, sigma, Qt = randomized_svd(pmi_matrix, n_components=rank, n_iter=iteration, random_state=seed) # Plain # RQ = matrix_input.dot(sparse.csc_matrix(Qt).T) # sqrt sigma injection RQ = matrix_input.dot(sparse.csc_matrix(Qt.T * np.sqrt(sigma))) print("Elapsed: {0}".format(inhour(time.time() - start_time))) progress.subsection("Closed-Form Linear Optimization") start_time = time.time() pre_inv = RQ.T.dot(RQ) + lam * sparse.identity(rank, dtype=np.float32) inverse = inv(pre_inv) Y = inverse.dot(RQ.T).dot(matrix_input) print("Elapsed: {0}".format(inhour(time.time() - start_time))) return np.array(RQ.todense()), np.array(Y.todense()), None
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") rating_matrix, timestamp_matrix = load_netflix(path=args.folder, shape=args.shape) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit) progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_numpy(rtime, args.path, "Rtime") save_array(nonzero_index, args.path, "Index")
def main(args): # Progress bar progress = WorkSplitter() progress.section("Load Data") if args.emb_type == 'bert': emb_size = 768 elif args.emb_type == 'xlmr': emb_size = 1024 # Load Data start_time = time.time() print("WARNING: Embedding size is set to", emb_size) data = Data(args, args.path, args.train, args.valid,emb_size, is_lb=True) print("Elapsed: {0}".format(inhour(time.time() - start_time))) #build model progress.section("Build Model") if args.network_architecture == 'embedding_net': model = EmbeddingNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100],corruption=args.corruption) elif args.network_architecture == 'embedding_highway_net': model = EmbeddingHighWayNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100]) else: raise NotImplementedError('either use embedding_net or embedding_highway_net') model.cuda() print(model) model.load_state_dict(torch.load(args.checkpoint)) print(model) lb_loader = data.instance_a_lb_loader(args.batch) lbs = {'user_lb': list(), 'tweet_lb': list()} preds = [] model.eval() with torch.no_grad(): lb_iterator = tqdm(lb_loader, desc="lb") for _, batch in enumerate(lb_iterator): token, feature, tweet_lb, user_lb, embedding = batch[0].float().cuda(), batch[1].float().cuda(), batch[2], batch[3], batch[4].float().cuda()#,batch[4].cuda() pred = torch.sigmoid(model(token,feature,embedding)).detach().cpu().numpy() if "Valid" in args.valid: lbs['tweet_lb'] += tweet_lb else: lbs['tweet_lb'] += tweet_lb[0] lbs['user_lb'] += user_lb[0] preds.append(pred) final_csv = pd.DataFrame(lbs) preds = np.float64(np.vstack(preds)) if not os.path.exists(args.spath): os.makedirs(args.spath) print("Generating CSVs...") for i, engage in enumerate(["reply", "retweet", "comment", "like"]): final_csv[engage] = preds[:,i] final_csv[['tweet_lb','user_lb',engage]].to_csv(os.path.join(args.spath, engage+'.csv'),index=False, header=False)
def sensitivity(train, validation, params): progress = WorkSplitter() progress.section("PMI-PLRec Default") RQ, Yt, _ = params['models']['NCE-PLRec'](train, embeded_matrix=np.empty((0)), iteration=params['iter'], rank=params['rank'], lam=params['lambda'], root=1.0) Y = Yt.T default_prediction = predict(matrix_U=RQ, matrix_V=Y, topK=params['topK'][-1], matrix_Train=train, gpu=True) default_result = evaluate(default_prediction, validation, params['metric'], params['topK']) print("-") print("Rank: {0}".format(params['rank'])) print("Lambda: {0}".format(params['lambda'])) print("SVD Iteration: {0}".format(params['iter'])) print("Evaluation Ranking Topk: {0}".format(params['topK'])) for key in default_result.keys(): print("{0} :{1}".format(key, default_result[key])) sensitivity_results = dict() for root in tqdm(params['root']): progress.section("PMI-PLRec, Root: " + str(root)) RQ, Yt, _ = params['models']['NCE-PLRec'](train, embeded_matrix=np.empty((0)), iteration=params['iter'], rank=params['rank'], lam=params['lambda'], root=root) Y = Yt.T prediction = predict(matrix_U=RQ, matrix_V=Y, topK=params['topK'][-1], matrix_Train=train, gpu=True) result = evaluate(prediction, validation, params['metric'], params['topK']) sensitivity_results[root] = result print("-") print("Root: {0}".format(root)) print("Rank: {0}".format(params['rank'])) print("Lambda: {0}".format(params['lambda'])) print("SVD Iteration: {0}".format(params['iter'])) print("Evaluation Ranking Topk: {0}".format(params['topK'])) for key in result.keys(): print("{0} :{1}".format(key, result[key])) return default_result, sensitivity_results
def hintae(matrix_train, matrix_valid, iteration=100, lam=0.01, rank=50, rank2=50, seed=0, batch_size=256, confidence=0.9, dataset=None, gpu_on=True, **unused): progress = WorkSplitter() progress.section("HintAE: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("HintAE: Load the variables trained on S_t") X = np.load('latent/' + dataset + 'unif_X_DeepAutoRec_200.npy') xBias = np.load('latent/' + dataset + 'unif_xB_DeepAutoRec_200.npy') Y = np.load('latent/' + dataset + 'unif_Y_DeepAutoRec_200.npy') yBias = np.load('latent/' + dataset + 'unif_yB_DeepAutoRec_200.npy') progress.section("HintAE: Training") m, n = matrix_train.shape model = HintAE(n, rank, rank2, lamb=lam, batch_size=batch_size, gpu_on=gpu_on, init_X=X, init_Y=Y, init_xBias=xBias, init_yBias=yBias, confidence=confidence) metric_names = ['NLL', 'AUC'] RQ, X, xBias, Y, yBias, Z, zBias, K, kBias = model.train_model( matrix_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, X, xBias, Y, yBias, Z, zBias, K, kBias
def initfeatureembedmf(matrix_train, matrix_valid, iteration=100, lam=0.01, rank=50, seed=0, batch_size=500, gpu_on=True, way='both', dataset=None, **unused): progress = WorkSplitter() progress.section("InitFeatureEmbedMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("InitFeatureEmbedMF: Load the variables trained on S_t") RQ = np.load('latent/' + dataset + 'unif_U_BiasedMF_10.npy') Y = np.load('latent/' + dataset + 'unif_V_BiasedMF_10.npy') uBias = np.load('latent/' + dataset + 'unif_uB_BiasedMF_10.npy') iBias = np.load('latent/' + dataset + 'unif_iB_BiasedMF_10.npy') progress.section("InitFeatureEmbedMF: Training") m, n = matrix_train.shape model = InitFeatureEmbedMF(m, n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on, init_U=RQ, init_V=Y, init_uBias=uBias, init_iBias=iBias, way=way) metric_names = ['NLL', 'AUC'] RQ, Y, user_bias, item_bias = model.train_model(matrix_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def pop(matrix_train, **unused): """ Function used to achieve generalized projected lrec w/o item-attribute embedding :param matrix_train: user-item matrix with shape m*n :param embeded_matrix: item-attribute matrix with length n (each row represents one item) :param lam: parameter of penalty :param k_factor: ratio of the latent dimension/number of items :return: prediction in sparse matrix """ progress = WorkSplitter() m, n = matrix_train.shape item_popularity = np.array(np.sum(matrix_train, axis=0)).flatten() RQ = np.ones((m, 1)) Y = item_popularity.reshape((1, n)) return RQ, Y, None
def bpr(matrix_train, embeded_matrix=np.empty((0)), iteration=100, lam=80, rank=200, seed=1, **unused): progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) m, n = matrix_input.shape model = BPR(m, n, rank, lamb=lam, batch_size=500) model.train_model(matrix_input, iteration) RQ = model.get_RQ() Y = model.get_Y().T model.sess.close() tf.reset_default_graph() return RQ, Y, None
def cml(matrix_train, embeded_matrix=np.empty((0)), iteration=100, lam=80, rank=200, seed=1, **unused): progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) m, n = matrix_input.shape model = CollaborativeMetricLearning(num_users=m, num_items=n, embed_dim=rank, cov_loss_weight=lam) model.train_model(matrix_input, iteration) RQ = model.get_RQ() Y = model.get_Y().T model.sess.close() tf.reset_default_graph() return RQ, Y, None
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") #rating_matrix = load_pandas(path=args.path, name=args.name, shape=args.shape) rating_matrix = load_yahoo(path=args.path, name=args.name, shape=args.shape) #timestamp_matrix = load_pandas(path=args.path, value_name='timestamp', name=args.name, shape=args.shape) progress.section("Split CSR Matrices") #rtrain, rvalid, rtest, nonzero_index = time_ordered_split(rating_matrix=rating_matrix, ratio=args.ratio, implicit=args.implicit) rtrain, rvalid, rtest, nonzero_index = split_seed_randomly( rating_matrix=rating_matrix, ratio=args.ratio, implicit=args.implicit) print("Done splitting Yahoo dataset") progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_array(nonzero_index, args.path, "Index") print("Done saving data for yahoo after splitting")
def cdae(matrix_train, embeded_matrix=np.empty((0)), iteration=100, lam=80, rank=200, corruption=0.8, seed=1, **unused): progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) m, n = matrix_input.shape model = CDAE(m, n, rank, 100, lamb=lam) model.train_model(matrix_input, corruption, iteration) RQ = model.get_RQ(matrix_input) Y = model.get_Y() Bias = model.get_Bias() model.sess.close() tf.reset_default_graph() return RQ, Y, Bias
def acf(matrix_train, embeded_matrix=np.empty((0)), epoch=300, iteration=100, lamb=80, rank=100, key_dim=3, batch_size=32, optimizer="Adam", learning_rate=0.001, seed=1, root=1, fb=False, **unused): print(epoch, lamb, rank) progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) progress.subsection("Create PMI matrix") pmi_matrix = get_pmi_matrix(matrix_input, root) progress.subsection("Randomized SVD") start_time = time.time() if fb: P, sigma, Qt = pca(pmi_matrix, k=rank, n_iter=iteration, raw=True) else: P, sigma, Qt = randomized_svd(pmi_matrix, n_components=rank, n_iter=iteration, power_iteration_normalizer='QR', random_state=seed) Q = Qt.T * np.sqrt(sigma) m, n = matrix_input.shape model = ACF(m, n, rank, key_dim, lamb=lamb, batch_size=batch_size, learning_rate=learning_rate, optimizer=Optimizer[optimizer], item_embeddings=Q) model.train_model(matrix_input, epoch) print("Elapsed: {0}".format(inhour(time.time() - start_time))) RQ = model.get_RQ() Y = model.get_Y().T model.sess.close() tf.reset_default_graph() return RQ, Y, None
def autorec(matrix_train, embeded_matrix=np.empty((0)), iteration=100, lam=80, rank=200, optimizer='RMSProp', seed=1, **unused): progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) m, n = matrix_input.shape model = AutoRec(n, rank, 100, lamb=lam, optimizer=Regularizer[optimizer]) model.train_model(matrix_input, iteration) RQ = model.get_RQ(matrix_input) Y = model.get_Y() Bias = model.get_Bias() model.sess.close() tf.reset_default_graph() return RQ, Y, Bias
def vae_cf(matrix_train, embedded_matrix=np.empty((0)), iteration=100, lam=80, rank=200, corruption=0.5, optimizer="RMSProp", seed=1, **unused): progress = WorkSplitter() matrix_input = matrix_train if embedded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embedded_matrix.T)) m, n = matrix_input.shape model = VAE(n, rank, 100, lamb=lam, observation_distribution="Multinomial", optimizer=Regularizer[optimizer]) model.train_model(matrix_input, corruption, iteration) RQ = model.get_RQ(matrix_input) Y = model.get_Y() Bias = model.get_Bias() model.sess.close() tf.reset_default_graph() return RQ, Y, Bias
def refinelabelmf(matrix_train, matrix_valid, iteration=100, lam=0.01, confidence=0.9, rank=50, seed=0, batch_size=500, gpu_on=True, dataset=None, **unused): progress = WorkSplitter() progress.section("RefineLabelMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("RefineLabelMF: Load the variables trained on S_c/S_t") norm_RQ = np.load('latent/' + dataset + 'U_BiasedMF_10.npy') norm_Y = np.load('latent/' + dataset + 'V_BiasedMF_10.npy') norm_uBias = np.load('latent/' + dataset + 'uB_BiasedMF_10.npy') norm_iBias = np.load('latent/' + dataset + 'iB_BiasedMF_10.npy') unif_RQ = np.load('latent/' + dataset + 'unif_U_BiasedMF_10.npy') unif_Y = np.load('latent/' + dataset + 'unif_V_BiasedMF_10.npy') unif_uBias = np.load('latent/' + dataset + 'unif_uB_BiasedMF_10.npy') unif_iBias = np.load('latent/' + dataset + 'unif_iB_BiasedMF_10.npy') progress.section("RefineLabelMF: Training") m, n = matrix_train.shape model = RefineLabelMF(m, n, rank, lamb=lam, confidence=confidence, batch_size=batch_size, gpu_on=gpu_on, norm_init_U=norm_RQ, norm_init_V=norm_Y, norm_init_uBias=norm_uBias, norm_init_iBias=norm_iBias, unif_init_U=unif_RQ, unif_init_V=unif_Y, unif_init_uBias=unif_uBias, unif_init_iBias=unif_iBias) metric_names = ['NLL', 'AUC'] RQ, Y, user_bias, item_bias, refined_label, user_item_pairs, prediction = model.train_model(matrix_train, matrix_valid, iteration, metric_names) # if gpu_on: # np.savetxt('Matlab/refinelabelmf_prediction.txt', cp.asnumpy(prediction)) # else: # np.savetxt('Matlab/refinelabelmf_prediction.txt', prediction) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def cml_normalized(matrix_train, time_stamp_matrix=None, embeded_matrix=np.empty((0)), iteration=100, lam=80, rank=200, seed=1, **unused): progress = WorkSplitter() matrix_input = matrix_train from utils.io import load_numpy time_stamp_matrix = load_numpy(path='datax/', name='Rtime.npz') orders = get_orders(time_stamp_matrix.multiply(matrix_train)) if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) m, n = matrix_input.shape model = NormalizedCollaborativeMetricLearning(num_users=m, num_items=n, embed_dim=rank, cov_loss_weight=lam) model.train_model(matrix_input, orders, iteration) RQ = model.get_RQ() Y = model.get_Y().T tf.reset_default_graph() return RQ, Y, None
def als(matrix_train, embeded_matrix=np.empty((0)), iteration=4, lam=80, rank=200, alpha=100, seed=1, **unused): """ :param matrix_train: rating matrix :param embeded_matrix: item or user embedding matrix(side info) :param iteration: number of alternative solving :param lam: regularization parameter :param rank: SVD top K eigenvalue ranks :param alpha: re-weighting parameter :param gpu: GPU computation or CPU computation. GPU usually does 2X speed of CPU :param seed: Random initialization seed :return: """ progress = WorkSplitter() progress.subsection("Alternative Item-wised Optimization") matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) m, n = matrix_train.shape matrix_coo = matrix_train.tocoo() cold_rows, cold_cols = get_cold(matrix_coo, m, n) np.random.seed(1) U = torch.tensor( np.random.normal(0, 0.01, size=(m, rank)).astype(np.float32)).float() V = torch.tensor( np.random.normal(0, 0.01, size=(n, rank)).astype(np.float32)).float() U[cold_rows] = 0 V[cold_cols] = 0 for i in xrange(iteration): progress.subsubsection("Iteration: {0}".format(i)) solve(matrix_input.T, U, V, lam=lam, rank=rank, alpha=alpha) solve(matrix_input, V, U, lam=lam, rank=rank, alpha=alpha) return U.numpy(), V.numpy().T, None
def biasedmf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=500, way=None, gpu_on=True, **unused): progress = WorkSplitter() progress.section("BiasedMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("BiasedMF: Training") m, n = matrix_train.shape model = BiasedMF(m, n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] if way == 'unif': RQ, Y, user_bias, item_bias, _ = model.train_model( matrix_unif_train, matrix_valid, iteration, metric_names) elif way == 'combine': matrix_train += matrix_unif_train RQ, Y, user_bias, item_bias, _ = model.train_model( matrix_train, matrix_valid, iteration, metric_names) else: RQ, Y, user_bias, item_bias, prediction = model.train_model( matrix_train, matrix_valid, iteration, metric_names) # if gpu_on: # np.savetxt('Matlab/biasedmf_prediction.txt', cp.asnumpy(prediction)) # else: # np.savetxt('Matlab/biasedmf_prediction.txt', prediction) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def ce_vae(matrix_train, matrix_train_keyphrase, embeded_matrix=np.empty((0)), epoch=100, lamb_l2=80.0, lamb_keyphrase=1.0, lamb_latent=5.0, lamb_rating=1.0, beta=0.2, learning_rate=0.0001, rank=200, corruption=0.5, optimizer="RMSProp", seed=1, **unused): progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) matrix_input_keyphrase = matrix_train_keyphrase model = CE_VAE(observation_dim=matrix_input.shape[1], keyphrase_dim=matrix_input_keyphrase.shape[1], latent_dim=rank, batch_size=128, lamb_l2=lamb_l2, lamb_keyphrase=lamb_keyphrase, lamb_latent=lamb_latent, lamb_rating=lamb_rating, beta=beta, learning_rate=learning_rate, observation_distribution="Gaussian", optimizer=Optimizer[optimizer]) model.train_model(matrix_input, matrix_input_keyphrase, corruption, epoch) return model
def uncertainty(Rtrain, df_input, rank): progress = WorkSplitter() m, n = Rtrain.shape valid_models = vaes.keys() results = [] for run in range(1): for idx, row in df_input.iterrows(): row = row.to_dict() if row['model'] not in valid_models: continue progress.section(json.dumps(row)) if 'optimizer' not in row.keys(): row['optimizer'] = 'RMSProp' model = vaes[row['model']](n, rank, batch_size=100, lamb=row['lambda'], optimizer=Regularizer[row['optimizer']]) model.train_model(Rtrain, corruption=row['corruption'], epoch=row['iter']) data_batches = model.get_batches(Rtrain, batch_size=100) progress.subsection("Predict") for batch in tqdm(data_batches): batch_size = batch.shape[0] _, stds = model.uncertainty(batch.todense()) num_rated = np.squeeze(np.asarray(np.sum(batch, axis=1))) std = np.mean(stds, axis=1) results.append( pd.DataFrame({ 'model': [row['model']] * batch_size, 'numRated': num_rated, 'std': std })) return pd.concat(results)
def wrsamplemf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=500, gpu_on=True, **unused): progress = WorkSplitter() progress.section("WRSampleMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("WRSampleMF: Training") m, n = matrix_train.shape marks = sparse.csr_matrix(matrix_train.shape) marks[(matrix_train != 0).nonzero()] = 1 matrix_train += matrix_unif_train num_samples = len(matrix_train.nonzero()[0]) model = WRSampleMF(m, n, rank, num_samples, lamb=lam, batch_size=batch_size, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] RQ, Y, user_bias, item_bias, confidence, user_item_pairs, prediction = model.train_model( matrix_train, marks, matrix_valid, iteration, metric_names) # np.savetxt('Matlab/wrsamplemf_samples.txt', user_item_pairs) # np.savetxt('Matlab/wrsamplemf_weights.txt', confidence) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def main(args): # Progress bar progress = WorkSplitter() # Load Data progress.section("Load Data") start_time = time.time() data = Data(args.path, args.train, args.valid,is_lb=True) print("Elapsed: {0}".format(inhour(time.time() - start_time))) #build model progress.section("Build Model") model = FeatureNet(data.n_token, data.n_feature, [1024, 2000, 1000, 500, 100]) model.cuda() print(model) model.cuda() model.load_state_dict(torch.load(args.checkpoint)) print(model) lb_loader = data.instance_a_lb_loader(args.batch) lbs = {'user_lb': list(), 'tweet_lb': list()} preds = [] model = model.eval() with torch.no_grad(): lb_iterator = tqdm(lb_loader, desc="lb") for _, batch in enumerate(lb_iterator): token, feature, tweet_lb, user_lb = batch[0].float().cuda(), batch[1].float().cuda(), batch[2], batch[3]#,batch[4].cuda() pred = torch.sigmoid(model(token,feature)).detach().cpu().numpy() lbs['tweet_lb'] += tweet_lb[0] lbs['user_lb'] += user_lb[0] preds.append(pred) final_csv = pd.DataFrame(lbs) preds = np.float64(np.vstack(preds)) if not os.path.exists(args.spath): os.makedirs(args.spath) print("Generating CSVs...") for i, engage in enumerate(["reply", "retweet", "comment", "like"]): final_csv[engage] = preds[:,i] final_csv[['tweet_lb','user_lb',engage]].to_csv(os.path.join(args.spath, engage+'.csv'),index=False, header=False)
def lookup(train, validation, params, measure='Cosine', gpu_on=True): progress = WorkSplitter() df = pd.DataFrame(columns=['model']) num_user = train.shape[0] for algorithm in params['models']: RQ = np.load('latent/U_{0}_{1}.npy'.format(algorithm, params['rank'])) Y = np.load('latent/V_{0}_{1}.npy'.format(algorithm, params['rank'])) if os.path.isfile('latent/B_{0}_{1}.npy'.format( algorithm, params['rank'])): Bias = np.load('latent/B_{0}_{1}.npy'.format( algorithm, params['rank'])) else: Bias = None progress.subsection("Prediction") prediction = predict(matrix_U=RQ, matrix_V=Y, measure=measure, bias=Bias, topK=params['topK'][-1], matrix_Train=train, gpu=gpu_on) progress.subsection("Evaluation") result = evaluate(prediction, validation, params['metric'], params['topK']) result_dict = {'model': algorithm} for name in result.keys(): result_dict[name] = [ round(result[name][0], 4), round(result[name][1], 4) ] df = df.append(result_dict, ignore_index=True) return df
def unionsamplemf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=500, confidence=0.9, gpu_on=True, **unused): progress = WorkSplitter() progress.section("UnionSampleMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("UnionSampleMF: Training") m, n = matrix_train.shape model = UnionSampleMF(m, n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on, confidence=confidence) metric_names = ['NLL', 'AUC'] marks = sparse.csr_matrix(matrix_train.shape) marks[(matrix_train != 0).nonzero()] = 1 matrix_train += matrix_unif_train RQ, Y, user_bias, item_bias = model.train_model(matrix_train, marks, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def restrictedbatchsamplemf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=500, gpu_on=True, step=3, way=None, **unused): progress = WorkSplitter() progress.section("RestrictedBatchSampleMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("RestrictedBatchSampleMF: Training") m, n = matrix_train.shape model = BatchSampleMF(m, n, rank, lamb=lam, batch_size=batch_size, step=step, gpu_on=gpu_on, way=way) metric_names = ['NLL', 'AUC'] RQ, Y, user_bias, item_bias = model.train_model(matrix_train, matrix_unif_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias
def autorec(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=256, way=None, gpu_on=True, **unused): progress = WorkSplitter() progress.section("AutoRec: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("AutoRec: Training") m, n = matrix_train.shape model = AutoRec(n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] if way == 'unif': RQ, X, xBias, Y, yBias = model.train_model(matrix_unif_train, matrix_valid, iteration, metric_names) elif way == 'combine': matrix_train += matrix_unif_train RQ, X, xBias, Y, yBias = model.train_model(matrix_train, matrix_valid, iteration, metric_names) else: RQ, X, xBias, Y, yBias = model.train_model(matrix_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, X, xBias, Y, yBias
def causalsamplemf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, lam2=0.01, rank=50, seed=0, batch_size=500, gpu_on=True, **unused): progress = WorkSplitter() progress.section("CausalSampleMF: Set the random seed") np.random.seed(seed) tf.set_random_seed(seed) progress.section("CausalSampleMF: Training") m, n = matrix_train.shape # Create new item IDs for S_t (i.e., [n, n*2) unif_user_item_matrix = lil_matrix(matrix_unif_train) unif_user_item_pairs = np.asarray(unif_user_item_matrix.nonzero()).T unif_label = np.asarray(matrix_unif_train[unif_user_item_pairs[:, 0], unif_user_item_pairs[:, 1]]).T unif_user_item_pairs[:, 1] += n # Create new csr matrix including union of S_c and S_t norm_user_item_matrix = lil_matrix(matrix_train) norm_user_item_pairs = np.asarray(norm_user_item_matrix.nonzero()).T norm_label = np.asarray(matrix_train[norm_user_item_pairs[:, 0], norm_user_item_pairs[:, 1]]).T user_item_pairs = np.vstack((unif_user_item_pairs, norm_user_item_pairs)) labels = np.vstack((unif_label, norm_label)) matrix_train = sparse.csr_matrix( (labels[:, 0], (user_item_pairs[:, 0], user_item_pairs[:, 1])), shape=(m, n * 2), dtype='float32') model = CausalSampleMF(m, n, rank, lamb=lam, lamb2=lam2, batch_size=batch_size, gpu_on=gpu_on) metric_names = ['NLL', 'AUC'] RQ, Y, user_bias, item_bias = model.train_model(matrix_train, matrix_valid, iteration, metric_names) model.sess.close() tf.reset_default_graph() return RQ, Y, user_bias, item_bias