def _update(self, X, vad_data, **kwargs): n_users = X.shape[0] XT = X.T.tocsr() # pre-compute this old_ndcg = -np.inf for i in xrange(self.max_iter): if self.verbose: print('ITERATION #%d' % i) start_t = _writeline_and_time('\tUpdating user factors...') self.theta = recompute_factors(self.beta, self.theta, X, self.lam_theta / self.lam_y, self.lam_y, self.mu, self.n_jobs, batch_size=self.batch_size) if self.verbose: print('\r\tUpdating user factors: time=%.2f' % (time.time() - start_t)) start_t = _writeline_and_time('\tUpdating item factors...') self.beta = recompute_factors(self.theta, self.beta, XT, self.lam_beta / self.lam_y, self.lam_y, self.mu, self.n_jobs, batch_size=self.batch_size) if self.verbose: print('\r\tUpdating item factors: time=%.2f' % (time.time() - start_t)) sys.stdout.flush() if self.verbose: start_t = _writeline_and_time('\tUpdating consideration prior...') start_idx = range(0, n_users, self.batch_size) end_idx = start_idx[1:] + [n_users] A_sum = np.zeros_like(self.mu) for lo, hi in zip(start_idx, end_idx): A_sum += a_row_batch(X[lo:hi], self.theta[lo:hi], self.beta, self.lam_y, self.mu).sum(axis=0) self.mu = (self.a + A_sum - 1) / (self.a + self.b + n_users - 2) if self.verbose: print('\r\tUpdating consideration prior: time=%.2f' % (time.time() - start_t)) sys.stdout.flush() if vad_data is not None: vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data, self.theta, self.beta, **kwargs) if self.verbose: print('\tValidation NDCG@k: %.4f' % vad_ndcg) sys.stdout.flush() if self.early_stopping and old_ndcg > vad_ndcg: break # we will not save the parameter for this iteration old_ndcg = vad_ndcg if self.save_params: self._save_params(i) pass
def _validate(self, X, vad_data, **kwargs): vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data, self.theta, self.beta, **kwargs) if self.verbose: print('\tValidation NDCG@k: %.5f' % vad_ndcg) return vad_ndcg
def _validate(self, X, vad_data, **kwargs): '''Compute validation metric (NDCG@k)''' vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data, self.theta, self.beta, **kwargs) if self.verbose: print('\tValidation NDCG@k: %.4f' % vad_ndcg) sys.stdout.flush() return vad_ndcg
def train(sppmi_mat, data_dir=DATA_DIR): ''' Train ''' # load file n_users = len(uid2idx) print('n_users:{}'.format(n_users)) n_items = len(sid2idx) print('n_items:{}'.format(n_items)) fn_train = os.path.join(data_dir, 'data_train.txt') fn_dev = os.path.join(data_dir, 'data_dev.txt') fn_test = os.path.join(data_dir, 'data_test.txt') train_data = util.load_input_data(fn_train, shape=(n_users, n_items)) vad_data = util.load_input_data(fn_dev, shape=(n_users, n_items)) test_data = util.load_input_data(fn_test, shape=(n_users, n_items)) # train model coder = cofacto.CoFacto(n_components=N_COMPONENTS, max_iter=max_iter, batch_size=1000, init_std=0.01, n_jobs=N_JOBS, random_state=98765, save_params=True, save_dir=save_dir, early_stopping=True, verbose=True, lam_theta=lam_theta, lam_beta=lam_beta, lam_gamma=lam_gamma, c0=c0, c1=c1) coder.fit(train_data, sppmi_mat, vad_data=vad_data, batch_users=5000, k=100) # test model n_params = len(glob.glob(os.path.join(save_dir, '*.npz'))) last_iter_num = n_params - 1 params = np.load( os.path.join(save_dir, 'CoFacto_K%d_iter%d.npz' % (N_COMPONENTS, last_iter_num))) U, V = params['U'], params['V'] print('Test Recall@20: %.4f' % rec_eval.recall_at_k( train_data, test_data, U, V, k=20, vad_data=vad_data)) print('Test Recall@50: %.4f' % rec_eval.recall_at_k( train_data, test_data, U, V, k=50, vad_data=vad_data)) print('Test NDCG@100: %.4f' % rec_eval.normalized_dcg_at_k( train_data, test_data, U, V, k=100, vad_data=vad_data)) print('Test MAP@100: %.4f' % rec_eval.map_at_k( train_data, test_data, U, V, k=100, vad_data=vad_data)) # save model_save_fn = os.path.join( DATA_DIR, 'Model_K{}_{}.npz'.format(N_COMPONENTS, DATA_SET_NAME)) np.savez(model_save_fn, U=U, V=V) print('saved')
def _validate(self, X, pi, vad_data, **kwargs): '''Compute validation metric (NDCG@k)''' mu = dict(params=[self.nu, pi, self.alpha], func=get_mu) vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data, self.theta, self.beta, mu=mu, **kwargs) if self.verbose: print('\tValidation NDCG@k: %.4f' % vad_ndcg) sys.stdout.flush() return vad_ndcg
def _update(self, X, pi, vad_data, **kwargs): '''Model training and evaluation on validation set''' XT = X.T.tocsr() # pre-compute this old_ndcg = -np.inf for i in xrange(self.max_iter): if self.verbose: print('ITERATION #%d' % i) start_t = _writeline_and_time('\tUpdating user factors...') self.theta = recompute_factors(self.beta, self.theta, pi, self.nu, self.alpha, X, self.lam_theta / self.lam_y, self.lam_y, self.n_jobs, batch_size=self.batch_size) if self.verbose: print('\r\tUpdating user factors: time=%.2f' % (time.time() - start_t)) start_t = _writeline_and_time('\tUpdating item factors...') self.beta = recompute_factors(self.theta, self.beta, self.nu, pi, self.alpha, XT, self.lam_beta / self.lam_y, self.lam_y, self.n_jobs, batch_size=self.batch_size) if self.verbose: print('\r\tUpdating item factors: time=%.2f' % (time.time() - start_t)) start_t = _writeline_and_time('\tUpdating user consideration factors...\n') self.update_nu(XT, pi) if self.verbose: print('\tUpdating user consideration factors: time=%.2f' % (time.time() - start_t)) sys.stdout.flush() if vad_data is not None: mu = dict(params=[self.nu, pi, self.alpha], func=get_mu) vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data, self.theta, self.beta, mu=mu, **kwargs) if self.verbose: print('\tValidation NDCG@k: %.4f' % vad_ndcg) sys.stdout.flush() if self.early_stopping and old_ndcg > vad_ndcg: break # we will not save the parameter for this iteration old_ndcg = vad_ndcg if self.save_params: self._save_params(i) pass
def factorize(S, num_factors, X=None, vad_data=None, num_iters=10, init_std=0.01, lambda_U_reg=1e-2, lambda_V_reg=100, lambda_W_reg=1e-2, dtype='float32', random_state=None, verbose=False, recompute_factors=batched_inv_joblib.recompute_factors_batched, fixed_item_embeddings=False, V=None, *args, **kwargs): num_users, num_items = S.shape if X is not None: assert X.shape == (num_items, num_factors) if verbose: print "Precompute S^T (if necessary)" start_time = time.time() ST = S.T.tocsr() if verbose: print " took %.3f seconds" % (time.time() - start_time) start_time = time.time() if type(random_state) is int: np.random.seed(random_state) elif random_state is not None: np.random.setstate(random_state) U = None if not fixed_item_embeddings and not V: V = np.random.randn(num_items, num_factors).astype(dtype) * init_std old_ndcg = -np.inf for i in xrange(num_iters): if verbose: print("Iteration %d:" % i) start_t = _write_and_time('\tUpdating user factors...') U = recompute_factors(V, S, lambda_U_reg, dtype=dtype, *args, **kwargs) if verbose: print('\r\tUpdating user factors: time=%.2f' % (time.time() - start_t)) if not fixed_item_embeddings: start_t = _write_and_time('\tUpdating item factors...') if not fixed_item_embeddings: V = recompute_factors(U, ST, lambda_V_reg, X=X, dtype=dtype, *args, **kwargs) if verbose and not fixed_item_embeddings: print('\r\tUpdating item factors: time=%.2f' % (time.time() - start_t)) if vad_data is not None and not fixed_item_embeddings: vad_ndcg = rec_eval.normalized_dcg_at_k(S, vad_data, U, V, k=10, batch_users=5000) if verbose: print("\tValidation NDCG@k: %.5f" % vad_ndcg) sys.stdout.flush() if old_ndcg > vad_ndcg: break # we will not save the parameter for this iteration old_ndcg = vad_ndcg return U, V, old_ndcg
save_dir = 'out/seed_%d' % (seed_train_test) for filename in glob.glob(os.path.join(save_dir, '*')): print filename save = False with open(filename, 'rb') as f: model = pickle.load(f) W = model.Ew H = model.Eh #model.score={} - erase the score for s in range(10): if ~np.isin('ndcg@100s' + str(s), model.score.keys()): save = True ndcg = rec_eval.normalized_dcg_at_k(Y_train > 0, Y_test > s, W, H, k=100) model.score['ndcg@100s' + str(s)] = ndcg if save == True: model.save_dir = save_dir model.save_model() #%% Read scores appended_data = [] for seed_train_test in Seed_train_test: save_dir = 'out/seed_%d' % (seed_train_test) for filename in glob.glob(os.path.join(save_dir, '*')): with open(filename, 'rb') as f: model = pickle.load(f) df_name = pd.DataFrame.from_dict([{
def factorize(S, num_factors, X=None, vad_data=None, num_iters=10, init_std=0.01, lambda_U_reg=1e-2, lambda_V_reg=100, lambda_W_reg=1e-2, dtype='float32', random_state=None, verbose=False, recompute_factors=batched_inv_joblib.recompute_factors_batched, fixed_item_embeddings=False, V=None, *args, **kwargs): num_users, num_items = S.shape if X is not None: assert X.shape == (num_items, num_factors) if verbose: print "Precompute S^T (if necessary)" start_time = time.time() ST = S.T.tocsr() if verbose: print " took %.3f seconds" % (time.time() - start_time) start_time = time.time() if type(random_state) is int: np.random.seed(random_state) elif random_state is not None: np.random.setstate(random_state) U = None if not fixed_item_embeddings and not V: V = np.random.randn(num_items, num_factors).astype(dtype) * init_std old_ndcg = -np.inf for i in xrange(num_iters): if verbose: print("Iteration %d:" % i) start_t = _write_and_time('\tUpdating user factors...') U = recompute_factors(V, S, lambda_U_reg, dtype=dtype, *args, **kwargs) if verbose: print('\r\tUpdating user factors: time=%.2f' % (time.time() - start_t)) if not fixed_item_embeddings: start_t = _write_and_time('\tUpdating item factors...') if not fixed_item_embeddings: V = recompute_factors(U, ST, lambda_V_reg, X=X, dtype=dtype, *args, **kwargs) if verbose and not fixed_item_embeddings: print('\r\tUpdating item factors: time=%.2f' % (time.time() - start_t)) if vad_data is not None and not fixed_item_embeddings: vad_ndcg = rec_eval.normalized_dcg_at_k(S, vad_data, U, V, k=100, batch_users=5000) if verbose: print("\tValidation NDCG@k: %.5f" % vad_ndcg) sys.stdout.flush() if old_ndcg > vad_ndcg: break # we will not save the parameter for this iteration old_ndcg = vad_ndcg return U, V, old_ndcg
model = sio.loadmat(d + '/model.mat')['sol'][0][0][0][0][0] U = model[0] C = model[1] G = model[2] vad_data = None try: train_data = T1_train test_data = T1_test print 'Testing USER-GAME MATRIX' print 'Test Recall@20: %f' % rec_eval.recall_at_k( train_data, test_data, U, G, k=20, vad_data=vad_data) print 'Test Recall@50: %f' % rec_eval.recall_at_k( train_data, test_data, U, G, k=50, vad_data=vad_data) print 'Test NDCG@100: %f' % rec_eval.normalized_dcg_at_k( train_data, test_data, U, G, k=100, vad_data=vad_data) print 'Test MAP@100: %f' % rec_eval.map_at_k( train_data, test_data, U, G, k=100, vad_data=vad_data) except: print 'Error' try: train_data = T2_train test_data = T2_test print 'Testing USER-GROUP MATRIX' print 'Test Recall@20: %f' % rec_eval.recall_at_k( train_data, test_data, U, C, k=20, vad_data=vad_data) print 'Test Recall@50: %f' % rec_eval.recall_at_k( train_data, test_data, U, C, k=50, vad_data=vad_data) print 'Test NDCG@100: %f' % rec_eval.normalized_dcg_at_k(
def _update(self, X, vad_data, **kwargs): '''Model training and evaluation on validation set''' n_users = X.shape[0] XT = X.T.tocsr() # pre-compute this old_ndcg = -np.inf for i in xrange(self.max_iter): if self.verbose: print('ITERATION #%d' % i) start_t = _writeline_and_time('\tUpdating user factors...') self.theta = recompute_factors(self.beta, self.theta, X, self.lam_theta / self.lam_y, self.lam_y, self.mu, self.n_jobs, batch_size=self.batch_size) if self.verbose: print('\r\tUpdating user factors: time=%.2f' % (time.time() - start_t)) start_t = _writeline_and_time('\tUpdating item factors...') self.beta = recompute_factors(self.theta, self.beta, XT, self.lam_beta / self.lam_y, self.lam_y, self.mu, self.n_jobs, batch_size=self.batch_size) if self.verbose: print('\r\tUpdating item factors: time=%.2f' % (time.time() - start_t)) sys.stdout.flush() if self.verbose: start_t = _writeline_and_time('\tUpdating consideration prior...') start_idx = range(0, n_users, self.batch_size) end_idx = start_idx[1:] + [n_users] A_sum = np.zeros_like(self.mu) for lo, hi in zip(start_idx, end_idx): A_sum += a_row_batch(X[lo:hi], self.theta[lo:hi], self.beta, self.lam_y, self.mu).sum(axis=0) self.mu = (self.a + A_sum - 1) / (self.a + self.b + n_users - 2) if self.verbose: print('\r\tUpdating consideration prior: time=%.2f' % (time.time() - start_t)) sys.stdout.flush() if vad_data is not None: vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data, self.theta, self.beta, **kwargs) if self.verbose: print('\tValidation NDCG@k: %.4f' % vad_ndcg) sys.stdout.flush() if self.early_stopping and old_ndcg > vad_ndcg: break # we will not save the parameter for this iteration old_ndcg = vad_ndcg if self.save_params: self._save_params(i) pass
S, num_factors, vad_data=vad_data, num_iters=num_iters, init_std=0.01, lambda_U_reg=lam_theta, lambda_V_reg=lam_beta, dtype='float32', random_state=98765, verbose=False, recompute_factors=batched_inv_joblib.recompute_factors_batched, batch_size=batch_size, n_jobs=n_jobs) if vad_ndcg > best_ndcg: best_ndcg = vad_ndcg U_best = U.copy() V_best = V.copy() best_alpha = alpha print best_alpha, best_ndcg test_data, test_raw = load_data(os.path.join(DATA_DIR, 'test.csv')) # alpha = 10 gives the best validation performance print 'Test Recall@10: %.4f' % rec_eval.recall_at_k( train_data, test_data, U_best, V_best, k=10, vad_data=vad_data) print 'Test Recall@10: %.4f' % rec_eval.recall_at_k( train_data, test_data, U_best, V_best, k=10, vad_data=vad_data) print 'Test NDCG@10: %.4f' % rec_eval.normalized_dcg_at_k( train_data, test_data, U_best, V_best, k=10, vad_data=vad_data) print 'Test MAP@10: %.4f' % rec_eval.map_at_k( train_data, test_data, U_best, V_best, k=10, vad_data=vad_data)
# In[31]: test_data, _ = load_data(os.path.join(DATA_DIR, 'test.csv')) test_data.data = np.ones_like(test_data.data) # In[32]: n_params = len(glob.glob(os.path.join(save_dir, '*.npz'))) params = np.load( os.path.join(save_dir, 'CoFacto_K%d_iter%d.npz' % (n_components, n_params - 1))) U, V = params['U'], params['V'] # In[33]: print 'Test Recall@20: %.4f' % rec_eval.recall_at_k( train_data, test_data, U, V, k=20, vad_data=vad_data) print 'Test Recall@50: %.4f' % rec_eval.recall_at_k( train_data, test_data, U, V, k=50, vad_data=vad_data) print 'Test NDCG@5: %.4f' % rec_eval.normalized_dcg_at_k( train_data, test_data, U, V, k=5, vad_data=vad_data) print 'Test MAP@10: %.4f' % rec_eval.map_at_k( train_data, test_data, U, V, k=10, vad_data=vad_data) # In[34]: np.savez('CoFactor_K100_ML20M.npz', U=U, V=V) # In[ ]:
def _update(self, X, pi, vad_data, **kwargs): '''Model training and evaluation on validation set''' XT = X.T.tocsr() # pre-compute this old_ndcg = -np.inf for i in xrange(self.max_iter): if self.verbose: print('ITERATION #%d' % i) start_t = _writeline_and_time('\tUpdating user factors...') self.theta = recompute_factors(self.beta, self.theta, pi, self.nu, self.alpha, X, self.lam_theta / self.lam_y, self.lam_y, self.n_jobs, batch_size=self.batch_size) if self.verbose: print('\r\tUpdating user factors: time=%.2f' % (time.time() - start_t)) start_t = _writeline_and_time('\tUpdating item factors...') self.beta = recompute_factors(self.theta, self.beta, self.nu, pi, self.alpha, XT, self.lam_beta / self.lam_y, self.lam_y, self.n_jobs, batch_size=self.batch_size) if self.verbose: print('\r\tUpdating item factors: time=%.2f' % (time.time() - start_t)) start_t = _writeline_and_time( '\tUpdating user consideration factors...\n') self.update_nu(XT, pi) if self.verbose: print('\tUpdating user consideration factors: time=%.2f' % (time.time() - start_t)) sys.stdout.flush() if vad_data is not None: mu = dict(params=[self.nu, pi, self.alpha], func=get_mu) vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data, self.theta, self.beta, mu=mu, **kwargs) if self.verbose: print('\tValidation NDCG@k: %.4f' % vad_ndcg) sys.stdout.flush() if self.early_stopping and old_ndcg > vad_ndcg: break # we will not save the parameter for this iteration old_ndcg = vad_ndcg if self.save_params: self._save_params(i) pass