Example #1
0
    def _update(self, X, vad_data, **kwargs):
        n_users = X.shape[0]
        XT = X.T.tocsr()  # pre-compute this
        old_ndcg = -np.inf
        for i in xrange(self.max_iter):
            if self.verbose:
                print('ITERATION #%d' % i)
                start_t = _writeline_and_time('\tUpdating user factors...')
            self.theta = recompute_factors(self.beta, self.theta, X,
                                           self.lam_theta / self.lam_y,
                                           self.lam_y,
                                           self.mu,
                                           self.n_jobs,
                                           batch_size=self.batch_size)
            if self.verbose:
                print('\r\tUpdating user factors: time=%.2f'
                      % (time.time() - start_t))
                start_t = _writeline_and_time('\tUpdating item factors...')
            self.beta = recompute_factors(self.theta, self.beta, XT,
                                          self.lam_beta / self.lam_y,
                                          self.lam_y,
                                          self.mu,
                                          self.n_jobs,
                                          batch_size=self.batch_size)
            if self.verbose:
                print('\r\tUpdating item factors: time=%.2f'
                      % (time.time() - start_t))
                sys.stdout.flush()

            if self.verbose:
                start_t = _writeline_and_time('\tUpdating consideration prior...')

            start_idx = range(0, n_users, self.batch_size)
            end_idx = start_idx[1:] + [n_users]

            A_sum = np.zeros_like(self.mu)
            for lo, hi in zip(start_idx, end_idx):
                A_sum += a_row_batch(X[lo:hi], self.theta[lo:hi], self.beta,
                                     self.lam_y, self.mu).sum(axis=0)
            self.mu = (self.a + A_sum - 1) / (self.a + self.b + n_users - 2)
            if self.verbose:
                print('\r\tUpdating consideration prior: time=%.2f'
                      % (time.time() - start_t))
                sys.stdout.flush()

            if vad_data is not None:
                vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data,
                                                        self.theta,
                                                        self.beta,
                                                        **kwargs)

                if self.verbose:
                    print('\tValidation NDCG@k: %.4f' % vad_ndcg)
                    sys.stdout.flush()
                if self.early_stopping and old_ndcg > vad_ndcg:
                    break  # we will not save the parameter for this iteration
                old_ndcg = vad_ndcg
            if self.save_params:
                self._save_params(i)
        pass
Example #2
0
 def _validate(self, X, vad_data, **kwargs):
     vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data,
                                             self.theta,
                                             self.beta,
                                             **kwargs)
     if self.verbose:
         print('\tValidation NDCG@k: %.5f' % vad_ndcg)
     return vad_ndcg
Example #3
0
 def _validate(self, X, vad_data, **kwargs):
     vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data,
                                             self.theta,
                                             self.beta,
                                             **kwargs)
     if self.verbose:
         print('\tValidation NDCG@k: %.5f' % vad_ndcg)
     return vad_ndcg
Example #4
0
 def _validate(self, X, vad_data, **kwargs):
     '''Compute validation metric (NDCG@k)'''
     vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data, self.theta,
                                             self.beta, **kwargs)
     if self.verbose:
         print('\tValidation NDCG@k: %.4f' % vad_ndcg)
         sys.stdout.flush()
     return vad_ndcg
Example #5
0
def train(sppmi_mat, data_dir=DATA_DIR):
    '''
        Train
    '''
    # load file
    n_users = len(uid2idx)
    print('n_users:{}'.format(n_users))
    n_items = len(sid2idx)
    print('n_items:{}'.format(n_items))
    fn_train = os.path.join(data_dir, 'data_train.txt')
    fn_dev = os.path.join(data_dir, 'data_dev.txt')
    fn_test = os.path.join(data_dir, 'data_test.txt')
    train_data = util.load_input_data(fn_train, shape=(n_users, n_items))
    vad_data = util.load_input_data(fn_dev, shape=(n_users, n_items))
    test_data = util.load_input_data(fn_test, shape=(n_users, n_items))
    # train model
    coder = cofacto.CoFacto(n_components=N_COMPONENTS,
                            max_iter=max_iter,
                            batch_size=1000,
                            init_std=0.01,
                            n_jobs=N_JOBS,
                            random_state=98765,
                            save_params=True,
                            save_dir=save_dir,
                            early_stopping=True,
                            verbose=True,
                            lam_theta=lam_theta,
                            lam_beta=lam_beta,
                            lam_gamma=lam_gamma,
                            c0=c0,
                            c1=c1)
    coder.fit(train_data,
              sppmi_mat,
              vad_data=vad_data,
              batch_users=5000,
              k=100)
    # test model
    n_params = len(glob.glob(os.path.join(save_dir, '*.npz')))
    last_iter_num = n_params - 1
    params = np.load(
        os.path.join(save_dir,
                     'CoFacto_K%d_iter%d.npz' % (N_COMPONENTS, last_iter_num)))
    U, V = params['U'], params['V']
    print('Test Recall@20: %.4f' % rec_eval.recall_at_k(
        train_data, test_data, U, V, k=20, vad_data=vad_data))
    print('Test Recall@50: %.4f' % rec_eval.recall_at_k(
        train_data, test_data, U, V, k=50, vad_data=vad_data))
    print('Test NDCG@100: %.4f' % rec_eval.normalized_dcg_at_k(
        train_data, test_data, U, V, k=100, vad_data=vad_data))
    print('Test MAP@100: %.4f' % rec_eval.map_at_k(
        train_data, test_data, U, V, k=100, vad_data=vad_data))
    # save
    model_save_fn = os.path.join(
        DATA_DIR, 'Model_K{}_{}.npz'.format(N_COMPONENTS, DATA_SET_NAME))
    np.savez(model_save_fn, U=U, V=V)
    print('saved')
Example #6
0
 def _validate(self, X, vad_data, **kwargs):
     '''Compute validation metric (NDCG@k)'''
     vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data,
                                             self.theta,
                                             self.beta,
                                             **kwargs)
     if self.verbose:
         print('\tValidation NDCG@k: %.4f' % vad_ndcg)
         sys.stdout.flush()
     return vad_ndcg
Example #7
0
 def _validate(self, X, pi, vad_data, **kwargs):
     '''Compute validation metric (NDCG@k)'''
     mu = dict(params=[self.nu, pi, self.alpha], func=get_mu)
     vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data,
                                             self.theta,
                                             self.beta,
                                             mu=mu,
                                             **kwargs)
     if self.verbose:
         print('\tValidation NDCG@k: %.4f' % vad_ndcg)
         sys.stdout.flush()
     return vad_ndcg
Example #8
0
    def _update(self, X, pi, vad_data, **kwargs):
        '''Model training and evaluation on validation set'''
        XT = X.T.tocsr()  # pre-compute this
        old_ndcg = -np.inf

        for i in xrange(self.max_iter):
            if self.verbose:
                print('ITERATION #%d' % i)
                start_t = _writeline_and_time('\tUpdating user factors...')
            self.theta = recompute_factors(self.beta, self.theta, pi,
                                           self.nu, self.alpha, X,
                                           self.lam_theta / self.lam_y,
                                           self.lam_y,
                                           self.n_jobs,
                                           batch_size=self.batch_size)
            if self.verbose:
                print('\r\tUpdating user factors: time=%.2f'
                      % (time.time() - start_t))
                start_t = _writeline_and_time('\tUpdating item factors...')
            self.beta = recompute_factors(self.theta, self.beta, self.nu, pi,
                                          self.alpha, XT,
                                          self.lam_beta / self.lam_y,
                                          self.lam_y,
                                          self.n_jobs,
                                          batch_size=self.batch_size)
            if self.verbose:
                print('\r\tUpdating item factors: time=%.2f'
                      % (time.time() - start_t))
                start_t = _writeline_and_time('\tUpdating user consideration factors...\n')
            self.update_nu(XT, pi)
            if self.verbose:
                print('\tUpdating user consideration factors: time=%.2f'
                      % (time.time() - start_t))
                sys.stdout.flush()

            if vad_data is not None:
                mu = dict(params=[self.nu, pi, self.alpha],
                          func=get_mu)
                vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data,
                                                        self.theta,
                                                        self.beta,
                                                        mu=mu,
                                                        **kwargs)
                if self.verbose:
                    print('\tValidation NDCG@k: %.4f' % vad_ndcg)
                    sys.stdout.flush()
                if self.early_stopping and old_ndcg > vad_ndcg:
                    break  # we will not save the parameter for this iteration
                old_ndcg = vad_ndcg
            if self.save_params:
                self._save_params(i)
        pass
Example #9
0
def factorize(S,
              num_factors,
              X=None,
              vad_data=None,
              num_iters=10,
              init_std=0.01,
              lambda_U_reg=1e-2,
              lambda_V_reg=100,
              lambda_W_reg=1e-2,
              dtype='float32',
              random_state=None,
              verbose=False,
              recompute_factors=batched_inv_joblib.recompute_factors_batched,
              fixed_item_embeddings=False,
              V=None,
              *args,
              **kwargs):

    num_users, num_items = S.shape
    if X is not None:
        assert X.shape == (num_items, num_factors)

    if verbose:
        print "Precompute S^T (if necessary)"
        start_time = time.time()

    ST = S.T.tocsr()

    if verbose:
        print "  took %.3f seconds" % (time.time() - start_time)
        start_time = time.time()

    if type(random_state) is int:
        np.random.seed(random_state)
    elif random_state is not None:
        np.random.setstate(random_state)

    U = None
    if not fixed_item_embeddings and not V:
        V = np.random.randn(num_items, num_factors).astype(dtype) * init_std

    old_ndcg = -np.inf
    for i in xrange(num_iters):
        if verbose:
            print("Iteration %d:" % i)
            start_t = _write_and_time('\tUpdating user factors...')
        U = recompute_factors(V, S, lambda_U_reg, dtype=dtype, *args, **kwargs)

        if verbose:
            print('\r\tUpdating user factors: time=%.2f' %
                  (time.time() - start_t))
            if not fixed_item_embeddings:
                start_t = _write_and_time('\tUpdating item factors...')
        if not fixed_item_embeddings:
            V = recompute_factors(U,
                                  ST,
                                  lambda_V_reg,
                                  X=X,
                                  dtype=dtype,
                                  *args,
                                  **kwargs)
        if verbose and not fixed_item_embeddings:
            print('\r\tUpdating item factors: time=%.2f' %
                  (time.time() - start_t))
        if vad_data is not None and not fixed_item_embeddings:
            vad_ndcg = rec_eval.normalized_dcg_at_k(S,
                                                    vad_data,
                                                    U,
                                                    V,
                                                    k=10,
                                                    batch_users=5000)
            if verbose:
                print("\tValidation NDCG@k: %.5f" % vad_ndcg)
                sys.stdout.flush()
            if old_ndcg > vad_ndcg:
                break  # we will not save the parameter for this iteration
            old_ndcg = vad_ndcg

    return U, V, old_ndcg
Example #10
0
    save_dir = 'out/seed_%d' % (seed_train_test)

    for filename in glob.glob(os.path.join(save_dir, '*')):
        print filename
        save = False
        with open(filename, 'rb') as f:
            model = pickle.load(f)
            W = model.Ew
            H = model.Eh
        #model.score={} - erase the score
        for s in range(10):
            if ~np.isin('ndcg@100s' + str(s), model.score.keys()):
                save = True
                ndcg = rec_eval.normalized_dcg_at_k(Y_train > 0,
                                                    Y_test > s,
                                                    W,
                                                    H,
                                                    k=100)
                model.score['ndcg@100s' + str(s)] = ndcg
        if save == True:
            model.save_dir = save_dir
            model.save_model()

#%% Read scores
appended_data = []
for seed_train_test in Seed_train_test:
    save_dir = 'out/seed_%d' % (seed_train_test)
    for filename in glob.glob(os.path.join(save_dir, '*')):
        with open(filename, 'rb') as f:
            model = pickle.load(f)
        df_name = pd.DataFrame.from_dict([{
Example #11
0
def factorize(S, num_factors, X=None, vad_data=None, num_iters=10, init_std=0.01,
              lambda_U_reg=1e-2, lambda_V_reg=100, lambda_W_reg=1e-2,
              dtype='float32', random_state=None, verbose=False,
              recompute_factors=batched_inv_joblib.recompute_factors_batched,
              fixed_item_embeddings=False,
              V=None,
              *args, **kwargs):

    num_users, num_items = S.shape
    if X is not None:
        assert X.shape == (num_items, num_factors)

    if verbose:
        print "Precompute S^T (if necessary)"
        start_time = time.time()

    ST = S.T.tocsr()

    if verbose:
        print "  took %.3f seconds" % (time.time() - start_time)
        start_time = time.time()

    if type(random_state) is int:
        np.random.seed(random_state)
    elif random_state is not None:
        np.random.setstate(random_state)

    U = None
    if not fixed_item_embeddings and not V:
        V = np.random.randn(num_items, num_factors).astype(dtype) * init_std

    old_ndcg = -np.inf
    for i in xrange(num_iters):
        if verbose:
            print("Iteration %d:" % i)
            start_t = _write_and_time('\tUpdating user factors...')
        U = recompute_factors(V, S, lambda_U_reg, dtype=dtype, *args, **kwargs)

        if verbose:
            print('\r\tUpdating user factors: time=%.2f'
                  % (time.time() - start_t))
            if not fixed_item_embeddings:
                start_t = _write_and_time('\tUpdating item factors...')
        if not fixed_item_embeddings:
            V = recompute_factors(U, ST, lambda_V_reg, X=X, dtype=dtype,
                                  *args, **kwargs)
        if verbose and not fixed_item_embeddings:
            print('\r\tUpdating item factors: time=%.2f'
                  % (time.time() - start_t))
        if vad_data is not None and not fixed_item_embeddings:
            vad_ndcg = rec_eval.normalized_dcg_at_k(S, vad_data, U, V,
                                                    k=100,
                                                    batch_users=5000)
            if verbose:
                print("\tValidation NDCG@k: %.5f" % vad_ndcg)
                sys.stdout.flush()
            if old_ndcg > vad_ndcg:
                break  # we will not save the parameter for this iteration
            old_ndcg = vad_ndcg

    return U, V, old_ndcg
Example #12
0
model = sio.loadmat(d + '/model.mat')['sol'][0][0][0][0][0]

U = model[0]
C = model[1]
G = model[2]

vad_data = None
try:
    train_data = T1_train
    test_data = T1_test
    print 'Testing USER-GAME MATRIX'
    print 'Test Recall@20: %f' % rec_eval.recall_at_k(
        train_data, test_data, U, G, k=20, vad_data=vad_data)
    print 'Test Recall@50: %f' % rec_eval.recall_at_k(
        train_data, test_data, U, G, k=50, vad_data=vad_data)
    print 'Test NDCG@100: %f' % rec_eval.normalized_dcg_at_k(
        train_data, test_data, U, G, k=100, vad_data=vad_data)
    print 'Test MAP@100: %f' % rec_eval.map_at_k(
        train_data, test_data, U, G, k=100, vad_data=vad_data)
except:
    print 'Error'

try:
    train_data = T2_train
    test_data = T2_test

    print 'Testing USER-GROUP MATRIX'
    print 'Test Recall@20: %f' % rec_eval.recall_at_k(
        train_data, test_data, U, C, k=20, vad_data=vad_data)
    print 'Test Recall@50: %f' % rec_eval.recall_at_k(
        train_data, test_data, U, C, k=50, vad_data=vad_data)
    print 'Test NDCG@100: %f' % rec_eval.normalized_dcg_at_k(
Example #13
0
    def _update(self, X, vad_data, **kwargs):
        '''Model training and evaluation on validation set'''
        n_users = X.shape[0]
        XT = X.T.tocsr()  # pre-compute this
        old_ndcg = -np.inf
        for i in xrange(self.max_iter):
            if self.verbose:
                print('ITERATION #%d' % i)
                start_t = _writeline_and_time('\tUpdating user factors...')
            self.theta = recompute_factors(self.beta, self.theta, X,
                                           self.lam_theta / self.lam_y,
                                           self.lam_y,
                                           self.mu,
                                           self.n_jobs,
                                           batch_size=self.batch_size)
            if self.verbose:
                print('\r\tUpdating user factors: time=%.2f'
                      % (time.time() - start_t))
                start_t = _writeline_and_time('\tUpdating item factors...')
            self.beta = recompute_factors(self.theta, self.beta, XT,
                                          self.lam_beta / self.lam_y,
                                          self.lam_y,
                                          self.mu,
                                          self.n_jobs,
                                          batch_size=self.batch_size)
            if self.verbose:
                print('\r\tUpdating item factors: time=%.2f'
                      % (time.time() - start_t))
                sys.stdout.flush()

            if self.verbose:
                start_t = _writeline_and_time('\tUpdating consideration prior...')

            start_idx = range(0, n_users, self.batch_size)
            end_idx = start_idx[1:] + [n_users]

            A_sum = np.zeros_like(self.mu)
            for lo, hi in zip(start_idx, end_idx):
                A_sum += a_row_batch(X[lo:hi], self.theta[lo:hi], self.beta,
                                     self.lam_y, self.mu).sum(axis=0)
            self.mu = (self.a + A_sum - 1) / (self.a + self.b + n_users - 2)
            if self.verbose:
                print('\r\tUpdating consideration prior: time=%.2f'
                      % (time.time() - start_t))
                sys.stdout.flush()

            if vad_data is not None:
                vad_ndcg = rec_eval.normalized_dcg_at_k(X, vad_data,
                                                        self.theta,
                                                        self.beta,
                                                        **kwargs)

                if self.verbose:
                    print('\tValidation NDCG@k: %.4f' % vad_ndcg)
                    sys.stdout.flush()
                if self.early_stopping and old_ndcg > vad_ndcg:
                    break  # we will not save the parameter for this iteration
                old_ndcg = vad_ndcg
            if self.save_params:
                self._save_params(i)
        pass
Example #14
0
        S,
        num_factors,
        vad_data=vad_data,
        num_iters=num_iters,
        init_std=0.01,
        lambda_U_reg=lam_theta,
        lambda_V_reg=lam_beta,
        dtype='float32',
        random_state=98765,
        verbose=False,
        recompute_factors=batched_inv_joblib.recompute_factors_batched,
        batch_size=batch_size,
        n_jobs=n_jobs)
    if vad_ndcg > best_ndcg:
        best_ndcg = vad_ndcg
        U_best = U.copy()
        V_best = V.copy()
        best_alpha = alpha
print best_alpha, best_ndcg

test_data, test_raw = load_data(os.path.join(DATA_DIR, 'test.csv'))
# alpha = 10 gives the best validation performance
print 'Test Recall@10: %.4f' % rec_eval.recall_at_k(
    train_data, test_data, U_best, V_best, k=10, vad_data=vad_data)
print 'Test Recall@10: %.4f' % rec_eval.recall_at_k(
    train_data, test_data, U_best, V_best, k=10, vad_data=vad_data)
print 'Test NDCG@10: %.4f' % rec_eval.normalized_dcg_at_k(
    train_data, test_data, U_best, V_best, k=10, vad_data=vad_data)
print 'Test MAP@10: %.4f' % rec_eval.map_at_k(
    train_data, test_data, U_best, V_best, k=10, vad_data=vad_data)
# In[31]:

test_data, _ = load_data(os.path.join(DATA_DIR, 'test.csv'))
test_data.data = np.ones_like(test_data.data)

# In[32]:

n_params = len(glob.glob(os.path.join(save_dir, '*.npz')))

params = np.load(
    os.path.join(save_dir,
                 'CoFacto_K%d_iter%d.npz' % (n_components, n_params - 1)))
U, V = params['U'], params['V']

# In[33]:

print 'Test Recall@20: %.4f' % rec_eval.recall_at_k(
    train_data, test_data, U, V, k=20, vad_data=vad_data)
print 'Test Recall@50: %.4f' % rec_eval.recall_at_k(
    train_data, test_data, U, V, k=50, vad_data=vad_data)
print 'Test NDCG@5: %.4f' % rec_eval.normalized_dcg_at_k(
    train_data, test_data, U, V, k=5, vad_data=vad_data)
print 'Test MAP@10: %.4f' % rec_eval.map_at_k(
    train_data, test_data, U, V, k=10, vad_data=vad_data)

# In[34]:

np.savez('CoFactor_K100_ML20M.npz', U=U, V=V)

# In[ ]:
Example #16
0
    def _update(self, X, pi, vad_data, **kwargs):
        '''Model training and evaluation on validation set'''
        XT = X.T.tocsr()  # pre-compute this
        old_ndcg = -np.inf

        for i in xrange(self.max_iter):
            if self.verbose:
                print('ITERATION #%d' % i)
                start_t = _writeline_and_time('\tUpdating user factors...')
            self.theta = recompute_factors(self.beta,
                                           self.theta,
                                           pi,
                                           self.nu,
                                           self.alpha,
                                           X,
                                           self.lam_theta / self.lam_y,
                                           self.lam_y,
                                           self.n_jobs,
                                           batch_size=self.batch_size)
            if self.verbose:
                print('\r\tUpdating user factors: time=%.2f' %
                      (time.time() - start_t))
                start_t = _writeline_and_time('\tUpdating item factors...')
            self.beta = recompute_factors(self.theta,
                                          self.beta,
                                          self.nu,
                                          pi,
                                          self.alpha,
                                          XT,
                                          self.lam_beta / self.lam_y,
                                          self.lam_y,
                                          self.n_jobs,
                                          batch_size=self.batch_size)
            if self.verbose:
                print('\r\tUpdating item factors: time=%.2f' %
                      (time.time() - start_t))
                start_t = _writeline_and_time(
                    '\tUpdating user consideration factors...\n')
            self.update_nu(XT, pi)
            if self.verbose:
                print('\tUpdating user consideration factors: time=%.2f' %
                      (time.time() - start_t))
                sys.stdout.flush()

            if vad_data is not None:
                mu = dict(params=[self.nu, pi, self.alpha], func=get_mu)
                vad_ndcg = rec_eval.normalized_dcg_at_k(X,
                                                        vad_data,
                                                        self.theta,
                                                        self.beta,
                                                        mu=mu,
                                                        **kwargs)
                if self.verbose:
                    print('\tValidation NDCG@k: %.4f' % vad_ndcg)
                    sys.stdout.flush()
                if self.early_stopping and old_ndcg > vad_ndcg:
                    break  # we will not save the parameter for this iteration
                old_ndcg = vad_ndcg
            if self.save_params:
                self._save_params(i)
        pass