Example #1
0
def build_fm_interaction():
    begin = datetime.datetime.now()
    test_y = np.loadtxt(open(test_y_file), dtype=int)
    fm = pywFM.FM(task='classification',
                  num_iter=100,
                  learning_method='mcmc',
                  temp_path=project_path + "model\\m_fm\\tmp\\")

    model = fm.run(None,
                   None,
                   None,
                   None,
                   train_path=train_x_file,
                   test_path=test_x_file,
                   model_path=project_path +
                   "model\\m_fm\\model_file\\fm_model",
                   out_path=project_path + "model\\m_fm\\model_file\\fm.out")
    end = datetime.datetime.now()

    print model.pairwise_interactions.shape
    prob_test = model.predictions
    auc_test = metrics.roc_auc_score(test_y, prob_test)
    print auc_test

    log_file = open(project_path + "result/exp_result", "a")
    log_file.write("fm: sparse_id + gbdt + 100 iters:" + '\n')
    log_file.write("auc_test: " + str(auc_test) + '\n')
    log_file.write("time: " + str(end - begin) + '\n' + '\n')
    log_file.close()

    print model.pairwise_interactions.shape
Example #2
0
def test():

    features = np.matrix([
        #     Users  |     Movies     |    Movie Ratings   | Time | Last Movies Rated
        #    A  B  C | TI  NH  SW  ST | TI   NH   SW   ST  |      | TI  NH  SW  ST
        [1, 0, 0, 1, 0, 0, 0, 0.3, 0.3, 0.3, 0, 13, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0, 0, 0.3, 0.3, 0.3, 0, 14, 1, 0, 0, 0],
        [1, 0, 0, 0, 0, 1, 0, 0.3, 0.3, 0.3, 0, 16, 0, 1, 0, 0],
        [0, 1, 0, 0, 0, 1, 0, 0, 0, 0.5, 0.5, 5, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 1, 0, 0, 0.5, 0.5, 8, 0, 0, 1, 0],
        [0, 0, 1, 1, 0, 0, 0, 0.5, 0, 0.5, 0, 9, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 12, 1, 0, 0, 0]
    ])
    target = [0, 1, 1, 0, 1, 0, 1]

    fm = pywFM.FM(task='c',
                  num_iter=20,
                  learning_method='sgd',
                  temp_path=project_path + "model\\m_fm\\tmp\\")
    print features[:5]
    # split features and target for train/test
    # first 5 are train, last 2 are test
    model = fm.run(features[:5],
                   target[:5],
                   features[5:],
                   target[5:],
                   model_path=project_path +
                   "model\\m_fm\\model_file\\fm_model",
                   out_path=project_path + "model\\m_fm\\model_file\\fm.out")
    prob_test = model.predictions

    auc_test = metrics.roc_auc_score(target[5:], prob_test)
    print auc_test
Example #3
0
def test():
    features = np.matrix([
        #     Users  |     Movies     |    Movie Ratings   | Time | Last Movies Rated
        #    A  B  C | TI  NH  SW  ST | TI   NH   SW   ST  |      | TI  NH  SW  ST
        [1, 0, 0, 1, 0, 0, 0, 0.3, 0.3, 0.3, 0, 13, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0, 0, 0.3, 0.3, 0.3, 0, 14, 1, 0, 0, 0],
        [1, 0, 0, 0, 0, 1, 0, 0.3, 0.3, 0.3, 0, 16, 0, 1, 0, 0],
        [0, 1, 0, 0, 0, 1, 0, 0, 0, 0.5, 0.5, 5, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 1, 0, 0, 0.5, 0.5, 8, 0, 0, 1, 0],
        [0, 0, 1, 1, 0, 0, 0, 0.5, 0, 0.5, 0, 9, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 12, 1, 0, 0, 0]
    ])
    target = [5, 3, 1, 4, 5, 1, 5]
    fm = pywFM.FM(task='regression', num_iter=5)

    # split features and target for train/test
    # first 5 are train, last 2 are test
    model = fm.run(features[:5], target[:5], features[5:], target[5:])

    print(model.predictions)
    # you can also get the model weights
    print(model.weights)
    prob_test = model.predictions
    auc_test = metrics.roc_auc_score(target[5:], prob_test)
    print auc_test
Example #4
0
    def fit(self, X, y):
        # Should not be done in production :) Otherwise you should also install libFM:
        # https://github.com/srendle/libfm
        import pywFM
        X_fm = self.prepare_fm(X)
        self.chrono.save('prepare data in sparse FM format')

        os.environ['LIBFM_PATH'] = 'XXX'  # If applicable
        fm = pywFM.FM(task='regression',
                      num_iter=self.nb_iterations,
                      k2=self.rank,
                      rlog=False)  # MCMC method
        # rlog contains the RMSE at each epoch, we do not need it here
        model = fm.run(X_fm, y, X_fm, y)
        self.chrono.save('train FM')

        nb_agents = self.nb_users + self.nb_works + self.nb_tags
        current = len(model.weights)

        if model.global_bias is None:  # Train failed (for example, libfm does not exist)
            self.mu = 0
            self.W = np.random.random(nb_agents)
            self.V = np.random.random((nb_agents, self.rank))
        else:
            self.mu = model.global_bias
            self.W = np.pad(
                np.array(model.weights), (0, nb_agents - current),
                mode='constant'
            )  # Just in case X_fm had too many zero columns on the right
            self.V = np.pad(model.pairwise_interactions,
                            [(0, nb_agents - current), (0, 0)],
                            mode='constant')
        self.V2 = np.power(self.V, 2)
Example #5
0
def cross_validationMCMC(data, k_indices, k, num_iter, std_init):
    """
        Runs the cross validation on the input data, using the Markov Chain Monte Carlo algorithm. 
        It splits the data into a training and testing fold, according to k_indices and k, and then runs 
        the MCMC on all the parameter std_init for num_iter iterations.
        @param data : the DataFrame containing all our training data (on which we do the CV)
        @param k_indices : array of k-lists containing each of the splits of the data
        @param k : the number of folds of the cross-validation
        @param num_iter : the number of iterations of the algorithm
        @param std_init : the standard deviation for the initialisation of the data
        @return loss_te : the RMSE loss for the run of the algorithm using libFM with these parameters.
        
    """
    # get k'th subgroup in test, others in train
    te_indices = k_indices[k]
    tr_indices = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indices = tr_indices.reshape(-1)

    train = data.loc[tr_indices]
    test = data.loc[te_indices]
    test.sort_values(['Movie', 'User'], ascending=[1, 1], inplace=True)

    # format the DataFrames into the Sparse matrices we need to run with pywFM
    features_tr, target_tr = df_to_sparse(train)
    features_te, target_te = df_to_sparse(test)

    # running the model
    fm = pywFM.FM(task='regression', num_iter=num_iter, init_stdev=std_init)

    model = fm.run(features_tr, target_tr, features_te, target_te)

    # getting the RMSE at the last run step.
    loss_te = model.rlog.rmse[num_iter - 1]

    return loss_te
def ALSBias_pywFM(train, test, num_iter=100, std_init = 0.43, rank = 7, r0_reg = 0.5, r1_reg = 15, r2_reg = 25):
    """
        Runs the ALS algorithm with the user bias included for num_iter iterations.
        N.B. The parameters passed by default are the best ones we found.
        
        @param train : the DataFrame containing all our training data.
        @param test : the DataFrame containing all our testing data.
        @param num_iter : the number of iterations of the algorithm
        @param std_init : the standard deviation for the initialisation of W and Z
        @param rank : the number of columns of W and Z
        @param r0_reg : the regularization parameter for the global bias term w0
        @param r1_reg : the regularization parameter of the user/item bias term w
        @param r2_reg : the regularization parameter for the ALS regularization (size of the entries of W and Z)
        @return np.array(pred) : the prediction values for all the data within the test set
        
    """
    # 1. Defining the model
    fm = pywFM.FM(task = 'regression', learning_method='als', num_iter=num_iter, init_stdev = std_init, k2 = rank,
             r0_regularization = r0_reg, r1_regularization = r1_reg, r2_regularization = r2_reg)
    
    # 2. Formatting the data
    features_tr, target_tr = df_to_sparse(train)
    features_te, target_te = df_to_sparse(test)
    
    # 3. Running the model
    model = fm.run(features_tr, target_tr, features_te, target_te)
    
    # 4. Outputs
    pred = model.predictions
    
    return np.array(pred)
def MCMC_pywFM(train, test, num_iter=100, std_init = 0.5):
    """
        Runs the ALS algorithm with MCMC for num_iter iterations.
        N.B. The parameters passed by default are the best ones we found.

        @param train : the DataFrame containing all our training data.
        @param test : the DataFrame containing all our testing data.
        @param num_iter : the number of iterations of the algorithm
        @param std_init : the standard deviation for the initialisation of W and Z

        @return np.array(pred) : the prediction values for all the data within the test set
        
    """
    
    # 1. Defining the model
    fm = pywFM.FM(task='regression', num_iter= num_iter, init_stdev = std_init)
    
    # 2. Formatting the data
    features_tr, target_tr = df_to_sparse(train)
    features_te, target_te = df_to_sparse(test)
    
    # 3. Running the model
    model = fm.run(features_tr, target_tr, features_te, target_te)
    
    # 4. Outputs
    pred = model.predictions
    
    return np.array(pred)
Example #8
0
def cross_validationMCMC(data, target, k_indices, k, num_iter, std_init):
    """
        Runs the cross validation on the input data, using the Markov Chain Monte Carlo algorithm.
        It splits the data into a training and testing fold, according to k_indices and k, and then runs
        the MCMC on all the parameter std_init for num_iter iterations.
        @param data : the DataFrame containing all our training data (on which we do the CV)
        @param k_indices : array of k-lists containing each of the splits of the data
        @param k : the number of folds of the cross-validation
        @param num_iter : the number of iterations of the algorithm
        @param std_init : the standard deviation for the initialisation of the data
        @return loss_te : the RMSE loss for the run of the algorithm using libFM with these parameters.

    """
    # get k'th subgroup in test, others in train
    te_indices = k_indices[k]
    tr_indices = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indices = tr_indices.reshape(-1)

    x1 = data.loc[tr_indices]
    x2 = data.loc[te_indices]
    y1 = target[tr_indices]
    y2 = target[te_indices]

    # running the model
    fm = pywFM.FM(task='classification',
                  num_iter=num_iter,
                  init_stdev=std_init)

    model = fm.run(x1, y1, x2, y2)

    # getting the RMSE at the last run step.
    pred = model.predictions

    return eval_gini(pred, y2)
Example #9
0
def make_mf_libfm(X, y, X_test, n_round=3):
    n = X.shape[0]
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor
    '''
    mf_tr = np.zeros(X.shape[0])
    mf_te = np.zeros(X_test.shape[0])
    for i in range(n_round):
        skf = StratifiedKFold(y,
                              n_folds=4,
                              shuffle=True,
                              random_state=42 + i * 1000)
        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]

            y_tr = y[ind_tr]
            y_te = y[ind_te]

            fm = pywFM.FM(task='classification',
                          learning_method='mcmc',
                          num_iter=1000,
                          init_stdev=0.7,
                          k0=1,
                          k1=1,
                          k2=8,
                          verbose=10)

            model = fm.run(X_tr, y_tr, X_te, y_te)
            y_pred = model.predictions

            mf_tr[ind_te] += y_pred
            score = roc_auc_score(y_te, y_pred)
            print 'pred[{}] score:{}'.format(i, score)

    fm = pywFM.FM(task='classification',
                  learning_method='mcmc',
                  num_iter=1000,
                  init_stdev=0.7,
                  k0=1,
                  k1=1,
                  k2=8,
                  verbose=10)
    model = fm.run(X, y, X_test, np.zeros(X_test.shape[0]))
    mf_te = model.predictions

    return (mf_tr / n_round, mf_te)
Example #10
0
File: b.py Project: Sdoof/notebooks
def dofit_pywFM(num_iter=100, lr=0.1, k2=8, learning_method='sgda'):
    globals().update(get_nn_data())
    fm = pywFM.FM('classification', num_iter=num_iter, init_stdev=0.1, k0=True,
            k1=True, k2=k2, learning_method=learning_method, learn_rate=lr,
            r0_regularization=0, r1_regularization=0, r2_regularization=0,
            rlog=False, verbose=True, silent=False, temp_path=None)
    y_fake_test = np.empty((X_test.shape[0]))
    assert(X_test.shape[0] == y_fake_test.shape[0])
    predictions, global_bias, weights, pairwise_interactions, rlog = \
    fm.run(X_fit[:1000], y_fit[:1000], X_test[:1000], y_fake_test[:1000],
            X_eval[:1000], y_eval[:1000])
    # auc = sklearn.metrics.roc_auc_score(y_fit, predictions)
    return {'predictions': predictions, 'global_bias': global_bias, 'weights': weights, 'pairwise_interactions': pairwise_interactions, 'rlog': rlog}
Example #11
0
File: bow.py Project: umpot/quora
def test_pywfm_on_bow():
    df = load_train()

    folds = create_folds(df)

    train, test = folds[0]

    train, test = oversample(train, test, 42)

    questions = list(train[question1]) + list(train[question2])
    print 'Creating Vectorizer...'
    c = CountVectorizer(questions, binary=True, stop_words='english')
    print 'Fitting Vectorizer...'
    c.fit(questions)

    train_arr_q1 = c.transform(train[question1])
    train_arr_q2 = c.transform(train[question2])

    train_arr = train_arr_q1 + train_arr_q2
    train_arr[train_arr == 2] = 1
    train_arr[train_arr == 1] = -1

    test_arr_q1 = c.transform(test[question1])
    test_arr_q2 = c.transform(test[question2])

    test_arr = test_arr_q1 + test_arr_q2
    test_arr[test_arr == 2] = 1
    test_arr[test_arr == 1] = -1

    train_target = train[TARGET]
    test_target = test[TARGET]

    fm = pywFM.FM(task='classification',
                  num_iter=100,
                  verbose=10,
                  r1_regularization=0.1,
                  learn_rate=0.1)

    res = fm.run(train_arr, train_target, test_arr, test_target)
    prob = res.predictions
    prob_0 = [1 - x for x in prob]
    # return res

    proba = np.array([prob_0, prob]).reshape(len(prob), 2)

    loss = log_loss(test[TARGET], proba)
    print loss

    print loss
def work(enu,us):
    local = data_test_FM[data_test_FM.CUST_ID==us]
    local = pd.merge(local,user_FM,on='CUST_ID')
    local = pd.merge(local,item_FM,on='ARTICLE_ID')
    if len(local) > 0 :
        X_test = sparse.csr_matrix(local.drop(columns=['CUST_ID','ARTICLE_ID']).to_numpy())

        os.environ['LIBFM_PATH']='/home/slide/bouaroun/libfm/bin/'
        fm = pywFM.FM(task='regression', num_iter=150, learning_method='als', learn_rate=0.05, r2_regularization=0.001) 
        model = fm.run(X, y, X_test , np.array([random.randint(1,5) for i in range(len(local))]) )

        local = pd.read_csv( d+'/FM/local_test_{}.csv'.format(us))
        local['FM_PRECISION'] =  model.predictions
        local = local[['ARTICLE_ID','FM_PRECISION','FM_PRECISION_tf']]
        local.to_csv( d+'/FM/local_test_{}.csv'.format(us),index=False )
    print(enu)
Example #13
0
def cross_validationALSBias(data, k_indices, k, num_iter, std_init, rank,
                            r0_reg, r1_reg, r2_reg):
    """
        Runs the cross validation on the input data, using the ALS algorithm with the user bias included. 
        It splits the data into a training and testing fold, according to k_indices and k, and then runs 
        the ALS with bias on all the parameters (std_init, rank, r0_reg, r1_reg, r2_reg) for num_iter iterations.
        @param data : the DataFrame containing all our training data (on which we do the CV)
        @param k_indices : array of k-lists containing each of the splits of the data
        @param k : the number of folds of the cross-validation
        @param num_iter : the number of iterations of the algorithm
        @param std_init : the standard deviation for the initialisation of W and Z
        @param rank : the number of columns of W and Z
        @param r0_reg : the regularization parameter for the global bias term w0
        @param r1_reg : the regularization parameter of the user/item bias term w
        @param r2_reg : the regularization parameter for the ALS regularization (size of the entries of W and Z)
        @return loss_te : the RMSE loss for the run of the algorithm using libFM with these parameters.
        
    """
    # get k'th subgroup in test, others in train
    te_indices = k_indices[k]
    tr_indices = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indices = tr_indices.reshape(-1)

    train = data.loc[tr_indices]
    test = data.loc[te_indices]
    test.sort_values(['Movie', 'User'], ascending=[1, 1], inplace=True)

    # format the DataFrames into the Sparse matrices we need to run with pywFM
    features_tr, target_tr = df_to_sparse(train)
    features_te, target_te = df_to_sparse(test)

    # running the model
    fm = pywFM.FM(task='regression',
                  learning_method='als',
                  num_iter=num_iter,
                  init_stdev=std_init,
                  k2=rank,
                  r0_regularization=r0_reg,
                  r1_regularization=r1_reg,
                  r2_regularization=r2_reg)

    model = fm.run(features_tr, target_tr, features_te, target_te)

    # getting the RMSE at the last run step.
    loss_te = model.rlog.rmse[num_iter - 1]

    return loss_te
Example #14
0
def demo_libfm():
    # export PYTHONPATH=~/ai_group/zhihu2019
    # export LIBFM_PATH=/root/ai_group/libfm/bin/
    # os.environ['LIBFM_PATH'] = '/Users/zhengchubin/PycharmProjects/zhihu2019/data/资料/libfm/bin/'

    # features = pd.DataFrame(features)
    # from sklearn.datasets import dump_svmlight_file
    # dump_svmlight_file(features, target, '/Users/zhengchubin/Desktop/xx.svm')
    # print(features.head())

    fm = pywFM.FM(task='classification', learning_method='mcmc', num_iter=100, init_stdev=0.7,
                  k0=1, k1=1,k2=16, verbose=10)

    # split features and target for train/test
    # first 5 are train, last 2 are test
    model = fm.run(X_tr, y_tr, X_te, y_te)
    print(model.predictions, type(model.predictions))
    # you can also get the model weights
    print(model.weights)
    print(model.pairwise_interactions)
Example #15
0
def build_fm_interaction():
    begin = datetime.datetime.now()
    test_y = np.loadtxt(open(test_Y_file), dtype=int)
    fm = pywFM.FM(task='classification',
                  num_iter=100,
                  learning_method='mcmc',
                  temp_path=project_path + "model\\m_fm\\tmp\\")

    model = fm.run(None,
                   None,
                   None,
                   None,
                   train_path=train_X_file,
                   test_path=test_X_file,
                   model_path=project_path +
                   "model\\m_fm\\model_file\\fm_model",
                   out_path=project_path + "model\\m_fm\\model_file\\fm.out")
    end = datetime.datetime.now()

    print model.pairwise_interactions.shape
    y_pred = model.predictions
    auc_test = metrics.roc_auc_score(test_y, y_pred)
    accuracy = metrics.accuracy_score(test_y, y_pred)
    logloss = metrics.log_loss(test_y, y_pred)
    np.savetxt(open(constants.project_path + "result/10_9_fm_pred", "w"),
               y_pred,
               fmt='%.5f')

    rcd = str(end) + '\n'
    rcd += "fm: new basic" + '\n'
    rcd += "accuracy: " + str(accuracy) + '\n'
    rcd += "logloss: " + str(logloss) + '\n'
    rcd += "auc_test: " + str(auc_test) + '\n'
    rcd += "time: " + str(end - begin) + '\n' + '\n'
    print rcd

    log_file = open(project_path + "result/oct_result", "a")
    log_file.write(rcd)
    log_file.close()

    print model.pairwise_interactions.shape
import pandas as pd
import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
from data_loader import loadData
from data_loader import loadTest

(train, y_train), (valid, y_valid) = loadData("../item_recom/train_info.tsv", 1.1)

dictv = DictVectorizer()

test = loadTest("../item_recom/test_info.tsv")

print "convert to one-hot represerntation"
_ = dictv.fit_transform(train+test)
X_train = dictv.transform(train)
# X_valid = dictv.transform(valid)
X_test = dictv.transform(test)
y_test = np.ones(len(test))*2.5

fm = pywFM.FM(task='regression', num_iter=1200, k2=48, learning_method='mcmc')

model = fm.run(X_train, y_train, X_test, y_test)
# print("FM RMSE: %.6f" % math.sqrt(mean_squared_error(y_valid, model.predictions)))
# 
with open('../submissions/sixteenth.csv', 'w') as csvfile:
	fieldnames = ['uid#iid', 'pred']
	writer = csv.DictWriter(csvfile, fieldnames)
	writer.writeheader()
	for ind in xrange(len(test)):
		writer.writerow({'uid#iid': "%s#%s"%(test[ind]["1_user_id"], test[ind]["2_item_id"]) ,'pred': "%f"%model.predictions[ind]})
            method=params["scale_features_method"])

        # das3h features with lr
        lr = LogisticRegression(max_iter=1000, solver="liblinear")
        lr.fit(X_train_, y_train_das3h, sample_weight=sample_weight)
        # metrics test
        y_test_pred_probas_das3h_lr = lr.predict_proba(X_test_)[:, 1]
        logs[f"fold{i}"]["das3h_lr"] = compute_metrics(
            y_test_das3h, y_test_pred_probas_das3h_lr)
        # metrics train
        y_train_pred_probas_das3h_lr = lr.predict_proba(X_train_)[:, 1]
        logs[f"fold{i}"]["das3h_lr_train"] = compute_metrics(
            y_train_das3h, y_train_pred_probas_das3h_lr)

        # das3h
        fm = pywFM.FM(**params_fm)
        model = fm.run(X_train_, y_train_das3h, X_test_, y_test_das3h)
        y_test_pred_probas_das3h = np.array(model.predictions)
        logs[f"fold{i}"]["das3h"] = compute_metrics(y_test_das3h,
                                                    y_test_pred_probas_das3h)

        # item-avg
        item_avg_train = item_avg_predictor(task_sessions_train)
        # metrics test
        y_test_pred_item_avg_probas = [
            item_avg_train(item) for item in task_sessions_test["task"]
        ]
        logs[f"fold{i}"]["item_avg"] = compute_metrics(
            task_sessions_test["solved"], y_test_pred_item_avg_probas)
        # metrics train
        y_train_pred_item_avg_probas = [
Example #18
0
import pywFM
import numpy as np
import pandas as pd

features = np.matrix([
    #     Users  |     Movies     |    Movie Ratings   | Time | Last Movies Rated
    #    A  B  C | TI  NH  SW  ST | TI   NH   SW   ST  |      | TI  NH  SW  ST
    [1, 0, 0, 1, 0, 0, 0, 0.3, 0.3, 0.3, 0, 13, 0, 0, 0, 0],
    [1, 0, 0, 0, 1, 0, 0, 0.3, 0.3, 0.3, 0, 14, 1, 0, 0, 0],
    [1, 0, 0, 0, 0, 1, 0, 0.3, 0.3, 0.3, 0, 16, 0, 1, 0, 0],
    [0, 1, 0, 0, 0, 1, 0, 0, 0, 0.5, 0.5, 5, 0, 0, 0, 0],
    [0, 1, 0, 0, 0, 0, 1, 0, 0, 0.5, 0.5, 8, 0, 0, 1, 0],
    [0, 0, 1, 1, 0, 0, 0, 0.5, 0, 0.5, 0, 9, 0, 0, 0, 0],
    [0, 0, 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 12, 1, 0, 0, 0]
])
target = [5, 3, 1, 4, 5, 1, 5]

fm = pywFM.FM(task='regression', num_iter=5)

# split features and target for train/test
# first 5 are train, last 2 are test
model = fm.run(features[:5], target[:5], features[5:], target[5:])
print(model.predictions)
# you can also get the model weights
print(model.weights)
Example #19
0
def DAS3H(a, active, tw, isKfold, model_params):

	dim = model_params['dim']

	# FM parameters
	FM_params = {
		'task': 'classification',
		'num_iter': model_params['iter'],
		'rlog': True,
		'learning_method': 'mcmc',
		'k2': dim
	}

	print(active)
	print(tw)
	prefix = ''
	if set(active) == {'users', 'items'} and dim == 0:
		prefix = 'IRT'
	elif set(active) == {'users', 'items'} and dim > 0:
		prefix = 'MIRTb'
	elif set(active) == {'skills', 'attempts'}:
		prefix = 'AFM'
	elif set(active) == {'skills', 'wins', 'fails'}:
		prefix = 'PFA'
	elif set(active) == {'items', 'skills', 'wins', 'fails'}:
		prefix = 'KTM'
	elif set(active) == {'users', 'items', 'skills', 'wins', 'attempts'} and ( tw == 'tw_kc'):
		prefix = 'DAS3H'
	elif set(active) == {'users', 'items', 'wins', 'attempts'} and ( tw == 'tw_items'):
		prefix = 'DASH'
	else:
		for f in active:
			prefix += f[0]
		if tw == 'tw_kc':
			prefix += 't1'
		else:
			prefix += 't2'
	print(prefix)


	[df, QMatrix, StaticInformation, DictList] = a.dataprocessor.loadLCData()
	X, dict_data = a.loadDAS3HData(active, features_suffix, 0.8, tw=tw)
	y = X[:,3].toarray().flatten()

	saveDir = os.path.join(a.LCDataDir, 'das3h', 'results_K'+str(isKfold)[0], prefix)
	prepareFolder(saveDir)

	metrics1 = {'MAE':metrics.mean_absolute_error,
	'MSE':metrics.mean_squared_error,
	'AUC':metrics.roc_auc_score,
	}

	metrics2 = {'Accuracy':metrics.accuracy_score,
	'Precision':metrics.precision_score,
	'AP':metrics.average_precision_score,
	'Recall':metrics.recall_score,
	'F1-score':metrics.f1_score,
	}

	metrics_tf1 = {'tf_Accuracy':tf.keras.metrics.Accuracy(),
	}

	metrics_tf2 = {'tf_Precision':tf.keras.metrics.Precision(thresholds = 0.5),
	'tf_Recall':tf.keras.metrics.Recall(thresholds = 0.5),
	'tf_MSE':tf.keras.metrics.MeanSquaredError(),
	'tf_MAE':tf.keras.metrics.MeanAbsoluteError(),
	'tf_RMSE':tf.keras.metrics.RootMeanSquaredError(),
	'tf_AUC':tf.keras.metrics.AUC(),
	'tf_AUC_1000': tf.keras.metrics.AUC(num_thresholds=1000)
	}


	results={'LC_params':a.LC_params,'model_params':model_params,'results':{}}

	if isKfold:
		for run_id in range(model_params['kFold']):
			prepareFolder(os.path.join(saveDir, str(run_id)))
			dict_data = a.loadSplitInfo(model_params['kFold'])

		for run_id in range(model_params['kFold']):
			users_train = dict_data[str(run_id)]['train']
			users_test = dict_data[str(run_id)]['test']

			X_train = X[np.where(np.isin(X[:,0].toarray().flatten(),users_train))]
			y_train = X_train[:,3].toarray().flatten()
			X_test = X[np.where(np.isin(X[:,0].toarray().flatten(),users_test))]
			y_test = X_test[:,3].toarray().flatten()

			if model_params['dim'] == 0:
				print('fitting...')
				model = LogisticRegression(solver="newton-cg", max_iter=400)
				model.fit(X_train[:,5:], y_train) # the 5 first columns are the non-sparse dataset
				y_pred_test = model.predict_proba(X_test[:,5:])[:, 1]
			else:
				fm = pywFM.FM(**FM_params)
				model = fm.run(X_train[:,5:], y_train, X_test[:,5:], y_test)
				y_pred_test = np.array(model.predictions)
				model.rlog.to_csv(os.path.join(saveDir, str(run_id), 'rlog.csv'))

			results['results'][run_id] = {}
			temp = results['results'][run_id]
			for metric in metrics1:
				temp[metric] = metrics1[metric](y_test, y_pred_test)

			for metric in metrics2:
				temp[metric] = metrics2[metric](y_test, (y_pred_test>0.5).astype(int))
				
			for metric in metrics_tf1:
				m = metrics_tf1[metric]
				m.reset_states()
				m.update_state(y_test, tf.greater_equal(y_pred_test,0.5))
				temp[metric] = m.result().numpy()
				
			for metric in metrics_tf2:
				m = metrics_tf2[metric]
				m.reset_states()
				m.update_state(y_test, y_pred_test)
				temp[metric] = m.result().numpy()
	else:
		X_train = X[np.where(np.isin(X[:,0].toarray().flatten(),dict_data['0']['train']))]
		y_train = X_train[:,3].toarray().flatten()
		X_test = X[np.where(np.isin(X[:,0].toarray().flatten(),dict_data['0']['test']))]
		y_test = X_test[:,3].toarray().flatten()

		if model_params['dim'] == 0:
			print('fitting...')
			model = LogisticRegression(solver="newton-cg", max_iter=model_params['iter'])
			model.fit(X_train[:,4:], y_train) # the 5 first columns are the non-sparse dataset
			y_pred_test = model.predict_proba(X_test[:,4:])[:, 1]
		else:
			fm = pywFM.FM(**FM_params)
			model = fm.run(X_train[:,4:], y_train, X_test[:,4:], y_test)
			y_pred_test = np.array(model.predictions)
			model.rlog.to_csv(os.path.join(saveDir, 'rlog'+getLegend(model_params)+'.csv'))

		temp = results['results']
		for metric in metrics1:
			temp[metric] = metrics1[metric](y_test, y_pred_test)

		for metric in metrics2:
			temp[metric] = metrics2[metric](y_test, (y_pred_test>0.5).astype(int))
				
		for metric in metrics_tf1:
			m = metrics_tf1[metric]
			m.reset_states()
			m.update_state(y_test, tf.greater_equal(y_pred_test,0.5))
			temp[metric] = m.result().numpy()
				
		for metric in metrics_tf2:
			m = metrics_tf2[metric]
			m.reset_states()
			m.update_state(y_test, y_pred_test)
			temp[metric] = m.result().numpy()
	saveDict(results, saveDir, 'results'+getLegend(model_params)+'.json')
	return results
Example #20
0
# A simple FM example to test the FM library and show the data structure (not
# this project since there are too many features to show)
features = np.matrix([
    #     Users  |     Movies     |    Movie Ratings   | Time | Last Movies Rated
    #    A  B  C | TI  NH  SW  ST | TI   NH   SW   ST  |      | TI  NH  SW  ST
    [1, 0, 0, 1, 0, 0, 0, 0.3, 0.3, 0.3, 0, 13, 0, 0, 0, 0],
    [1, 0, 0, 0, 1, 0, 0, 0.3, 0.3, 0.3, 0, 14, 1, 0, 0, 0],
    [1, 0, 0, 0, 0, 1, 0, 0.3, 0.3, 0.3, 0, 16, 0, 1, 0, 0],
    [0, 1, 0, 0, 0, 1, 0, 0, 0, 0.5, 0.5, 5, 0, 0, 0, 0],
    [0, 1, 0, 0, 0, 0, 1, 0, 0, 0.5, 0.5, 8, 0, 0, 1, 0],
    [0, 0, 1, 1, 0, 0, 0, 0.5, 0, 0.5, 0, 9, 0, 0, 0, 0],
    [0, 0, 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 12, 1, 0, 0, 0]
])
target = [5, 3, 1, 4, 5, 1, 5]

fm = pywFM.FM(task='regression', num_iter=5)

# split features and target for train/test
# first 5 are train, last 2 are test
model = fm.run(features[:5], target[:5], features[5:], target[5:])
print(model.predictions)
# you can also get the model weights
print(model.weights)

# Data preprocessing

# A small data test
#n=20000
#sample = pd.read_csv('train.csv',iterator=True)
#sample = sample.get_chunk(n)
# All data
Example #21
0
import pywFM
import numpy as np
import pandas as pd
import os

os.environ['LIBFM_PATH'] = '/Users/jilljenn/code/libfm/bin/'

features = np.matrix([
#     Users  |     Movies     |    Movie Ratings   | Time | Last Movies Rated
#    A  B  C | TI  NH  SW  ST | TI   NH   SW   ST  |      | TI  NH  SW  ST
    [1, 0, 0,  1,  0,  0,  0,   0.3, 0.3, 0.3, 0,     13,   0,  0,  0,  0 ],
    [1, 0, 0,  0,  1,  0,  0,   0.3, 0.3, 0.3, 0,     14,   1,  0,  0,  0 ],
    [1, 0, 0,  0,  0,  1,  0,   0.3, 0.3, 0.3, 0,     16,   0,  1,  0,  0 ],
    [0, 1, 0,  0,  0,  1,  0,   0,   0,   0.5, 0.5,   5,    0,  0,  0,  0 ],
    [0, 1, 0,  0,  0,  0,  1,   0,   0,   0.5, 0.5,   8,    0,  0,  1,  0 ],
    [0, 0, 1,  1,  0,  0,  0,   0.5, 0,   0.5, 0,     9,    0,  0,  0,  0 ],
    [0, 0, 1,  0,  0,  1,  0,   0.5, 0,   0.5, 0,     12,   1,  0,  0,  0 ]
])
target = [0, 1, 1, 0, 0, 0, 1]

fm = pywFM.FM(task='classification', num_iter=50, rlog=False)

# split features and target for train/test
# first 5 are train, last 2 are test
model = fm.run(features[:5], target[:5], features[5:], target[5:])
print(model.predictions)
# you can also get the model weights
print(model.weights)
Example #22
0
                  x4=train_ens["exf"],
                  x5=train_ens["knn"],
                  y=np.ravel(train_y_ens))
    with pm.Model() as model:
        pm.glm.glm('y ~ 0 + x1 + x2 + x3 + x4 + x5',
                   data_1,
                   family=pm.glm.families.Binomial())
        start = pm.find_MAP()
        step = pm.Metropolis()
        trace_m = pm.sample(2000, step, start=start, progressbar=True)
    predicted_test[:, i] = np.median(
        trace_m.x1) * test_fin_data["rf"] + np.median(
            trace_m.x2) * test_fin_data["gbm"] + np.median(
                trace_m.x3
            ) * test_fin_data["sgd"] + test_fin_data["exf"] * np.median(
                trace_m.x4) + test_fin_data["knn"] * np.median(trace_m.x5)

predicted_test_fin = predicted_test.mean(axis=1)

roc_auc_score(test_y, predicted_test2)

################ Factorization Machines ####################

import pywFM as fm

fm_logit = fm.FM(task="classification")

fm_logit.run(train_data, np.ravel(train_y), test_data, np.ravel(test_y))

train_fin_data.drop(train_fin_data[[0]], axis=1, inplace=True)
train_y.drop(train_y[[0]], axis=1, inplace=True)
train_x, train_y, test_x = ont_hot(data)
print('##########################')
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

# train = data[data['FLAG'] >= 0]
#
# test = data[data['FLAG'] < 0]
# train_y = train['FLAG'].values

# fm = FM(num_factors=10, num_iter=300, verbose=True, task='classification', initial_learning_rate=0.01, learning_rate_schedule="optimal")
# fm.fit(train_x, train_y)
# y_pred = fm.predict(test_x)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=2018)
fm = pywFM.FM(task='classification', num_iter=100, k2=10, verbose=True)
test_y = np.ones(test_x.shape[0])
fm.run(train_x,train_y, test_x, test_y, valid_x, valid_y)
y_pred = fm.predictions
# print(roc_auc_score(valid_y, ))

sub = pd.DataFrame()
sub['USRID'] = test_x['USRID']
sub['target'] = y_pred

sub.to_csv('./submit/%s.csv'%str(datetime.now().strftime("%Y-%m-%d_%H-%M-%S")),index=None,sep='\t')

# print('train set:', roc_auc_score(train_y, fm.predict(train_x)))

# help(pylibfm.FM)
Example #24
0

df_to_sparse(df_train, 'X_train.npz')
print('Train done')
df_to_sparse(df_test, 'X_test.npz')
print('Test done')

X_train = load_npz('X_train.npz')
X_test = load_npz('X_test.npz')
print(X_train.shape)
print(X_test.shape)

fm = pywFM.FM(task='regression',
              num_iter=500,
              k2=20,
              rlog=False,
              learning_method='mcmc',
              r1_regularization=0.1,
              r2_regularization=0.1)
model = fm.run(X_train, df_train['outcome'], X_test, df_test['outcome'])
print(mean_squared_error(df_test['outcome'], model.predictions)**0.5)
print(X_test[0], df_test['outcome'][0], model.predictions[0])

bundle = {
    'mu': model.global_bias,
    'W': model.weights,
    'V': model.pairwise_interactions
}
with open('fm.pickle', 'wb') as f:
    pickle.dump(bundle, f, pickle.HIGHEST_PROTOCOL)
Example #25
0
    X_fm = hstack([X[agent] for agent in active_agents]).tocsr()
    save_npz(SPARSE_NPZ, X_fm)
    return X_fm


X_train = df_to_sparse(df_train, 'X_train.npz')
y_train = df_train['outcome']
print('Encoding train done')
X_test = df_to_sparse(df_test, 'X_test.npz')
y_test = df_test['outcome']
print('Encoding test done')

params = {
    'task': 'classification',
    'num_iter': options.iter,
    'rlog': True,
    'learning_method': 'mcmc'
}
if options.d > 0:
    params['k2'] = options.d

fm = pywFM.FM(**params)
model = fm.run(X_train, y_train, X_test, y_test)

ACC = accuracy_score(y_test, np.round(model.predictions))
AUC = roc_auc_score(y_test, model.predictions)
NLL = log_loss(y_test, model.predictions)
print('accuracy', ACC)
print('AUC', AUC)
print('NLL', NLL)
Example #26
0
test_pca = pca.fit_transform(Test)
print('Explained variance: %.4f' % pca.explained_variance_ratio_.sum())

train_pca = pd.DataFrame(train_pca)
train_pca.index = train.index
test_pca = pd.DataFrame(test_pca)
test_pca.index = test.index

train = pd.concat([train, train_pca], axis=1)
test = pd.concat([test, test_pca], axis=1)

# pywfm
clf = pywFM.FM(task='classification',
               num_iter=1000,
               init_stdev=0.1,
               k2=5,
               learning_method='mcmc',
               verbose=False,
               silent=False)

y = np.asarray(y)
y.shape = (len(y), )

sub = pd.DataFrame()
sub['id'] = testid

y1 = np.zeros((len(testid), ))

model = clf.run(x_train=train, y_train=y, x_test=test, y_test=y1)

sub['target'] = model.predictions