Beispiel #1
0
def pyFM_cv(verbose=True, t=Timer()):
    #pyFM parameters
    factors = np.linspace(20, 200, 10, dtype=np.int64)
    learning_rates = np.logspace(-3, -3, 1)
    params = dict()
    rmses = dict()

    for k in factors:
        params['k'] = k
        for rate in learning_rates:
            params['rate'] = rate
            algo = pylibfm.FM(num_factors=k,
                              num_iter=200,
                              verbose=True,
                              task="regression",
                              initial_learning_rate=rate,
                              learning_rate_schedule="optimal")
            rmse = pyFM_cv_algo(algo)
            print(
                "------Time:{}, rmse: {}, factors: {}, learning_rates: {}------\n\n"
                .format(t.now(), rmse, k, rate))
            rmses[rmse] = params

    # Find the model with least RMSE
    lowest_rmse = min(rmses.keys())
    best_params = rmses[lowest_rmse]

    print("Best pyFM rmse: {}. Params: factors: {}, learning_rates: {}".format(
        lowest_rmse, best_params['k'], best_params['rate']))
Beispiel #2
0
def pyFMJob(data_path,
            params,
            N,
            vectorizer,
            with_timestamps=False,
            with_authors=False):
    rmses = []
    logging.info("Evaluando con params: {0}".format(params))
    for i in range(1, 4 + 1):
        train_data, y_tr, _ = loadData('train/train_N' + str(N) + '.' + str(i),
                                       data_path=data_path,
                                       with_timestamps=with_timestamps,
                                       with_authors=with_authors)
        X_tr = vectorizer.transform(train_data)
        fm = pylibfm.FM(num_factors=params['f'], num_iter=params['mi'], k0=params['bias'], k1=params['oneway'], init_stdev=params['init_stdev'], \
                validation_size=params['val_size'], learning_rate_schedule=params['lr_s'], initial_learning_rate=params['lr'], \
                power_t=params['invscale_pow'], t0=params['optimal_denom'], shuffle_training=params['shuffle'], seed=params['seed'], \
                task='regression', verbose=True)
        fm.fit(X_tr, y_tr)
        val_data, y_va, _ = loadData('val/val_N' + str(N) + '.' + str(i),
                                     data_path=data_path,
                                     with_timestamps=with_timestamps,
                                     with_authors=with_authors)
        X_va = vectorizer.transform(val_data)
        preds = fm.predict(X_va)
        rmse = sqrt(mean_squared_error(y_va, preds))
        print("FM RMSE: %.4f" % rmse)
        rmses.append(rmse)
    return mean(rmses)
Beispiel #3
0
def pyfm_predict(train_actual, predict):
    """
    Matrix Factorization using SGD with pyFM library.
    Compute the predictions on a test_set after training on a train_set using the method Svd++  Matrix Factorization using SGD with pyFM library
    Args:
        train_actual (pandas.DataFrame): train set
        predict (pandas.DataFrame): test set
    Hyperparameters:
        num_factors : The number of factors.
        num_iter : The number of iteration of the SGD procedure
        initial_learning_rate:

    Returns:
        numpy array: predictions
    """
    print("pyfm")
    predict_data, y_predict = create_input_pyfm(predict)
    train_actual_data, y_train_actual = create_input_pyfm(train_actual)
    v = DictVectorizer()
    X_train = v.fit_transform(train_actual_data)
    X_test = v.transform(predict_data)
    # Hyperparameters
    num_factors = 20
    num_iter = 200
    task = 'regression'
    initial_learning_rate = 0.001
    learning_rate_schedule = 'optimal'
    fm = pylibfm.FM(num_factors=num_factors,
                    num_iter=num_iter,
                    task=task,
                    initial_learning_rate=initial_learning_rate,
                    learning_rate_schedule=learning_rate_schedule)
    fm.fit(X_train, y_train_actual)
    preds = fm.predict(X_test)
    return np.clip(preds, 1, 5)
Beispiel #4
0
def main(para_dname):

    data_dir = 'data'
    output_dir = 'output'
    dataset_name = para_dname
    # dataset_name = 'ml-1m'
    # dataset_name = 'douban'
    # dataset_name = 'yahoo_music'

    train_X, test_X, train_y, test_y = load_dataset(data_dir, dataset_name)
    v = DictVectorizer()
    train_X = v.fit_transform(train_X)
    test_X = v.transform(test_X)

    fm = pylibfm.FM(num_factors=10,
                    num_iter=1000,
                    verbose=True,
                    task='regression',
                    initial_learning_rate=0.001,
                    learning_rate_schedule='optimal',
                    validation_size=0.1)
    train_loss, val_loss = fm.fit(train_X, train_y)
    train_loss = np.sqrt(np.array(train_loss))
    val_loss = np.sqrt(np.array(val_loss))
    np.save(os.path.join(output_dir, dataset_name + '_trloss'), train_loss)
    np.save(os.path.join(output_dir, dataset_name + '_valloss'), val_loss)
    preds = fm.predict(test_X)
    test_loss = math.sqrt(mean_squared_error(test_y, preds))
    print(preds)
    print('Test loss: %.5f' % test_loss)

    return 0
Beispiel #5
0
    def __init__(self,
                 iter=100,
                 factor=10,
                 use_info=True,
                 path='./',
                 external_fm=None):
        from pyfm import pylibfm
        self.__use_info = use_info
        # temp code, load ml-100k's info
        if self.__use_info:
            self.__info = Info(path)

        # Build and train a Factorization Machine
        if external_fm:
            print >> sys.stderr, 'Use external FM: %s' % type(external_fm)
            self.__fm = external_fm
        else:
            print >> sys.stderr, 'iter=%d, factor=%d, use_info=%d' % (
                iter, factor, use_info)
            self.__fm = pylibfm.FM(num_factors=factor,
                                   num_iter=iter,
                                   verbose=True,
                                   task="regression",
                                   initial_learning_rate=0.001,
                                   learning_rate_schedule="optimal")
Beispiel #6
0
def FM(train_data, test_data, y_train):
    import numpy as np
    from sklearn.feature_extraction import DictVectorizer
    from pyfm import pylibfm
    import pandas as pd
    import math
    import time

    start_time = time.time()

    # Transforms lists of feature-value mappings to one-hot encoded vectors
    v = DictVectorizer()
    X_train = v.fit_transform(train_data)
    X_test = v.transform(test_data)

    # Build and train a Factorization Machine
    fm = pylibfm.FM(num_factors=5,
                    num_iter=10,
                    verbose=True,
                    task="regression",
                    initial_learning_rate=0.1,
                    learning_rate_schedule="optimal")

    fm.fit(X_train, y_train)
    preds = fm.predict(X_test)

    # Time taken for training (seconds)
    print("--- %s seconds : Time taken for training ---" %
          (time.time() - start_time))

    np.savetxt('predictions.txt', preds)
    return preds
Beispiel #7
0
 def __init__(self, n_factor=16, n_iter=10, use_attrs=True):
     self.fm = pylibfm.FM(num_factors=n_factor,
                          num_iter=n_iter,
                          task="regression",
                          initial_learning_rate=0.001,
                          learning_rate_schedule="optimal")
     self.use_attrs = use_attrs
     self.v = DictVectorizer()
Beispiel #8
0
def cf_models(cf_lib, N, data_path, params):
    #caching, ya que son siempre los mismos params del CF
    cf_models = {}

    all_data, y_all, items = loadData(
        "eval_all_N" + str(N) + ".data",
        data_path='TwitterRatings/funkSVD/data_with_authors/',
        with_timestamps=False,
        with_authors=True)  #hardcoded bc f**k it
    if cf_lib == "pyFM":
        v = DictVectorizer()
        X_all = v.fit_transform(all_data)
        for i in range(1, 4 + 1):
            train_data, y_tr, _ = loadData(
                'train/train_N' + str(N) + '.' + str(i),
                data_path='TwitterRatings/funkSVD/data_with_authors/',
                with_timestamps=False,
                with_authors=True)
            X_tr = v.transform(train_data)
            fm = pylibfm.FM(num_factors=params['f'], num_iter=params['mi'], k0=params['bias'], k1=params['oneway'], init_stdev=params['init_stdev'], \
                    validation_size=params['val_size'], learning_rate_schedule=params['lr_s'], initial_learning_rate=params['lr'], \
                    power_t=params['invscale_pow'], t0=params['optimal_denom'], shuffle_training=params['shuffle'], seed=params['seed'], \
                    task='regression', verbose=True)
            fm.fit(X_tr, y_tr)
            cf_models[i] = fm

        return cf_models, v, items

    elif cf_lib == "implicit":
        all_c = consumption(ratings_path=data_path + 'eval_all_N' + str(N) +
                            '.data',
                            rel_thresh=0,
                            with_ratings=True)
        items_ids = list(
            set([
                itemId for userId, itemsDict in all_c.items()
                for itemId in itemsDict
            ]))
        idcoder = IdCoder(items_ids, all_c.keys())
        for i in range(1, 4 + 1):
            ones, row, col = get_data(data_path=data_path,
                                      all_c=all_c,
                                      idcoder=idcoder,
                                      fold=i,
                                      N=N,
                                      mode="tuning")
            matrix = csr_matrix((ones, (row, col)), dtype=np.float64)
            user_items = matrix.T.tocsr()
            model = implicit.als.AlternatingLeastSquares(
                factors=params['f'],
                regularization=params['lamb'],
                iterations=params['mi'],
                dtype=np.float64)
            model.fit(matrix)
            cf_models[i] = model

        return cf_models, idcoder, items
Beispiel #9
0
 def train(self, n_epochs: int, learning_rate: float = 0.001,
           random_seed: int = 42, hybrid: bool = False, verbose: bool = True):
     self._build_train_test_ds(hybrid)
     self.fm = pylibfm.FM(num_factors=self.k,
                          num_iter=n_epochs,
                          verbose=verbose,
                          task='regression',
                          initial_learning_rate=learning_rate,
                          seed=random_seed)
     self.fm.fit(self.X_train, self.y_train)
Beispiel #10
0
 def __init__(self):
     self.fm = pylibfm.FM(k1=False,
                          validation_size=0.005,
                          num_factors=10,
                          num_iter=8,
                          verbose=True,
                          task="classification",
                          initial_learning_rate=0.001,
                          init_stdev=0.002,
                          learning_rate_schedule="optimal")
def train(training_data, labels):
    # Use one hot-encoding
    label_encoder = LabelEncoder()
    vectorizer = DictVectorizer()
    train_event_x = vectorizer.fit_transform(training_data)
    train_event_y = label_encoder.fit_transform(labels)
    # Create and train the model.

    pctr_estimator = pylibfm.FM()
    pctr_estimator.fit(train_event_x, train_event_y)
    model = (pctr_estimator, label_encoder, vectorizer)
    print('Training done')
    return model
Beispiel #12
0
    def __init__(self, rec_name, dataset, uses_features):
        super(FMRec, self).__init__(rec_name, dataset, uses_features)

        # init
        self.one_hot_columns = None

        # default rec
        self.fm = pylibfm.FM(num_factors=50,
                             num_iter=10,
                             task="regression",
                             initial_learning_rate=0.001,
                             learning_rate_schedule="optimal",
                             verbose=True)
Beispiel #13
0
def train(training_data, labels):
    #label encoder features to get prediction
    label_encoder = LabelEncoder()
    vectorizer = DictVectorizer()
    train_event_x = vectorizer.fit_transform(training_data)
    train_event_y = label_encoder.fit_transform(labels)
    #definition of balanced weights
    global weights
    weights = len(labels) / (sum(labels) * 2)

    # Create and train the model using Factorization Machine Algorithm
    pctr_estimator = pylibfm.FM()
    pctr_estimator.fit(train_event_x, train_event_y)
    model = (pctr_estimator, label_encoder, vectorizer)
    print('Training done')
    return model
Beispiel #14
0
 def __init__(self,
              num_factors=10,
              num_iter=1,
              k0=True,
              k1=True,
              init_stdev=0.1,
              validation_size=0.01,
              learning_rate_schedule="optimal",
              initial_learning_rate=0.01,
              power_t=0.5,
              t0=0.001,
              task='classification',
              verbose=True,
              shuffle_training=True,
              seed=28):
     super(BaseEstimator, self).__init__()
     self.num_factors = num_factors
     self.num_iter = num_iter
     self.k0 = k0
     self.k1 = k1
     self.init_stdev = init_stdev
     self.validation_size = validation_size
     self.learning_rate_schedule = learning_rate_schedule
     self.initial_learning_rate = initial_learning_rate
     self.power_t = power_t
     self.t0 = t0
     self.task = task
     self.verbose = verbose
     self.shuffle_training = shuffle_training
     self.seed = seed
     self.fm = pylibfm.FM(
         num_factors=self.num_factors,
         num_iter=self.num_iter,
         k0=self.k0,
         k1=self.k1,
         init_stdev=self.init_stdev,
         validation_size=self.validation_size,
         learning_rate_schedule=self.learning_rate_schedule,
         initial_learning_rate=self.initial_learning_rate,
         power_t=self.power_t,
         t0=self.t0,
         task=self.task,
         verbose=self.verbose,
         shuffle_training=self.shuffle_training,
         seed=self.seed)
Beispiel #15
0
def main():
    print(getCurrentTime(), "running...")
    dok_train = np.load(r"%s\..\input\sparse_train.npy" % runningPath)[()]# train 中包括 '2018-09-17'-- '2018-09-23' 的数据
    dok_verify = np.load(r"%s\..\input\sparse_verify.npy" % runningPath)[()] # verify 中只包含了 '2018-09-24' 的数据
    dok_test = np.load(r"%s\..\input\sparse_test.npy" % runningPath)[()]
    
    train_label = pd.read_csv(r'%s\..\input\train_label.txt' % runningPath)
    verify_label = pd.read_csv(r'%s\..\input\verify_label.txt' % runningPath)
    
    fm = pylibfm.FM(num_factors=50, num_iter=10, verbose=True, task="classification", initial_learning_rate=0.0001, learning_rate_schedule="optimal")
    
    fm.fit(dok_train, train_label['is_trade'])
    
    Y_prediced = fm.predict(dok_verify)
    
    pyfm_logloss = -np.sum(verify_label * np.log(Y_prediced) + 
                          (1 - verify_label) * np.log(1 - Y_prediced))/ Y_prediced.shape[0] 
    print(getCurrentTime(), "lgb logloss %.6f" %(pyfm_logloss))

    
    return 
def pyfm(train, test, **arg):
    print('[PYFM] applying')

    # Get the args
    num_factors = arg['num_factors']
    num_iter = arg['num_iter']
    task = arg['task']
    initial_learning_rate = arg['initial_learning_rate']
    learning_rate_schedule = arg['learning_rate_schedule']

    (train_data, y_train, train_users, train_items) = prepare_data(train)
    (test_data, y_test, test_users, test_items) = prepare_data(test)

    v = DictVectorizer()
    X_train = v.fit_transform(train_data)
    X_test = v.transform(test_data)

    fm = pylibfm.FM(num_factors=num_factors,
                    num_iter=num_iter,
                    task=task,
                    initial_learning_rate=initial_learning_rate,
                    learning_rate_schedule=learning_rate_schedule)

    fm.fit(X_train, y_train)

    preds = fm.predict(X_test)

    for i in range(len(preds)):
        if preds[i] > 5:
            preds[i] = 5
        elif preds[i] < 1:
            preds[i] = 1

    df_return = test.copy()

    df_return.Rating = preds

    print('[PYFM] done')

    return df_return
Beispiel #17
0
def peretrain():
    X_train = []
    y = []
    for user in os.listdir("userinfo"):
        for file in os.listdir("userinfo/" + user):
            if file == "dislike.txt":
                dann = [
                    int(i) for i in open("userinfo/" + user + '/' +
                                         file, "r").read().split()
                    if checkint(i)
                ]
                for d in dann:
                    X_train.append({
                        "who_marked": str(user),
                        "marked_user": str(d)
                    })
                    y.append(0)
            elif file == "like.txt":
                dann = [
                    int(i) for i in open("userinfo/" + user + '/' +
                                         file, "r").read().split()
                    if checkint(i)
                ]
                for d in dann:
                    X_train.append({
                        "who_marked": str(user),
                        "marked_user": str(d)
                    })
                    y.append(1)
    v = DictVectorizer()
    X = v.fit_transform(X_train)
    y = np.array(y, dtype=np.float64)
    fm = pylibfm.FM(num_factors=20,
                    num_iter=len(y) * 3,
                    verbose=False,
                    task="regression",
                    initial_learning_rate=0.001,
                    learning_rate_schedule="optimal")
    fm.fit(X, y)
    return fm
Beispiel #18
0
def dofit_pyfm():
    d = get_subsample()
    globals().update(d)
    clf = pylibfm.FM(num_factors=4,
                     num_iter=100,
                     verbose=True,
                     task="classification",
                     initial_learning_rate=0.00001,
                     learning_rate_schedule="optimal")
    scaler = preproc.StandardScaler(with_mean=False)  # hmm
    scaler.fit(X_train)

    def transx(x):
        x = scaler.transform(x)
        return scipy.sparse.csr_matrix(x)

    clf._fit_old = clf.fit
    clf.fit = lambda x, y: clf._fit_old(transx(x), y)
    clf._predict_old = clf.predict
    clf.predict = lambda x: clf._predict_old(transx(x))
    clf.fit(X_train, y_train)
    return {'clf': clf}
Beispiel #19
0
def benchmark(task='regression', content=False):
    losses = []
    total_time = 0
    for k in range(5):
        print('== Fold %d' % (k+1))
        # set RNG
        np.random.seed(0)
        random.seed(0)
        # Load data
        (train_data, y_train) = load_ml_100k("u%d.base" % (k+1), content)
        (test_data, y_test) = load_ml_100k("u%d.test" % (k+1), content)
        if task == "classification":
            y_test = np.greater(y_test, 3)
        # Transform to matrix
        v = DictVectorizer()
        x_train = v.fit_transform(train_data)
        x_test = v.transform(test_data)
        # Build and train a Factorization Machine
        fm = pylibfm.FM(num_iter=20, verbose=True, task=task,
                        initial_learning_rate=0.005,
                        learning_rate_schedule="constant", seed=0)
        start = time.time()
        fm.fit(x_train, y_train)
        used = time.time() - start
        total_time += used
        # Evaluate
        predictions = fm.predict(x_test)
        if task == "regression":
            losses.append(root_mean_squared_error(y_test, predictions))
            print("FM RMSE: %.4f" % losses[-1])
        elif task == "classification":
            losses.append(log_loss(y_test, predictions))
            print("FM log loss: %.4f" % losses[-1])
        print("Time used: %.4fs" % used)
    print('== Summary')
    print('Mean RMSE: %.4f' % np.mean(losses))
    print('Total time: %.4fs' % total_time)
Beispiel #20
0
            val_w = sample_weight[test]

            # reg = LogisticRegression(C=0.1, solver='sag', n_jobs=-1)
            # pred_x = cross_val_predict(reg, trn_x, trn_y, cv=5, n_jobs=-1)
            # trn_x = np.c_[trn_x, pred_x]
            """
            clf = TFFMClassifier(order=6,
                                 rank=10,
                                 optimizer=tf.train.AdagradOptimizer(0.01),
                                 n_epochs=100,
                                 batch_size=10000,
                                 init_std=0.001,
                                 input_type='sparse'
                                 )
            """
            clf = pylibfm.FM(**params)

            clf.fit(trn_x, trn_y)

            _score = log_loss(val_y, clf.predict(val_x), sample_weight=val_w)
            _score2 = -roc_auc_score(
                val_y, clf.predict(val_x), sample_weight=val_w)
            # logger.debug('   _score: %s' % _score)
            list_score.append(_score)
            list_score2.append(_score2)
            break
        score = (np.mean(list_score), np.min(list_score), np.max(list_score))
        score2 = (np.mean(list_score2), np.min(list_score2),
                  np.max(list_score2))

        logger.info('param: %s' % (params))
Beispiel #21
0
v = DictVectorizer()
# X_origin = train.loc[:,['iid','uid']].astype(np.string_).to_dict(orient='records')
# X = v.fit_transform(X_origin)
y = np.array(train.loc[:, ['score']]).flatten().astype(np.float64)
#X_train, X_test, y_train, y_test = train_test_split(X, y)

X_merge = pd.concat([train.loc[:, ['uid', 'iid']], test])
X_merge_hot = v.fit_transform(
    X_merge.astype(np.string_).to_dict(orient='records'))
train_hot = X_merge_hot[0:train.shape[0]]
test_hot = X_merge_hot[train.shape[0]:X_merge_hot.shape[0]]

print "data is ready"
fm = pylibfm.FM(num_factors=100,
                num_iter=30,
                verbose=True,
                task="regression",
                initial_learning_rate=0.01,
                learning_rate_schedule="optimal")
#y_train = y_train.astype(np.float64)
fm.fit(train_hot, y)
print 'fit well'
joblib.dump(fm, "fm_model10,0000_iter_30.m")

print "start predict"
y_pred = fm.predict(test_hot)

df_fm = pd.DataFrame(y_pred, columns=['score'])
df_fm.to_csv("fm_result100_0000_iter_30.csv", index=False)

#
# ground_truth = np.around(preds)
Beispiel #22
0
from pyfm import pylibfm
from sklearn.feature_extraction import DictVectorizer

iris_data = load_iris()
X = iris_data['data']
y = iris_data['target'] == 2
data = [{v: k for k, v in dict(zip(i, range(len(i)))).items()} for i in X]

X_train, X_test, y_train, y_test = train_test_split(data,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

v = DictVectorizer()
X_train = v.fit_transform(X_train)
X_test = v.transform(X_test)

fm = pylibfm.FM(num_factors=50,
                num_iter=1000,
                verbose=True,
                task="classification",
                initial_learning_rate=0.0001,
                learning_rate_schedule="optimal")

fm.fit(X_train, y_train)

y_preds = fm.predict(X_test)
from sklearn.metrics import log_loss

print("Validation log loss: %.4f" % log_loss(y_test, y_preds))
Beispiel #23
0
def hybrid_protocol_evaluation(data_path, data_path_context, cf_lib, solr,
                               params_cb, params_cf, params_hy, N):
    test_c = consumption(ratings_path=data_path + 'test/test_N20.data',
                         rel_thresh=0,
                         with_ratings=True)
    train_c = consumption(ratings_path=data_path + 'eval_train_N20.data',
                          rel_thresh=0,
                          with_ratings=False)
    all_c = consumption(ratings_path=data_path + 'eval_all_N20.data',
                        rel_thresh=0,
                        with_ratings=True)
    MRRs = dict((N, []) for N in [5, 10, 15, 20])
    nDCGs = dict((N, []) for N in [5, 10, 15, 20])
    APs = dict((N, []) for N in [5, 10, 15, 20])
    Rprecs = dict((N, []) for N in [5, 10, 15, 20])

    if cf_lib == "pyFM":
        all_data, y_all, items = loadData("eval_all_N20.data",
                                          data_path=data_path_context,
                                          with_timestamps=False,
                                          with_authors=True)
        v = DictVectorizer()
        X_all = v.fit_transform(all_data)

        train_data, y_tr, _ = loadData('eval_train_N20.data',
                                       data_path=data_path_context,
                                       with_timestamps=False,
                                       with_authors=True)
        X_tr = v.transform(train_data)
        fm   = pylibfm.FM(num_factors=params_cf['f'], num_iter=params_cf['mi'], k0=params_cf['bias'], k1=params_cf['oneway'], init_stdev=params_cf['init_stdev'], \
                validation_size=params_cf['val_size'], learning_rate_schedule=params_cf['lr_s'], initial_learning_rate=params_cf['lr'], \
                power_t=params_cf['invscale_pow'], t0=params_cf['optimal_denom'], shuffle_training=params_cf['shuffle'], seed=params_cf['seed'], \
                task='regression', verbose=True)
        fm.fit(X_tr, y_tr)

    elif cf_lib == "implicit":
        items_ids = list(
            set([
                itemId for userId, itemsDict in all_c.items()
                for itemId in itemsDict
            ]))
        idcoder = IdCoder(items_ids, all_c.keys())
        ones, row, col = get_data(data_path=data_path,
                                  all_c=all_c,
                                  idcoder=idcoder,
                                  fold=0,
                                  N=20,
                                  mode="testing")
        matrix = csr_matrix((ones, (row, col)), dtype=np.float64)
        user_items = matrix.T.tocsr()
        model = implicit.als.AlternatingLeastSquares(
            factors=params_cf['f'],
            regularization=params_cf['lamb'],
            iterations=params_cf['mi'],
            dtype=np.float64)
        model.fit(matrix)

    p = 0
    for userId in test_c:
        logging.info("#u: {0}/{1}".format(p, len(test_c)))
        p += 1
        if cf_lib == "pyFM":
            user_rows = [{
                'user_id': str(userId),
                'item_id': str(itemId)
            } for itemId in items]
            X_te = v.transform(user_rows)
            preds = fm.predict(X_te)
            recs_cf = [
                itemId for _, itemId in sorted(zip(preds, items), reverse=True)
            ]
        elif cf_lib == "implicit":
            recommends = model.recommend(userid=int(
                idcoder.coder('user', userId)),
                                         user_items=user_items,
                                         N=200)
            recs_cf = [idcoder.decoder('item', tupl[0]) for tupl in recommends]

        recs_cb = []
        for itemId in train_c[userId]:
            encoded_params = urlencode(params_cb)
            url = solr + '/mlt?q=goodreadsId:' + itemId + "&" + encoded_params
            response = json.loads(urlopen(url).read().decode('utf8'))
            try:
                docs = response['response']['docs']
            except TypeError as e:
                continue
            recs_cb.append([str(doc['goodreadsId'][0]) for doc in docs])

        recs_cb = flatten_list(list_of_lists=recs_cb, rows=params_cb['rows'])

        recs_cf = remove_consumed(user_consumption=train_c[userId],
                                  rec_list=recs_cf)
        recs_cf = recs_cf[:200]
        recs_cb = remove_consumed(user_consumption=train_c[userId],
                                  rec_list=recs_cb)
        recs_cb = recs_cb[:200]

        recs_hy = hybridize_recs(recs_cb=recs_cb,
                                 recs_cf=recs_cf,
                                 weight_cb=params_hy['weight_cb'],
                                 weight_cf=params_hy['weight_cf'])
        recs_hy = remove_consumed(user_consumption=train_c[userId],
                                  rec_list=recs_hy)
        recs_hy = recs_cleaner(solr=solr,
                               consumpt=train_c[userId],
                               recs=recs_hy[:100])
        recs_hy = user_ranked_recs(user_recs=recs_hy,
                                   user_consumpt=test_c[userId])

        for N in [5, 10, 15, 20]:
            mini_recs = dict((k, recs_hy[k]) for k in recs_hy.keys()[:N])
            MRRs[N].append(MRR(recs=mini_recs, rel_thresh=1))
            nDCGs[N].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=False))
            APs[N].append(AP_at_N(n=N, recs=recs_hy, rel_thresh=1))
            Rprecs[N].append(R_precision(n_relevants=N, recs=mini_recs))

    for N in [5, 10, 15, 20]:
        with open('TwitterRatings/hybrid/clean/protocol.txt', 'a') as file:
            file.write( "N=%s, nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \
             (N, mean(nDCGs[N]), mean(APs[N]), mean(MRRs[N]), mean(Rprecs[N])) )
Beispiel #24
0
    column_names = ['userId', 'movieId', 'timestamp']
    label_name = 'rating'

    X, y = data[column_names].values, data[label_name].values

    pos = 80000
    X_train, X_test = X[:pos], X[pos:]
    y_train, y_test = y[:pos], y[pos:]

    # build model
    from pyfm import pylibfm
    model = pylibfm.FM(
        num_factors=8,
        num_iter=10,
        validation_size=0.0,
        task='regression',
        reg_0=0.0,
        reg_w=0.01,
        reg_v=0.05,
    )
    self = pyfm_model_wrapper(model)

    self.fit(X_train, y_train)
    y_pred = self.predict(X_test)

    np.mean(np.abs(y_pred - y_test))

    dir(self.model)

    self.model.w0
    self.model.w
Beispiel #25
0
y_train = data.loc[data['cv'] < threshold, 'conversion']
y_test = data.loc[data['cv'] >= threshold, 'conversion']

dictTrain = list(
    map(lambda ind: dict.fromkeys(getNames(train, ind), 1), train.index))
dictTest = list(
    map(lambda ind: dict.fromkeys(getNames(test, ind), 1), test.index))

v = DictVectorizer()
X_train = v.fit_transform(dictTrain)
X_test = v.transform(dictTest)

# =============================================================================
# Factorization Machine
# =============================================================================
fm = pylibfm.FM(num_factors=20, num_iter=50, task="classification")
fm.fit(X_train, y_train)

# =========================================================================
# Compute MPR
# =========================================================================
data_test = pd.concat([test, y_test], axis=1)
data_test['setSize'] = list(map(len, data_test['itemSet']))
data_test = data_test.loc[(data_test['setSize'] > 1) &
                          (data_test['conversion'] == 1), ]

data_test['target2'] = -1
for ind in data_test.index:
    data_test.loc[ind, 'target2'] = data_test.loc[ind, 'itemSet'][-1]
    data_test.at[ind, 'itemSet'] = data_test.loc[ind, 'itemSet'][:-1]
Beispiel #26
0
	'''
    feature = data.iloc[:, :-1]  #取特征
    label = data.iloc[:, -1]
    #将数组按列进行归一化
    feature = minmax_scale(feature, axis=0)  #此处如果不归一化,预测结果全是Nan
    return feature, label


train = pd.read_csv(trainData, header=None)
test = pd.read_csv(testData, header=None)
X_train, y_train = preprocessData(train)
X_test, y_test = preprocessData(test)
X_train = [{v: k
            for k, v in dict(zip(i, range(len(i)))).items()} for i in X_train]
X_test = [{v: k
           for k, v in dict(zip(i, range(len(i)))).items()} for i in X_test]
v = DictVectorizer()
X_train = v.fit_transform(X_train)
X_test = v.transform(X_test)

fm = pylibfm.FM(num_factors=15,
                num_iter=300,
                verbose=False,
                task="classification",
                initial_learning_rate=0.01,
                learning_rate_schedule="optimal")
fm.fit(X_train, y_train)
y_pred_label = [get_label(i) for i in fm.predict(X_test)]
print(y_pred_label)
print(accuracy_score(y_test, y_pred_label))
Beispiel #27
0
        "age": 33
    },
    {
        "user": "******",
        "item": "20",
        "age": 55
    },
    {
        "user": "******",
        "item": "10",
        "age": 20
    },
]
v = DictVectorizer()
X = v.fit_transform(train)
print(type(X))
print(print(v.get_feature_names()))
print(X)
# print(X.toarray())
# [[ 19.   0.   0.   0.   1.   1.   0.   0.   0.]
#  [ 33.   0.   0.   1.   0.   0.   1.   0.   0.]
#  [ 55.   0.   1.   0.   0.   0.   0.   1.   0.]
#  [ 20.   1.   0.   0.   0.   0.   0.   0.   1.]]
y = np.repeat(1.0, X.shape[0])
print(y)
fm = pylibfm.FM()
fm.fit(X, y)
pred = fm.predict(v.transform({"user": "******", "item": "10", "age": 24}))
print(pred)
print(v.transform({"user": "******", "item": "10", "age": 24}))
print(v.transform({"user": "******", "item": "10", "age": 24}).toarray())
Beispiel #28
0
 test = data.loc[data['cv']>=threshold,['target','itemSet']]
 
 y_train = data.loc[data['cv']<threshold,'conversion']
 y_test = data.loc[data['cv']>=threshold,'conversion']
 
 dictTrain = list(map(lambda ind: dict.fromkeys(getNames(train,ind),1),train.index))
 dictTest = list(map(lambda ind: dict.fromkeys(getNames(test,ind),1),test.index))
 
 v = DictVectorizer()
 X_train = v.fit_transform(dictTrain)
 X_test = v.transform(dictTest)
 
 # =========================================================================
 # Factorization Machine
 # =========================================================================
 fm = pylibfm.FM(num_factors=numTraits,num_iter=100,task="classification")
 fm.fit(X_train,y_train)
 
 # =========================================================================
 # Compute MPR
 # =========================================================================
 data_test = pd.concat([test,y_test],axis=1)
 data_test = data_test.loc[data_test['conversion']==1,]
 
 percentileRank = []
 precisionAt5 = 0
 precisionAt10 = 0
 precisionAt20 = 0
 for ind in data_test.index:
     subdata = data_test.loc[ind,]
     true_target = subdata['target']
Beispiel #29
0
def pyFM_tuning(data_path, N, with_timestamps=False, with_authors=False):

    all_data, y_all, _ = loadData("eval_all_N" + str(N) + ".data",
                                  data_path=data_path,
                                  with_timestamps=with_timestamps,
                                  with_authors=with_authors)
    v = DictVectorizer()
    X_all = v.fit_transform(all_data)

    defaults = {'f': 100, 'mi': 20, 'bias': True, 'oneway': True , 'init_stdev': 0.1, 'val_size': 0.01, 'lr_s': 'optimal', 'lr': 0.01, \
          'invscale_pow': 0.5, 'optimal_denom': 0.001, 'shuffle': True, 'seed': 28} #cambio del original: f:20, mi:1
    results = dict((param, {}) for param in defaults.keys())

    for param in [
            'mi', 'f', 'bias', 'oneway', 'init_stdev', 'val_size', 'lr_s',
            'lr', 'invscale_pow', 'optimal_denom', 'shuffle', 'seed'
    ]:

        if param == 'mi':
            for i in [1, 5, 10, 20, 50, 100, 150, 200]:
                defaults['mi'] = i
                results['mi'][i] = pyFMJob(data_path=data_path,
                                           params=defaults,
                                           N=N,
                                           vectorizer=v,
                                           with_timestamps=with_timestamps,
                                           with_authors=with_authors)
            defaults['mi'] = opt_value(results=results['mi'], metric='rmse')

        elif param == 'f':
            for i in range(20, 2020, 20):
                defaults['f'] = i
                results['f'][i] = pyFMJob(data_path=data_path,
                                          params=defaults,
                                          N=N,
                                          vectorizer=v,
                                          with_timestamps=with_timestamps,
                                          with_authors=with_authors)
            defaults['f'] = opt_value(results=results['f'], metric='rmse')

        elif param == 'bias':
            for i in [True, False]:
                defaults['bias'] = i
                results['bias'][i] = pyFMJob(data_path=data_path,
                                             params=defaults,
                                             N=N,
                                             vectorizer=v,
                                             with_timestamps=with_timestamps,
                                             with_authors=with_authors)
            defaults['bias'] = opt_value(results=results['bias'],
                                         metric='rmse')

        elif param == 'oneway':
            for i in [True, False]:
                defaults['oneway'] = i
                results['oneway'][i] = pyFMJob(data_path=data_path,
                                               params=defaults,
                                               N=N,
                                               vectorizer=v,
                                               with_timestamps=with_timestamps,
                                               with_authors=with_authors)
            defaults['oneway'] = opt_value(results=results['oneway'],
                                           metric='rmse')

        elif param == 'init_stdev':
            for i in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]:
                defaults['init_stdev'] = i
                results['init_stdev'][i] = pyFMJob(
                    data_path=data_path,
                    params=defaults,
                    N=N,
                    vectorizer=v,
                    with_timestamps=with_timestamps,
                    with_authors=with_authors)
            defaults['init_stdev'] = opt_value(results=results['init_stdev'],
                                               metric='rmse')

        elif param == 'val_size':
            for i in [0.001, 0.01, 0.1, 0.5, 0.8, 0.9]:
                defaults['val_size'] = i
                results['val_size'][i] = pyFMJob(
                    data_path=data_path,
                    params=defaults,
                    N=N,
                    vectorizer=v,
                    with_timestamps=with_timestamps,
                    with_authors=with_authors)
            defaults['val_size'] = opt_value(results=results['val_size'],
                                             metric='rmse')

        elif param == 'lr_s':
            for i in ['constant', 'optimal', 'invscaling']:
                defaults['lr_s'] = i

                if i == 'optimal':
                    for j in [
                            0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5
                    ]:
                        defaults['optimal_denom'] = j
                        results['optimal_denom'][j] = pyFMJob(
                            data_path=data_path,
                            params=defaults,
                            N=N,
                            vectorizer=v,
                            with_timestamps=with_timestamps,
                            with_authors=with_authors)
                    defaults['optimal_denom'] = opt_value(
                        results=results['optimal_denom'], metric='rmse')
                    results['lr_s'][i] = results['optimal_denom'][
                        defaults['optimal_denom']]

                elif i == 'invscaling':
                    for j in [0.001, 0.05, 0.1, 0.5, 0.8, 1.0]:
                        defaults['invscale_pow'] = j
                        results['invscale_pow'][j] = pyFMJob(
                            data_path=data_path,
                            params=defaults,
                            N=N,
                            vectorizer=v,
                            with_timestamps=with_timestamps,
                            with_authors=with_authors)
                    defaults['invscale_pow'] = opt_value(
                        results=results['invscale_pow'], metric='rmse')
                    results['lr_s'][i] = results['invscale_pow'][
                        defaults['invscale_pow']]

                elif i == 'constant':
                    results['lr_s'][i] = pyFMJob(
                        data_path=data_path,
                        params=defaults,
                        N=N,
                        vectorizer=v,
                        with_timestamps=with_timestamps,
                        with_authors=with_authors)

            defaults['lr_s'] = opt_value(results=results['lr_s'],
                                         metric='rmse')

        elif param == 'lr':
            for i in [0.001, 0.003, 0.005, 0.01, 0.02, 0.03, 0.04,
                      0.05]:  #0.07, 0.08, 0.1]:
                defaults['lr'] = i
                results['lr'][i] = pyFMJob(data_path=data_path,
                                           params=defaults,
                                           N=N,
                                           vectorizer=v,
                                           with_timestamps=with_timestamps,
                                           with_authors=with_authors)
            defaults['lr'] = opt_value(results=results['lr'], metric='rmse')

        elif param == 'shuffle':
            for i in [True, False]:
                defaults['shuffle'] = i
                results['shuffle'][i] = pyFMJob(
                    data_path=data_path,
                    params=defaults,
                    N=N,
                    vectorizer=v,
                    with_timestamps=with_timestamps,
                    with_authors=with_authors)
            defaults['shuffle'] = opt_value(results=results['shuffle'],
                                            metric='rmse')

        elif param == 'seed':
            for i in [10, 20, 28, 30, 50]:
                defaults['seed'] = i
                results['seed'][i] = pyFMJob(data_path=data_path,
                                             params=defaults,
                                             N=N,
                                             vectorizer=v,
                                             with_timestamps=with_timestamps,
                                             with_authors=with_authors)
            defaults['seed'] = opt_value(results=results['seed'],
                                         metric='rmse')

    # Real testing
    train_data, y_tr, _ = loadData('eval_train_N' + str(N) + '.data',
                                   data_path=data_path,
                                   with_timestamps=with_timestamps,
                                   with_authors=with_authors)
    X_tr = v.transform(train_data)
    fm   = pylibfm.FM(num_factors=defaults['f'], num_iter=defaults['mi'], k0=defaults['bias'], k1=defaults['oneway'], init_stdev=defaults['init_stdev'], \
            validation_size=defaults['val_size'], learning_rate_schedule=defaults['lr_s'], initial_learning_rate=defaults['lr'], \
            power_t=defaults['invscale_pow'], t0=defaults['optimal_denom'], shuffle_training=defaults['shuffle'], seed=defaults['seed'], \
            task='regression', verbose=True)
    fm.fit(X_tr, y_tr)

    test_data, y_te, _ = loadData('test/test_N' + str(N) + '.data',
                                  data_path=data_path,
                                  with_timestamps=with_timestamps,
                                  with_authors=with_authors)
    X_te = v.transform(test_data)
    preds = fm.predict(X_te)
    rmse = sqrt(mean_squared_error(y_te, preds))
    print("FM RMSE: %.4f" % rmse)

    with open(
            'TwitterRatings/pyFM/opt_params_tmstmp' + str(with_timestamps) +
            '_auth' + str(with_authors) + '.txt', 'w') as f:
        for param in defaults:
            f.write("{param}:{value}\n".format(param=param,
                                               value=defaults[param]))
        f.write("RMSE:{rmse}".format(rmse=rmse))

    with open(
            'TwitterRatings/pyFM/params_rmses_tmstmp' + str(with_timestamps) +
            '_auth' + str(with_authors) + '.txt', 'w') as f:
        for param in results:
            for value in results[param]:
                f.write("{param}={value}\t : {RMSE}\n".format(
                    param=param, value=value, RMSE=results[param][value]))

    return defaults
Beispiel #30
0
def pyFM_protocol_evaluation(data_path,
                             params,
                             with_timestamps=False,
                             with_authors=False):
    solr = "http://localhost:8983/solr/grrecsys"
    # userId = '33120270'
    all_data, y_all, items = loadData("eval_all_N20.data",
                                      data_path=data_path,
                                      with_timestamps=with_timestamps,
                                      with_authors=with_authors)
    v = DictVectorizer()
    X_all = v.fit_transform(all_data)

    test_c = consumption(
        ratings_path='TwitterRatings/funkSVD/data/test/test_N20.data',
        rel_thresh=0,
        with_ratings=True)
    train_c = consumption(
        ratings_path='TwitterRatings/funkSVD/data/eval_train_N20.data',
        rel_thresh=0,
        with_ratings=False)
    MRRs = dict((N, []) for N in [5, 10, 15, 20])
    nDCGs = dict((N, []) for N in [5, 10, 15, 20])
    APs = dict((N, []) for N in [5, 10, 15, 20])
    Rprecs = dict((N, []) for N in [5, 10, 15, 20])

    train_data, y_tr, _ = loadData('eval_train_N20.data',
                                   data_path=data_path,
                                   with_timestamps=with_timestamps,
                                   with_authors=with_authors)
    X_tr = v.transform(train_data)
    fm   = pylibfm.FM(num_factors=params['f'], num_iter=params['mi'], k0=params['bias'], k1=params['oneway'], init_stdev=params['init_stdev'], \
            validation_size=params['val_size'], learning_rate_schedule=params['lr_s'], initial_learning_rate=params['lr'], \
            power_t=params['invscale_pow'], t0=params['optimal_denom'], shuffle_training=params['shuffle'], seed=params['seed'], \
            task='regression', verbose=True)
    fm.fit(X_tr, y_tr)

    p = 0
    for userId in test_c:
        logging.info("#u: {0}/{1}".format(p, len(test_c)))
        p += 1
        user_rows = [{
            'user_id': str(userId),
            'item_id': str(itemId)
        } for itemId in items]
        X_te = v.transform(user_rows)
        preds = fm.predict(X_te)
        book_recs = [
            itemId for _, itemId in sorted(zip(preds, items), reverse=True)
        ]
        book_recs = remove_consumed(user_consumption=train_c[userId],
                                    rec_list=book_recs)
        book_recs = recs_cleaner(solr=solr,
                                 consumpt=train_c[userId],
                                 recs=book_recs[:100])
        recs = user_ranked_recs(user_recs=book_recs,
                                user_consumpt=test_c[userId])

        for N in [5, 10, 15, 20]:
            mini_recs = dict((k, recs[k]) for k in recs.keys()[:N])
            MRRs[N].append(MRR(recs=mini_recs, rel_thresh=1))
            nDCGs[N].append(
                nDCG(recs=mini_recs, alt_form=False, rel_thresh=False))
            APs[N].append(AP_at_N(n=N, recs=recs, rel_thresh=1))
            Rprecs[N].append(R_precision(n_relevants=N, recs=mini_recs))

    for N in [5, 10, 15, 20]:
        with open(
                'TwitterRatings/pyFM/clean/protocol_tmstmp' +
                str(with_timestamps) + '_auth' + str(with_authors) + '.txt',
                'a') as file:
            file.write( "N=%s, nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \
             (N, mean(nDCGs[N]), mean(APs[N]), mean(MRRs[N]), mean(Rprecs[N])) )