Exemple #1
0
    def runFM(self, X_train, y_train, X_test, y_test):
        """
        X_train = sp.csc_matrix(np.array([[6, 1],
                                    [2, 3],
                                    [3, 0],
                                    [6, 1],
                                    [4, 5]]), dtype=np.float64)
        y_train = np.array([298, 266, 29, 298, 848], dtype=np.float64)
        X_test = X_train
        y_test = y_train
        """
        """
        X, y, coef = make_user_item_regression(label_stdev=.4, random_state=seed)
        from sklearn.cross_validation import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.33, random_state=seed)
        X_train = sp.csc_matrix(X_train)
        X_test = sp.csc_matrix(X_test)
        X_test = X_train
        y_test = y_train
        """
        print "params => iter: {} - rank: {} - std-dev: {} - seed: {}".format(
            n_iter, rank, std_dev, seed)

        start_time = time.time()

        self.fm_wu = mcmc.FMRegression(
            n_iter=0, rank=rank, random_state=seed)  #, init_stdev=std_dev)
        # initalize coefs
        self.fm_wu.fit_predict(X_train, y_train, X_test)

        rmse_test = []
        rmse_new = []
        hyper_param = np.zeros((n_iter - 1, 3 + 2 * rank), dtype=np.float64)
        for nr, i in enumerate(range(1, n_iter)):
            self.fm_wu.random_state = i * seed
            y_pred = self.fm_wu.fit_predict(X_train,
                                            y_train,
                                            X_test,
                                            n_more_iter=step_size)
            rmse_test.append(np.sqrt(mean_squared_error(y_pred, y_test)))
            hyper_param[nr, :] = self.fm_wu.hyper_param_

        print '------- restart ----------'
        values = np.arange(1, n_iter)
        rmse_test_re = []
        hyper_param_re = np.zeros((len(values), 3 + 2 * rank),
                                  dtype=np.float64)
        for nr, i in enumerate(values):
            self.fm = mcmc.FMRegression(
                n_iter=i, rank=rank, random_state=seed)  #, init_stdev=std_dev)
            y_pred = self.fm.fit_predict(X_train, y_train, X_test)
            rmse_test_re.append(np.sqrt(mean_squared_error(y_pred, y_test)))
            hyper_param_re[nr, :] = self.fm.hyper_param_

        print "Process finished in {} seconds".format(time.time() - start_time)
        print "Min RMSE on warmup model: {}".format(rmse_test[-1])
        print "Min RMSE on retrained model: {}".format(rmse_test_re[-1])

        return rmse_test, hyper_param, rmse_test_re, hyper_param_re
Exemple #2
0
def fm(X_train, X_dev_test, y_train, y_dev_test, X_test, y_test):
    seed = 333
    fm = mcmc.FMRegression(n_iter=300, rank=32, random_state=seed)
    y_pred = fm.fit_predict(X_train, y_train, X_test)
    np.sqrt(mean_squared_error(y_pred, y_test))
    scaler = StandardScaler()
    y_train_norm = scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
    fm = mcmc.FMRegression(n_iter=300, rank=32, random_state=seed)
    y_pred = fm.fit_predict(X_train, y_train_norm, X_test)
    print(np.sqrt(mean_squared_error(scaler.inverse_transform(y_pred),
                                     y_test)))
Exemple #3
0
    def predict_fastfm(self):

        if Constants.USE_CONTEXT:
            for record in self.records_to_predict:
                important_record = record[Constants.REVIEW_ID_FIELD]
                record[Constants.CONTEXT_TOPICS_FIELD] = \
                    self.context_topics_map[important_record]

        all_records = self.train_records + self.records_to_predict
        x_matrix, y_vector = fastfm_recommender.records_to_matrix(
            all_records, self.context_rich_topics)

        encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True)
        encoder.fit(x_matrix)

        x_train = encoder.transform(x_matrix[:len(self.train_records)])
        y_train = y_vector[:len(self.train_records)]
        x_test = encoder.transform(x_matrix[len(self.train_records):])

        if Constants.FASTFM_METHOD == 'mcmc':
            # solver = mcmc.FMRegression(n_iter=num_iters, rank=num_factors)
            solver = mcmc.FMRegression(rank=Constants.FM_NUM_FACTORS)
            self.predictions = solver.fit_predict(x_train, y_train, x_test)
        elif Constants.FASTFM_METHOD == 'als':
            solver = als.FMRegression(rank=Constants.FM_NUM_FACTORS)
            solver.fit(x_train, y_train)
            self.predictions = solver.predict(x_test)
        elif Constants.FASTFM_METHOD == 'sgd':
            solver = sgd.FMRegression(rank=Constants.FM_NUM_FACTORS)
            solver.fit(x_train, y_train)
            self.predictions = solver.predict(x_test)
Exemple #4
0
def test_fm_regression():
    w0, w, V, y, X = get_test_problem()

    fm = mcmc.FMRegression(n_iter=1000, rank=2, init_stdev=0.1)

    y_pred = fm.fit_predict(X, y, X)
    assert metrics.r2_score(y_pred, y) > 0.99
Exemple #5
0
def test_find_init_stdev():
    X, y, coef = make_user_item_regression(label_stdev=.5)
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=44)
    X_train = sp.csc_matrix(X_train)
    X_test = sp.csc_matrix(X_test)

    fm = mcmc.FMRegression(n_iter=10, rank=5)
    best_init_stdev, mse = mcmc.find_init_stdev(fm,
                                                X_train,
                                                y_train,
                                                stdev_range=[0.2, 0.5, 1.0])
    best_init_stdev_bad, _ = mcmc.find_init_stdev(fm,
                                                  X_train,
                                                  y_train,
                                                  stdev_range=[5.])
    print('--' * 30)
    best_init_stdev_vali, mse_vali = mcmc.find_init_stdev(
        fm, X_train, y_train, X_test, y_test, stdev_range=[0.2, 0.5, 1.0])
    assert best_init_stdev < best_init_stdev_bad
    assert best_init_stdev_vali == best_init_stdev
    assert mse_vali > mse
Exemple #6
0
def test_clone():
    from sklearn.base import clone

    a = mcmc.FMRegression()
    b = clone(a)
    assert a.get_params() == b.get_params()

    a = mcmc.FMClassification()
    b = clone(a)
    assert a.get_params() == b.get_params()
Exemple #7
0
def test_mcmc_warm_start():
    X, y, coef = make_user_item_regression(label_stdev=0)
    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=44)
    X_train = sp.csc_matrix(X_train)
    X_test = sp.csc_matrix(X_test)

    fm = mcmc.FMRegression(n_iter=100, rank=2)
    y_pred = fm.fit_predict(X_train, y_train, X_test)
    error_10_iter = mean_squared_error(y_pred, y_test)

    fm = mcmc.FMRegression(n_iter=50, rank=2)
    y_pred = fm.fit_predict(X_train, y_train, X_test)
    error_5_iter = mean_squared_error(y_pred, y_test)

    y_pred = fm.fit_predict(X_train, y_train, X_test, n_more_iter=50)
    error_5_iter_plus_5 = mean_squared_error(y_pred, y_test)
    print(error_5_iter, error_5_iter_plus_5, error_10_iter)
    print(fm.hyper_param_)
    assert_almost_equal(error_10_iter, error_5_iter_plus_5, decimal=2)
Exemple #8
0
def fm_rank(X_train, X_dev_test, y_train, y_dev_test):
    n_iter = 100
    seed = 333
    rmse_test = []
    ranks = [4, 8, 16, 32, 64]
    for rank in ranks:
        fm = mcmc.FMRegression(n_iter=n_iter, rank=rank, random_state=seed)
        y_pred = fm.fit_predict(X_train, y_train, X_dev_test)
        rmse = np.sqrt(mean_squared_error(y_pred, y_dev_test))
        rmse_test.append(rmse)
        print("rank:{}\trmse:{:.3f}".format(rank, rmse))
    plt.plot(ranks, rmse_test, label="dev test rmse", color="r")
    plt.legend()
    plt.show()
    pass
Exemple #9
0
def fm_candidate_columns():
    lens, _, _, _ = lens_data.get_lens_data()
    lens['user_id'] = lens['user_id'].astype(str)
    lens['movie_id'] = lens['movie_id'].astype(str)
    lens['year'] = lens['date'].apply(str).str.split('-').str.get(0)
    lens['release_year'] = lens['release_date'].apply(str).str.split(
        '-').str.get(2)
    candidate_columns = [
        [
            'user_id', 'movie_id', 'release_year', 'age', 'sex', 'year',
            'rating'
        ],  # A
        ['user_id', 'movie_id', 'age', 'sex', 'year', 'rating'],  # B
        ['user_id', 'movie_id', 'sex', 'year', 'rating'],  # C
        ['user_id', 'movie_id', 'age', 'sex', 'rating'],  # D
        ['user_id', 'movie_id', 'rating'],  # E
    ]
    rmse_test = []
    n_iter = 500
    seed = 123
    rank = 8
    for column in candidate_columns:
        filtered_lens = lens[column].dropna()
        v = DictVectorizer()
        X_more_feature = v.fit_transform(
            list(filtered_lens.drop('rating', axis=1).T.to_dict().values()))
        y_more_feature = filtered_lens['rating'].tolist()
        X_mf_train, X_mf_test, y_mf_train, y_mf_test = train_test_split(
            X_more_feature, y_more_feature, test_size=0.1, random_state=42)
        scaler = StandardScaler()
        y_mf_train_norm = scaler.fit_transform(
            np.array(y_mf_train).reshape(-1, 1)).ravel()
        fm = mcmc.FMRegression(n_iter=n_iter, rank=rank, random_state=seed)
        # Allocates and initalizes the model and hyper parameter.
        fm.fit_predict(X_mf_train, y_mf_train_norm, X_mf_test)
        y_pred = fm.fit_predict(X_mf_train, y_mf_train_norm, X_mf_test)
        rmse_test.append(
            np.sqrt(
                mean_squared_error(
                    scaler.inverse_transform(y_pred.reshape(-1, 1)),
                    y_mf_test)))
    print(rmse_test)
    # RMSEをプロットする
    ind = np.arange(len(rmse_test))
    bar = plt.bar(ind, height=rmse_test)
    plt.xticks(ind, ('A', 'B', 'C', 'D', 'E'))
    plt.ylim((0.88, 0.90))
    plt.show()
Exemple #10
0
def fm_n_iter(X_train, X_dev_test, y_train, y_dev_test):
    """ fastFMで機械学習を行う
    :param X_train: 訓練データ
    :param X_dev_test: 検証データ
    :param y_train: 訓練データの評価値(ラベル?)
    :param y_dev_test: 検証データの評価値(ラベル?)
    :return:
    """
    # fastFMに指定するパラメタ
    n_iter = 300
    step_size = 1
    seed = 123
    rank = 4
    # MCMCのFMモデルを初期化
    fm = mcmc.FMRegression(n_iter=0, rank=rank, random_state=seed)
    fm.fit_predict(X_train, y_train, X_dev_test)
    # aaa
    rmse_dev_test = []
    rmse_test = []
    hyper_param = np.zeros((n_iter - 1, 3 + 2 * rank), dtype=np.float64)
    # TODO イテレーション回数を変化させて、予測結果の性能とハイパーパラメタを得る
    for nr, i in enumerate(range(1, n_iter)):
        fm.random_state = i * seed
        # (MCMCで)パラメタフィッティングおよび予測を行う
        y_pred = fm.fit_predict(X_train,
                                y_train,
                                X_dev_test,
                                n_more_iter=step_size)
        rmse_test.append(np.sqrt(mean_squared_error(y_pred, y_dev_test)))
        hyper_param[nr, :] = fm.hyper_param_
    # 最初の5回は値が落ち着いていないので無視する
    values = np.arange(1, n_iter)
    x = values * step_size
    burn_in = 5
    x = x[burn_in:]
    # RMSEとハイパーパラメータをプロットする
    fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, figsize=(15, 8))
    axes[0, 0].plot(x, rmse_test[burn_in:], label='dev test rmse', color="r")
    axes[0, 0].legend()
    axes[0, 1].plot(x, hyper_param[burn_in:, 0], label='alpha', color="b")
    axes[0, 1].legend()
    axes[1, 0].plot(x, hyper_param[burn_in:, 1], label='lambda_w', color="g")
    axes[1, 0].legend()
    axes[1, 1].plot(x, hyper_param[burn_in:, 3], label='mu_w', color="g")
    axes[1, 1].legend()
    # 検証データのラベル値の標準偏差を出力。予測値の標準偏差がこの値より小さければ
    print("np.std(y_dev_test) = {}".format(np.std(y_dev_test)))
    plt.show()
Exemple #11
0
def fastFMJob(data_path, params, N, vectorizer, solver):
    rmses = []
    logging.info("Evaluando con params: {0}".format(params))
    for i in range(1, 4 + 1):
        train_data, y_tr, _ = loadData('train/train_N' + str(N) + '.' + str(i),
                                       data_path=data_path,
                                       with_timestamps=False,
                                       with_authors=False)
        val_data, y_va, _ = loadData('val/val_N' + str(N) + '.' + str(i),
                                     data_path=data_path,
                                     with_timestamps=False,
                                     with_authors=False)
        X_tr = vectorizer.transform(train_data)
        X_va = vectorizer.transform(val_data)
        if solver == "mcmc":
            fm = mcmc.FMRegression(n_iter=params['mi'],
                                   init_stdev=params['init_stdev'],
                                   rank=params['f'],
                                   random_state=123,
                                   copy_X=True)
            preds = fm.fit_predict(X_tr, y_tr, X_va)
            rmse = sqrt(mean_squared_error(y_va, preds))
            logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
            rmses.append(rmse)
        elif solver == "als":
            fm = als.FMRegression(n_iter=params['mi'], init_stdev=params['init_stdev'], rank=params['f'], random_state=123, \
                       l2_reg_w=params['l2_reg_w'], l2_reg_V=params['l2_reg_V'], l2_reg=params['l2_reg'])
            fm.fit(X_tr, y_tr)
            preds = fm.predict(X_va)
            rmse = sqrt(mean_squared_error(y_va, preds))
            logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
            rmses.append(rmse)
        elif solver == "sgd":
            fm = sgd.FMRegression(n_iter=params['mi'], init_stdev=params['init_stdev'], rank=params['f'], random_state=123, \
                       l2_reg_w=params['l2_reg_w'], l2_reg_V=params['l2_reg_V'], l2_reg=params['l2_reg'], step_size=params['step_size'])
            fm.fit(X_tr, y_tr)
            preds = fm.predict(X_va)
            rmse = sqrt(mean_squared_error(y_va, preds))
            logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
            rmses.append(rmse)
    return mean(rmses)
Exemple #12
0
def predict(train_records, test_records):
    """
    Makes a prediction for the testing set based on the topic probability vector
    of each record and the rating. The topic model is built using the training
    set. This function uses the FastFM Factorization Machines Module for Python

    :param train_records: the training set
    :param test_records: the testing set
    :return: a list with the predictions for the testing set
    """

    records = train_records + test_records

    context_rich_topics = [(i, 1) for i in range(num_topics)]
    new_matrix, new_y = records_to_matrix(records, context_rich_topics)
    print(new_matrix)
    encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True)
    encoder.fit(new_matrix)

    new_x = encoder.transform(new_matrix[:len(train_records)])
    # print(new_x.todense())
    # x_train, x_test, y_train, y_test = train_test_split(new_x, new_y)
    x_train = new_x
    y_train = new_y[:len(train_records)]
    x_test = encoder.transform(new_matrix[len(train_records):])
    mc_regressor = mcmc.FMRegression()
    y_pred = mc_regressor.fit_predict(x_train, y_train, x_test)
    print('********')
    print(x_test.todense())
    print(y_pred)

    als_fm = als.FMRegression(
        n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
    als_fm.fit(x_train, y_train)
    y_pred = als_fm.predict(x_test)
    print(y_pred)

    return y_pred
Exemple #13
0
def fastFM_tuning(data_path, N, solver):
    all_data, y_all, _ = loadData("eval_all_N" + str(N) + ".data",
                                  data_path=data_path,
                                  with_timestamps=False,
                                  with_authors=False)
    v = DictVectorizer()
    X_all = v.fit_transform(all_data)

    if solver == "mcmc":
        defaults = {'mi': 100, 'init_stdev': 0.1, 'f': 8}
    elif solver == "als":
        defaults = {
            'mi': 100,
            'init_stdev': 0.1,
            'f': 8,
            'l2_reg_w': 0.1,
            'l2_reg_V': 0.1,
            'l2_reg': 0
        }
    elif solver == "sgd":
        defaults = {
            'mi': 100,
            'init_stdev': 0.1,
            'f': 8,
            'l2_reg_w': 0.1,
            'l2_reg_V': 0.1,
            'l2_reg': 0,
            'step_size': 0.1
        }

    results = dict((param, {}) for param in defaults.keys())

    for param in ['mi', 'f', 'init_stdev']:

        if param == 'mi':
            for i in [1, 5, 10, 20, 50, 100, 150, 200]:
                defaults['mi'] = i
                results['mi'][i] = fastFMJob(data_path=data_path,
                                             params=defaults,
                                             N=N,
                                             vectorizer=v,
                                             solver=solver)
            defaults['mi'] = opt_value(results=results['mi'], metric='rmse')

        elif param == 'f':
            for i in [1, 5, 8, 10] + range(20, 2020, 20):
                defaults['f'] = i
                results['f'][i] = fastFMJob(data_path=data_path,
                                            params=defaults,
                                            N=N,
                                            vectorizer=v,
                                            solver=solver)
            defaults['f'] = opt_value(results=results['f'], metric='rmse')

        elif param == 'init_stdev':
            for i in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]:
                defaults['init_stdev'] = i
                results['init_stdev'][i] = fastFMJob(data_path=data_path,
                                                     params=defaults,
                                                     N=N,
                                                     vectorizer=v,
                                                     solver=solver)
            defaults['init_stdev'] = opt_value(results=results['init_stdev'],
                                               metric='rmse')

    if solver != "mcmc":
        for param in ['l2_reg_w', 'l2_reg_V', 'l2_reg']:

            if param == 'l2_reg_w':
                for i in [
                        0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0
                ]:
                    defaults['l2_reg_w'] = i
                    results['l2_reg_w'][i] = fastFMJob(data_path=data_path,
                                                       params=defaults,
                                                       N=N,
                                                       vectorizer=v,
                                                       solver=solver)
                defaults['l2_reg_w'] = opt_value(results=results['l2_reg_w'],
                                                 metric='rmse')

            elif param == 'l2_reg_V':
                for i in [
                        0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0
                ]:
                    defaults['l2_reg_V'] = i
                    results['l2_reg_V'][i] = fastFMJob(data_path=data_path,
                                                       params=defaults,
                                                       N=N,
                                                       vectorizer=v,
                                                       solver=solver)
                defaults['l2_reg_V'] = opt_value(results=results['l2_reg_V'],
                                                 metric='rmse')

            elif param == 'l2_reg':
                for i in [
                        0.0, 0.001, 0.003, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05,
                        0.07, 0.08, 0.1
                ]:
                    defaults['l2_reg'] = i
                    results['l2_reg'][i] = fastFMJob(data_path=data_path,
                                                     params=defaults,
                                                     N=N,
                                                     vectorizer=v,
                                                     solver=solver)
                defaults['l2_reg'] = opt_value(results=results['l2_reg'],
                                               metric='rmse')

    if solver == "sgd":
        for i in [
                0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
                0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.5
        ]:
            defaults['step_size'] = i
            results['step_size'][i] = fastFMJob(data_path=data_path,
                                                params=defaults,
                                                N=N,
                                                vectorizer=v,
                                                solver=solver)
        defaults['step_size'] = opt_value(results=results['step_size'],
                                          metric='rmse')

    # Real testing
    train_data, y_tr, _ = loadData('eval_train_N' + str(N) + '.data',
                                   data_path=data_path,
                                   with_timestamps=False,
                                   with_authors=False)
    test_data, y_te, _ = loadData('test/test_N' + str(N) + '.data',
                                  data_path=data_path,
                                  with_timestamps=False,
                                  with_authors=False)
    X_tr = v.transform(train_data)
    X_te = v.transform(test_data)

    if solver == "mcmc":
        fm = mcmc.FMRegression(n_iter=defaults['mi'],
                               init_stdev=defaults['init_stdev'],
                               rank=defaults['f'],
                               random_state=123,
                               copy_X=True)
        preds = fm.fit_predict(X_tr, y_tr, X_te)
        rmse = sqrt(mean_squared_error(y_te, preds))
        logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
        with open('TwitterRatings/fastFM/mcmc/clean/opt_params.txt', 'w') as f:
            for param in defaults:
                f.write("{param}:{value}\n".format(param=param,
                                                   value=defaults[param]))
            f.write("RMSE:{rmse}".format(rmse=rmse))
        with open('TwitterRatings/fastFM/mcmc/clean/params_rmses.txt',
                  'w') as f:
            for param in results:
                for value in results[param]:
                    f.write("{param}={value}\t : {RMSE}\n".format(
                        param=param, value=value, RMSE=results[param][value]))

    elif solver == "als":
        fm = als.FMRegression(n_iter=defaults['mi'], init_stdev=defaults['init_stdev'], rank=defaults['f'], random_state=123, \
                   l2_reg_w=defaults['l2_reg_w'], l2_reg_V=defaults['l2_reg_V'], l2_reg=defaults['l2_reg'])
        fm.fit(X_tr, y_tr)
        preds = fm.predict(X_te)
        rmse = sqrt(mean_squared_error(y_te, preds))
        logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
        with open('TwitterRatings/fastFM/als/clean/opt_params.txt', 'w') as f:
            for param in defaults:
                f.write("{param}:{value}\n".format(param=param,
                                                   value=defaults[param]))
            f.write("RMSE:{rmse}".format(rmse=rmse))
        with open('TwitterRatings/fastFM/als/clean/params_rmses.txt',
                  'w') as f:
            for param in results:
                for value in results[param]:
                    f.write("{param}={value}\t : {RMSE}\n".format(
                        param=param, value=value, RMSE=results[param][value]))

    elif solver == "sgd":
        fm = sgd.FMRegression(n_iter=defaults['mi'], init_stdev=defaults['init_stdev'], rank=defaults['f'], random_state=123, \
                   l2_reg_w=defaults['l2_reg_w'], l2_reg_V=defaults['l2_reg_V'], l2_reg=defaults['l2_reg'], step_size=defaults['step_size'])
        fm.fit(X_tr, y_tr)
        preds = fm.predict(X_te)
        rmse = sqrt(mean_squared_error(y_te, preds))
        logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
        with open('TwitterRatings/fastFM/sgd/clean/opt_params.txt', 'w') as f:
            for param in defaults:
                f.write("{param}:{value}\n".format(param=param,
                                                   value=defaults[param]))
            f.write("RMSE:{rmse}".format(rmse=rmse))
        with open('TwitterRatings/fastFM/sgd/clean/params_rmses.txt',
                  'w') as f:
            for param in results:
                for value in results[param]:
                    f.write("{param}={value}\t : {RMSE}\n".format(
                        param=param, value=value, RMSE=results[param][value]))

    return defaults
Exemple #14
0
    rank = 4
    seed = 333
    step_size = 1

    """
    X, y, coef = make_user_item_regression(label_stdev=.4, random_state=seed)
    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=seed)
    X_train = sp.csc_matrix(X_train)
    X_test = sp.csc_matrix(X_test)
    X_test = X_train
    y_test = y_train
    """

    fm = mcmc.FMRegression(n_iter=0, rank=rank, random_state=seed)
    # initalize coefs
    fm.fit_predict(X_train, y_train, X_test)

    rmse_test = []
    rmse_new = []
    hyper_param = np.zeros((n_iter -1, 3 + 2 * rank), dtype=np.float64)
    for nr, i in enumerate(range(1, n_iter)):
        fm.random_state = i * seed
        y_pred = fm.fit_predict(X_train, y_train, X_test, n_more_iter=step_size)
        rmse_test.append(np.sqrt(mean_squared_error(y_pred, y_test)))
        hyper_param[nr, :] = fm.hyper_param_

    print '------- restart ----------'
    values = np.arange(1, n_iter)
    rmse_test_re = []
Exemple #15
0
from sklearn.metrics import mean_squared_error
from fastFM import mcmc
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

n_iter = 100
seed = 333

rmse_test = []
# rankを4, 8, 16, 32, 64
ranks = [4, 8, 16, 32, 64]

# rankを変えて学習・予測をしdev testデータに対するRMSEを獲得する
for rank in ranks:
    fm = mcmc.FMRegression(n_ter=n_iter, rank=rank, random_state=seed)
    y_pred = fm.fit_predict(X_train, y_train, X_dev_test)
    rmse = np.sqrt(mean_squared_error(y_pred, y_dev_test))
    rmse_test.append(rmse)
    print('rank:{}\trmse:{:.3f}'.format(rank, rmse))

# 各rank毎のRMSEをプロットする
plt.plot(ranks, rmse_test, label='dev test rmse', color="r")
plt.legend()
Exemple #16
0
def _build_mcmc_model(param):
    return mcmc.FMRegression(n_iter=param['n_iter'], \
                             init_stdev=param['init_stdev'], \
                             rank=param['rank'], \
                             random_state=param['random_state'])
Exemple #17
0
train_hot = X_merge_hot[0:train.shape[0]]
test_hot = X_merge_hot[train.shape[0]:X_merge_hot.shape[0]]

X = v.fit_transform(X_origin)
y = np.array(train.loc[:, ['score']]).flatten()
X_train, X_test, y_train, y_test = train_test_split(X, y)
print "start fit"
#fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)
print 'The accuracy of eXtreme Gradient Boosting Classifier on testing set:', xgbc.score(
    X_test, y_test)

fm = mcmc.FMRegression(n_iter=100,
                       init_stdev=0.1,
                       rank=8,
                       random_state=123,
                       copy_X=True)
#y_pred = fm.fit_predict(train_hot, y,test_hot)
y_pred = fm.fit_predict(X_train, y_train, X_test)
joblib.dump(fm, "fast_fm_model_mcmc.m")
print "start predict"
#y_pred = fm.predict(test_hot)

df_fm = pd.DataFrame(y_pred, columns=['score'])
df_fm.to_csv("fast_fm_result_mcmc.csv", index=False)
print y_pred.shape
from sklearn.metrics import mean_squared_error
print 'mse:', mean_squared_error(y_test, y_pred)

R_real = math.sqrt(mean_squared_error(y_test, y_pred))
Exemple #18
0
from sklearn.metrics import mean_squared_error
from fastFM import mcmc
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

fm = mcmc.FMRegression(n_iter=300, rank=32, random_state=seed)
y_pred = fm.fit_predict(X_train, y_train, X_test)
np.sqrt(mean_squared_error(y_pred, y_test))

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
y_train_norm = scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
fm = mcmc.FMRegression(n_iter=300, rank=32, random_state=seed)
y_pred = fm.fit_predict(X_train, y_train_norm, X_test)
np.sqrt(mean_squared_error(scaler.inverse_transform(y_pred), y_test))

lens['user_id'] = lens['user_id'].astype(str)
lens['movie_id'] = lens['movie_id'].astype(str)
lens['year'] = lens['date'].apply(str).str.split('-').str.get(0)
lens['release_year'] = lens['release_date'].apply(str).str.split('-').str.get(2)
lens['year'] = lens['date'].apply(str).str.split('-').str.get(0)
lens['release_year'] = lens['release_date'].apply(str).str.split('-').str.get(2)


candidate_columns = [
    ['user_id', 'movie_id', 'release_year', 'age', 'sex', 'year', 'rating'], #A
    ['user_id', 'movie_id', 'age', 'sex', 'year', 'rating'], #B
    ['user_id', 'movie_id', 'sex', 'year', 'rating'], #C
    ['user_id', 'movie_id', 'age', 'sex', 'rating'], #D