def train_fm_model(): X_train, y_train = load_processed_data(pathify('data', 'processed', 'avazu-cv-train.csv'), label_col='click') X_val, y_val = load_processed_data(pathify('data', 'processed', 'avazu-cv-val.csv'), label_col='click') encoder = OneHotEncoder(handle_unknown='ignore').fit(X_train) X_train = encoder.transform(X_train) X_val = encoder.transform(X_val) X_train = csr_matrix(X_train) X_val = csr_matrix(X_val) y_train[y_train == 0] = -1 y_val[y_val == 0] = -1 y_train = np.array(y_train) y_val = np.array(y_val) fm = mcmc.FMClassification(n_iter=50, init_stdev=0.1, random_state= 123, rank=2) y_pred = fm.fit_predict_proba(X_train, y_train, X_val) auc_score = cal_auc(y_val, y_pred) log.info("auc_score: {:.4f}".format(auc_score)) log_loss = cal_logloss(y_val, y_pred) log.info("log_loss: {:.4f}".format(log_loss)) save_pickle(fm, pathify('models', 'avazu-fm.pickle')) return fm
def test_clone(): from sklearn.base import clone a = mcmc.FMRegression() b = clone(a) assert a.get_params() == b.get_params() a = mcmc.FMClassification() b = clone(a) assert a.get_params() == b.get_params()
def test_fm_classification_proba(): w0, w, V, y, X = get_test_problem() # transform to labels easier problem then default one y_labels = np.ones_like(y) y_labels[y < np.mean(y)] = -1 fm = mcmc.FMClassification(n_iter=1000, init_stdev=0.1, rank=2) y_pred_proba = fm.fit_predict_proba(X, y_labels, X) y_pred = fm.fit_predict(X, y_labels, X) y_pred_proba[y_pred_proba < .5] = -1 y_pred_proba[y_pred_proba != -1] = 1 assert_array_equal(y_pred, y_pred_proba)
def test_linear_fm_classification(): w0, w, V, y, X = get_test_problem() # transform to labels easier problem then default one y_labels = np.ones_like(y) y_labels[y < np.mean(y)] = -1 fm = mcmc.FMClassification(n_iter=1000, init_stdev=0.1, rank=0) y_pred = fm.fit_predict_proba(X, y_labels, X) fpr, tpr, thresholds = metrics.roc_curve(y_labels, y_pred) auc = metrics.auc(fpr, tpr) assert auc > 0.95 y_pred = fm.predict(X[:2, ])
def __init__(self, learning_method='mcmc', num_iter=100, init_stdev=0.1, k2=8, learn_rate=0, r0_regularization=0.1, r1_regularization=0.1, r2_regularization=0.1, seed=123, model_path=None): if learning_method.upper() == 'MCMC': self.fm = mcmc.FMClassification(n_iter=num_iter, init_stdev=init_stdev, rank=k2, random_state=seed) elif learning_method.upper() == 'ALS': self.fm = als.FMClassification(n_iter=num_iter, init_stdev=init_stdev, rank=k2, random_state=seed, l2_reg=r0_regularization, l2_reg_w=r1_regularization, l2_reg_V=r2_regularization) elif learning_method.upper() == 'SGD': self.fm = sgd.FMClassification(n_iter=num_iter, init_stdev=init_stdev, rank=k2, random_state=seed, l2_reg=r0_regularization, l2_reg_w=r1_regularization, l2_reg_V=r2_regularization, step_size=learn_rate) else: raise TypeError('method should be one of {sgd, als, mcmc}') self.__method = learning_method.upper() # __* means private attribute self.__model_path = model_path
def demo_fastfm(): fm = mcmc.FMClassification(n_iter=100, init_stdev=0.1, rank=16, random_state=123, copy_X=True) y_pred = fm.fit_predict_proba(sparse.csr_matrix(X_tr), np.array(y_tr), sparse.csr_matrix(X_te)) print(y_pred, type(y_pred))
print('mse:', mean_squared_error(y_test, y_pred)) import numpy as np # Convert dataset to binary classification task. y_labels = np.ones_like(y) y_labels[y < np.mean(y)] = -1 X_train, X_test, y_train, y_test = train_test_split(X, y_labels) from fastFM import sgd fm = sgd.FMClassification(n_iter=1000, init_stdev=0.1, l2_reg_w=0, l2_reg_V=0, rank=2, step_size=0.1) fm.fit(X_train, y_train) y_pred = fm.predict(X_test) y_pred_proba = fm.predict_proba(X_test) from sklearn.metrics import accuracy_score, roc_auc_score print('acc:', accuracy_score(y_test, y_pred)) print('auc:', roc_auc_score(y_test, y_pred_proba)) from fastFM import mcmc fm = mcmc.FMClassification(n_iter=1000, rank=2, init_stdev=0.1) y_pred = fm.fit_predict(X_train, y_train, X_test) y_pred_proba = fm.fit_predict_proba(X_train, y_train, X_test) from sklearn.metrics import accuracy_score, roc_auc_score print('acc:', accuracy_score(y_test, y_pred)) print('auc:', roc_auc_score(y_test, y_pred_proba))
##INITIAL TEST OF MCMC WITHOUT OPTIMIZATION## #fm.fit(trainX, trainY) #testY = fm.predict(testX) #print(testY) ###y_pred = fm.fit_predict(trainX, trainY, testX) ###y_pred_proba = fm.fit_predict_proba(trainX, trainY, testX) ##INITIAL TEST OF MCMC WITHOUT OPTIMIZATION## fm = None #TESTS #n_iter = [10,20,30,40,50,60,70,80,90, 100,500,1000,2000,3000] #rank_iter = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,50] #stdev_iter= [0,0.05,0.1,0.5,1.0] #for i in stdev_iter: ##TESTS fm = mcmc.FMClassification(n_iter=80, rank=25, init_stdev=0.1) y_pred = fm.fit_predict(trainX, trainY, testX) y_pred_proba = fm.fit_predict_proba(trainX, trainY, testX) y_pred_proba_auc = fm.fit_predict_proba(X_t, y_t, X_val) fpr,tpr, thresholds = metrics.roc_curve(y_val, y_pred_proba_auc, pos_label = 1) auc = metrics.auc(fpr, tpr) #print auc print "AUC: " %(auc) #submission = pd.DataFrame({'id':test_df['id'], 'ACTION':y_pred_proba_auc}) #submission.to_csv('/Users/admin/Dropbox/EE379K/project/ee379k_project/submissions/submission_FMoptimized.csv', index = False) print "Saving results at submission_FMoptimized.csv"
# coding:utf-8 import pandas as pd import numpy as np from fastFM import mcmc train = pd.read_csv('../data/dup/train_xgb11U.csv') valid = pd.read_csv('../data/dup/valid_xgb11U.csv') test = pd.read_csv('../data/dup/test_xgb11U.csv') train.fillna(0, inplace=True) valid.fillna(0, inplace=True) test.fillna(0, inplace=True) train_Y = train['label'] train.drop('label', axis=1, inplace=True) valid_Y = valid['label'] valid.drop('label', axis=1, inplace=True) test_Y = test['label'] test.drop('label', axis=1, inplace=True) fm = mcmc.FMClassification(n_iter=50, random_state=133) y = fm.fit_predict_proba(train, train_Y, valid) print y