def ffm_model(): X = process(train) Y = train['is_trade'] # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0) ffm_data = ffm.FFMData(X, Y) ffm_data_test = ffm.FFMData(X[418028:468028], Y[418028:468028]) model = ffm.FFM(eta=0.1, lam=0.0001, k=4) model.fit(ffm_data, num_iter=200, val_data=ffm_data_test, metric='logloss', early_stopping=6, maximum=True) t = process(test) ffm_test = ffm.FFMData(t) pred = model.predict_proba(ffm_test) test['predicted_score'] = pred sub1 = test[['instance_id', 'predicted_score']] sub = pd.read_csv("input/test.txt", sep="\s+") sub = pd.merge(sub, sub1, on=['instance_id'], how='left') sub = sub.fillna(0) sub[['instance_id', 'predicted_score']].to_csv('result/result0422_ffm.txt', sep=" ", index=False)
def ffm(df_train, category_features): train = df_train[(df_train['day'] >= 18) & (df_train['day'] <= 23)] col = [ c for c in train if c not in [ 'is_trade', 'item_category_list', 'item_property_list', 'predict_category_property', 'instance_id', 'realtime', 'context_timestamp' ] ] raw_ffm_data = df_train for cols in category_features: raw_ffm_data[cols] = raw_ffm_data[cols].astype(str) data_ffm = FFMFormatPandas(raw_ffm_data[col]) data_ffm_y = raw_ffm_data['is_trade'].tolist() X = train[col] train_num = X.shape[0] X_train_ffm = data_ffm[:train_num] X_test_ffm = data_ffm[train_num:] y_train_ffm = data_ffm_y[:train_num] y_test_ffm = data_ffm_y[train_num:] import ffm ffm_train = ffm.FFMData(X_train_ffm, y_train_ffm) ffm_test = ffm.FFMData(X_test_ffm, y_test_ffm) n_iter = 5 ffmmodel = ffm.FFM(eta=0.02, lam=0.0001, k=6) ffmmodel.init_model(ffm_train) for i in range(n_iter): print('iteration %d : ' % i) ffmmodel.iteration(ffm_train) y_pred = ffmmodel.predict(ffm_test) t_pred = ffmmodel.predict(ffm_train) logloss = log_loss(y_test_ffm, y_pred) t_logloss = log_loss(y_train_ffm, t_pred) print('train log_loss %.4f' % (t_logloss), end='\t') print('test log_loss %.4f' % (logloss))
def fit(self, X, y): ''' :param X: (field, index, value) format :param y: 0 or 1 :return: ''' ffm_data = ffm.FFMData(X, y) model = ffm.FFM(self.eta, self.l2, self.factor) model.init_model(ffm_data) for i in tqdm(range(self.n_iter)): model.iteration(ffm_data) FFM._model = model return model
def build_model(train_X, train_y, test_X, test_y): """ Function to build and to train model from given train and test dataset """ train_ffm_data = ffm.FFMData(train_X, train_y) test_ffm_data = ffm.FFMData(test_X, test_y) model = ffm.FFM(**MODEL_PARAMETERS) model.init_model(train_ffm_data) for i in range(ITERATIONS): model.iteration(train_ffm_data) # TODO temporary fix. replace this line of code with a commented line # return model, roc_auc_score(train_y, model.predict(train_ffm_data)), roc_auc_score(test_y, model.predict(test_ffm_data)) return model, 1, 1
def ffm_test(ffmdata, data): # FFM X_train, X_test, y_train, y_test = train_test_split( ffmdata, data['is_trade'].values, test_size=0.3, random_state=888) n_iter = 20 ffm_train = ffm.FFMData(X_train, y_train) ffm_test = ffm.FFMData(X_test, y_test) model = ffm.FFM(eta=0.05, lam=0.01, k=10) model.init_model(ffm_train) for i in range(n_iter): model.iteration(ffm_train) y_true = model.predict(ffm_train) y_pred = model.predict(ffm_test) train_log = log_loss(y_train, y_true) test_log = log_loss(y_test, y_pred) print('iteration_%d: ' % i, 'train_auc %.4f' % train_log, 'test_auc %.4f' % test_log)
def ffm_0(): X = [ [(1, 2, 1), (2, 3, 1), (3, 5, 1)], [(1, 0, 1), (2, 3, 1), (3, 7, 1)], [(1, 1, 1), (2, 3, 1), (3, 7, 1), (3, 9, 1)], ] y = [1, 1, 0] ffm_data = ffm.FFMData(X, y) # train the model for 10 iterations n_iter = 10 model = ffm.FFM(eta=0.1, lam=0.0001, k=4) model.init_model(ffm_data) for i in range(n_iter): print('iteration %d, ' % i, end='') model.iteration(ffm_data) y_pred = model.predict(ffm_data) auc = log_loss(y, y_pred) print('train auc %.4f' % auc)
def main(): # load data train_set, validation_set, test_set, features = load_data() print(train_set) # train_set.save('no-header.csv', format='csv') # train_set = gl.SFrame.read_csv('no-header.csv', delimiter=',', verbose=False, column_type_hints=str) # train_set = train_set.add_row_number() def transform_row(row): return [':'.join([str(row['id']), str(k), v]) for k, v in row.items() if k != 'id'] train_set['formatted_data'] = train_set.apply(lambda row: ' '.join(sorted(transform_row(row)))) print(train_set['formatted_data']) # train_set['answer'].save('output') # train_set['answer'].save('training_set.csv', format='csv') # train_set['answer'].export_csv('output.csv', delimiter=' ', header=False, line_terminator='\n') # trainfile = 'no-header.csv' print ("converting") train = read_libffm_file(trainfile) print(train) # Train a model m = ffm.FFM() m.fit(train, features, target='click', features=features, nr_iters=10) yhat = m.predict(features) print yhat
def fit(self, X_trn, y_trn, X_val, y_val, model_path=None): logger = logging.getLogger(str(self)) ffm = ffmlib.FFM(eta=self.learning_rate, lam=self.reg, k=self.factor_size) ffm_data_trn = ffmlib.FFMData(X_trn, y_trn) ffm_data_val = ffmlib.FFMData(X_val, y_val) ffm.init_model(ffm_data_trn) auc_trn_max = auc_val_max = auc_val = nb_epochs = 0. while auc_val == auc_val_max and nb_epochs < self.nb_epochs_max: t0 = time() ffm.iteration(ffm_data_trn) t1 = time() auc_trn = roc_auc_score(y_trn, ffm.predict(ffm_data_trn)) auc_val = roc_auc_score(y_val, ffm.predict(ffm_data_val)) logger.info('AUC trn: %.3lf AUC val: %.3lf (%.3lf seconds)' % (auc_trn, auc_val, t1 - t0)) auc_trn_max = max(auc_trn, auc_trn_max) auc_val_max = max(auc_val, auc_val_max) nb_epochs += int(auc_val == auc_val_max) if auc_val == auc_val_max and model_path: logger.info('Saving %s' % model_path) ffm.save_model(model_path) del ffm, ffm_data_trn, ffm_data_val return auc_trn_max, auc_val_max, nb_epochs
def main(): # # load data # train_set, validation_set, test_set, features = load_data() # # train_set['click'] = train_set['click'].astype(int) # print(train_set['click']) # # Train a model # m = ffm.FFM() # m.fit(train_set, features, target='click', features=features, nr_iters=25) # yhat = m.predict(features) # print yhat ######################################################################################################################## trainfile = 'lib/bigdata.tr.txt' validfile = 'lib/bigdata.te.txt' train = read_libffm_file(trainfile) valid = read_libffm_file(validfile) print(train) train['y'] = train['y'].astype(int) del train['features.0'] valid = valid[train.column_names()] train.save('examples/small.tr.sframe') valid.save('examples/small.te.sframe') features = [c for c in train.column_names() if c != 'y'] # Train a model m = ffm.FFM() m.fit(train, valid, target='y', features=features, nr_iters=15) yhat = m.predict(valid) print yhat
for cols in category_features: raw_ffm_data[cols] = raw_ffm_data[cols].astype(str) data_ffm = FFMFormatPandas(raw_ffm_data[col]) data_ffm_y = raw_ffm_data['is_trade'].tolist() train_num = X.shape[0] X_train_ffm = data_ffm[:train_num] X_test_ffm = data_ffm[train_num:] y_train_ffm = data_ffm_y[:train_num] y_test_ffm = data_ffm_y[train_num:] import ffm ffm_train = ffm.FFMData(X_train_ffm, y_train_ffm) ffm_test = ffm.FFMData(X_test_ffm, y_test_ffm) n_iter = 5 ffmmodel = ffm.FFM(eta=0.02, lam=0.0001, k=6) ffmmodel.init_model(ffm_train) for i in range(n_iter): print('iteration %d : ' % i) ffmmodel.iteration(ffm_train) y_pred = ffmmodel.predict(ffm_test) t_pred = ffmmodel.predict(ffm_train) #auc = roc_auc_score(y_test_ffm, y_pred) logloss = log_loss(y_test_ffm, y_pred) #t_auc = roc_auc_score(y_train_ffm, t_pred) t_logloss = log_loss(y_train_ffm, t_pred) print('train log_loss %.4f' % (t_logloss), end='\t') print('test log_loss %.4f' % (logloss))
# prepare the data # (field, index, value) format X = [ [(1, 2, 1), (2, 3, 1), (3, 5, 1)], [(1, 0, 1), (2, 3, 1), (3, 7, 1)], [(1, 1, 1), (2, 3, 1), (3, 7, 1), (3, 9, 1)], [(1, 0, 1), (2, 3, 1), (3, 5, 1)], ] y = [1, 1, 0, 1] ffm_data = ffm.FFMData(X, y) ffm_data_test = ffm.FFMData(X, y) model = ffm.FFM(eta=0.1, lam=0.0001, k=4) model.fit(ffm_data, num_iter=10, val_data=ffm_data_test, metric='auc', early_stopping=6, maximum=True) print(model.predict_proba(ffm_data_test)) model.save_model('result/ololo.bin') model = ffm.read_model('result/ololo.bin') print(model.predict(ffm_data_test))
import ffm import graphlab as gl from convert import read_libffm_file # Output from examples/criteo_process.py train = gl.SFrame('criteo_train_transformed') valid = gl.SFrame('criteo_valid_transformed') # Currently only dictionary columns are supported features = [c for c in train.column_names() if train[c].dtype() == dict] # Train a model m = ffm.FFM() m.fit(train, valid, target='X1', features=features, nr_iters=15) # Make predictions yhat = m.predict(valid)