def ffm(df_train, category_features): train = df_train[(df_train['day'] >= 18) & (df_train['day'] <= 23)] col = [ c for c in train if c not in [ 'is_trade', 'item_category_list', 'item_property_list', 'predict_category_property', 'instance_id', 'realtime', 'context_timestamp' ] ] raw_ffm_data = df_train for cols in category_features: raw_ffm_data[cols] = raw_ffm_data[cols].astype(str) data_ffm = FFMFormatPandas(raw_ffm_data[col]) data_ffm_y = raw_ffm_data['is_trade'].tolist() X = train[col] train_num = X.shape[0] X_train_ffm = data_ffm[:train_num] X_test_ffm = data_ffm[train_num:] y_train_ffm = data_ffm_y[:train_num] y_test_ffm = data_ffm_y[train_num:] import ffm ffm_train = ffm.FFMData(X_train_ffm, y_train_ffm) ffm_test = ffm.FFMData(X_test_ffm, y_test_ffm) n_iter = 5 ffmmodel = ffm.FFM(eta=0.02, lam=0.0001, k=6) ffmmodel.init_model(ffm_train) for i in range(n_iter): print('iteration %d : ' % i) ffmmodel.iteration(ffm_train) y_pred = ffmmodel.predict(ffm_test) t_pred = ffmmodel.predict(ffm_train) logloss = log_loss(y_test_ffm, y_pred) t_logloss = log_loss(y_train_ffm, t_pred) print('train log_loss %.4f' % (t_logloss), end='\t') print('test log_loss %.4f' % (logloss))
def ffm_model(): X = process(train) Y = train['is_trade'] # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0) ffm_data = ffm.FFMData(X, Y) ffm_data_test = ffm.FFMData(X[418028:468028], Y[418028:468028]) model = ffm.FFM(eta=0.1, lam=0.0001, k=4) model.fit(ffm_data, num_iter=200, val_data=ffm_data_test, metric='logloss', early_stopping=6, maximum=True) t = process(test) ffm_test = ffm.FFMData(t) pred = model.predict_proba(ffm_test) test['predicted_score'] = pred sub1 = test[['instance_id', 'predicted_score']] sub = pd.read_csv("input/test.txt", sep="\s+") sub = pd.merge(sub, sub1, on=['instance_id'], how='left') sub = sub.fillna(0) sub[['instance_id', 'predicted_score']].to_csv('result/result0422_ffm.txt', sep=" ", index=False)
def build_model(train_X, train_y, test_X, test_y): """ Function to build and to train model from given train and test dataset """ train_ffm_data = ffm.FFMData(train_X, train_y) test_ffm_data = ffm.FFMData(test_X, test_y) model = ffm.FFM(**MODEL_PARAMETERS) model.init_model(train_ffm_data) for i in range(ITERATIONS): model.iteration(train_ffm_data) # TODO temporary fix. replace this line of code with a commented line # return model, roc_auc_score(train_y, model.predict(train_ffm_data)), roc_auc_score(test_y, model.predict(test_ffm_data)) return model, 1, 1
def predict_ffm(): """ Function to run prediction process: - Get all posts in a model - Get only new posts - Generate pairs with similar posts for each user - Load a model from disk - Get FFM predictions - Save recommendations to a mongo database """ database_url = config['database_url'] database = config['database_name'] utils.log("FFM predict", "Prepare model...") model = ffm.read_model(config['model_path'] + "model.bin") mappings = joblib.load(config['model_path'] + "mappings.pkl") while True: utils.log("FFM predict", "Get posts...") posts = get_posts(database_url, database) utils.log("FFM predict", "Create dataset...") events = utils.get_events(database_url, database) dataset = create_dataset(posts, events) utils.log("FFM predict", "Extend events...") dataset = extend_events(dataset, posts) mappings, ffm_dataset_X, ffm_dataset_y = create_ffm_dataset( dataset, mappings) ffm_dataset = ffm.FFMData(ffm_dataset_X, ffm_dataset_y) dataset["prediction"] = model.predict(ffm_dataset) utils.log("FFM predict", "Save recommendations...") save_recommendations( dataset[["user_id", "post_permlink", "prediction"]], database_url, database)
def ffm_test(ffmdata, data): # FFM X_train, X_test, y_train, y_test = train_test_split( ffmdata, data['is_trade'].values, test_size=0.3, random_state=888) n_iter = 20 ffm_train = ffm.FFMData(X_train, y_train) ffm_test = ffm.FFMData(X_test, y_test) model = ffm.FFM(eta=0.05, lam=0.01, k=10) model.init_model(ffm_train) for i in range(n_iter): model.iteration(ffm_train) y_true = model.predict(ffm_train) y_pred = model.predict(ffm_test) train_log = log_loss(y_train, y_true) test_log = log_loss(y_test, y_pred) print('iteration_%d: ' % i, 'train_auc %.4f' % train_log, 'test_auc %.4f' % test_log)
def FFMData(X, y): ''' prepare the data :param X: (field, index, value) format :param y: :return: ''' return ffm.FFMData(X, y)
def predict(self): df = pd.read_csv('%s/test.csv' % self.data_dir, usecols=['id']) _, _, X = self.get_features(test=True) model = ffmlib.read_model(self.model_path) ffm_tst = ffmlib.FFMData(X, np.zeros(len(X))) df['target'] = model.predict(ffm_tst) self.logger.info('Mean target: %.3lf' % df['target'].mean()) df.to_csv(self.predict_path_tst, index=False) self.logger.info('Saved %s' % self.predict_path_tst)
def fit(self, X, y): ''' :param X: (field, index, value) format :param y: 0 or 1 :return: ''' ffm_data = ffm.FFMData(X, y) model = ffm.FFM(self.eta, self.l2, self.factor) model.init_model(ffm_data) for i in tqdm(range(self.n_iter)): model.iteration(ffm_data) FFM._model = model return model
def fit(self, X_trn, y_trn, X_val, y_val, model_path=None): logger = logging.getLogger(str(self)) ffm = ffmlib.FFM(eta=self.learning_rate, lam=self.reg, k=self.factor_size) ffm_data_trn = ffmlib.FFMData(X_trn, y_trn) ffm_data_val = ffmlib.FFMData(X_val, y_val) ffm.init_model(ffm_data_trn) auc_trn_max = auc_val_max = auc_val = nb_epochs = 0. while auc_val == auc_val_max and nb_epochs < self.nb_epochs_max: t0 = time() ffm.iteration(ffm_data_trn) t1 = time() auc_trn = roc_auc_score(y_trn, ffm.predict(ffm_data_trn)) auc_val = roc_auc_score(y_val, ffm.predict(ffm_data_val)) logger.info('AUC trn: %.3lf AUC val: %.3lf (%.3lf seconds)' % (auc_trn, auc_val, t1 - t0)) auc_trn_max = max(auc_trn, auc_trn_max) auc_val_max = max(auc_val, auc_val_max) nb_epochs += int(auc_val == auc_val_max) if auc_val == auc_val_max and model_path: logger.info('Saving %s' % model_path) ffm.save_model(model_path) del ffm, ffm_data_trn, ffm_data_val return auc_trn_max, auc_val_max, nb_epochs
def ffm_0(): X = [ [(1, 2, 1), (2, 3, 1), (3, 5, 1)], [(1, 0, 1), (2, 3, 1), (3, 7, 1)], [(1, 1, 1), (2, 3, 1), (3, 7, 1), (3, 9, 1)], ] y = [1, 1, 0] ffm_data = ffm.FFMData(X, y) # train the model for 10 iterations n_iter = 10 model = ffm.FFM(eta=0.1, lam=0.0001, k=4) model.init_model(ffm_data) for i in range(n_iter): print('iteration %d, ' % i, end='') model.iteration(ffm_data) y_pred = model.predict(ffm_data) auc = log_loss(y, y_pred) print('train auc %.4f' % auc)
early_stopping_rounds=100, verbose_eval=50, categorical_feature=category_features) raw_ffm_data = df_train for cols in category_features: raw_ffm_data[cols] = raw_ffm_data[cols].astype(str) data_ffm = FFMFormatPandas(raw_ffm_data[col]) data_ffm_y = raw_ffm_data['is_trade'].tolist() train_num = X.shape[0] X_train_ffm = data_ffm[:train_num] X_test_ffm = data_ffm[train_num:] y_train_ffm = data_ffm_y[:train_num] y_test_ffm = data_ffm_y[train_num:] import ffm ffm_train = ffm.FFMData(X_train_ffm, y_train_ffm) ffm_test = ffm.FFMData(X_test_ffm, y_test_ffm) n_iter = 5 ffmmodel = ffm.FFM(eta=0.02, lam=0.0001, k=6) ffmmodel.init_model(ffm_train) for i in range(n_iter): print('iteration %d : ' % i) ffmmodel.iteration(ffm_train) y_pred = ffmmodel.predict(ffm_test) t_pred = ffmmodel.predict(ffm_train) #auc = roc_auc_score(y_test_ffm, y_pred) logloss = log_loss(y_test_ffm, y_pred)
import ffm # prepare the data # (field, index, value) format X = [ [(1, 2, 1), (2, 3, 1), (3, 5, 1)], [(1, 0, 1), (2, 3, 1), (3, 7, 1)], [(1, 1, 1), (2, 3, 1), (3, 7, 1), (3, 9, 1)], [(1, 0, 1), (2, 3, 1), (3, 5, 1)], ] y = [1, 1, 0, 1] ffm_data = ffm.FFMData(X, y) ffm_data_test = ffm.FFMData(X, y) model = ffm.FFM(eta=0.1, lam=0.0001, k=4) model.fit(ffm_data, num_iter=10, val_data=ffm_data_test, metric='auc', early_stopping=6, maximum=True) print(model.predict_proba(ffm_data_test)) model.save_model('result/ololo.bin') model = ffm.read_model('result/ololo.bin')
def predict(cls, X, y): ffm_data = ffm.FFMData(X, y) return cls._model.predict(ffm_data)
def predict(self, X, model_path): ffm = ffmlib.read_model(model_path) ffm_data = ffmlib.FFMData(X, np.zeros(len(X))) yp = ffm.predict(ffm_data) del ffm_data return yp
def transform_convert(self, df): X, y = self.transform(df) return ffm.FFMData(X, y)