Ejemplo n.º 1
0
def ffm(df_train, category_features):
    train = df_train[(df_train['day'] >= 18) & (df_train['day'] <= 23)]
    col = [
        c for c in train if c not in [
            'is_trade', 'item_category_list', 'item_property_list',
            'predict_category_property', 'instance_id', 'realtime',
            'context_timestamp'
        ]
    ]
    raw_ffm_data = df_train
    for cols in category_features:
        raw_ffm_data[cols] = raw_ffm_data[cols].astype(str)
    data_ffm = FFMFormatPandas(raw_ffm_data[col])
    data_ffm_y = raw_ffm_data['is_trade'].tolist()
    X = train[col]
    train_num = X.shape[0]
    X_train_ffm = data_ffm[:train_num]
    X_test_ffm = data_ffm[train_num:]
    y_train_ffm = data_ffm_y[:train_num]
    y_test_ffm = data_ffm_y[train_num:]
    import ffm
    ffm_train = ffm.FFMData(X_train_ffm, y_train_ffm)
    ffm_test = ffm.FFMData(X_test_ffm, y_test_ffm)
    n_iter = 5
    ffmmodel = ffm.FFM(eta=0.02, lam=0.0001, k=6)
    ffmmodel.init_model(ffm_train)
    for i in range(n_iter):
        print('iteration %d : ' % i)
        ffmmodel.iteration(ffm_train)
        y_pred = ffmmodel.predict(ffm_test)
        t_pred = ffmmodel.predict(ffm_train)
        logloss = log_loss(y_test_ffm, y_pred)
        t_logloss = log_loss(y_train_ffm, t_pred)
        print('train log_loss %.4f' % (t_logloss), end='\t')
        print('test log_loss %.4f' % (logloss))
Ejemplo n.º 2
0
def ffm_model():
    X = process(train)
    Y = train['is_trade']

    # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
    ffm_data = ffm.FFMData(X, Y)
    ffm_data_test = ffm.FFMData(X[418028:468028], Y[418028:468028])

    model = ffm.FFM(eta=0.1, lam=0.0001, k=4)
    model.fit(ffm_data,
              num_iter=200,
              val_data=ffm_data_test,
              metric='logloss',
              early_stopping=6,
              maximum=True)

    t = process(test)
    ffm_test = ffm.FFMData(t)
    pred = model.predict_proba(ffm_test)

    test['predicted_score'] = pred
    sub1 = test[['instance_id', 'predicted_score']]
    sub = pd.read_csv("input/test.txt", sep="\s+")
    sub = pd.merge(sub, sub1, on=['instance_id'], how='left')
    sub = sub.fillna(0)
    sub[['instance_id', 'predicted_score']].to_csv('result/result0422_ffm.txt',
                                                   sep=" ",
                                                   index=False)
Ejemplo n.º 3
0
def build_model(train_X, train_y, test_X, test_y):
  """
    Function to build and to train model from given train and test dataset
  """
  train_ffm_data = ffm.FFMData(train_X, train_y)
  test_ffm_data = ffm.FFMData(test_X, test_y)

  model = ffm.FFM(**MODEL_PARAMETERS)
  model.init_model(train_ffm_data)

  for i in range(ITERATIONS):
    model.iteration(train_ffm_data)
  # TODO temporary fix. replace this line of code with a commented line
  # return model, roc_auc_score(train_y, model.predict(train_ffm_data)), roc_auc_score(test_y, model.predict(test_ffm_data))
  return model, 1, 1
Ejemplo n.º 4
0
def predict_ffm():
    """
    Function to run prediction process:
    - Get all posts in a model
    - Get only new posts
    - Generate pairs with similar posts for each user
    - Load a model from disk
    - Get FFM predictions
    - Save recommendations to a mongo database
  """
    database_url = config['database_url']
    database = config['database_name']
    utils.log("FFM predict", "Prepare model...")
    model = ffm.read_model(config['model_path'] + "model.bin")
    mappings = joblib.load(config['model_path'] + "mappings.pkl")

    while True:
        utils.log("FFM predict", "Get posts...")
        posts = get_posts(database_url, database)
        utils.log("FFM predict", "Create dataset...")
        events = utils.get_events(database_url, database)
        dataset = create_dataset(posts, events)
        utils.log("FFM predict", "Extend events...")
        dataset = extend_events(dataset, posts)
        mappings, ffm_dataset_X, ffm_dataset_y = create_ffm_dataset(
            dataset, mappings)
        ffm_dataset = ffm.FFMData(ffm_dataset_X, ffm_dataset_y)
        dataset["prediction"] = model.predict(ffm_dataset)
        utils.log("FFM predict", "Save recommendations...")
        save_recommendations(
            dataset[["user_id", "post_permlink", "prediction"]], database_url,
            database)
Ejemplo n.º 5
0
def ffm_test(ffmdata, data):
    # FFM
    X_train, X_test, y_train, y_test = train_test_split(
        ffmdata, data['is_trade'].values, test_size=0.3, random_state=888)
    n_iter = 20
    ffm_train = ffm.FFMData(X_train, y_train)
    ffm_test = ffm.FFMData(X_test, y_test)
    model = ffm.FFM(eta=0.05, lam=0.01, k=10)
    model.init_model(ffm_train)
    for i in range(n_iter):
        model.iteration(ffm_train)
        y_true = model.predict(ffm_train)
        y_pred = model.predict(ffm_test)
        train_log = log_loss(y_train, y_true)
        test_log = log_loss(y_test, y_pred)
        print('iteration_%d: ' % i, 'train_auc %.4f' % train_log,
              'test_auc %.4f' % test_log)
Ejemplo n.º 6
0
 def FFMData(X, y):
     '''
     prepare the data
     :param X: (field, index, value) format
     :param y:
     :return:
     '''
     return ffm.FFMData(X, y)
Ejemplo n.º 7
0
 def predict(self):
     df = pd.read_csv('%s/test.csv' % self.data_dir, usecols=['id'])
     _, _, X = self.get_features(test=True)
     model = ffmlib.read_model(self.model_path)
     ffm_tst = ffmlib.FFMData(X, np.zeros(len(X)))
     df['target'] = model.predict(ffm_tst)
     self.logger.info('Mean target: %.3lf' % df['target'].mean())
     df.to_csv(self.predict_path_tst, index=False)
     self.logger.info('Saved %s' % self.predict_path_tst)
Ejemplo n.º 8
0
 def fit(self, X, y):
     '''
     :param X: (field, index, value) format
     :param y: 0 or 1
     :return:
     '''
     ffm_data = ffm.FFMData(X, y)
     model = ffm.FFM(self.eta, self.l2, self.factor)
     model.init_model(ffm_data)
     for i in tqdm(range(self.n_iter)):
         model.iteration(ffm_data)
     FFM._model = model
     return model
Ejemplo n.º 9
0
 def fit(self, X_trn, y_trn, X_val, y_val, model_path=None):
     logger = logging.getLogger(str(self))
     ffm = ffmlib.FFM(eta=self.learning_rate, lam=self.reg, k=self.factor_size)
     ffm_data_trn = ffmlib.FFMData(X_trn, y_trn)
     ffm_data_val = ffmlib.FFMData(X_val, y_val)
     ffm.init_model(ffm_data_trn)
     auc_trn_max = auc_val_max = auc_val = nb_epochs = 0.
     while auc_val == auc_val_max and nb_epochs < self.nb_epochs_max:
         t0 = time()
         ffm.iteration(ffm_data_trn)
         t1 = time()
         auc_trn = roc_auc_score(y_trn, ffm.predict(ffm_data_trn))
         auc_val = roc_auc_score(y_val, ffm.predict(ffm_data_val))
         logger.info('AUC trn: %.3lf AUC val: %.3lf (%.3lf seconds)' % (auc_trn, auc_val, t1 - t0))
         auc_trn_max = max(auc_trn, auc_trn_max)
         auc_val_max = max(auc_val, auc_val_max)
         nb_epochs += int(auc_val == auc_val_max)
         if auc_val == auc_val_max and model_path:
             logger.info('Saving %s' % model_path)
             ffm.save_model(model_path)
     del ffm, ffm_data_trn, ffm_data_val
     return auc_trn_max, auc_val_max, nb_epochs
Ejemplo n.º 10
0
def ffm_0():
    X = [
        [(1, 2, 1), (2, 3, 1), (3, 5, 1)],
        [(1, 0, 1), (2, 3, 1), (3, 7, 1)],
        [(1, 1, 1), (2, 3, 1), (3, 7, 1), (3, 9, 1)],
    ]
    y = [1, 1, 0]
    ffm_data = ffm.FFMData(X, y)
    # train the model for 10 iterations
    n_iter = 10
    model = ffm.FFM(eta=0.1, lam=0.0001, k=4)
    model.init_model(ffm_data)
    for i in range(n_iter):
        print('iteration %d, ' % i, end='')
    model.iteration(ffm_data)
    y_pred = model.predict(ffm_data)
    auc = log_loss(y, y_pred)
    print('train auc %.4f' % auc)
Ejemplo n.º 11
0
                early_stopping_rounds=100,
                verbose_eval=50,
                categorical_feature=category_features)

raw_ffm_data = df_train
for cols in category_features:
    raw_ffm_data[cols] = raw_ffm_data[cols].astype(str)
data_ffm = FFMFormatPandas(raw_ffm_data[col])
data_ffm_y = raw_ffm_data['is_trade'].tolist()
train_num = X.shape[0]
X_train_ffm = data_ffm[:train_num]
X_test_ffm = data_ffm[train_num:]
y_train_ffm = data_ffm_y[:train_num]
y_test_ffm = data_ffm_y[train_num:]
import ffm
ffm_train = ffm.FFMData(X_train_ffm, y_train_ffm)
ffm_test = ffm.FFMData(X_test_ffm, y_test_ffm)

n_iter = 5

ffmmodel = ffm.FFM(eta=0.02, lam=0.0001, k=6)
ffmmodel.init_model(ffm_train)

for i in range(n_iter):
    print('iteration %d : ' % i)
    ffmmodel.iteration(ffm_train)

    y_pred = ffmmodel.predict(ffm_test)
    t_pred = ffmmodel.predict(ffm_train)
    #auc = roc_auc_score(y_test_ffm, y_pred)
    logloss = log_loss(y_test_ffm, y_pred)
Ejemplo n.º 12
0
import ffm

# prepare the data
# (field, index, value) format

X = [
    [(1, 2, 1), (2, 3, 1), (3, 5, 1)],
    [(1, 0, 1), (2, 3, 1), (3, 7, 1)],
    [(1, 1, 1), (2, 3, 1), (3, 7, 1), (3, 9, 1)],
    [(1, 0, 1), (2, 3, 1), (3, 5, 1)],
]

y = [1, 1, 0, 1]

ffm_data = ffm.FFMData(X, y)
ffm_data_test = ffm.FFMData(X, y)

model = ffm.FFM(eta=0.1, lam=0.0001, k=4)
model.fit(ffm_data,
          num_iter=10,
          val_data=ffm_data_test,
          metric='auc',
          early_stopping=6,
          maximum=True)

print(model.predict_proba(ffm_data_test))

model.save_model('result/ololo.bin')

model = ffm.read_model('result/ololo.bin')
Ejemplo n.º 13
0
 def predict(cls, X, y):
     ffm_data = ffm.FFMData(X, y)
     return cls._model.predict(ffm_data)
Ejemplo n.º 14
0
 def predict(self, X, model_path):
     ffm = ffmlib.read_model(model_path)
     ffm_data = ffmlib.FFMData(X, np.zeros(len(X)))
     yp = ffm.predict(ffm_data)
     del ffm_data
     return yp
Ejemplo n.º 15
0
	def transform_convert(self, df):
		X, y = self.transform(df)
		return ffm.FFMData(X, y)