Beispiel #1
0
class FactorizationMachineBasedRecommender(BaseEstimator, ClassifierMixin):
    def __init__(self, show_progress=False):
        self.show_progress = show_progress
        self.model = TFFMClassifier(
            order=6,
            rank=10,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
            n_epochs=100,
            batch_size=-1,
            init_std=0.001,
            input_type='dense')

    def fit(self, X, y=None):
        self.model.fit(X, y, show_progress=self.show_progress)

        return self

    def predict_proba(self, X, y=None):
        return self.model.predict_proba(X)[:, 1]

    def fit_predict_proba(self, X, y=None):
        self.fit(X, y)

        return self.predict_proba(X)

    def score(self, X, y=None, **kwargs):
        y_pred = self.predict_proba(X, y)

        return roc_auc_score(y, y_pred)
Beispiel #2
0
 def __init__(self, show_progress=False):
     self.show_progress = show_progress
     self.model = TFFMClassifier(
         order=6,
         rank=10,
         optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
         n_epochs=100,
         batch_size=-1,
         init_std=0.001,
         input_type='dense')
Beispiel #3
0
 def fit(self, training_data, y):
     self.model = TFFMClassifier(
         order=2,
         rank=64,
         optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
         n_epochs=100,
         batch_size=-1,
         init_std=0.001,
         input_type='sparse',
         verbose=2)
     self.model.fit(X=training_data, y=y)
Beispiel #4
0
    def test_decision_function_order_4(self):
        model = TFFMClassifier(order=4, rank=10, n_epochs=1)
        model.fit(self.X, self.y)
        b = model.b.eval(session=model.session)
        w = [0] * 4
        for i in range(4):
            w[i] = model.w[i].eval(session=model.session)

        desired = self.bruteforce_inference(self.X, w, b)

        actual = model.decision_function(self.X)
        np.testing.assert_almost_equal(actual, desired)
Beispiel #5
0
    def decision_function_order_4(self, input_type, use_diag=False):
        # Explanation for init_std=1.0.
        # With small init_std the contribution of higher order terms is
        # neglectable, so we would essentially test only low-order implementation.
        # That's why a relatively high init_std=1.0 here.
        model = TFFMClassifier(
            order=4,
            rank=10,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
            n_epochs=0,
            input_type=input_type,
            init_std=1.0,
            seed=0,
            use_diag=use_diag)

        if input_type == 'dense':
            X = self.X
        else:
            X = sp.csr_matrix(self.X)

        model.fit(X, self.y)
        b = model.intercept
        w = model.weights

        desired = self.bruteforce_inference(self.X, w, b, use_diag=use_diag)

        actual = model.decision_function(X)
        model.destroy()

        np.testing.assert_almost_equal(actual, desired, decimal=4)
Beispiel #6
0
class FM_Rec(RecModel):
    def __init__(self):
        self.model = None

    def fit(self, training_data, y):
        self.model = TFFMClassifier(
            order=2,
            rank=64,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
            n_epochs=100,
            batch_size=-1,
            init_std=0.001,
            input_type='sparse',
            verbose=2)
        self.model.fit(X=training_data, y=y)

    def predict(self, predict_users):
        return self.model.predict_proba(predict_users)
Beispiel #7
0
    def decision_function_order_4(self, input_type, use_diag=False):
        # Explanation for init_std=1.0.
        # With small init_std the contribution of higher order terms is
        # neglectable, so we would essentially test only low-order implementation.
        # That's why a relatively high init_std=1.0 here.
        model = TFFMClassifier(
            order=4,
            rank=10,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
            n_epochs=0,
            input_type=input_type,
            init_std=1.0,
            seed=0,
            use_diag=use_diag
        )

        if input_type == 'dense':
            X = self.X
        else:
            X = sp.csr_matrix(self.X)

        model.fit(X, self.y)
        b = model.intercept
        w = model.weights

        desired = self.bruteforce_inference(self.X, w, b, use_diag=use_diag)

        actual = model.decision_function(X)
        model.destroy()

        np.testing.assert_almost_equal(actual, desired, decimal=4)
Beispiel #8
0
    def decision_function_order_4(self, input_type):
        model = TFFMClassifier(
            order=4,
            rank=10,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
            n_epochs=1,
            input_type=input_type)

        if input_type == 'dense':
            X = self.X
        else:
            X = sp.csr_matrix(self.X)

        model.fit(X, self.y)
        b = model.intercept
        w = model.weights

        desired = self.bruteforce_inference(self.X, w, b)

        actual = model.decision_function(X)
        model.destroy()
        np.testing.assert_almost_equal(actual, desired, decimal=6)
Beispiel #9
0
    def decision_function_order_4(self, input_type):
        model = TFFMClassifier(
            order=4,
            rank=10,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
            n_epochs=1,
            input_type=input_type
        )

        if input_type == 'dense':
            X = self.X
        else:
            X = sp.csr_matrix(self.X)

        model.fit(X, self.y)
        b = model.intercept
        w = model.weights

        desired = self.bruteforce_inference(self.X, w, b)

        actual = model.decision_function(X)
        model.destroy()
        np.testing.assert_almost_equal(actual, desired, decimal=6)
ax_tr = np.array(X_tr)
ax_te = np.array(X_te)
ax_te_cs = np.array(X_te_cs)

# replacing NaN with zeros
ax_tr = np.nan_to_num(ax_tr)
ax_te = np.nan_to_num(ax_te)
ax_te_cs = np.nan_to_num(ax_te_cs)

# defining the model with optimized hyper parameters
model = TFFMClassifier(order=2,
                       rank=7,
                       optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
                       n_epochs=100,
                       batch_size=1024,
                       init_std=0.001,
                       reg=0.01,
                       input_type='dense',
                       log_dir='/home/asif/01_tffm/logs/',
                       verbose=1,
                       seed=12345)

# preparing the data for cold start
cold_start = pd.DataFrame(ax_te_cs, columns=X_tr.columns)

# What happens if we only have access to categories and no historical click/purchase data?
# Let's delete historical click and purchasing data for the cold_start test set
for column in cold_start.columns:
    if ('buy' in column or 'click' in column) and ('Category' not in column):
        cold_start[column] = 0
Beispiel #11
0
    y_train = y[train_index]
    y_test = y[test_index]
    # print("train len: %d, test len: %d" % (len(yy_train), len(yy_test)))
    # print("train sum: %d, test sum: %d" % (sum(yy_train), sum(yy_test)))
    features_train = features[train_index]
    features_test = features[test_index]

    sparse_u_train = get_selected_input(features_train, "u_all", cfg)
    sparse_ad_train = get_selected_input(features_train, "ad_all", cfg)

    sparse_u_test = get_selected_input(features_test, "u_all", cfg)
    sparse_ad_test = get_selected_input(features_test, "ad_all", cfg)

    sparse_x_train = hstack([sparse_u_train, sparse_ad_train]).tocsr()
    sparse_x_test = hstack([sparse_u_test, sparse_ad_test]).tocsr()

order = 2
model = TFFMClassifier(order=order,
                       rank=10,
                       optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
                       n_epochs=50,
                       batch_size=50,
                       init_std=0.001,
                       reg=0.01,
                       input_type='sparse',
                       seed=42)
model.fit(sparse_x_train, y_train, show_progress=False)
predictions = model.predict(sparse_x_test)
print('[order={}] accuracy: {}'.format(order,
                                       roc_auc_score(y_test, predictions)))
model.destroy()
Beispiel #12
0
            idx = np.arange(x_train.shape[0], dtype=int)
            _train = idx[train]
            _test = idx[test]            
            trn_x = x_train[_train, :]
            val_x = x_train[_test, :]
            trn_y = y_train[train]
            val_y = y_train[test]

            list_idx = df.loc[test].reset_index(drop=True).groupby(
                'order_id').apply(lambda x: x.index.values.shape[0]).tolist()
            list_idx = np.array(list_idx, dtype=np.int)

            clf = TFFMClassifier(order=2,
                                 rank=10,
                                     optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
                                     n_epochs=100,
                                     batch_size=100000,
                                     init_std=0.001,
                                     input_type='sparse'
                                     )
            
            clf.fit(trn_x, trn_y, show_progress=True)
            pred = clf.predict_proba(val_x)[:, 1]
            all_pred[test] = pred

            _score = log_loss(val_y, pred)
            _score2 = - roc_auc_score(val_y, pred)
            _, _score3, _ = f1_metric(val_y.astype(int), pred.astype(float))
            logger.debug('   _score: %s' % _score3)
            list_score.append(_score)
            list_score2.append(_score2)
            list_score3.append(- 1 * _score3)
Beispiel #13
0
print('Non-zeros rate: {}'.format(np.mean(x_train != 0)))
print('Classes balance: {} / {}'.format(np.mean(y_train == 0),
                                        np.mean(y_train == 1)))

X_tr, X_te, y_tr, y_te = train_test_split(x_train,
                                          y_train,
                                          random_state=42,
                                          test_size=0.3)

from tffm import TFFMClassifier, TFFMRegressor

for order in [3]:
    model = TFFMClassifier(
        order=order,
        rank=5,
        optimizer=tf.train.AdagradOptimizer(learning_rate=0.00001),
        n_epochs=5,
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        # session_config=tf.ConfigProto(log_device_placement=True, device_count={'GPU':0}),
        seed=42)
    model.fit(X_tr, y_tr, show_progress=True)
    predictions = model.predict(X_te)
    print('[order={}] logloss: {}'.format(order, log_loss(y_te, predictions)))
    print('[order={}] auc: {}'.format(order, roc_auc_score(y_te, predictions)))
    print(predictions)
    # this will close tf.Session and free resources
    model.destroy()
def tafm(data, oscar, subset_quantile):
  #Initialize output
  n_iteration = 1 
  result_dict = []
  oscar_pred = pd.DataFrame()
  os_acc = 0
  yr_name = ["_5","_10","_15"]
  
  for quantile in subset_quantile:
    print("---Running iteration " + str(n_iteration) + " ---")
    #subset original data
    filter_standard, subset_data = subsetdata(data, quantile)
    
    n_raters = subset_data.Rater.unique().shape[0]
    n_movies = round(subset_data.Film.unique().shape[0]/data.Film.unique().shape[0],2)
    n_size = subset_data.shape[0]*subset_data.shape[1]
    
    #split test and train data
    xtrain, xtest, ytrain, ytest = train_test_split(subset_data.loc[:, subset_data.columns != "Win"],subset_data.loc[:, "Win"], test_size=0.2, random_state=42)
    
    #encode data
    encoder, xtrain_enc, xtest_enc = OneHotEncoding(xtrain, xtest)
    
    #Dense
    start = time.time()
    tf.reset_default_graph()
    order = 2
    model = TFFMClassifier(
        order=order, 
        rank=10,
        optimizer=tf.train.FtrlOptimizer(0.1, l1_regularization_strength=0.001), 
        n_epochs=50, 
        init_std=0.001,
        reg=0.01,
        batch_size=50,
        input_type='sparse'
    )
    model.fit(xtrain_enc, ytrain, show_progress=True)
    predictions = model.predict(xtest_enc)
    
    #Evaluation metrics
    model_acc = f1_score(ytest, predictions, average='weighted')
    
    #Predicting the recent 15 years of Oscar
    unique_years = list(np.unique(oscar.Year))
    oscar_years = [unique_years[-5:], unique_years[-10:], unique_years]
    oscar_accuracy = []
    for n in range(len(oscar_years)):
      subset_yr = oscar_years[n]
      oscar_subset = oscar >> mask(X.Year.isin(subset_yr))
      oscar_x = oscar_subset.loc[:, oscar_subset.columns != "Win"]
      oscar_y = oscar_subset.loc[:, "Win"]
      oscar_train = encoder.transform(oscar_x)
    
      oscar_predictions = model.predict(oscar_train)
      oscar_acc = f1_score(oscar_y, oscar_predictions, average="weighted")
      oscar_accuracy.append(oscar_acc)
      
      if oscar_acc > os_acc:
        os_acc=oscar_acc
        oscar_pred = oscar_subset.copy()
        oscar_pred["Predictions"] = oscar_predictions
      
    spent_time = time.time() - start
    
    res = [n_movies, spent_time, model_acc]
    res.extend(oscar_accuracy)
    result_dict.append(res)
    n_iteration += 1
  
  results = pd.DataFrame(result_dict)
  results.columns = ["N_Movies","Time","Model_Accuracy","Oscar_Rec5","Oscar_Rec10","Oscar_Rec15"]
        
  return results, oscar_pred
y = np.reshape(y, (y.shape[0], ))

X = data_train_FM.drop(
    columns=['FREQUENCY', 'CUST_ID', 'ARTICLE_ID', 'AGE']).to_numpy()
X = X.astype(np.float32)

del data_train_FM

rank = 20
l_r = 0.05
reg = 0.001
epoch = 200

model_tf = TFFMClassifier(order=2,
                          rank=rank,
                          optimizer=tf.train.AdamOptimizer(learning_rate=l_r),
                          reg=reg,
                          n_epochs=epoch,
                          init_std=0.0001)

protocol = pd.read_csv(d + '/test_protocol.csv')
protocol = protocol.drop_duplicates()

data_train = pd.read_csv(d + '/train_model.csv')[['CUST_ID',
                                                  'AGE']].drop_duplicates()

protocol = pd.merge(protocol, data_train, on='CUST_ID')

data_reco_baselines = pd.read_csv(d+'/data_reco_baselines.csv').drop_duplicates()\
[['ARM_PRECISION', 'K50_PRECISION','ALS_PRECISION','BPR_PRECISION', 'VAES_PRECISION',\
 'SPEC_PRECISION', 'CUST_ID','ARTICLE_ID']]
Beispiel #16
0
print('Dataset shape: {}'.format(x_train.shape))
print('Non-zeros rate: {}'.format(np.mean(x_train != 0)))
print('Classes balance: {} / {}'.format(np.mean(y_train == 0), np.mean(y_train == 1)))

X_tr, X_te, y_tr, y_te = train_test_split(x_train, y_train, random_state=42, test_size=0.3)


from tffm import TFFMClassifier, TFFMRegressor

for order in [3]:
    model = TFFMClassifier(
        order=order,
        rank=5,
        optimizer=tf.train.AdagradOptimizer(learning_rate=0.00001),
        n_epochs=5,
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        # session_config=tf.ConfigProto(log_device_placement=True, device_count={'GPU':0}),
        seed=42
    )
    model.fit(X_tr, y_tr, show_progress=True)
    predictions = model.predict(X_te)
    print('[order={}] logloss: {}'.format(order, log_loss(y_te, predictions)))
    print('[order={}] auc: {}'.format(order, roc_auc_score(y_te, predictions)))
    print(predictions)
    # this will close tf.Session and free resources
    model.destroy()
Beispiel #17
0
    logger.info('cv_start')
    for params in ParameterGrid(all_params):
        logger.info('param: %s' % (params))
        for train_idx, test_idx in list(cv)[:1]:
            with gzip.open('train_fm.svm', 'wb') as f:
                dump_svmlight_file(data[train_idx], target[train_idx], f)
            del output
            gc.collect()
            with gzip.open('test_svm.svm', 'wb') as f:
                dump_svmlight_file(data[test_idx], target[test_idx], f)

            model = TFFMClassifier(
                order=2,
                rank=10,
                optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
                n_epochs=50,
                batch_size=100000,
                init_std=0.001,
                reg=0.001,
                input_type='sparse')
            """
            model = FMClassification()
            """
            model.fit(data[train_idx], target[train_idx], show_progress=True)
            ans = model.predict_proba(data[test_idx])[:, 1]

            score = roc_auc_score(target[test_idx], ans)
            logger.info('score: %s' % score)
            logger.info('all thresh: %s, score: %s' %
                        mcc_optimize(ans, target[test_idx]))
            score = roc_auc_score(target[test_idx], ans)
print ("-------------Training the sigmoid SVM-------------")
print("sigmoid SVM auc is",svmauc3)

from sklearn import svm
clf4 = svm.NuSVC(kernel='poly')
y_pred_svm4 = clf4.fit(X=dataset_train_X,y=dataset_train_y).predict(dataset_test_X)
svmauc4 = roc_auc_score(y_true=dataset_test_y,y_score=y_pred_svm4)
print ("-------------Training the poly SVM-------------")
print("poly SVM auc is",svmauc4)

from tffm import TFFMClassifier
model = TFFMClassifier(
    order=3,
    rank=16,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
    n_epochs=200,
    batch_size=-1,
    init_std=0.001,
    input_type='dense'
)

model.fit(dataset_train_X, dataset_train_y, show_progress=True)
predict = model.predict(X=dataset_test_X)

tfm_auc_3d = roc_auc_score(y_true=dataset_test_y,y_score=predict)
print("3d fm is",tfm_auc_3d)


from tffm import TFFMClassifier
model = TFFMClassifier(
    order=4,
    def test_case_1(self):
        categorical_features = [
            'source_system_tab', 'source_screen_name', 'city', 'gender'
        ]

        categorical_features_lang = ['language']

        numerical_features = ['bd', 'song_length', 'days_registered']

        num_features_pipeline = Pipeline([
            ('impute', SimpleImputer(missing_values=np.nan, strategy='mean')),
            ('discretize', KBinsDiscretizer(n_bins=4, encode='onehot-dense'))
        ])

        cat_features_pipeline = Pipeline([
            ('impute',
             SimpleImputer(missing_values=np.nan,
                           strategy='constant',
                           fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])

        cat_features_pipeline_lang = Pipeline([
            ('impute',
             SimpleImputer(missing_values=np.nan,
                           strategy='constant',
                           fill_value=-1)),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])

        preprocessor = ColumnTransformer(
            transformers=[('num', num_features_pipeline, numerical_features),
                          ('cat', cat_features_pipeline, categorical_features),
                          ('cat_lang', cat_features_pipeline_lang,
                           categorical_features_lang)])

        unified_pipeline = Pipeline(
            steps=[('add_meta_info',
                    MetaFeaturesExtractor(user_meta=members, item_meta=songs)
                    ), ('preprocessing', preprocessor)])

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.90,
                                                            random_state=42,
                                                            stratify=y)

        X_train = unified_pipeline.fit_transform(X_train, y_train)

        self.assertTrue(len(X_train) > 0)

        model = TFFMClassifier(
            order=6,
            rank=10,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
            n_epochs=100,
            batch_size=-1,
            init_std=0.001,
            input_type='dense')

        model.fit(X_train, y_train.values, show_progress=True)