class FactorizationMachineBasedRecommender(BaseEstimator, ClassifierMixin): def __init__(self, show_progress=False): self.show_progress = show_progress self.model = TFFMClassifier( order=6, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_epochs=100, batch_size=-1, init_std=0.001, input_type='dense') def fit(self, X, y=None): self.model.fit(X, y, show_progress=self.show_progress) return self def predict_proba(self, X, y=None): return self.model.predict_proba(X)[:, 1] def fit_predict_proba(self, X, y=None): self.fit(X, y) return self.predict_proba(X) def score(self, X, y=None, **kwargs): y_pred = self.predict_proba(X, y) return roc_auc_score(y, y_pred)
def __init__(self, show_progress=False): self.show_progress = show_progress self.model = TFFMClassifier( order=6, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_epochs=100, batch_size=-1, init_std=0.001, input_type='dense')
def fit(self, training_data, y): self.model = TFFMClassifier( order=2, rank=64, optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_epochs=100, batch_size=-1, init_std=0.001, input_type='sparse', verbose=2) self.model.fit(X=training_data, y=y)
def test_decision_function_order_4(self): model = TFFMClassifier(order=4, rank=10, n_epochs=1) model.fit(self.X, self.y) b = model.b.eval(session=model.session) w = [0] * 4 for i in range(4): w[i] = model.w[i].eval(session=model.session) desired = self.bruteforce_inference(self.X, w, b) actual = model.decision_function(self.X) np.testing.assert_almost_equal(actual, desired)
def decision_function_order_4(self, input_type, use_diag=False): # Explanation for init_std=1.0. # With small init_std the contribution of higher order terms is # neglectable, so we would essentially test only low-order implementation. # That's why a relatively high init_std=1.0 here. model = TFFMClassifier( order=4, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.1), n_epochs=0, input_type=input_type, init_std=1.0, seed=0, use_diag=use_diag) if input_type == 'dense': X = self.X else: X = sp.csr_matrix(self.X) model.fit(X, self.y) b = model.intercept w = model.weights desired = self.bruteforce_inference(self.X, w, b, use_diag=use_diag) actual = model.decision_function(X) model.destroy() np.testing.assert_almost_equal(actual, desired, decimal=4)
class FM_Rec(RecModel): def __init__(self): self.model = None def fit(self, training_data, y): self.model = TFFMClassifier( order=2, rank=64, optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_epochs=100, batch_size=-1, init_std=0.001, input_type='sparse', verbose=2) self.model.fit(X=training_data, y=y) def predict(self, predict_users): return self.model.predict_proba(predict_users)
def decision_function_order_4(self, input_type, use_diag=False): # Explanation for init_std=1.0. # With small init_std the contribution of higher order terms is # neglectable, so we would essentially test only low-order implementation. # That's why a relatively high init_std=1.0 here. model = TFFMClassifier( order=4, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.1), n_epochs=0, input_type=input_type, init_std=1.0, seed=0, use_diag=use_diag ) if input_type == 'dense': X = self.X else: X = sp.csr_matrix(self.X) model.fit(X, self.y) b = model.intercept w = model.weights desired = self.bruteforce_inference(self.X, w, b, use_diag=use_diag) actual = model.decision_function(X) model.destroy() np.testing.assert_almost_equal(actual, desired, decimal=4)
def decision_function_order_4(self, input_type): model = TFFMClassifier( order=4, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.1), n_epochs=1, input_type=input_type) if input_type == 'dense': X = self.X else: X = sp.csr_matrix(self.X) model.fit(X, self.y) b = model.intercept w = model.weights desired = self.bruteforce_inference(self.X, w, b) actual = model.decision_function(X) model.destroy() np.testing.assert_almost_equal(actual, desired, decimal=6)
def decision_function_order_4(self, input_type): model = TFFMClassifier( order=4, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.1), n_epochs=1, input_type=input_type ) if input_type == 'dense': X = self.X else: X = sp.csr_matrix(self.X) model.fit(X, self.y) b = model.intercept w = model.weights desired = self.bruteforce_inference(self.X, w, b) actual = model.decision_function(X) model.destroy() np.testing.assert_almost_equal(actual, desired, decimal=6)
ax_tr = np.array(X_tr) ax_te = np.array(X_te) ax_te_cs = np.array(X_te_cs) # replacing NaN with zeros ax_tr = np.nan_to_num(ax_tr) ax_te = np.nan_to_num(ax_te) ax_te_cs = np.nan_to_num(ax_te_cs) # defining the model with optimized hyper parameters model = TFFMClassifier(order=2, rank=7, optimizer=tf.train.AdamOptimizer(learning_rate=0.001), n_epochs=100, batch_size=1024, init_std=0.001, reg=0.01, input_type='dense', log_dir='/home/asif/01_tffm/logs/', verbose=1, seed=12345) # preparing the data for cold start cold_start = pd.DataFrame(ax_te_cs, columns=X_tr.columns) # What happens if we only have access to categories and no historical click/purchase data? # Let's delete historical click and purchasing data for the cold_start test set for column in cold_start.columns: if ('buy' in column or 'click' in column) and ('Category' not in column): cold_start[column] = 0
y_train = y[train_index] y_test = y[test_index] # print("train len: %d, test len: %d" % (len(yy_train), len(yy_test))) # print("train sum: %d, test sum: %d" % (sum(yy_train), sum(yy_test))) features_train = features[train_index] features_test = features[test_index] sparse_u_train = get_selected_input(features_train, "u_all", cfg) sparse_ad_train = get_selected_input(features_train, "ad_all", cfg) sparse_u_test = get_selected_input(features_test, "u_all", cfg) sparse_ad_test = get_selected_input(features_test, "ad_all", cfg) sparse_x_train = hstack([sparse_u_train, sparse_ad_train]).tocsr() sparse_x_test = hstack([sparse_u_test, sparse_ad_test]).tocsr() order = 2 model = TFFMClassifier(order=order, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.001), n_epochs=50, batch_size=50, init_std=0.001, reg=0.01, input_type='sparse', seed=42) model.fit(sparse_x_train, y_train, show_progress=False) predictions = model.predict(sparse_x_test) print('[order={}] accuracy: {}'.format(order, roc_auc_score(y_test, predictions))) model.destroy()
idx = np.arange(x_train.shape[0], dtype=int) _train = idx[train] _test = idx[test] trn_x = x_train[_train, :] val_x = x_train[_test, :] trn_y = y_train[train] val_y = y_train[test] list_idx = df.loc[test].reset_index(drop=True).groupby( 'order_id').apply(lambda x: x.index.values.shape[0]).tolist() list_idx = np.array(list_idx, dtype=np.int) clf = TFFMClassifier(order=2, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_epochs=100, batch_size=100000, init_std=0.001, input_type='sparse' ) clf.fit(trn_x, trn_y, show_progress=True) pred = clf.predict_proba(val_x)[:, 1] all_pred[test] = pred _score = log_loss(val_y, pred) _score2 = - roc_auc_score(val_y, pred) _, _score3, _ = f1_metric(val_y.astype(int), pred.astype(float)) logger.debug(' _score: %s' % _score3) list_score.append(_score) list_score2.append(_score2) list_score3.append(- 1 * _score3)
print('Non-zeros rate: {}'.format(np.mean(x_train != 0))) print('Classes balance: {} / {}'.format(np.mean(y_train == 0), np.mean(y_train == 1))) X_tr, X_te, y_tr, y_te = train_test_split(x_train, y_train, random_state=42, test_size=0.3) from tffm import TFFMClassifier, TFFMRegressor for order in [3]: model = TFFMClassifier( order=order, rank=5, optimizer=tf.train.AdagradOptimizer(learning_rate=0.00001), n_epochs=5, batch_size=1024, init_std=0.001, reg=0.01, input_type='sparse', # session_config=tf.ConfigProto(log_device_placement=True, device_count={'GPU':0}), seed=42) model.fit(X_tr, y_tr, show_progress=True) predictions = model.predict(X_te) print('[order={}] logloss: {}'.format(order, log_loss(y_te, predictions))) print('[order={}] auc: {}'.format(order, roc_auc_score(y_te, predictions))) print(predictions) # this will close tf.Session and free resources model.destroy()
def tafm(data, oscar, subset_quantile): #Initialize output n_iteration = 1 result_dict = [] oscar_pred = pd.DataFrame() os_acc = 0 yr_name = ["_5","_10","_15"] for quantile in subset_quantile: print("---Running iteration " + str(n_iteration) + " ---") #subset original data filter_standard, subset_data = subsetdata(data, quantile) n_raters = subset_data.Rater.unique().shape[0] n_movies = round(subset_data.Film.unique().shape[0]/data.Film.unique().shape[0],2) n_size = subset_data.shape[0]*subset_data.shape[1] #split test and train data xtrain, xtest, ytrain, ytest = train_test_split(subset_data.loc[:, subset_data.columns != "Win"],subset_data.loc[:, "Win"], test_size=0.2, random_state=42) #encode data encoder, xtrain_enc, xtest_enc = OneHotEncoding(xtrain, xtest) #Dense start = time.time() tf.reset_default_graph() order = 2 model = TFFMClassifier( order=order, rank=10, optimizer=tf.train.FtrlOptimizer(0.1, l1_regularization_strength=0.001), n_epochs=50, init_std=0.001, reg=0.01, batch_size=50, input_type='sparse' ) model.fit(xtrain_enc, ytrain, show_progress=True) predictions = model.predict(xtest_enc) #Evaluation metrics model_acc = f1_score(ytest, predictions, average='weighted') #Predicting the recent 15 years of Oscar unique_years = list(np.unique(oscar.Year)) oscar_years = [unique_years[-5:], unique_years[-10:], unique_years] oscar_accuracy = [] for n in range(len(oscar_years)): subset_yr = oscar_years[n] oscar_subset = oscar >> mask(X.Year.isin(subset_yr)) oscar_x = oscar_subset.loc[:, oscar_subset.columns != "Win"] oscar_y = oscar_subset.loc[:, "Win"] oscar_train = encoder.transform(oscar_x) oscar_predictions = model.predict(oscar_train) oscar_acc = f1_score(oscar_y, oscar_predictions, average="weighted") oscar_accuracy.append(oscar_acc) if oscar_acc > os_acc: os_acc=oscar_acc oscar_pred = oscar_subset.copy() oscar_pred["Predictions"] = oscar_predictions spent_time = time.time() - start res = [n_movies, spent_time, model_acc] res.extend(oscar_accuracy) result_dict.append(res) n_iteration += 1 results = pd.DataFrame(result_dict) results.columns = ["N_Movies","Time","Model_Accuracy","Oscar_Rec5","Oscar_Rec10","Oscar_Rec15"] return results, oscar_pred
y = np.reshape(y, (y.shape[0], )) X = data_train_FM.drop( columns=['FREQUENCY', 'CUST_ID', 'ARTICLE_ID', 'AGE']).to_numpy() X = X.astype(np.float32) del data_train_FM rank = 20 l_r = 0.05 reg = 0.001 epoch = 200 model_tf = TFFMClassifier(order=2, rank=rank, optimizer=tf.train.AdamOptimizer(learning_rate=l_r), reg=reg, n_epochs=epoch, init_std=0.0001) protocol = pd.read_csv(d + '/test_protocol.csv') protocol = protocol.drop_duplicates() data_train = pd.read_csv(d + '/train_model.csv')[['CUST_ID', 'AGE']].drop_duplicates() protocol = pd.merge(protocol, data_train, on='CUST_ID') data_reco_baselines = pd.read_csv(d+'/data_reco_baselines.csv').drop_duplicates()\ [['ARM_PRECISION', 'K50_PRECISION','ALS_PRECISION','BPR_PRECISION', 'VAES_PRECISION',\ 'SPEC_PRECISION', 'CUST_ID','ARTICLE_ID']]
print('Dataset shape: {}'.format(x_train.shape)) print('Non-zeros rate: {}'.format(np.mean(x_train != 0))) print('Classes balance: {} / {}'.format(np.mean(y_train == 0), np.mean(y_train == 1))) X_tr, X_te, y_tr, y_te = train_test_split(x_train, y_train, random_state=42, test_size=0.3) from tffm import TFFMClassifier, TFFMRegressor for order in [3]: model = TFFMClassifier( order=order, rank=5, optimizer=tf.train.AdagradOptimizer(learning_rate=0.00001), n_epochs=5, batch_size=1024, init_std=0.001, reg=0.01, input_type='sparse', # session_config=tf.ConfigProto(log_device_placement=True, device_count={'GPU':0}), seed=42 ) model.fit(X_tr, y_tr, show_progress=True) predictions = model.predict(X_te) print('[order={}] logloss: {}'.format(order, log_loss(y_te, predictions))) print('[order={}] auc: {}'.format(order, roc_auc_score(y_te, predictions))) print(predictions) # this will close tf.Session and free resources model.destroy()
logger.info('cv_start') for params in ParameterGrid(all_params): logger.info('param: %s' % (params)) for train_idx, test_idx in list(cv)[:1]: with gzip.open('train_fm.svm', 'wb') as f: dump_svmlight_file(data[train_idx], target[train_idx], f) del output gc.collect() with gzip.open('test_svm.svm', 'wb') as f: dump_svmlight_file(data[test_idx], target[test_idx], f) model = TFFMClassifier( order=2, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_epochs=50, batch_size=100000, init_std=0.001, reg=0.001, input_type='sparse') """ model = FMClassification() """ model.fit(data[train_idx], target[train_idx], show_progress=True) ans = model.predict_proba(data[test_idx])[:, 1] score = roc_auc_score(target[test_idx], ans) logger.info('score: %s' % score) logger.info('all thresh: %s, score: %s' % mcc_optimize(ans, target[test_idx])) score = roc_auc_score(target[test_idx], ans)
print ("-------------Training the sigmoid SVM-------------") print("sigmoid SVM auc is",svmauc3) from sklearn import svm clf4 = svm.NuSVC(kernel='poly') y_pred_svm4 = clf4.fit(X=dataset_train_X,y=dataset_train_y).predict(dataset_test_X) svmauc4 = roc_auc_score(y_true=dataset_test_y,y_score=y_pred_svm4) print ("-------------Training the poly SVM-------------") print("poly SVM auc is",svmauc4) from tffm import TFFMClassifier model = TFFMClassifier( order=3, rank=16, optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_epochs=200, batch_size=-1, init_std=0.001, input_type='dense' ) model.fit(dataset_train_X, dataset_train_y, show_progress=True) predict = model.predict(X=dataset_test_X) tfm_auc_3d = roc_auc_score(y_true=dataset_test_y,y_score=predict) print("3d fm is",tfm_auc_3d) from tffm import TFFMClassifier model = TFFMClassifier( order=4,
def test_case_1(self): categorical_features = [ 'source_system_tab', 'source_screen_name', 'city', 'gender' ] categorical_features_lang = ['language'] numerical_features = ['bd', 'song_length', 'days_registered'] num_features_pipeline = Pipeline([ ('impute', SimpleImputer(missing_values=np.nan, strategy='mean')), ('discretize', KBinsDiscretizer(n_bins=4, encode='onehot-dense')) ]) cat_features_pipeline = Pipeline([ ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False)) ]) cat_features_pipeline_lang = Pipeline([ ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False)) ]) preprocessor = ColumnTransformer( transformers=[('num', num_features_pipeline, numerical_features), ('cat', cat_features_pipeline, categorical_features), ('cat_lang', cat_features_pipeline_lang, categorical_features_lang)]) unified_pipeline = Pipeline( steps=[('add_meta_info', MetaFeaturesExtractor(user_meta=members, item_meta=songs) ), ('preprocessing', preprocessor)]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90, random_state=42, stratify=y) X_train = unified_pipeline.fit_transform(X_train, y_train) self.assertTrue(len(X_train) > 0) model = TFFMClassifier( order=6, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.001), n_epochs=100, batch_size=-1, init_std=0.001, input_type='dense') model.fit(X_train, y_train.values, show_progress=True)