def pyFM_cv(verbose=True, t=Timer()): #pyFM parameters factors = np.linspace(20, 200, 10, dtype=np.int64) learning_rates = np.logspace(-3, -3, 1) params = dict() rmses = dict() for k in factors: params['k'] = k for rate in learning_rates: params['rate'] = rate algo = pylibfm.FM(num_factors=k, num_iter=200, verbose=True, task="regression", initial_learning_rate=rate, learning_rate_schedule="optimal") rmse = pyFM_cv_algo(algo) print( "------Time:{}, rmse: {}, factors: {}, learning_rates: {}------\n\n" .format(t.now(), rmse, k, rate)) rmses[rmse] = params # Find the model with least RMSE lowest_rmse = min(rmses.keys()) best_params = rmses[lowest_rmse] print("Best pyFM rmse: {}. Params: factors: {}, learning_rates: {}".format( lowest_rmse, best_params['k'], best_params['rate']))
def pyFMJob(data_path, params, N, vectorizer, with_timestamps=False, with_authors=False): rmses = [] logging.info("Evaluando con params: {0}".format(params)) for i in range(1, 4 + 1): train_data, y_tr, _ = loadData('train/train_N' + str(N) + '.' + str(i), data_path=data_path, with_timestamps=with_timestamps, with_authors=with_authors) X_tr = vectorizer.transform(train_data) fm = pylibfm.FM(num_factors=params['f'], num_iter=params['mi'], k0=params['bias'], k1=params['oneway'], init_stdev=params['init_stdev'], \ validation_size=params['val_size'], learning_rate_schedule=params['lr_s'], initial_learning_rate=params['lr'], \ power_t=params['invscale_pow'], t0=params['optimal_denom'], shuffle_training=params['shuffle'], seed=params['seed'], \ task='regression', verbose=True) fm.fit(X_tr, y_tr) val_data, y_va, _ = loadData('val/val_N' + str(N) + '.' + str(i), data_path=data_path, with_timestamps=with_timestamps, with_authors=with_authors) X_va = vectorizer.transform(val_data) preds = fm.predict(X_va) rmse = sqrt(mean_squared_error(y_va, preds)) print("FM RMSE: %.4f" % rmse) rmses.append(rmse) return mean(rmses)
def pyfm_predict(train_actual, predict): """ Matrix Factorization using SGD with pyFM library. Compute the predictions on a test_set after training on a train_set using the method Svd++ Matrix Factorization using SGD with pyFM library Args: train_actual (pandas.DataFrame): train set predict (pandas.DataFrame): test set Hyperparameters: num_factors : The number of factors. num_iter : The number of iteration of the SGD procedure initial_learning_rate: Returns: numpy array: predictions """ print("pyfm") predict_data, y_predict = create_input_pyfm(predict) train_actual_data, y_train_actual = create_input_pyfm(train_actual) v = DictVectorizer() X_train = v.fit_transform(train_actual_data) X_test = v.transform(predict_data) # Hyperparameters num_factors = 20 num_iter = 200 task = 'regression' initial_learning_rate = 0.001 learning_rate_schedule = 'optimal' fm = pylibfm.FM(num_factors=num_factors, num_iter=num_iter, task=task, initial_learning_rate=initial_learning_rate, learning_rate_schedule=learning_rate_schedule) fm.fit(X_train, y_train_actual) preds = fm.predict(X_test) return np.clip(preds, 1, 5)
def main(para_dname): data_dir = 'data' output_dir = 'output' dataset_name = para_dname # dataset_name = 'ml-1m' # dataset_name = 'douban' # dataset_name = 'yahoo_music' train_X, test_X, train_y, test_y = load_dataset(data_dir, dataset_name) v = DictVectorizer() train_X = v.fit_transform(train_X) test_X = v.transform(test_X) fm = pylibfm.FM(num_factors=10, num_iter=1000, verbose=True, task='regression', initial_learning_rate=0.001, learning_rate_schedule='optimal', validation_size=0.1) train_loss, val_loss = fm.fit(train_X, train_y) train_loss = np.sqrt(np.array(train_loss)) val_loss = np.sqrt(np.array(val_loss)) np.save(os.path.join(output_dir, dataset_name + '_trloss'), train_loss) np.save(os.path.join(output_dir, dataset_name + '_valloss'), val_loss) preds = fm.predict(test_X) test_loss = math.sqrt(mean_squared_error(test_y, preds)) print(preds) print('Test loss: %.5f' % test_loss) return 0
def __init__(self, iter=100, factor=10, use_info=True, path='./', external_fm=None): from pyfm import pylibfm self.__use_info = use_info # temp code, load ml-100k's info if self.__use_info: self.__info = Info(path) # Build and train a Factorization Machine if external_fm: print >> sys.stderr, 'Use external FM: %s' % type(external_fm) self.__fm = external_fm else: print >> sys.stderr, 'iter=%d, factor=%d, use_info=%d' % ( iter, factor, use_info) self.__fm = pylibfm.FM(num_factors=factor, num_iter=iter, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")
def FM(train_data, test_data, y_train): import numpy as np from sklearn.feature_extraction import DictVectorizer from pyfm import pylibfm import pandas as pd import math import time start_time = time.time() # Transforms lists of feature-value mappings to one-hot encoded vectors v = DictVectorizer() X_train = v.fit_transform(train_data) X_test = v.transform(test_data) # Build and train a Factorization Machine fm = pylibfm.FM(num_factors=5, num_iter=10, verbose=True, task="regression", initial_learning_rate=0.1, learning_rate_schedule="optimal") fm.fit(X_train, y_train) preds = fm.predict(X_test) # Time taken for training (seconds) print("--- %s seconds : Time taken for training ---" % (time.time() - start_time)) np.savetxt('predictions.txt', preds) return preds
def __init__(self, n_factor=16, n_iter=10, use_attrs=True): self.fm = pylibfm.FM(num_factors=n_factor, num_iter=n_iter, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal") self.use_attrs = use_attrs self.v = DictVectorizer()
def cf_models(cf_lib, N, data_path, params): #caching, ya que son siempre los mismos params del CF cf_models = {} all_data, y_all, items = loadData( "eval_all_N" + str(N) + ".data", data_path='TwitterRatings/funkSVD/data_with_authors/', with_timestamps=False, with_authors=True) #hardcoded bc f**k it if cf_lib == "pyFM": v = DictVectorizer() X_all = v.fit_transform(all_data) for i in range(1, 4 + 1): train_data, y_tr, _ = loadData( 'train/train_N' + str(N) + '.' + str(i), data_path='TwitterRatings/funkSVD/data_with_authors/', with_timestamps=False, with_authors=True) X_tr = v.transform(train_data) fm = pylibfm.FM(num_factors=params['f'], num_iter=params['mi'], k0=params['bias'], k1=params['oneway'], init_stdev=params['init_stdev'], \ validation_size=params['val_size'], learning_rate_schedule=params['lr_s'], initial_learning_rate=params['lr'], \ power_t=params['invscale_pow'], t0=params['optimal_denom'], shuffle_training=params['shuffle'], seed=params['seed'], \ task='regression', verbose=True) fm.fit(X_tr, y_tr) cf_models[i] = fm return cf_models, v, items elif cf_lib == "implicit": all_c = consumption(ratings_path=data_path + 'eval_all_N' + str(N) + '.data', rel_thresh=0, with_ratings=True) items_ids = list( set([ itemId for userId, itemsDict in all_c.items() for itemId in itemsDict ])) idcoder = IdCoder(items_ids, all_c.keys()) for i in range(1, 4 + 1): ones, row, col = get_data(data_path=data_path, all_c=all_c, idcoder=idcoder, fold=i, N=N, mode="tuning") matrix = csr_matrix((ones, (row, col)), dtype=np.float64) user_items = matrix.T.tocsr() model = implicit.als.AlternatingLeastSquares( factors=params['f'], regularization=params['lamb'], iterations=params['mi'], dtype=np.float64) model.fit(matrix) cf_models[i] = model return cf_models, idcoder, items
def train(self, n_epochs: int, learning_rate: float = 0.001, random_seed: int = 42, hybrid: bool = False, verbose: bool = True): self._build_train_test_ds(hybrid) self.fm = pylibfm.FM(num_factors=self.k, num_iter=n_epochs, verbose=verbose, task='regression', initial_learning_rate=learning_rate, seed=random_seed) self.fm.fit(self.X_train, self.y_train)
def __init__(self): self.fm = pylibfm.FM(k1=False, validation_size=0.005, num_factors=10, num_iter=8, verbose=True, task="classification", initial_learning_rate=0.001, init_stdev=0.002, learning_rate_schedule="optimal")
def train(training_data, labels): # Use one hot-encoding label_encoder = LabelEncoder() vectorizer = DictVectorizer() train_event_x = vectorizer.fit_transform(training_data) train_event_y = label_encoder.fit_transform(labels) # Create and train the model. pctr_estimator = pylibfm.FM() pctr_estimator.fit(train_event_x, train_event_y) model = (pctr_estimator, label_encoder, vectorizer) print('Training done') return model
def __init__(self, rec_name, dataset, uses_features): super(FMRec, self).__init__(rec_name, dataset, uses_features) # init self.one_hot_columns = None # default rec self.fm = pylibfm.FM(num_factors=50, num_iter=10, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal", verbose=True)
def train(training_data, labels): #label encoder features to get prediction label_encoder = LabelEncoder() vectorizer = DictVectorizer() train_event_x = vectorizer.fit_transform(training_data) train_event_y = label_encoder.fit_transform(labels) #definition of balanced weights global weights weights = len(labels) / (sum(labels) * 2) # Create and train the model using Factorization Machine Algorithm pctr_estimator = pylibfm.FM() pctr_estimator.fit(train_event_x, train_event_y) model = (pctr_estimator, label_encoder, vectorizer) print('Training done') return model
def __init__(self, num_factors=10, num_iter=1, k0=True, k1=True, init_stdev=0.1, validation_size=0.01, learning_rate_schedule="optimal", initial_learning_rate=0.01, power_t=0.5, t0=0.001, task='classification', verbose=True, shuffle_training=True, seed=28): super(BaseEstimator, self).__init__() self.num_factors = num_factors self.num_iter = num_iter self.k0 = k0 self.k1 = k1 self.init_stdev = init_stdev self.validation_size = validation_size self.learning_rate_schedule = learning_rate_schedule self.initial_learning_rate = initial_learning_rate self.power_t = power_t self.t0 = t0 self.task = task self.verbose = verbose self.shuffle_training = shuffle_training self.seed = seed self.fm = pylibfm.FM( num_factors=self.num_factors, num_iter=self.num_iter, k0=self.k0, k1=self.k1, init_stdev=self.init_stdev, validation_size=self.validation_size, learning_rate_schedule=self.learning_rate_schedule, initial_learning_rate=self.initial_learning_rate, power_t=self.power_t, t0=self.t0, task=self.task, verbose=self.verbose, shuffle_training=self.shuffle_training, seed=self.seed)
def main(): print(getCurrentTime(), "running...") dok_train = np.load(r"%s\..\input\sparse_train.npy" % runningPath)[()]# train 中包括 '2018-09-17'-- '2018-09-23' 的数据 dok_verify = np.load(r"%s\..\input\sparse_verify.npy" % runningPath)[()] # verify 中只包含了 '2018-09-24' 的数据 dok_test = np.load(r"%s\..\input\sparse_test.npy" % runningPath)[()] train_label = pd.read_csv(r'%s\..\input\train_label.txt' % runningPath) verify_label = pd.read_csv(r'%s\..\input\verify_label.txt' % runningPath) fm = pylibfm.FM(num_factors=50, num_iter=10, verbose=True, task="classification", initial_learning_rate=0.0001, learning_rate_schedule="optimal") fm.fit(dok_train, train_label['is_trade']) Y_prediced = fm.predict(dok_verify) pyfm_logloss = -np.sum(verify_label * np.log(Y_prediced) + (1 - verify_label) * np.log(1 - Y_prediced))/ Y_prediced.shape[0] print(getCurrentTime(), "lgb logloss %.6f" %(pyfm_logloss)) return
def pyfm(train, test, **arg): print('[PYFM] applying') # Get the args num_factors = arg['num_factors'] num_iter = arg['num_iter'] task = arg['task'] initial_learning_rate = arg['initial_learning_rate'] learning_rate_schedule = arg['learning_rate_schedule'] (train_data, y_train, train_users, train_items) = prepare_data(train) (test_data, y_test, test_users, test_items) = prepare_data(test) v = DictVectorizer() X_train = v.fit_transform(train_data) X_test = v.transform(test_data) fm = pylibfm.FM(num_factors=num_factors, num_iter=num_iter, task=task, initial_learning_rate=initial_learning_rate, learning_rate_schedule=learning_rate_schedule) fm.fit(X_train, y_train) preds = fm.predict(X_test) for i in range(len(preds)): if preds[i] > 5: preds[i] = 5 elif preds[i] < 1: preds[i] = 1 df_return = test.copy() df_return.Rating = preds print('[PYFM] done') return df_return
def peretrain(): X_train = [] y = [] for user in os.listdir("userinfo"): for file in os.listdir("userinfo/" + user): if file == "dislike.txt": dann = [ int(i) for i in open("userinfo/" + user + '/' + file, "r").read().split() if checkint(i) ] for d in dann: X_train.append({ "who_marked": str(user), "marked_user": str(d) }) y.append(0) elif file == "like.txt": dann = [ int(i) for i in open("userinfo/" + user + '/' + file, "r").read().split() if checkint(i) ] for d in dann: X_train.append({ "who_marked": str(user), "marked_user": str(d) }) y.append(1) v = DictVectorizer() X = v.fit_transform(X_train) y = np.array(y, dtype=np.float64) fm = pylibfm.FM(num_factors=20, num_iter=len(y) * 3, verbose=False, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal") fm.fit(X, y) return fm
def dofit_pyfm(): d = get_subsample() globals().update(d) clf = pylibfm.FM(num_factors=4, num_iter=100, verbose=True, task="classification", initial_learning_rate=0.00001, learning_rate_schedule="optimal") scaler = preproc.StandardScaler(with_mean=False) # hmm scaler.fit(X_train) def transx(x): x = scaler.transform(x) return scipy.sparse.csr_matrix(x) clf._fit_old = clf.fit clf.fit = lambda x, y: clf._fit_old(transx(x), y) clf._predict_old = clf.predict clf.predict = lambda x: clf._predict_old(transx(x)) clf.fit(X_train, y_train) return {'clf': clf}
def benchmark(task='regression', content=False): losses = [] total_time = 0 for k in range(5): print('== Fold %d' % (k+1)) # set RNG np.random.seed(0) random.seed(0) # Load data (train_data, y_train) = load_ml_100k("u%d.base" % (k+1), content) (test_data, y_test) = load_ml_100k("u%d.test" % (k+1), content) if task == "classification": y_test = np.greater(y_test, 3) # Transform to matrix v = DictVectorizer() x_train = v.fit_transform(train_data) x_test = v.transform(test_data) # Build and train a Factorization Machine fm = pylibfm.FM(num_iter=20, verbose=True, task=task, initial_learning_rate=0.005, learning_rate_schedule="constant", seed=0) start = time.time() fm.fit(x_train, y_train) used = time.time() - start total_time += used # Evaluate predictions = fm.predict(x_test) if task == "regression": losses.append(root_mean_squared_error(y_test, predictions)) print("FM RMSE: %.4f" % losses[-1]) elif task == "classification": losses.append(log_loss(y_test, predictions)) print("FM log loss: %.4f" % losses[-1]) print("Time used: %.4fs" % used) print('== Summary') print('Mean RMSE: %.4f' % np.mean(losses)) print('Total time: %.4fs' % total_time)
val_w = sample_weight[test] # reg = LogisticRegression(C=0.1, solver='sag', n_jobs=-1) # pred_x = cross_val_predict(reg, trn_x, trn_y, cv=5, n_jobs=-1) # trn_x = np.c_[trn_x, pred_x] """ clf = TFFMClassifier(order=6, rank=10, optimizer=tf.train.AdagradOptimizer(0.01), n_epochs=100, batch_size=10000, init_std=0.001, input_type='sparse' ) """ clf = pylibfm.FM(**params) clf.fit(trn_x, trn_y) _score = log_loss(val_y, clf.predict(val_x), sample_weight=val_w) _score2 = -roc_auc_score( val_y, clf.predict(val_x), sample_weight=val_w) # logger.debug(' _score: %s' % _score) list_score.append(_score) list_score2.append(_score2) break score = (np.mean(list_score), np.min(list_score), np.max(list_score)) score2 = (np.mean(list_score2), np.min(list_score2), np.max(list_score2)) logger.info('param: %s' % (params))
v = DictVectorizer() # X_origin = train.loc[:,['iid','uid']].astype(np.string_).to_dict(orient='records') # X = v.fit_transform(X_origin) y = np.array(train.loc[:, ['score']]).flatten().astype(np.float64) #X_train, X_test, y_train, y_test = train_test_split(X, y) X_merge = pd.concat([train.loc[:, ['uid', 'iid']], test]) X_merge_hot = v.fit_transform( X_merge.astype(np.string_).to_dict(orient='records')) train_hot = X_merge_hot[0:train.shape[0]] test_hot = X_merge_hot[train.shape[0]:X_merge_hot.shape[0]] print "data is ready" fm = pylibfm.FM(num_factors=100, num_iter=30, verbose=True, task="regression", initial_learning_rate=0.01, learning_rate_schedule="optimal") #y_train = y_train.astype(np.float64) fm.fit(train_hot, y) print 'fit well' joblib.dump(fm, "fm_model10,0000_iter_30.m") print "start predict" y_pred = fm.predict(test_hot) df_fm = pd.DataFrame(y_pred, columns=['score']) df_fm.to_csv("fm_result100_0000_iter_30.csv", index=False) # # ground_truth = np.around(preds)
from pyfm import pylibfm from sklearn.feature_extraction import DictVectorizer iris_data = load_iris() X = iris_data['data'] y = iris_data['target'] == 2 data = [{v: k for k, v in dict(zip(i, range(len(i)))).items()} for i in X] X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=0) v = DictVectorizer() X_train = v.fit_transform(X_train) X_test = v.transform(X_test) fm = pylibfm.FM(num_factors=50, num_iter=1000, verbose=True, task="classification", initial_learning_rate=0.0001, learning_rate_schedule="optimal") fm.fit(X_train, y_train) y_preds = fm.predict(X_test) from sklearn.metrics import log_loss print("Validation log loss: %.4f" % log_loss(y_test, y_preds))
def hybrid_protocol_evaluation(data_path, data_path_context, cf_lib, solr, params_cb, params_cf, params_hy, N): test_c = consumption(ratings_path=data_path + 'test/test_N20.data', rel_thresh=0, with_ratings=True) train_c = consumption(ratings_path=data_path + 'eval_train_N20.data', rel_thresh=0, with_ratings=False) all_c = consumption(ratings_path=data_path + 'eval_all_N20.data', rel_thresh=0, with_ratings=True) MRRs = dict((N, []) for N in [5, 10, 15, 20]) nDCGs = dict((N, []) for N in [5, 10, 15, 20]) APs = dict((N, []) for N in [5, 10, 15, 20]) Rprecs = dict((N, []) for N in [5, 10, 15, 20]) if cf_lib == "pyFM": all_data, y_all, items = loadData("eval_all_N20.data", data_path=data_path_context, with_timestamps=False, with_authors=True) v = DictVectorizer() X_all = v.fit_transform(all_data) train_data, y_tr, _ = loadData('eval_train_N20.data', data_path=data_path_context, with_timestamps=False, with_authors=True) X_tr = v.transform(train_data) fm = pylibfm.FM(num_factors=params_cf['f'], num_iter=params_cf['mi'], k0=params_cf['bias'], k1=params_cf['oneway'], init_stdev=params_cf['init_stdev'], \ validation_size=params_cf['val_size'], learning_rate_schedule=params_cf['lr_s'], initial_learning_rate=params_cf['lr'], \ power_t=params_cf['invscale_pow'], t0=params_cf['optimal_denom'], shuffle_training=params_cf['shuffle'], seed=params_cf['seed'], \ task='regression', verbose=True) fm.fit(X_tr, y_tr) elif cf_lib == "implicit": items_ids = list( set([ itemId for userId, itemsDict in all_c.items() for itemId in itemsDict ])) idcoder = IdCoder(items_ids, all_c.keys()) ones, row, col = get_data(data_path=data_path, all_c=all_c, idcoder=idcoder, fold=0, N=20, mode="testing") matrix = csr_matrix((ones, (row, col)), dtype=np.float64) user_items = matrix.T.tocsr() model = implicit.als.AlternatingLeastSquares( factors=params_cf['f'], regularization=params_cf['lamb'], iterations=params_cf['mi'], dtype=np.float64) model.fit(matrix) p = 0 for userId in test_c: logging.info("#u: {0}/{1}".format(p, len(test_c))) p += 1 if cf_lib == "pyFM": user_rows = [{ 'user_id': str(userId), 'item_id': str(itemId) } for itemId in items] X_te = v.transform(user_rows) preds = fm.predict(X_te) recs_cf = [ itemId for _, itemId in sorted(zip(preds, items), reverse=True) ] elif cf_lib == "implicit": recommends = model.recommend(userid=int( idcoder.coder('user', userId)), user_items=user_items, N=200) recs_cf = [idcoder.decoder('item', tupl[0]) for tupl in recommends] recs_cb = [] for itemId in train_c[userId]: encoded_params = urlencode(params_cb) url = solr + '/mlt?q=goodreadsId:' + itemId + "&" + encoded_params response = json.loads(urlopen(url).read().decode('utf8')) try: docs = response['response']['docs'] except TypeError as e: continue recs_cb.append([str(doc['goodreadsId'][0]) for doc in docs]) recs_cb = flatten_list(list_of_lists=recs_cb, rows=params_cb['rows']) recs_cf = remove_consumed(user_consumption=train_c[userId], rec_list=recs_cf) recs_cf = recs_cf[:200] recs_cb = remove_consumed(user_consumption=train_c[userId], rec_list=recs_cb) recs_cb = recs_cb[:200] recs_hy = hybridize_recs(recs_cb=recs_cb, recs_cf=recs_cf, weight_cb=params_hy['weight_cb'], weight_cf=params_hy['weight_cf']) recs_hy = remove_consumed(user_consumption=train_c[userId], rec_list=recs_hy) recs_hy = recs_cleaner(solr=solr, consumpt=train_c[userId], recs=recs_hy[:100]) recs_hy = user_ranked_recs(user_recs=recs_hy, user_consumpt=test_c[userId]) for N in [5, 10, 15, 20]: mini_recs = dict((k, recs_hy[k]) for k in recs_hy.keys()[:N]) MRRs[N].append(MRR(recs=mini_recs, rel_thresh=1)) nDCGs[N].append( nDCG(recs=mini_recs, alt_form=False, rel_thresh=False)) APs[N].append(AP_at_N(n=N, recs=recs_hy, rel_thresh=1)) Rprecs[N].append(R_precision(n_relevants=N, recs=mini_recs)) for N in [5, 10, 15, 20]: with open('TwitterRatings/hybrid/clean/protocol.txt', 'a') as file: file.write( "N=%s, nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \ (N, mean(nDCGs[N]), mean(APs[N]), mean(MRRs[N]), mean(Rprecs[N])) )
column_names = ['userId', 'movieId', 'timestamp'] label_name = 'rating' X, y = data[column_names].values, data[label_name].values pos = 80000 X_train, X_test = X[:pos], X[pos:] y_train, y_test = y[:pos], y[pos:] # build model from pyfm import pylibfm model = pylibfm.FM( num_factors=8, num_iter=10, validation_size=0.0, task='regression', reg_0=0.0, reg_w=0.01, reg_v=0.05, ) self = pyfm_model_wrapper(model) self.fit(X_train, y_train) y_pred = self.predict(X_test) np.mean(np.abs(y_pred - y_test)) dir(self.model) self.model.w0 self.model.w
y_train = data.loc[data['cv'] < threshold, 'conversion'] y_test = data.loc[data['cv'] >= threshold, 'conversion'] dictTrain = list( map(lambda ind: dict.fromkeys(getNames(train, ind), 1), train.index)) dictTest = list( map(lambda ind: dict.fromkeys(getNames(test, ind), 1), test.index)) v = DictVectorizer() X_train = v.fit_transform(dictTrain) X_test = v.transform(dictTest) # ============================================================================= # Factorization Machine # ============================================================================= fm = pylibfm.FM(num_factors=20, num_iter=50, task="classification") fm.fit(X_train, y_train) # ========================================================================= # Compute MPR # ========================================================================= data_test = pd.concat([test, y_test], axis=1) data_test['setSize'] = list(map(len, data_test['itemSet'])) data_test = data_test.loc[(data_test['setSize'] > 1) & (data_test['conversion'] == 1), ] data_test['target2'] = -1 for ind in data_test.index: data_test.loc[ind, 'target2'] = data_test.loc[ind, 'itemSet'][-1] data_test.at[ind, 'itemSet'] = data_test.loc[ind, 'itemSet'][:-1]
''' feature = data.iloc[:, :-1] #取特征 label = data.iloc[:, -1] #将数组按列进行归一化 feature = minmax_scale(feature, axis=0) #此处如果不归一化,预测结果全是Nan return feature, label train = pd.read_csv(trainData, header=None) test = pd.read_csv(testData, header=None) X_train, y_train = preprocessData(train) X_test, y_test = preprocessData(test) X_train = [{v: k for k, v in dict(zip(i, range(len(i)))).items()} for i in X_train] X_test = [{v: k for k, v in dict(zip(i, range(len(i)))).items()} for i in X_test] v = DictVectorizer() X_train = v.fit_transform(X_train) X_test = v.transform(X_test) fm = pylibfm.FM(num_factors=15, num_iter=300, verbose=False, task="classification", initial_learning_rate=0.01, learning_rate_schedule="optimal") fm.fit(X_train, y_train) y_pred_label = [get_label(i) for i in fm.predict(X_test)] print(y_pred_label) print(accuracy_score(y_test, y_pred_label))
"age": 33 }, { "user": "******", "item": "20", "age": 55 }, { "user": "******", "item": "10", "age": 20 }, ] v = DictVectorizer() X = v.fit_transform(train) print(type(X)) print(print(v.get_feature_names())) print(X) # print(X.toarray()) # [[ 19. 0. 0. 0. 1. 1. 0. 0. 0.] # [ 33. 0. 0. 1. 0. 0. 1. 0. 0.] # [ 55. 0. 1. 0. 0. 0. 0. 1. 0.] # [ 20. 1. 0. 0. 0. 0. 0. 0. 1.]] y = np.repeat(1.0, X.shape[0]) print(y) fm = pylibfm.FM() fm.fit(X, y) pred = fm.predict(v.transform({"user": "******", "item": "10", "age": 24})) print(pred) print(v.transform({"user": "******", "item": "10", "age": 24})) print(v.transform({"user": "******", "item": "10", "age": 24}).toarray())
test = data.loc[data['cv']>=threshold,['target','itemSet']] y_train = data.loc[data['cv']<threshold,'conversion'] y_test = data.loc[data['cv']>=threshold,'conversion'] dictTrain = list(map(lambda ind: dict.fromkeys(getNames(train,ind),1),train.index)) dictTest = list(map(lambda ind: dict.fromkeys(getNames(test,ind),1),test.index)) v = DictVectorizer() X_train = v.fit_transform(dictTrain) X_test = v.transform(dictTest) # ========================================================================= # Factorization Machine # ========================================================================= fm = pylibfm.FM(num_factors=numTraits,num_iter=100,task="classification") fm.fit(X_train,y_train) # ========================================================================= # Compute MPR # ========================================================================= data_test = pd.concat([test,y_test],axis=1) data_test = data_test.loc[data_test['conversion']==1,] percentileRank = [] precisionAt5 = 0 precisionAt10 = 0 precisionAt20 = 0 for ind in data_test.index: subdata = data_test.loc[ind,] true_target = subdata['target']
def pyFM_tuning(data_path, N, with_timestamps=False, with_authors=False): all_data, y_all, _ = loadData("eval_all_N" + str(N) + ".data", data_path=data_path, with_timestamps=with_timestamps, with_authors=with_authors) v = DictVectorizer() X_all = v.fit_transform(all_data) defaults = {'f': 100, 'mi': 20, 'bias': True, 'oneway': True , 'init_stdev': 0.1, 'val_size': 0.01, 'lr_s': 'optimal', 'lr': 0.01, \ 'invscale_pow': 0.5, 'optimal_denom': 0.001, 'shuffle': True, 'seed': 28} #cambio del original: f:20, mi:1 results = dict((param, {}) for param in defaults.keys()) for param in [ 'mi', 'f', 'bias', 'oneway', 'init_stdev', 'val_size', 'lr_s', 'lr', 'invscale_pow', 'optimal_denom', 'shuffle', 'seed' ]: if param == 'mi': for i in [1, 5, 10, 20, 50, 100, 150, 200]: defaults['mi'] = i results['mi'][i] = pyFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['mi'] = opt_value(results=results['mi'], metric='rmse') elif param == 'f': for i in range(20, 2020, 20): defaults['f'] = i results['f'][i] = pyFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['f'] = opt_value(results=results['f'], metric='rmse') elif param == 'bias': for i in [True, False]: defaults['bias'] = i results['bias'][i] = pyFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['bias'] = opt_value(results=results['bias'], metric='rmse') elif param == 'oneway': for i in [True, False]: defaults['oneway'] = i results['oneway'][i] = pyFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['oneway'] = opt_value(results=results['oneway'], metric='rmse') elif param == 'init_stdev': for i in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]: defaults['init_stdev'] = i results['init_stdev'][i] = pyFMJob( data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['init_stdev'] = opt_value(results=results['init_stdev'], metric='rmse') elif param == 'val_size': for i in [0.001, 0.01, 0.1, 0.5, 0.8, 0.9]: defaults['val_size'] = i results['val_size'][i] = pyFMJob( data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['val_size'] = opt_value(results=results['val_size'], metric='rmse') elif param == 'lr_s': for i in ['constant', 'optimal', 'invscaling']: defaults['lr_s'] = i if i == 'optimal': for j in [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5 ]: defaults['optimal_denom'] = j results['optimal_denom'][j] = pyFMJob( data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['optimal_denom'] = opt_value( results=results['optimal_denom'], metric='rmse') results['lr_s'][i] = results['optimal_denom'][ defaults['optimal_denom']] elif i == 'invscaling': for j in [0.001, 0.05, 0.1, 0.5, 0.8, 1.0]: defaults['invscale_pow'] = j results['invscale_pow'][j] = pyFMJob( data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['invscale_pow'] = opt_value( results=results['invscale_pow'], metric='rmse') results['lr_s'][i] = results['invscale_pow'][ defaults['invscale_pow']] elif i == 'constant': results['lr_s'][i] = pyFMJob( data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['lr_s'] = opt_value(results=results['lr_s'], metric='rmse') elif param == 'lr': for i in [0.001, 0.003, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05]: #0.07, 0.08, 0.1]: defaults['lr'] = i results['lr'][i] = pyFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['lr'] = opt_value(results=results['lr'], metric='rmse') elif param == 'shuffle': for i in [True, False]: defaults['shuffle'] = i results['shuffle'][i] = pyFMJob( data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['shuffle'] = opt_value(results=results['shuffle'], metric='rmse') elif param == 'seed': for i in [10, 20, 28, 30, 50]: defaults['seed'] = i results['seed'][i] = pyFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, with_timestamps=with_timestamps, with_authors=with_authors) defaults['seed'] = opt_value(results=results['seed'], metric='rmse') # Real testing train_data, y_tr, _ = loadData('eval_train_N' + str(N) + '.data', data_path=data_path, with_timestamps=with_timestamps, with_authors=with_authors) X_tr = v.transform(train_data) fm = pylibfm.FM(num_factors=defaults['f'], num_iter=defaults['mi'], k0=defaults['bias'], k1=defaults['oneway'], init_stdev=defaults['init_stdev'], \ validation_size=defaults['val_size'], learning_rate_schedule=defaults['lr_s'], initial_learning_rate=defaults['lr'], \ power_t=defaults['invscale_pow'], t0=defaults['optimal_denom'], shuffle_training=defaults['shuffle'], seed=defaults['seed'], \ task='regression', verbose=True) fm.fit(X_tr, y_tr) test_data, y_te, _ = loadData('test/test_N' + str(N) + '.data', data_path=data_path, with_timestamps=with_timestamps, with_authors=with_authors) X_te = v.transform(test_data) preds = fm.predict(X_te) rmse = sqrt(mean_squared_error(y_te, preds)) print("FM RMSE: %.4f" % rmse) with open( 'TwitterRatings/pyFM/opt_params_tmstmp' + str(with_timestamps) + '_auth' + str(with_authors) + '.txt', 'w') as f: for param in defaults: f.write("{param}:{value}\n".format(param=param, value=defaults[param])) f.write("RMSE:{rmse}".format(rmse=rmse)) with open( 'TwitterRatings/pyFM/params_rmses_tmstmp' + str(with_timestamps) + '_auth' + str(with_authors) + '.txt', 'w') as f: for param in results: for value in results[param]: f.write("{param}={value}\t : {RMSE}\n".format( param=param, value=value, RMSE=results[param][value])) return defaults
def pyFM_protocol_evaluation(data_path, params, with_timestamps=False, with_authors=False): solr = "http://localhost:8983/solr/grrecsys" # userId = '33120270' all_data, y_all, items = loadData("eval_all_N20.data", data_path=data_path, with_timestamps=with_timestamps, with_authors=with_authors) v = DictVectorizer() X_all = v.fit_transform(all_data) test_c = consumption( ratings_path='TwitterRatings/funkSVD/data/test/test_N20.data', rel_thresh=0, with_ratings=True) train_c = consumption( ratings_path='TwitterRatings/funkSVD/data/eval_train_N20.data', rel_thresh=0, with_ratings=False) MRRs = dict((N, []) for N in [5, 10, 15, 20]) nDCGs = dict((N, []) for N in [5, 10, 15, 20]) APs = dict((N, []) for N in [5, 10, 15, 20]) Rprecs = dict((N, []) for N in [5, 10, 15, 20]) train_data, y_tr, _ = loadData('eval_train_N20.data', data_path=data_path, with_timestamps=with_timestamps, with_authors=with_authors) X_tr = v.transform(train_data) fm = pylibfm.FM(num_factors=params['f'], num_iter=params['mi'], k0=params['bias'], k1=params['oneway'], init_stdev=params['init_stdev'], \ validation_size=params['val_size'], learning_rate_schedule=params['lr_s'], initial_learning_rate=params['lr'], \ power_t=params['invscale_pow'], t0=params['optimal_denom'], shuffle_training=params['shuffle'], seed=params['seed'], \ task='regression', verbose=True) fm.fit(X_tr, y_tr) p = 0 for userId in test_c: logging.info("#u: {0}/{1}".format(p, len(test_c))) p += 1 user_rows = [{ 'user_id': str(userId), 'item_id': str(itemId) } for itemId in items] X_te = v.transform(user_rows) preds = fm.predict(X_te) book_recs = [ itemId for _, itemId in sorted(zip(preds, items), reverse=True) ] book_recs = remove_consumed(user_consumption=train_c[userId], rec_list=book_recs) book_recs = recs_cleaner(solr=solr, consumpt=train_c[userId], recs=book_recs[:100]) recs = user_ranked_recs(user_recs=book_recs, user_consumpt=test_c[userId]) for N in [5, 10, 15, 20]: mini_recs = dict((k, recs[k]) for k in recs.keys()[:N]) MRRs[N].append(MRR(recs=mini_recs, rel_thresh=1)) nDCGs[N].append( nDCG(recs=mini_recs, alt_form=False, rel_thresh=False)) APs[N].append(AP_at_N(n=N, recs=recs, rel_thresh=1)) Rprecs[N].append(R_precision(n_relevants=N, recs=mini_recs)) for N in [5, 10, 15, 20]: with open( 'TwitterRatings/pyFM/clean/protocol_tmstmp' + str(with_timestamps) + '_auth' + str(with_authors) + '.txt', 'a') as file: file.write( "N=%s, nDCG=%s, MAP=%s, MRR=%s, R-precision=%s\n" % \ (N, mean(nDCGs[N]), mean(APs[N]), mean(MRRs[N]), mean(Rprecs[N])) )