def create_fm_model_by_file(cls, train_path, model_output, valid_path=None, iterations=100, thread_count=4, task='binary', k=16, lr=0.1, metric='auc', stop_window=100): # from sklearn.datasets import dump_svmlight_file # dump_svmlight_file(X_tr, y_tr, 'train.libsvm') # dump_svmlight_file(X_te, y_te, 'test.libsvm') fm_model = xl.create_fm() fm_model.setTrain(str(train_path)) if valid_path: fm_model.setValidate(str(valid_path)) param = { 'task': task, 'lr': lr, 'k': k, 'lambda': 0.002, 'metric': metric, 'epoch': iterations, 'stop_window': stop_window, 'nthread': thread_count } fm_model.fit(param, str(model_output)) return fm_model
def create_model(self): print("model start") fm_model = xl.create_fm() train_file = "/".join([self.OPT_HOME, 'train.txt']) validate_file = "/".join([self.OPT_HOME, 'validate.txt']) test_file = "/".join([self.OPT_HOME, 'test.txt']) param = { 'task': 'binary', 'epoch': 10, 'lr': 0.2, 'lambda': 0.002, 'metric': 'auc' } fm_model.setTrain(train_file) fm_model.setValidate(validate_file) fm_model.fit(param, 'model.out') fm_model.setTest(test_file) fm_model.setSigmoid() fm_model.predict('model.out', "output.txt") model_file = "/".join([self.MY_HOME, 'model.out']) print(model_file) self.move_to_opt(model_file)
def run(): fm_model = xl.create_fm() train_path = './dataset/train.txt' test_path = './dataset/test.txt' fm_model.setTrain(train_path) fm_model.setValidate(test_path) # Parameters: param = { 'task': 'binary', 'epoch': 10, 'lr': 0.2, 'k': 4, 'lambda': 0.002, 'metric': 'auc' } # Start to train # The trained model will be stored in model.out fm_model.fit(param, './model.out') fm_model.setTXTModel('./model.txt') # Prediction task fm_model.setTest(test_path) # Set the path of test dataset fm_model.setSigmoid() # Convert output to 0-1 # Start to predict # The output result will be stored in output.txt fm_model.predict("./model.out", "./output.txt")
def demo_xlearn_1(): # Generate predictions temp_output_file = tempfile.NamedTemporaryFile(delete=True) fm_model = xl.create_fm() # convert to libsvm format # converts train and test data to libSVM format dump_svmlight_file(X_tr, y_tr, BASE_DIR+'/tmp/train.libsvm') dump_svmlight_file(X_te, y_te, BASE_DIR+'/tmp/test.libsvm') # set training and validation data fm_model.setTrain(BASE_DIR+'/tmp/train.libsvm') fm_model.setValidate(BASE_DIR+'/tmp/test.libsvm') # define params and train param = {'task': 'binary', 'lr': 0.1, 'k': 16, 'lambda': 0.0002, 'metric': 'auc', 'epoch': 100} fm_model.fit(param, BASE_DIR+'/tmp/model.out') fm_model.setTest(BASE_DIR+'/tmp/test.libsvm') # 预测 fm_model.predict(BASE_DIR+'/tmp/model.out', temp_output_file.name) y_pred = pd.read_csv(temp_output_file.name, header=None)[0].values print(y_pred, type(y_pred)) temp_output_file.close()
def val(self, path_trn, val_prop=0.2, xlfm_params=XLFM_PARAMS_DEFAULT): self.logger.info('Preparing datasets') path_trn, path_val = self._split_features(path_trn, val_prop) fm_model = xl.create_fm() fm_model.setTrain(path_trn) fm_model.setValidate(path_val) fm_model.fit(xlfm_params, '%s/model-ffm-best-val.out' % self.artifacts_dir)
def __init__(self, model_type): assert model_type in ["FM","FFM","linear"] # 只能是这三种模型 self.model_type = model_type if self.model_type == "FM": self.model = xl.create_fm() elif self.model_type == "FFM": self.model = xl.create_ffm() else: # 采用默认的线性模型 self.model = xl.create_linear()
def creat_model(model_type): # 创建模型对象 if model_type == "FM": model = xl.create_fm() elif model_type == "FFM": model = xl.create_ffm() else: # 采用默认的线性模型 model = xl.create_linear() return model
def simple_predict2(self): fm_model = xl.create_fm() # Prediction task predict_path = 'xlearn_dataset/predict_'+self._user_id+'.txt' fm_model.setTest(predict_path) # Set the path of test dataset # Start to predict # The output result will be stored in output.txt fm_model.predict('final_model/xLearn_v0.3.out', 'xlearn_dataset/output_'+self._user_id+'.txt') result = pd.read_csv('xlearn_dataset/output_'+self._user_id+'.txt', header=None, names=['xlearn_stars']) result = pd.concat([self._xrestaurants['business_id'],result], axis=1) return result
def train(self): # %% fm fm_model = xl.create_fm() fm_model.setTrain(self.fp_train) fm_model.setSigmoid() # 二分类 param = CONFIG['fm']['params'] fp_pred_fm = "../data/fm/output_fm.txt" print('starting training...') fm_model.fit(param, self.fp_fm_model) fm_model.setValidate(self.fp_valid) fm_model.predict(self.fp_fm_model, fp_pred_fm)
def predict(self, num=10): fm_model = xl.create_fm() # Prediction task predict_path = 'xlearn_dataset/predict_'+self._user_id+'.txt' fm_model.setTest(predict_path) # Set the path of test dataset # Start to predict # The output result will be stored in output.txt fm_model.predict('final_model/xLearn_v0.1.out', 'xlearn_dataset/output_'+self._user_id+'.txt') result = pd.read_csv('xlearn_dataset/output_'+self._user_id+'.txt', header=None, names=['final_stars']) result = pd.concat([self._xrestaurants,result], axis=1).sort_values('final_stars', ascending=False)[:num] return result
def predict(cls, model_path, test_path, set_sigmoid=True) -> np.ndarray: fm_model = xl.create_fm() temp_output_file = tempfile.NamedTemporaryFile(delete=True) if set_sigmoid: fm_model.setSigmoid() # 将分数通过 setSigmoid() API 转换到(0-1)之间 fm_model.setTest(str(test_path)) # 预测 fm_model.predict(str(model_path), temp_output_file.name) y_pred = pd.read_csv(temp_output_file.name, header=None)[0].values temp_output_file.close() return y_pred
def __init__(self, X_train, y_train, categorical_cols, categorical_universe): self.model_file = "data/fm/model.txt" self.weights_file = "data/fm/model_wt.out" self.x_train_file = "data/fm/x_train.txt" self.x_test_file = "data/fm/x_test.txt" self.categorical_cols = categorical_cols self.categorical_universe = categorical_universe self.enc = OneHotEncoder().fit(self.categorical_universe.reshape(-1, 1)) self.model = xl.create_fm() self.model.setTXTModel(self.model_file) self.transform_input(X_train, y_train, 'train')
def __init__(self, URM_train, train_svm_file_path, approximate_recommender: BaseRecommender, ICM_train=None, UCM_train=None, item_feature_fields=None, user_feature_fields=None, valid_svm_file_path=None, max_items_to_predict=1000, model_filename="model.out", model_type="ffm", temp_relative_folder="temp/", verbose=True): self.ICM_train = ICM_train self.UCM_train = UCM_train user_fields = np.full(shape=URM_train.shape[0], fill_value=0) item_fields = np.full(shape=URM_train.shape[1], fill_value=1) if item_feature_fields is not None: item_feature_fields = item_feature_fields + 2 if user_feature_fields is not None: user_feature_fields = user_feature_fields + np.max( item_feature_fields) + 1 self.fields = np.concatenate([ user_fields, item_fields, item_feature_fields, user_feature_fields ]) self.approximate_recommender = approximate_recommender self.max_items_to_predict = max_items_to_predict # Set path of temp folder and model_path root_path = get_project_root_path() fm_data_path = os.path.join(root_path, "resources", "ffm_data") self.temp_folder = os.path.join(fm_data_path, temp_relative_folder) self.model_folder = os.path.join(fm_data_path, "model") self.model_path = os.path.join(self.model_folder, model_filename) if model_type == "ffm": self.model = xl.create_ffm() elif model_type == "fm": self.model = xl.create_fm() else: raise ValueError( "model_type is inexistent, choose between ffm and fm") self.model.setTrain(train_svm_file_path) if valid_svm_file_path is not None: self.model.setValidate(valid_svm_file_path) super().__init__(URM_train, verbose)
def xlearn(self): # Training task fm_model = xl.create_fm() fm_model.setTrain(join(self._data_path, 'train.libsvm')) fm_model.setValidate(join(self._data_path, 'test.libsvm')) if self.onlyResults: fm_model.setQuiet() param = {'task':'reg', 'lr':0.1, 'lambda':0.002, 'metric':'rmse', 'opt':'ftrl'} fm_model.fit(param, join(self._xlearn_path, 'model.out')) # Prediction task fm_model.setTest(join(self._data_path, 'test.libsvm')) fm_model.setSigmoid() fm_model.predict(join(self._xlearn_path, 'model.out'), join(self._xlearn_path, 'output.txt')) predictions = pd.read_csv(join(self._xlearn_path, 'output.txt'), header=None).values.flatten() if self.onlyResults: print("Completed xLearn evaluation.") return np.sqrt(mean_squared_error(self.y_test, predictions))
def fit(self, df, label, eva_df=None, eva_label=None, path='datasource/train.ffm', overwrite_path=True, eva_path='datasource/valid.ffm', model_path='datasource/ffm_model.out', overwrite_eva_path=True): if (eva_df is None) ^ (eva_label is None): raise Exception( 'params eva_df, eva_df must be all None or all have value.') df.index = range(df.shape[0]) label.index = range(label.shape[0]) if self.model_type == 'lr': self.clf = xl.create_ffm() elif self.model_type == 'fm': self.clf = xl.create_fm() elif self.model_type == 'ffm': self.clf = xl.create_linear() else: raise ValueError(self.model_type, ' is an invalid value for param cat.') self.fe = FFMEncoder(df) self.fe.fit(df, self.cutoff) self.fe.transform(df, label, path) if eva_df is not None: eva_df.index = range(eva_df.shape[0]) eva_label.index = range(eva_label.shape[0]) self.fe.transform(eva_df, eva_label, eva_path) self.clf.setTrain(path) if eva_df is not None: self.clf.setValidate(eva_path) self.clf.fit(self.params, model_path) self.model_path = model_path
def xl_fm(train_file, test_file, trainY, testY, rank, reg, epoch): for file in (model_file, output_file): if os.path.exists(file): os.unlink(file) param = { 'task': 'reg', 'metric': 'rmse', 'epoch': epoch, 'k': rank, 'lambda': reg } fm_model = xl.create_fm() fm_model.setTrain(train_file) # fm_model.setValidate(train_file) fm_model.fit(param, model_file) fm_model.setTest(test_file) fm_model.predict(model_file, output_file) pred = pd.read_csv(output_file, header=None).values.flatten() test_rmse = np.mean((testY - pred)**2)**0.5 return test_rmse
def run_xlearn(): if MODEL == 'LM': model = xl.create_linear() elif MODEL == 'FM': model = xl.create_fm() else: assert MODEL == 'FFM' model = xl.create_ffm() model.setTrain(TRAIN) model.setValidate(TEST) if WINDOW == 0: model.disableEarlyStop() param = { 'task': TASK, 'epoch': EPOCH, 'opt': OPT, 'metric': METRIC, 'k': K, 'lr': LEARNING_RATE, 'lambda': LAMBDA, } model.fit(param, './xlearn.model')
def __init__(self, X_train, X_test, y_train, y_test, categorical_cols): self.model_file = "data/fm/model.txt" self.weights_file = "data/fm/model_wt.out" self.x_train_file = "data/fm/x_train.txt" self.x_test_file = "data/fm/x_test.txt" self.categorical_cols = categorical_cols self.model = xl.create_fm() self.model.setTXTModel(self.model_file) X_train = self.one_hot_categorical_col(X_train) X_test = self.one_hot_categorical_col(X_test) dump_svmlight_file(X_train, y_train, self.x_train_file, zero_based=True, multilabel=False) dump_svmlight_file(X_test, y_test, self.x_test_file, zero_based=True, multilabel=False)
def _build_model(self): print("Build Model") training_file = self._dataset_path + '/fm_libsvm.csv' fm_model = xl.create_fm() fm_model.setTrain(training_file) param = { 'task': 'reg', 'lr': 0.15, 'epoch': 10, 'lambda': 0.002, 'metric': 'rmse', 'nthread': os.cpu_count(), 'k': self._num_dim } with open('fm_parameter.json', 'w') as file: json.dump(param, file, indent=4) fm_model.setTXTModel(self._txt_model_file) fm_model.fit(param, self._out_model_file)
def xl_objective(params, method="fm"): xl_objective.i+=1 params['task'] = 'reg' params['metric'] = 'rmse' # remember hyperopt casts as floats params['epoch'] = int(params['epoch']) params['k'] = int(params['k']) if method is "linear": xl_model = xl.create_linear() elif method is "fm": xl_model = xl.create_fm() results = [] for train, valid, target in zip(train_fpaths, valid_fpaths, valid_target_fpaths): preds_fname = os.path.join(XLEARN_DIR, 'tmp_output.txt') model_fname = os.path.join(XLEARN_DIR, "tmp_model.out") xl_model.setTrain(train) xl_model.setTest(valid) xl_model.setQuiet() xl_model.fit(params, model_fname) xl_model.predict(model_fname, preds_fname) y_valid = np.loadtxt(target) predictions = np.loadtxt(preds_fname) loss = np.sqrt(mean_squared_error(y_valid, predictions)) results.append(loss) error = np.mean(results) print("INFO: iteration {} error {:.3f}".format(xl_objective.i, error)) return error
convert_to_ffm(test_data_Label, 'Test', list(num_col_te), list(cat_col_te), list(all_col_te)) # # 2. Package Comparison # ## (1) xlearn # 可支持ffm、LR和FM # In[31]: import xlearn as xl # - FM # In[32]: fm_model = xl.create_fm() # Use field-aware factorization machine fm_model.setTrain("Train_ffm.txt") # Training data fm_model.setValidate("Test_ffm.txt") # Validation data # param: # 0. binary classification # 1. learning rate : 0.2 # 2. regular lambda : 0.002 param = {'task': 'binary', 'lr': 0.2, 'lambda': 0.002, 'metric': 'acc'} # Train model fm_model.fit(param, "./model_fm.out") # Prediction task fm_model.setTest("Test_ffm.txt") # Test data fm_model.setSigmoid() # Convert output to 0-1 # Start to predict
import xlearn as xl import pandas as pd from sklearn.datasets import dump_svmlight_file df = pd.read_csv("data/ts_FE.csv") features = pd.read_csv('feature.csv') x_columns = features.head(30)['feature'].tolist() #特征重要性排序中,选择前30的特征 y = df.click # y为数据的label值 dummy = pd.get_dummies(df[x_columns]) mat = dummy.as_matrix() dump_svmlight_file(mat, y, 'test.libsvm', zero_based=False) xfm = xl.create_fm() # xfm.setTrain("train.libsvm") # param = {'task':'binary', 'lr':0.0001, 'lambda':0.01, 'k':8, 'epoch':150} # xfm.fit(param, 'model.out') # xfm.setTXTModel("model.txt") xfm.setSigmoid() xfm.setTest("test.libsvm") xfm.predict('model.out', "output.txt") train = pd.read_csv('data/test.csv', dtype={'id': 'U'}) click = pd.read_csv('output.txt', names=['click']) out = pd.DataFrame({'id': train['id'], 'click': click['click']}) out.to_csv('Y_out.csv', index=False, sep=',')
import xlearn as xl # Training task ffm_model = xl.create_fm() # Use factorization machine ffm_model.setTrain("./house_price_train.txt") # Training data # param: # 0. Binary task # 1. learning rate: 0.2 # 2. regular lambda: 0.002 # 4. evaluation metric: rmse param = {'task': 'reg', 'lr': 0.2, 'lambda': 0.002, 'metric': 'rmse'} # Use cross-validation ffm_model.cv(param)
import os import xlearn as xl from config import Config from utils import GenResultFromTxt conf = Config() if __name__=="__main__": # sparse feature and dense features sparse_features = ['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'music_id', 'did', 'gender'] dense_features = ['video_duration', 'beauty'] target = ['finish', 'like'] # 将训练集转化为libsvm形式 for tar in target: test_path = os.path.join(conf.data_path, "FFM_" + tar + "_test.txt") model_path = os.path.join(conf.model_save_path, "FFM_"+tar+".model") save_path = os.path.join(conf.result_save_path, "FFM_"+tar+".output") print("==> Predict model") fm_model = xl.create_fm() fm_model.setTest(test_path) fm_model.setSigmoid() fm_model.predict(model_path, save_path) print("==> Generate result file") finish_output_path = os.path.join(conf.result_save_path, "FFM_finish.output") like_output_path = os.path.join(conf.result_save_path, "FFM_like.output") result_save_path = os.path.join(conf.result_save_path, "FFM_result322.csv") GenResultFromTxt(conf.test_data_path, finish_output_path, like_output_path, result_save_path) print("==> Finish generating result file")
for i, x in enumerate(catdict.keys()): if(catdict[x]==0): # numerical datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x]) else: # categorical datastring = datastring + " "+str(i)+":"+ str(int(datarow[x]))+":1" datastring += '\n' if n < n1: file_train.write(datastring) elif n < n_train: file_valid.write(datastring) else: file_test.write(datastring) convert_to_ffm(df, numerics=[], categories=cols, features=cols, Label='click', n_train=n_train, train_size=0.8) ##================ FM ==================## ## setting fm_model = xl.create_fm() # Use factorization machine fm_model.setTrain(fp_train) # Training data fm_model.setValidate(fp_valid) # Validation data fm_model.setSigmoid() param = {'task': 'binary', 'k': 20, 'lr': 0.02, 'lambda': 0.002, 'epoch': 100, 'opt': 'adagrad' } ## training fm_model.fit(param, fp_model_fm)
import xlearn as xl param = {'task':'binary', 'lr':0.2, 'epoch': 20, 'k':2, 'lambda':0.002, 'metric':'auc'} train_data = "../../data/criteo_conversion_logs/small_train.txt" test_data = "../../data/criteo_conversion_logs/small_test.txt" lr_model = xl.create_linear() lr_model.setTrain(train_data) lr_model.setValidate(test_data) lr_model.setTest(test_data) lr_model.setSigmoid() lr_model.fit(param, './lr_model.out') fm_model = xl.create_fm() fm_model.setTrain(train_data) fm_model.setValidate(test_data) fm_model.setTest(test_data) fm_model.setSigmoid() fm_model.fit(param, './fm_model.out') ffm_model = xl.create_ffm() ffm_model.setTrain(train_data) ffm_model.setValidate(test_data) ffm_model.setTest(test_data) ffm_model.setSigmoid() ffm_model.fit(param, './ffm_model.out')