def mapConf2Model(name): conf = d_name_conf[name] model_name = name.split('_')[0] #if model_name != 'lr' and model_name != 'fm' and model_name != 'DINN': # conf['layer_sizes'] = [FIELD_SIZES, 10, 1] if model_name in set(['lr', 'fm']): conf['input_dim'] = INPUT_DIM print 'conf', conf if model_name == 'ffm': return FFM(**conf) elif model_name == 'fwfm': conf['layer_sizes'] = [FIELD_SIZES, 10, 1] return FwFM(**conf) elif model_name == 'fwfm3': conf['layer_sizes'] = [FIELD_SIZES, 10, 1] return FwFM3(**conf) elif model_name == 'fm': return FM(**conf) elif model_name == 'lr': return LR(**conf) elif model_name == 'fwfmoh': return FwFM_LE(**conf) elif model_name == 'MTLfwfm': conf['index_lines'] = utils.index_lines conf['num_lines'] = FIELD_SIZES[utils.index_lines] conf['layer_sizes'] = [FIELD_SIZES, 10, 1] return MultiTask_FwFM(**conf) elif model_name == 'DINN': conf['layer_sizes'] = [FIELD_SIZES, 10, 1] return DINN(**conf)
def mapConf2Model(name): conf = d_name_conf[name] model_name = name.split('_')[0] if model_name == 'ffm': return FFM(**conf) elif model_name == 'fwfm': return FwFM(**conf) elif model_name == 'fm': return FM(**conf) elif model_name == 'lr': return LR(**conf)
def test(config_file, meta_data_file, id_map, dataToken, batch_data_dir, max_doc_length=30, model_name=None, restore_path=None, no_doc_index=False): np.random.seed(RANDOM_SEED_NP) data = DataDUELoader(meta_data_file=meta_data_file, batch_data_dir=batch_data_dir, id_map=id_map, dataToken=dataToken, max_doc_length=max_doc_length, no_doc_index=no_doc_index) model_spec = json_reader(config_file) model = FM(feature_shape=(0 if no_doc_index else data.D) + data.U + data.V + 1, feature_dim=(0 if no_doc_index else 1) + 1 + max_doc_length, label_dim=data.E, model_spec=model_spec, model_name=model_name) model.initialization() def performance(model_local, data_local): preds = model_local.predict(data_generator=data_local) labels = [] for data_batched in data_local.generate( batch_size=model_spec["batch_size"], random_shuffle=False): labels.append(data_batched["label"]) labels = np.concatenate(labels, axis=0) # one-hot to index # trues = np.argmax(labels, axis=-1) perf = evaluate(preds=preds, trues=trues) return perf if restore_path is not None: if not isinstance(restore_path, list): restore_paths = [restore_path] else: restore_paths = restore_path for restore_path in restore_paths: model.restore(restore_path) perf = performance(model_local=model, data_local=data) print("ckpt_path: %s" % restore_path) print("performance: %s" % str(perf)) else: perf = performance(model_local=model, data_local=data) print("random initialization") print("performance: %s" % str(perf))
def train(config_file, meta_data_file, id_map, dataToken, batch_data_dir_train, batch_data_dir_valid=None, max_doc_length=30, model_name=None, restore_path=None, no_doc_index=False): np.random.seed(RANDOM_SEED_NP) data_train = DataDUELoader(meta_data_file=meta_data_file, batch_data_dir=batch_data_dir_train, id_map=id_map, dataToken=dataToken, max_doc_length=max_doc_length, no_doc_index=no_doc_index) if batch_data_dir_valid is not None: data_valid = DataDUELoader(meta_data_file=meta_data_file, batch_data_dir=batch_data_dir_valid, id_map=id_map, dataToken=dataToken, max_doc_length=max_doc_length, no_doc_index=no_doc_index) else: data_valid = None model_spec = json_reader(config_file) model = FM(feature_shape=(0 if no_doc_index else data_train.D) + data_train.U + data_train.V + 1, feature_dim=(0 if no_doc_index else 1) + 1 + max_doc_length, label_dim=data_train.E, model_spec=model_spec, model_name=model_name) model.initialization() if restore_path is not None: model.restore(restore_path) # train # results = model.train(data_generator=data_train, data_generator_valid=data_valid) print("train_results: %s" % str(results)) best_epoch = read(directory="../summary/" + model.model_name, main_indicator="epoch_losses_valid_00")[0] print("best_epoch by validation loss: %d" % best_epoch)
'l2_weight': 0, 'random_seed': 0 } model = LR(**lr_params) elif algo == 'fm': fm_params = { 'input_dim': input_dim, 'factor_order': 10, 'opt_algo': 'gd', 'learning_rate': 0.1, 'l2_w': 0, 'l2_v': 0, } model = FM(**fm_params) elif algo == 'fnn': fnn_params = { 'layer_sizes': [field_sizes, 10, 1], 'layer_acts': ['tanh', 'none'], 'drop_out': [0, 0], 'opt_algo': 'gd', 'learning_rate': 0.1, 'layer_l2': [0, 0], 'random_seed': 0 } model = FNN(**fnn_params) elif algo == 'ccpm': ccpm_params = { 'layer_sizes': [field_sizes, 10, 5, 3],
lr = LR(**lr_params) lr.fit(Xi_train, Xv_train, y_train, Xi_valid, Xv_valid, y_valid) elif algo == 'fm': fm_params = { "feature_size": feature_size, "field_size": field_size, "embedding_size": 15, "epoch": 20, "batch_size": 1024, "learning_rate": 0.001, "optimizer_type": "adam", "l2_w_reg": 0.01, "l2_v_reg": 0.01, "verbose": True } fm = FM(**fm_params) fm.fit(Xi_train, Xv_train, y_train, Xi_valid, Xv_valid, y_valid) elif algo == 'deepfm': deepfm_params = { "feature_size": feature_size, "field_size": field_size, "embedding_size": 15, "deep_layers": [256, 128, 64], "epoch": 20, "batch_size": 1024, "learning_rate": 0.001, "optimizer_type": "adam", "l2_reg": 0.01, "dropout_deep": [0.5, 0.5, 0.5, 0.5], "verbose": True }
import pandas as pd import numpy as np from models import LR,FM,MLP,WideAndDeep,DeepFM if __name__=='__main__': np.random.seed(2019) data_dir="../data/movie_lens_100k/" train = pd.read_csv(data_dir+'ua.base', sep='\t', names=['user_id', 'movie_id', 'ratings', 'time']) test = pd.read_csv(data_dir+'ua.test', sep='\t', names=['user_id', 'movie_id', 'ratings', 'time']) data=pd.concat([train,test],axis=0) y_train = train['ratings'].values.reshape(-1, 1) # 一列 y_test = test['ratings'].values.reshape(-1, 1) features=['user_id','movie_id'] features_sizes=[data[f].nunique() for f in features] print("DFM") ls=[] model=LR for _ in range(10): model=FM(features_sizes) #model = LR(features_sizes) #model=DeepFM(features_sizes,deep_layers=(10,10),k=10) best_score=model.fit(train[features]-1,test[features]-1,y_train,y_test,lr=0.0005,N_EPOCH=150,batch_size=500,early_stopping_rounds=30) #-1是因为ids要从0起.而数据中是从1起的 ls.append(best_score) print(pd.Series(ls).mean(),pd.Series(ls).min()) print(str(ls))
data[c] = lbl.fit_transform(list(data[c])) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(data[features], data['plays'], test_size=0.2, random_state=42) y_train = y_train.values.reshape((-1, 1)) y_test = y_test.values.reshape((-1, 1)) #model=LR(features_sizes,loss_type='rmse')#,hash_size=r) #model=FM(features_sizes,k=24) #model=MLP(features_sizes,deep_layers=(12,12),k=24) best(12,12) k=24 #model=FM(features_sizes,k=24,FM_ignore_interaction=[(0,2),(0,3),(0,4)]) model = FM(features_sizes, k=24, FM_ignore_interaction=[(0, 1), (0, 2), (0, 3), (0, 4)]) #model=DeepFM(features_sizes,deep_layers=(12,12),k=24) #model = NFM(features_sizes, k=24) print(model) best_score = model.fit(X_train, X_test, y_train, y_test, lr=0.0005, N_EPOCH=50, batch_size=5000, early_stopping_rounds=5) #0.0005->0.001(1e-3 bs=1000) ''' ls=[] Rounds=1
train_features=['msno','song_id'] features_sizes=[train_data[c].nunique() for c in train_features] from sklearn.preprocessing import LabelEncoder lbl = LabelEncoder() enc = ColdStartEncoder() for c in train_features: train_data[c]=lbl.fit_transform(list(train_data[c])) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(train_data[train_features], train_data['target'], test_size = 0.125, random_state = 42) y_train=y_train.values.reshape((-1,1)) y_test=y_test.values.reshape((-1,1)) ''' #<Model> #model=LR(features_sizes,loss_type='binary',metric_type='auc') model=FM(features_sizes,k=8,loss_type='binary',metric_type='auc') #model=FM(features_sizes,k=8,loss_type='binary',metric_type='auc',FM_ignore_interaction=[(0,2),(0,3),(0,4)]) #FMDE #model=MLP(features_sizes,k=8,loss_type='binary',metric_type='auc',deep_layers=(32,8)) #model=NFM(features_sizes,k=8,loss_type='binary',metric_type='auc') #model=WideAndDeep(features_sizes,k=8,loss_type='binary',metric_type='auc',deep_layers=(8,8)) #model=DeepFM(features_sizes,k=8,loss_type='binary',metric_type='auc',deep_layers=(8,8)) #model=AFM(features_sizes,k=8,loss_type='binary',metric_type='auc',attention_FM=8) #model=DeepAFM(features_sizes,k=8,loss_type='binary',metric_type='auc',attention_FM=8,deep_layers=(8,8)) print(model) #[BUG fix] 老版本一定要传入拷贝..wtf~! 06/27修补BUG 内部copy防止影响数据 best_score = model.fit(X_train[train_features], X_test[train_features], y_train, y_test, lr=0.0005, N_EPOCH=50, batch_size=4096,early_stopping_rounds=5)#0.0005->0.001(1e-3 bs=1000) y_pred=model.predict(X_test) y_pred=1./(1.+np.exp(-1.*y_pred))#sigmoid transform from sklearn.metrics import roc_auc_score,log_loss print("ROC-AUC score on valid set: %.4f" %roc_auc_score(y_test,y_pred)) #print(log_loss(y_test,y_pred))