def mapConf2Model(name):
    conf = d_name_conf[name]
    model_name = name.split('_')[0]
    #if model_name != 'lr' and model_name != 'fm' and model_name != 'DINN':
    #    conf['layer_sizes'] = [FIELD_SIZES, 10, 1]
    if model_name in set(['lr', 'fm']):
        conf['input_dim'] = INPUT_DIM
    print 'conf', conf
    if model_name == 'ffm':
        return FFM(**conf)
    elif model_name == 'fwfm':
        conf['layer_sizes'] = [FIELD_SIZES, 10, 1]
        return FwFM(**conf)
    elif model_name == 'fwfm3':
        conf['layer_sizes'] = [FIELD_SIZES, 10, 1]
        return FwFM3(**conf)
    elif model_name == 'fm':
        return FM(**conf)
    elif model_name == 'lr':
        return LR(**conf)
    elif model_name == 'fwfmoh':
        return FwFM_LE(**conf)
    elif model_name == 'MTLfwfm':
        conf['index_lines'] = utils.index_lines
        conf['num_lines'] = FIELD_SIZES[utils.index_lines]
        conf['layer_sizes'] = [FIELD_SIZES, 10, 1]
        return MultiTask_FwFM(**conf)
    elif model_name == 'DINN':
        conf['layer_sizes'] = [FIELD_SIZES, 10, 1]
        return DINN(**conf)
def mapConf2Model(name):
    conf = d_name_conf[name]
    model_name = name.split('_')[0]
    if model_name == 'ffm':
        return FFM(**conf)
    elif model_name == 'fwfm':
        return FwFM(**conf)
    elif model_name == 'fm':
        return FM(**conf)
    elif model_name == 'lr':
        return LR(**conf)
Beispiel #3
0
def test(config_file,
         meta_data_file,
         id_map,
         dataToken,
         batch_data_dir,
         max_doc_length=30,
         model_name=None,
         restore_path=None,
         no_doc_index=False):
    np.random.seed(RANDOM_SEED_NP)
    data = DataDUELoader(meta_data_file=meta_data_file,
                         batch_data_dir=batch_data_dir,
                         id_map=id_map,
                         dataToken=dataToken,
                         max_doc_length=max_doc_length,
                         no_doc_index=no_doc_index)

    model_spec = json_reader(config_file)
    model = FM(feature_shape=(0 if no_doc_index else data.D) + data.U +
               data.V + 1,
               feature_dim=(0 if no_doc_index else 1) + 1 + max_doc_length,
               label_dim=data.E,
               model_spec=model_spec,
               model_name=model_name)
    model.initialization()

    def performance(model_local, data_local):
        preds = model_local.predict(data_generator=data_local)
        labels = []
        for data_batched in data_local.generate(
                batch_size=model_spec["batch_size"], random_shuffle=False):
            labels.append(data_batched["label"])
        labels = np.concatenate(labels, axis=0)
        # one-hot to index #
        trues = np.argmax(labels, axis=-1)

        perf = evaluate(preds=preds, trues=trues)
        return perf

    if restore_path is not None:
        if not isinstance(restore_path, list):
            restore_paths = [restore_path]
        else:
            restore_paths = restore_path
        for restore_path in restore_paths:
            model.restore(restore_path)
            perf = performance(model_local=model, data_local=data)
            print("ckpt_path: %s" % restore_path)
            print("performance: %s" % str(perf))
    else:
        perf = performance(model_local=model, data_local=data)
        print("random initialization")
        print("performance: %s" % str(perf))
Beispiel #4
0
def train(config_file,
          meta_data_file,
          id_map,
          dataToken,
          batch_data_dir_train,
          batch_data_dir_valid=None,
          max_doc_length=30,
          model_name=None,
          restore_path=None,
          no_doc_index=False):
    np.random.seed(RANDOM_SEED_NP)
    data_train = DataDUELoader(meta_data_file=meta_data_file,
                               batch_data_dir=batch_data_dir_train,
                               id_map=id_map,
                               dataToken=dataToken,
                               max_doc_length=max_doc_length,
                               no_doc_index=no_doc_index)
    if batch_data_dir_valid is not None:
        data_valid = DataDUELoader(meta_data_file=meta_data_file,
                                   batch_data_dir=batch_data_dir_valid,
                                   id_map=id_map,
                                   dataToken=dataToken,
                                   max_doc_length=max_doc_length,
                                   no_doc_index=no_doc_index)
    else:
        data_valid = None

    model_spec = json_reader(config_file)
    model = FM(feature_shape=(0 if no_doc_index else data_train.D) +
               data_train.U + data_train.V + 1,
               feature_dim=(0 if no_doc_index else 1) + 1 + max_doc_length,
               label_dim=data_train.E,
               model_spec=model_spec,
               model_name=model_name)
    model.initialization()
    if restore_path is not None:
        model.restore(restore_path)

    # train #
    results = model.train(data_generator=data_train,
                          data_generator_valid=data_valid)
    print("train_results: %s" % str(results))

    best_epoch = read(directory="../summary/" + model.model_name,
                      main_indicator="epoch_losses_valid_00")[0]
    print("best_epoch by validation loss: %d" % best_epoch)
Beispiel #5
0
        'l2_weight': 0,
        'random_seed': 0
    }

    model = LR(**lr_params)
elif algo == 'fm':
    fm_params = {
        'input_dim': input_dim,
        'factor_order': 10,
        'opt_algo': 'gd',
        'learning_rate': 0.1,
        'l2_w': 0,
        'l2_v': 0,
    }

    model = FM(**fm_params)
elif algo == 'fnn':
    fnn_params = {
        'layer_sizes': [field_sizes, 10, 1],
        'layer_acts': ['tanh', 'none'],
        'drop_out': [0, 0],
        'opt_algo': 'gd',
        'learning_rate': 0.1,
        'layer_l2': [0, 0],
        'random_seed': 0
    }

    model = FNN(**fnn_params)
elif algo == 'ccpm':
    ccpm_params = {
        'layer_sizes': [field_sizes, 10, 5, 3],
Beispiel #6
0
    lr = LR(**lr_params)
    lr.fit(Xi_train, Xv_train, y_train, Xi_valid, Xv_valid, y_valid)
elif algo == 'fm':
    fm_params = {
        "feature_size": feature_size,
        "field_size": field_size,
        "embedding_size": 15,
        "epoch": 20,
        "batch_size": 1024,
        "learning_rate": 0.001,
        "optimizer_type": "adam",
        "l2_w_reg": 0.01,
        "l2_v_reg": 0.01,
        "verbose": True
    }
    fm = FM(**fm_params)
    fm.fit(Xi_train, Xv_train, y_train, Xi_valid, Xv_valid, y_valid)
elif algo == 'deepfm':
    deepfm_params = {
        "feature_size": feature_size,
        "field_size": field_size,
        "embedding_size": 15,
        "deep_layers": [256, 128, 64],
        "epoch": 20,
        "batch_size": 1024,
        "learning_rate": 0.001,
        "optimizer_type": "adam",
        "l2_reg": 0.01,
        "dropout_deep": [0.5, 0.5, 0.5, 0.5],
        "verbose": True
    }
import pandas as pd
import numpy as np
from models import LR,FM,MLP,WideAndDeep,DeepFM


if __name__=='__main__':

    np.random.seed(2019)
    data_dir="../data/movie_lens_100k/"
    train = pd.read_csv(data_dir+'ua.base', sep='\t', names=['user_id', 'movie_id', 'ratings', 'time'])
    test = pd.read_csv(data_dir+'ua.test', sep='\t', names=['user_id', 'movie_id', 'ratings', 'time'])
    data=pd.concat([train,test],axis=0)
    y_train = train['ratings'].values.reshape(-1, 1)  # 一列
    y_test = test['ratings'].values.reshape(-1, 1)


    features=['user_id','movie_id']
    features_sizes=[data[f].nunique() for f in features]
    print("DFM")
    ls=[]
    model=LR
    for _ in range(10):
        model=FM(features_sizes)
        #model = LR(features_sizes)
        #model=DeepFM(features_sizes,deep_layers=(10,10),k=10)
        best_score=model.fit(train[features]-1,test[features]-1,y_train,y_test,lr=0.0005,N_EPOCH=150,batch_size=500,early_stopping_rounds=30)
        #-1是因为ids要从0起.而数据中是从1起的
        ls.append(best_score)
    print(pd.Series(ls).mean(),pd.Series(ls).min())
    print(str(ls))
Beispiel #8
0
    data[c] = lbl.fit_transform(list(data[c]))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[features],
                                                    data['plays'],
                                                    test_size=0.2,
                                                    random_state=42)
y_train = y_train.values.reshape((-1, 1))
y_test = y_test.values.reshape((-1, 1))

#model=LR(features_sizes,loss_type='rmse')#,hash_size=r)
#model=FM(features_sizes,k=24)
#model=MLP(features_sizes,deep_layers=(12,12),k=24) best(12,12) k=24
#model=FM(features_sizes,k=24,FM_ignore_interaction=[(0,2),(0,3),(0,4)])
model = FM(features_sizes,
           k=24,
           FM_ignore_interaction=[(0, 1), (0, 2), (0, 3), (0, 4)])
#model=DeepFM(features_sizes,deep_layers=(12,12),k=24)
#model = NFM(features_sizes, k=24)
print(model)
best_score = model.fit(X_train,
                       X_test,
                       y_train,
                       y_test,
                       lr=0.0005,
                       N_EPOCH=50,
                       batch_size=5000,
                       early_stopping_rounds=5)  #0.0005->0.001(1e-3 bs=1000)
'''
ls=[]
Rounds=1
Beispiel #9
0
train_features=['msno','song_id']
features_sizes=[train_data[c].nunique() for c in train_features]
from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()
enc = ColdStartEncoder()
for c in train_features:
    train_data[c]=lbl.fit_transform(list(train_data[c]))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data[train_features], train_data['target'], test_size = 0.125, random_state = 42)
y_train=y_train.values.reshape((-1,1))
y_test=y_test.values.reshape((-1,1))
'''

#<Model>
#model=LR(features_sizes,loss_type='binary',metric_type='auc')
model=FM(features_sizes,k=8,loss_type='binary',metric_type='auc')
#model=FM(features_sizes,k=8,loss_type='binary',metric_type='auc',FM_ignore_interaction=[(0,2),(0,3),(0,4)]) #FMDE
#model=MLP(features_sizes,k=8,loss_type='binary',metric_type='auc',deep_layers=(32,8))
#model=NFM(features_sizes,k=8,loss_type='binary',metric_type='auc')
#model=WideAndDeep(features_sizes,k=8,loss_type='binary',metric_type='auc',deep_layers=(8,8))
#model=DeepFM(features_sizes,k=8,loss_type='binary',metric_type='auc',deep_layers=(8,8))
#model=AFM(features_sizes,k=8,loss_type='binary',metric_type='auc',attention_FM=8)
#model=DeepAFM(features_sizes,k=8,loss_type='binary',metric_type='auc',attention_FM=8,deep_layers=(8,8))
print(model)
#[BUG fix] 老版本一定要传入拷贝..wtf~! 06/27修补BUG 内部copy防止影响数据
best_score = model.fit(X_train[train_features], X_test[train_features], y_train, y_test, lr=0.0005, N_EPOCH=50, batch_size=4096,early_stopping_rounds=5)#0.0005->0.001(1e-3 bs=1000)
y_pred=model.predict(X_test)
y_pred=1./(1.+np.exp(-1.*y_pred))#sigmoid transform
from sklearn.metrics import roc_auc_score,log_loss
print("ROC-AUC score on valid set: %.4f" %roc_auc_score(y_test,y_pred))
#print(log_loss(y_test,y_pred))