Beispiel #1
0
def test_long_dense_vector():
    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_feature_names(feature_columns)

    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])

    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    model = DeepFM(feature_columns, feature_columns[:-1])
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
Beispiel #2
0
def test_DeepFM(use_fm, hidden_size, sparse_feature_num):
    model_name = "DeepFM"
    sample_size = 64
    feature_dim_dict = {"sparse": {}, 'dense': []}
    for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]):
        if name == "sparse":
            for i in range(num):
                feature_dim_dict[name][name + '_' +
                                       str(i)] = np.random.randint(1, 10)
        else:
            for i in range(num):
                feature_dim_dict[name].append(name + '_' + str(i))

    sparse_input = [np.random.randint(0, dim, sample_size)
                    for dim in feature_dim_dict['sparse'].values()]
    dense_input = [np.random.random(sample_size)
                   for name in feature_dim_dict['dense']]
    y = np.random.randint(0, 2, sample_size)
    x = sparse_input + dense_input

    model = DeepFM(feature_dim_dict,  use_fm=use_fm,
                   hidden_size=hidden_size, keep_prob=0.5, )
    model.compile('adam', 'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    model.fit(x, y, batch_size=100, epochs=1, validation_split=0.5)

    print(model_name+" test train valid pass!")
    model.save_weights(model_name + '_weights.h5')
    model.load_weights(model_name + '_weights.h5')
    print(model_name+" test save load weight pass!")
    save_model(model,  model_name + '.h5')
    model = load_model(model_name + '.h5', custom_objects)
    print(model_name + " test save load model pass!")

    print(model_name + " test pass!")
Beispiel #3
0
def test_long_dense_vector():
    #构造特征
    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_feature_names(feature_columns)

    #构造样本
    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])
    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    #创建模型model
    model = DeepFM(feature_columns, feature_columns[:-1])

    # model.summary()
    #tf.keras.utils.plot_model(model, "test_compu")

    #训练模型
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
Beispiel #4
0
def run_deepfm_model():
    train, test, train_model_input, test_model_input, dnn_feature_columns, linear_feature_columns, feature_names, target = read_data_as_model(
    )

    #Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['binary_crossentropy'],
    )

    model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=10,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
    return pred_ans, test[target].values, round(
        roc_auc_score(test[target].values, pred_ans), 4), 'deepfm'
def train_deepFM():
    k = featureengineer.k
    #缺失值填充+编码处理
    data,appsnum, tags_nums = trainmodel.data,trainmodel.appsnum,trainmodel.tags_nums
    data[trainmodel.sparse_features] = data[trainmodel.sparse_features].fillna('-1', )
    for feat in trainmodel.dense_features:
        data[feat].fillna(data[feat].dropna().mean(), inplace=True)

    for feat in trainmodel.sparse_features:
        data[feat] = data[feat].apply(lambda x:str(x))
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[trainmodel.dense_features] = mms.fit_transform(data[trainmodel.dense_features])


    #数据格式转换
    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=8)
                              for i, feat in enumerate(trainmodel.sparse_features)] + \
                             [DenseFeat(feat, 1, ) for feat in trainmodel.dense_features]

    lgbOut_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=1)
                              for i, feat in enumerate(trainmodel.lgbOut_Features)]

    key2index_len = {'applist': appsnum+1, 'new_tag': tags_nums}
    varlen_features = [VarLenSparseFeat('%s' % i, vocabulary_size=key2index_len[i], maxlen=k, embedding_dim=8, combiner='mean',weight_name=None) for i
                       in trainmodel.var_features]

    dnn_feature_columns = fixlen_feature_columns + varlen_features
    linear_feature_columns = fixlen_feature_columns + varlen_features + lgbOut_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    sparse_dense_features = trainmodel.sparse_features + trainmodel.dense_features + trainmodel.lgbOut_Features

    train, test = train_test_split(data, test_size=0.2)


    train_model_input = {name: train[name] for name in sparse_dense_features}
    test_model_input = {name: test[name] for name in sparse_dense_features}
    for x in trainmodel.var_features:
        if x == 'applist':
            train_model_input[x] = np.array(train[x].tolist())
            test_model_input[x] = np.array(test[x].tolist())
        if x == 'new_tag':
            train_model_input[x] = np.array(train[x].tolist())-appsnum
            test_model_input[x] = np.array(test[x].tolist())-appsnum
    # 模型
    model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                   dnn_hidden_units=(50, 30, 30), l2_reg_linear=0.001, l2_reg_embedding=0.001,
                   l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.1, dnn_activation='relu', dnn_use_bn=True,
                   task='binary')
    model.compile("adam", "binary_crossentropy",metrics=['AUC'], )

    history = model.fit(train_model_input, train['target'].values,
                        batch_size=256, epochs=1, verbose=2, validation_split=0.2, )

    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test AUC", round(roc_auc_score(test['target'].values, pred_ans), 4))
Beispiel #6
0
def test_DeepFM(use_fm, hidden_size):
    name = "DeepFM"
    sample_size = 64
    feature_dim_dict = {
        'sparse': {
            'sparse_1': 2,
            'sparse_2': 5,
            'sparse_3': 10
        },
        'dense': ['dense_1', 'dense_2', 'dense_3']
    }
    sparse_input = [
        np.random.randint(0, dim, sample_size)
        for dim in feature_dim_dict['sparse'].values()
    ]
    dense_input = [
        np.random.random(sample_size) for name in feature_dim_dict['dense']
    ]
    y = np.random.randint(0, 2, sample_size)
    x = sparse_input + dense_input

    model = DeepFM(
        feature_dim_dict,
        use_fm=use_fm,
        hidden_size=hidden_size,
        keep_prob=0.5,
    )
    model.compile('adam',
                  'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    model.fit(x, y, batch_size=100, epochs=1, validation_split=0.5)
    print(name + " test train valid pass!")
    model.save_weights(name + '_weights.h5')
    model.load_weights(name + '_weights.h5')
    print(name + " test save load weight pass!")
    save_model(model, name + '.h5')
    model = load_model(name + '.h5', custom_objects)
    print(name + " test save load model pass!")

    print(name + " test pass!")
Beispiel #7
0
def model_generate(train_X, train_y, val_X, val_y, linear_feature_columns,
                   dnn_feature_columns):
    model = DeepFM(linear_feature_columns,
                   dnn_feature_columns,
                   embedding_size=32)
    model.compile("adam",
                  "binary_crossentropy",
                  metrics=[roc_auc_score_pyfunc, log_loss_pyfunc])
    history = model.fit(train_X,
                        train_y,
                        validation_data=(val_X, val_y),
                        batch_size=4096,
                        epochs=5,
                        callbacks=[EarlyStopping()])
    return model, history
Beispiel #8
0
def train_model(train, test, linear_feature, dnn_feature):

    model = DeepFM(linear_feature, dnn_feature, task='binary')
    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['AUC'],
    )
    history = model.fit(
        *train,
        batch_size=512,
        epochs=5,
        verbose=2,
        validation_split=0.1,
    )
    pred_ans = model.predict(test[0], batch_size=512)
    print("test LogLoss", round(log_loss(test[1], pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[1], pred_ans), 4))
Beispiel #9
0
def deepfm_model(linear_feature_columns, dnn_feature_columns,
                 train_model_input, train, test_model_input, test):
    cols = ['model', 'RMSE', 'MAE', 'MSE', 'AUC', 'score']
    df_result = pd.DataFrame(columns=cols, index=range(1))
    model = DeepFM(linear_feature_columns,
                   dnn_feature_columns,
                   dnn_hidden_units=config.deepfm_att["dnn_hidden_units"],
                   init_std=config.deepfm_att["init_std"],
                   seed=config.deepfm_att["seed"],
                   dnn_dropout=config.deepfm_att["dnn_dropout"],
                   dnn_activation=config.deepfm_att["dnn_activation"],
                   task=config.deepfm_att["task"],
                   fm_group=config.deepfm_att["fm_group"],
                   dnn_use_bn=config.deepfm_att["dnn_use_bn"])

    model.compile("adam", "mse", metrics=['mse'])

    history = model.fit(train_model_input,
                        train[target].values,
                        batch_size=256,
                        epochs=config.model_epoch['epoch'],
                        verbose=2,
                        validation_split=0.2)

    pred_ans = model.predict(test_model_input, batch_size=256)
    save_model(model, 'saved_deepfm.h5')  # save_model
    auc = roc_auc_score(test[target].values, pred_ans)

    df_result.loc[0].model = "DeepFM"
    df_result.loc[0].RMSE = np.round(
        math.sqrt(mean_squared_error(test[target].values, pred_ans)), 3)
    df_result.loc[0].MAE = np.round(
        mean_absolute_error(test[target].values, pred_ans), 3)
    df_result.loc[0].MSE = np.round(
        mean_squared_error(test[target].values, pred_ans), 3)
    df_result.loc[0].AUC = np.round(auc, 3)
    #df_result.loc[0].score=(1/df_result.iloc[0]['RMSE'])*(1/df_result.iloc[0]['MAE'])*(2*df_result.iloc[0]['AUC'])
    return df_result
Beispiel #10
0
    seed=1024,
    dnn_dropout=0.3,
    dnn_activation='selu',
    dnn_use_bn=True,
)

import tensorflow.keras as keras
import tensorflow as tf
opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(
    opt,
    "binary_crossentropy",
    metrics=['accuracy'],
)

history = model.fit(train_model_input,
                    train[target].values,
                    batch_size=128,
                    epochs=30,
                    verbose=1,
                    validation_data=(valid_model_input, valid[target].values))


def plot_learning_curves(history):
    pd.DataFrame(history.history).plot(figsize=(8, 5))
    plt.grid(True)
    plt.gca().set_ylim(0, 1)
    plt.show()


plot_learning_curves(history)
model_input = {name: data[name].values for name in feature_names}
# model_input['Genres'] = genres_list

# %%
for i in model_input:
    print(i, model_input[i].dtype)

# %%
model_input

# %%
data[['Rating']].values

# %%
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile(
    "adam",
    "mse",
    metrics=['mse'],
)
history = model.fit(
    model_input,
    data[['Rating']].values,
    batch_size=256,
    epochs=10,
    verbose=2,
    validation_split=0.2,
)

# %%
Beispiel #12
0
print(model_input)
# print(model_input.shape)
# 4.Define Model,compile and train
model = DeepFM(
    {
        "sparse": sparse_feat_list,
        "dense": dense_feat_list,
        "sequence": sequence_feature
    },
    final_activation='linear',
    embedding_size=8,
    use_fm=False,
    hidden_size=(64, 64))

model.compile(
    "adam",
    "mape",
    metrics=['mape'],
)
history = model.fit(
    model_input,
    df_train[target].values,
    batch_size=2048,
    epochs=200,
    verbose=2,
    validation_split=0.2,
)
pred = model.predict(model_input)
print(pred)
print(smape(df_train[target].values, pred))
Beispiel #13
0
def deepctr_cv(X_train,
               y_train,
               folds,
               logger,
               cv_path,
               X_test=None,
               optional_data=None,
               prep=True,
               split_conf=None):

    scores = []
    preds = []

    meta = np.zeros_like(y_train).astype("float64")
    if split_conf is None:
        X_tr, X_te, main_conf, _ = prep_for_embedding(X_train,
                                                      X_test,
                                                      conf,
                                                      prep=prep)
        X_train, X_test = X_tr, X_te
    else:
        main_conf = split_conf

    cat_cols = [c for c, _, _ in main_conf[0]]
    cat_fs = [SingleFeat(c, d) for c, d, _ in main_conf[0]]
    num_fs = [SingleFeat(c, 0) for c in conf.num_cols]

    X_test = split_df(X_test, cat_cols, conf.num_cols)

    for num_fold, (tr_ind, tes_ind) in enumerate(folds):
        if num_fold > 0:
            break
        logger.info(f"fold_{num_fold}")

        fold_path = cv_path / f"fold{num_fold}"
        seed_path = fold_path
        Path(fold_path).mkdir(exist_ok=True, parents=True)

        callbacks = [CSVLogger(str(fold_path / 'epochs.csv'))]

        X_cv_train, X_cv_test = X_train.iloc[tr_ind], X_train.iloc[tes_ind]
        y_cv_train, y_cv_test = y_train.iloc[tr_ind], y_train.iloc[tes_ind]
        X_cv_train = split_df(X_cv_train, cat_cols, conf.num_cols)
        X_cv_test = split_df(X_cv_test, cat_cols, conf.num_cols)

        model = DeepFM({
            'sparse': cat_fs,
            'dense': num_fs
        },
                       final_activation='sigmoid')
        model.compile("adam", "binary_crossentropy", metrics=['accuracy'])
        model.fit(X_cv_train,
                  y_cv_train,
                  callbacks=callbacks,
                  batch_size=2048,
                  epochs=10,
                  verbose=1,
                  validation_data=(X_cv_test, y_cv_test))
        model.save_weights(str(seed_path / 'weights.h5'), save_format='hdf5')
        gc.collect()

        if X_test is not None:
            pred = model.predict(X_test, batch_size=2048)
            pred = pred[:, 0]
            np.save(seed_path / f"pred.npy", pred)

        train_oof = model.predict(X_cv_test, batch_size=2048)
        train_oof = train_oof[:, 0]
        auc = roc_auc_score(y_cv_test.values, train_oof)
        logger.info(f"{num_fold}: auc {auc}")
        np.save(seed_path / f"train_oof.npy", train_oof)

        # auc = roc_auc_score(y_cv_test, train_oof)
        # logger.info(f"seed_average: auc {auc}")
        scores.append(auc)
        np.save(fold_path / f"tes_ind.npy", tes_ind)
        meta[tes_ind] += train_oof
        del X_cv_train, y_cv_train, X_cv_test, y_cv_test

        if X_test is not None:
            preds.append(pred)

    scores = np.array(scores)
    preds = np.array(preds)
    pred = rank_average(preds)
    logger.info(f"{scores.mean()}, {scores.std()}")
    return scores, pred, meta
Beispiel #14
0
class DeepFMHelper:
    def __init__(self):
        self.min_max_scaler = MinMaxScaler(feature_range=(0, 1))
        self.cat_features = [
            "user_Вид тура_last",
            "user_Звездность_last",
            "tour_Страна",
            "tour_Страна тура",
            "user_Тип заявки_last",
        ]
        self.dense_features = None
        self.fixlen_feature_columns = None
        self.feature_names = None
        self.model = None

    def fit(self, X, y):
        X_ = X.copy()
        self.dense_features = list(X_.columns.difference(self.cat_features))

        logger.debug("MinMaxScaler")
        self.min_max_scaler.fit(X_[self.dense_features])
        X_[self.dense_features] = self.min_max_scaler.transform(
            X_[self.dense_features])

        self._column_mapping(X_)
        X_.columns = [self.columns_mapping[col] for col in X_.columns]

        self.fixlen_feature_columns = [
            SparseFeat(
                self.columns_mapping[feat],
                vocabulary_size=X_[self.columns_mapping[feat]].max() + 1,
                embedding_dim=4,
            ) for i, feat in enumerate(self.cat_features)
        ] + [
            DenseFeat(
                self.columns_mapping[feat],
                1,
            ) for feat in self.dense_features
        ]
        self.feature_names = get_feature_names(self.fixlen_feature_columns)

        logger.debug("Compile DeepFM model")
        self.model = DeepFM(self.fixlen_feature_columns,
                            self.fixlen_feature_columns,
                            task="binary")
        self.model.compile(
            "adam",
            "binary_crossentropy",
            metrics=["binary_crossentropy"],
        )

        logger.debug("Fit DeepFM")
        train_model_input = {
            name: X_[name].values
            for name in self.feature_names
        }
        self.model.fit(
            train_model_input,
            y,
            batch_size=256,
            epochs=3,
            verbose=2,
            validation_split=0.2,
        )

    def predict_proba(self, X):
        X_ = X.copy()
        X_[self.dense_features] = self.min_max_scaler.transform(
            X_[self.dense_features])
        X_.columns = [self.columns_mapping[col] for col in X_.columns]
        model_input = {name: X_[name].values for name in self.feature_names}
        pred = self.inference(model_input)
        pred = pred[:, 0].numpy()
        return pred

    def _column_mapping(self, X):
        symbols = (
            "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
            "abvgdeejzijklmnoprstufhzcss_y_euaABVGDEEJZIJKLMNOPRSTUFHZCSS_Y_EUA",
        )

        tr = {ord(a): ord(b) for a, b in zip(*symbols)}

        self.columns_mapping = dict(
            zip(
                X.columns,
                [
                    col.translate(tr).replace(" ", "_").replace("$", "dollar")
                    for col in X.columns
                ],
            ))

    @tf.function()
    def inference(self, test_model_input):
        return self.model(test_model_input)

    def save_model(self):
        self.model.save_weights("backend/data/DeepFM_w.h5")
        with open("backend/data/DeepFM_data.pkl", "wb") as f_out:
            pickle.dump(
                (
                    self.columns_mapping,
                    self.min_max_scaler,
                    self.dense_features,
                    self.fixlen_feature_columns,
                    self.feature_names,
                ),
                f_out,
            )

    def load_model(self):
        with open("data/DeepFM_data.pkl", "rb") as f_in:
            (
                self.columns_mapping,
                self.min_max_scaler,
                self.dense_features,
                self.fixlen_feature_columns,
                self.feature_names,
            ) = pickle.load(f_in)
        self.model = DeepFM(self.fixlen_feature_columns,
                            self.fixlen_feature_columns,
                            task="binary")
        self.model.compile(
            "adam",
            "binary_crossentropy",
            metrics=["binary_crossentropy"],
        )
        self.model.load_weights("data/DeepFM_w.h5")
Beispiel #15
0
        model.compile("adam",
                      "binary_crossentropy",
                      metrics=['binary_crossentropy'])

        filepath = 'model_save/deep_fm_sample-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_loss',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='min')

        history = model.fit(
            model_input,
            label,
            callbacks=[checkpoint],
            batch_size=batch_size,
            epochs=50,
            verbose=1,
            validation_split=0.2,
        )

    elif mode == 'test':
        model = DeepFM({
            "sparse": sparse_feature_dim,
            "dense": []
        },
                       final_activation='sigmoid')
        model.load_weights(
            'model_save/deep_fm_sample-ep002-loss0.175-val_loss0.171.h5')

        # model = load_model('model_save/deep_fm_sample-ep001-loss0.192-val_loss0.176.h5')
               dnn_dropout=0.5,
               dnn_activation='relu',
               dnn_use_bn=True,
               task='binary')
try:
    model.load_weights(checkpoint_path)
    print('load weights')
except:
    pass
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=['accuracy', 'AUC'])
history = model.fit(train_model_input,
                    train[target],
                    batch_size=8192,
                    epochs=5,
                    verbose=1,
                    shuffle=True,
                    callbacks=[cp_callback],
                    validation_data=(val_model_input, val[target]))

data['predict'] = 0
data.loc[train_index, 'predict'] = model.predict(train_model_input,
                                                 batch_size=8192)
data.loc[val_index, 'predict'] = model.predict(val_model_input,
                                               batch_size=8192)
data.loc[test_index, 'predict'] = model.predict(test_model_input,
                                                batch_size=8192)

p = 88.5
pred_val = data.loc[val_index, 'predict']
print("val LogLoss", round(log_loss(val[target], pred_val), 4))
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, valid = train_test_split(data, test_size=0.2,random_state=10)

    train_model_input = {name:train[name] for name in feature_names}
    valid_model_input = {name:valid[name] for name in feature_names}
    test_model_input = {name:test[name] for name in feature_names}
    
    # 4.Define Model,train,predict and evaluate
    #dnn_hidden_units用来定义隐藏层数量以及每层神经元个数
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary',dnn_hidden_units=[100,100],dnn_dropout=0.2)
    model.compile("adam", "binary_crossentropy",
                  metrics=['binary_crossentropy'], )

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=2, verbose=2, validation_data=(valid_model_input,valid['target']) )
    pred_ans = model.predict(valid_model_input, batch_size=256)
    print("valid LogLoss", round(log_loss(valid[target].values, pred_ans), 4))
    print("valid AUC", round(roc_auc_score(valid[target].values, pred_ans), 4))
    #进行预测,并写入csv文件
    result=model.predict(test_model_input, batch_size=256)
    
    result=pd.DataFrame(result,columns=['label'])
    submit=pd.DataFrame(test['ID'],columns=['ID'])
    submit=submit.join(result)
    submit.to_csv(sys.path[0]+"\\tem\\"+"result"+'.csv',index=False)
Beispiel #18
0
               use_fm=False,
               dnn_hidden_units=(128, 128),
               dnn_dropout=0)
# model = DCN(dnn_feature_columns, embedding_size=8)
model.compile(
    Adam(lr=0.005),
    "binary_crossentropy",
    metrics=['binary_crossentropy'],
)
#es = EarlyStopping(monitor='val_binary_crossentropy')
history = model.fit(train_model_input,
                    train[target].values,
                    validation_split=0.3,
                    callbacks=[
                        EarlyStopping(monitor='val_loss',
                                      patience=3,
                                      verbose=0,
                                      mode='auto')
                    ],
                    batch_size=4096,
                    epochs=10,
                    verbose=1)

pred_ans = model.predict(test_model_input, batch_size=2**14)
pred_finish = (pred_ans * 2).astype(int)

print('Region ID={}'.format(RIGIONID))
print("test accuracy",
      round(accuracy_score(test[target].values, pred_finish), 4))
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
Beispiel #19
0
model.compile(
    optimizer=tf.keras.optimizers.Adam(3e-4),
    loss="binary_crossentropy",
    #     loss=multi_category_focal_loss2(alpha=0.1),
    metrics=[auroc],
)

dirpath = Path('checkpoint')
if dirpath.exists() and dirpath.is_dir():
    shutil.rmtree(dirpath)
os.mkdir('checkpoint')

hist = model.fit(train_and_val_model_input,
                 y_train_val,
                 batch_size=BATCH_SIZE,
                 epochs=EPOCHS,
                 verbose=2,
                 validation_split=0.1,
                 callbacks=get_callbacks())

# In[ ]:

best_epoch = np.argmax(hist.history["val_auroc"]) + 1
model.load_weights('checkpoint/epoch_{:02d}.hdf5'.format(best_epoch))
print(hist.history["val_auroc"])
print('loading epoch_{:02d}.hdf5'.format(best_epoch))

pred_ans = model.predict(test_model_input, verbose=1, batch_size=BATCH_SIZE)

pred_ans = pred_ans.flatten()
ans = pd.DataFrame(
Beispiel #20
0
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

train_model_input = {name: train[name] for name in sparse_features}  #
test_model_input = {name: test[name] for name in sparse_features}  #

#history = LossHistory()
model = DeepFM(linear_feature_columns,
               dnn_feature_columns,
               task='binary',
               dnn_hidden_units=(200, 80))
model.compile(
    "adam",
    "binary_crossentropy",
    metrics=['binary_crossentropy'],
)

for x in range(5):
    history = model.fit(train_model_input,
                        train[target].values,
                        batch_size=256,
                        epochs=1,
                        verbose=1)
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
    print("test RMSE", round(rmse(test[target].values, pred_ans), 4))
    print("test RIG", round(rig(test[target].values, pred_ans)[0], 4))

    model.save("./trained_data/normal_model_" + str(x))
    with open('./trained_data/normal_history_' + str(x), 'wb') as file_pi:
        pickle.dump(history.history, file_pi)
Beispiel #21
0
            fixlen_feature_names,
            len(labels["valid"]),
            mode="train",
        )
        for file_count, data_batch in enumerate(train_generator):
            print("epoch", epoch, "filecount", file_count)
            train_model_input, train_model_labels = data_batch

            X_shuffled, Y_shuffled = shuffle_batch(
                train_model_input,
                train_model_labels)  # using AutoInt's convention

            history = model.fit(
                X_shuffled,
                Y_shuffled,
                batch_size=batch_size,
                epochs=1,
                verbose=1,
                callbacks=callbacks,
            )

            history_epoch[epoch][file_count] = [
                history.history, history.params
            ]

            if epoch < epochs_skip_es:
                continue

            valid_generator = get_data_generator(
                base_path,
                cross_path,
                fixlen_feature_names,
class DeepModel:
    def __init__(self, model_name, model_architecture="DeepFM"):
        self.model_name = model_name
        self.model_architecture = model_architecture

        self.model = None
        self.history = None
        self.data = None
        self.callbacks = []

    # requires tf2
    # def set_notebook_mode(self):
    #    progress_bar_cb = tfa.callbacks.TQDMProgressBar() #TQDMNotebookCallback(leave_inner=True, leave_outer=True)
    #    self.callbacks.append(progress_bar_cb)

    def prepare_data(self,
                     data_source,
                     sparse_features,
                     target,
                     test_size=0.1):
        self.data = Data(sparse_features,
                         target,
                         data_format="deepctr",
                         test_size=test_size)
        self.data.ingest(data_source)
        self.data.prepare()

    def build(self, task):
        assert task in ['regression', 'binary']
        if self.model_architecture == "DeepFM":
            self.model = DeepFM(
                self.data.linear_feature_columns,
                self.data.dnn_feature_columns,
                task=task,
            )
        else:
            raise NotImplementedError(
                'At the current stage of the development, only a DeepFM is supported'
            )

        task_attr = {
            'regression': {
                'loss': 'mse',
                'metrics': 'mse'
            },
            'binary': {
                'loss': 'binary_crossentropy',
                'metrics': 'accuracy'
            }
        }
        if task == "regression":
            loss = "mse"
            metrics = "mse"
        elif task == "binary":
            loss = "binary_crossentropy"
            metrics = "accuracy"

        self.model.compile(optimizer="adam",
                           loss=task_attr[task]['loss'],
                           metrics=task_attr[task]['metrics'])

    def train(self, batch_size=256, epochs=10, validation_split=0.1):
        #class_weights = class_weight.compute_class_weight(
        #    "balanced", np.unique(self.data.y_train[:, 0]), self.data.y_train[:, 0]
        #)
        self.history = self.model.fit(
            self.data.X_train,
            self.data.y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=validation_split,
            verbose=2,
            #class_weight=class_weights,
            callbacks=self.callbacks,
        )

    def evaluate(self):
        self.model.evaluate(self.data.X_test,
                            self.data.y_test,
                            batch_size=4096)

    def prepare_input(self, df):
        df = df.copy()
        for feat in self.data.sparse_features:
            lbe = self.data.encoders[feat]
            df[feat] = lbe.transform(df[feat])

        X = {name: df[name].values for name in self.data.feature_names}
        return X

    def predict(self, X, batch_size=256):
        return self.model.predict(X, batch_size=batch_size)
Beispiel #23
0
    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model
    model_input = {name: data[name] for name in sparse_features}  #
    model_input["genres"] = genres_list
    model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)

    # 4.Define Model,compile and train
    model = DeepFM(linear_feature_columns,
                   dnn_feature_columns,
                   task='regression')

    model.compile(
        "adam",
        "mse",
        metrics=['mse'],
    )

    tensorboard_callback = tf.keras.callbacks.TensorBoard()
    history = model.fit(model_input,
                        data[target].values,
                        batch_size=256,
                        epochs=10,
                        verbose=2,
                        validation_split=0.2,
                        callbacks=[tensorboard_callback])
Beispiel #24
0
model.compile(
    optimizer=tf.keras.optimizers.Adam(3e-4),
    # loss="binary_crossentropy",
    loss=multi_category_focal_loss2(alpha=0.1),
    metrics=[auroc],
)

dirpath = Path('checkpoint')
if dirpath.exists() and dirpath.is_dir():
    shutil.rmtree(dirpath)
os.mkdir('checkpoint')

hist = model.fit(online_train_model_input,
                 train_df['label'].values,
                 batch_size=BATCH_SIZE,
                 epochs=EPOCHS,
                 verbose=2,
                 validation_split=0.1,
                 shuffle=True,
                 callbacks=get_callbacks())

# In[ ]:

best_epoch = np.argmax(hist.history["val_auroc"]) + 1
model.load_weights('checkpoint/epoch_{:02d}.hdf5'.format(best_epoch))
print(hist.history["val_auroc"])
print('loading epoch_{:02d}.hdf5'.format(best_epoch))

y_pre = model.predict(online_test_model_input,
                      verbose=1,
                      batch_size=BATCH_SIZE)
res = pd.DataFrame()
from deepctr.models import DeepFM

if __name__ == "__main__":

    data = pd.read_csv("./movielens_sample.txt")
    sparse_features = ["movie_id", "user_id",
                       "gender", "age", "occupation", "zip"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # 2.count #unique features for each sparse field
    sparse_feature_dim = {feat: data[feat].nunique()
                          for feat in sparse_features}
    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[feat].values for feat in sparse_feature_dim]
    test_model_input = [test[feat].values for feat in sparse_feature_dim]
    # 4.Define Model,train,predict and evaluate
    model = DeepFM({"sparse": sparse_feature_dim, "dense": []},
                   final_activation='linear')
    model.compile("adam", "mse", metrics=['mse'],)

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=1, verbose=2, validation_split=0.2,)
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test MSE", round(mean_squared_error(
        test[target].values, pred_ans), 4))
model.compile(optimizer=opt, loss=tf.losses.BinaryCrossentropy(),
                metrics=[tf.keras.metrics.AUC()])

log_dir="logs"+ os.path.sep + NNconfig_dic["model_name"] + "_res" + os.path.sep \
              + datetime.now().strftime("%Y%m%d-%H%M%S")
NN_config_path = "logs" + os.path.sep + NNconfig_dic["model_name"] + "_res" + os.path.sep \
              + datetime.now().strftime("%Y%m%d-%H%M%S") + "_" + "NNconfig.json"


# if worker_index == 0:
#     if not os.path.exists("logs" + os.path.sep + NNconfig_dic["model_name"] + "_res"):
#         os.makedirs("logs" + os.path.sep + NNconfig_dic["model_name"] + "_res")
#     with open(NN_config_path, "w+") as conf:
#         json.dump(NNconfig_dic, conf)

callbacks = [tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch = 3),]


# %%

# TODO: edit epoch etc.
model.fit(D_train, epochs=epochs, verbose=1 if worker_index == 0 else 0, validation_data=D_valid,
                    steps_per_epoch=max(len_train // batchsize + 1, num_workers) // num_workers , 
                    validation_steps=max(len_valid // batchsize + 1, num_workers) // num_workers,
                    callbacks = callbacks)

# %%
if worker_index == 0:
    save_path = '/models/save/CTR/1'
    model.save(save_path)
Beispiel #27
0
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=train_data[feat].nunique(), embedding_dim=size_dict[field_info[feat]], dtype='int32', group_name=field_info[feat]) for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

train, test = train_test_split(train_data, test_size=0.2, random_state=2020)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', dnn_hidden_units=(2, 256), dnn_dropout=0.0)
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="binary_crossentropy", metrics=['binary_crossentropy'], optimizer=opt)

history = model.fit(train_model_input, train[target].values, batch_size=64, epochs=10, verbose=1, validation_split=0.2, class_weight={0:1, 1:3})
pred_ans = model.predict(test_model_input, batch_size=64)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

ans = pd.Series(pred_ans.reshape((-1,)))
ans[ans>=0.5] = 1
ans[ans<0.5] = 0

pd.Series(test[target].transportation_issues).value_counts()

ans.value_counts()

print(classification_report(test[target], ans))

model.summary()
Beispiel #28
0
    sparse_feature_dim = {feat: data[feat].nunique()
                          for feat in sparse_features}
    # 3.generate input data for model
    model_input = [data[feat].values for feat in sparse_feature_dim]

    if mode == 'train':
        # 4.Define Model,compile and train
        model = DeepFM({"sparse": sparse_feature_dim, "dense": []},
                       final_activation='sigmoid')

        model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'])

        filepath = 'model_save/deep_fm_sample-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
        checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

        history = model.fit(model_input, data[target].values, callbacks=[checkpoint],
                            batch_size=batch_size, epochs=50, verbose=1, validation_split=0.2,)

    elif mode == 'test':
        model = DeepFM({"sparse": sparse_feature_dim, "dense": []},
                       final_activation='sigmoid')
        model.load_weights('model_save/deep_fm_sample-ep001-loss0.184-val_loss0.172.h5')

        # model = load_model('model_save/deep_fm_sample-ep001-loss0.192-val_loss0.176.h5')

        data = pd.read_csv("./data/sample/validation.txt")

        # 1.Label Encoding for sparse features,and do simple Transformation for dense features
        for feat in sparse_features:
            lbe = LabelEncoder()
            data[feat] = lbe.fit_transform(data[feat])
        # 2.count #unique features for each sparse field
                                    sparse_feature_names=sparse_feature_names,
                                    label=label,
                                    mode=tf.estimator.ModeKeys.TRAIN)
    val_input_fn = tfrecord_to_fn(os.path.join(save_dir, 'val_tfrecord'),
                                  dense_feature_names=dense_feature_names,
                                  sparse_feature_names=sparse_feature_names,
                                  label=label,
                                  mode=tf.estimator.ModeKeys.EVAL)
    pred_input_fn = tfrecord_to_fn(os.path.join(save_dir, 'test_tfrecord'),
                                   dense_feature_names=dense_feature_names,
                                   sparse_feature_names=sparse_feature_names,
                                   label=None,
                                   mode=tf.estimator.ModeKeys.PREDICT)
    train_datasets = train_input_fn()
    val_datasets = val_input_fn()
    pred_datasets = pred_input_fn()

    tensorboard_callback = tf.keras.callbacks.TensorBoard()
    earlystop_callback = tf.keras.callbacks.EarlyStopping('val_loss',
                                                          patience=10)
    # 存在版本问题,只能在eager模式使用validation_data
    history = model.fit(train_datasets,
                        epochs=1000,
                        verbose=1,
                        validation_data=val_datasets,
                        callbacks=[tensorboard_callback, earlystop_callback])
    #
    print('train finish')
    # res = model.predict(pred_datasets)
    # print(res)
Beispiel #30
0
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model = multi_gpu_model(model, gpus=2)

    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['binary_crossentropy'],
    )

    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=10,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
Beispiel #31
0
def test_DFM_avazu(data, train, test):
    print("\nTesting DFM on avazu dataset...\n")

    results_activation_function = {"auc": [], "logloss": [], "rmse": []}
    results_dropout = {"auc": [], "logloss": [], "rmse": []}
    results_number_of_neurons = {"auc": [], "logloss": [], "rmse": []}

    auc = 0
    logloss = 0
    rmse = 0

    features_labels = train.columns

    sparse_features_labels = features_labels[1:23]
    target_label = features_labels[0]

    dnn_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=data[feat].nunique(),
            embedding_dim=4,
        ) for feat in sparse_features_labels
    ]
    linear_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=data[feat].nunique(),
            embedding_dim=4,
        ) for feat in sparse_features_labels
    ]

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    true_y = test[target_label].values

    print("\t\t-- ACTIVATION FUNCTIONS --\t\t")
    for dnn_activation in dnn_activation_list:
        print("\nTesting {dnn_activation}...".format(
            dnn_activation=dnn_activation))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_activation=dnn_activation,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_activation_function["auc"].append(auc)
        results_activation_function["logloss"].append(logloss)
        results_activation_function["rmse"].append(rmse)

    print("\t\t-- DROPOUT RATES --\t\t")
    for dnn_dropout in dnn_dropout_list:
        print("\nTesting {dnn_dropout}...".format(dnn_dropout=dnn_dropout))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_dropout=dnn_dropout,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_dropout["auc"].append(auc)
        results_dropout["logloss"].append(logloss)
        results_dropout["rmse"].append(rmse)

    print("\t\t-- HIDDEN UNITS --\t\t")
    for dnn_hidden_units in dnn_hidden_units_list:
        print("\nTesting {dnn_hidden_units}...".format(
            dnn_hidden_units=dnn_hidden_units))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_hidden_units=dnn_hidden_units,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_number_of_neurons["auc"].append(auc)
        results_number_of_neurons["logloss"].append(logloss)
        results_number_of_neurons["rmse"].append(rmse)

    if PLOT:
        create_plots("DFM", "avazu", results_activation_function,
                     "Activation Function", "activation_func",
                     dnn_activation_list)
        create_plots("DFM", "avazu", results_dropout, "Dropout Rate",
                     "dropout", dnn_dropout_list)
        create_plots("DFM", "avazu", results_number_of_neurons,
                     "Number of Neurons per layer", "nr_neurons",
                     dnn_hidden_units_list)