コード例 #1
0
ファイル: DeepFM_test.py プロジェクト: nwf5d/DeepCTR
def test_DeepFM(use_fm, hidden_size, sparse_feature_num):
    model_name = "DeepFM"
    sample_size = 64
    feature_dim_dict = {"sparse": {}, 'dense': []}
    for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]):
        if name == "sparse":
            for i in range(num):
                feature_dim_dict[name][name + '_' +
                                       str(i)] = np.random.randint(1, 10)
        else:
            for i in range(num):
                feature_dim_dict[name].append(name + '_' + str(i))

    sparse_input = [np.random.randint(0, dim, sample_size)
                    for dim in feature_dim_dict['sparse'].values()]
    dense_input = [np.random.random(sample_size)
                   for name in feature_dim_dict['dense']]
    y = np.random.randint(0, 2, sample_size)
    x = sparse_input + dense_input

    model = DeepFM(feature_dim_dict,  use_fm=use_fm,
                   hidden_size=hidden_size, keep_prob=0.5, )
    model.compile('adam', 'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    model.fit(x, y, batch_size=100, epochs=1, validation_split=0.5)

    print(model_name+" test train valid pass!")
    model.save_weights(model_name + '_weights.h5')
    model.load_weights(model_name + '_weights.h5')
    print(model_name+" test save load weight pass!")
    save_model(model,  model_name + '.h5')
    model = load_model(model_name + '.h5', custom_objects)
    print(model_name + " test save load model pass!")

    print(model_name + " test pass!")
コード例 #2
0
def test_DeepFM(use_fm, hidden_size):
    name = "DeepFM"
    sample_size = 64
    feature_dim_dict = {
        'sparse': {
            'sparse_1': 2,
            'sparse_2': 5,
            'sparse_3': 10
        },
        'dense': ['dense_1', 'dense_2', 'dense_3']
    }
    sparse_input = [
        np.random.randint(0, dim, sample_size)
        for dim in feature_dim_dict['sparse'].values()
    ]
    dense_input = [
        np.random.random(sample_size) for name in feature_dim_dict['dense']
    ]
    y = np.random.randint(0, 2, sample_size)
    x = sparse_input + dense_input

    model = DeepFM(
        feature_dim_dict,
        use_fm=use_fm,
        hidden_size=hidden_size,
        keep_prob=0.5,
    )
    model.compile('adam',
                  'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    model.fit(x, y, batch_size=100, epochs=1, validation_split=0.5)
    print(name + " test train valid pass!")
    model.save_weights(name + '_weights.h5')
    model.load_weights(name + '_weights.h5')
    print(name + " test save load weight pass!")
    save_model(model, name + '.h5')
    model = load_model(name + '.h5', custom_objects)
    print(name + " test save load model pass!")

    print(name + " test pass!")
コード例 #3
0
# 编译有错,临时去掉embedding_size=8,use_fm=True,编译不过
model = DeepFM(linear_feature_columns,
               dnn_feature_columns,
               fm_group=fixlen_feature_columns,
               dnn_hidden_units=(256, 256, 256),
               l2_reg_linear=0.001,
               l2_reg_embedding=0.001,
               l2_reg_dnn=0,
               init_std=0.0001,
               seed=1024,
               dnn_dropout=0.5,
               dnn_activation='relu',
               dnn_use_bn=True,
               task='binary')
try:
    model.load_weights(checkpoint_path)
    print('load weights')
except:
    pass
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=['accuracy', 'AUC'])
history = model.fit(train_model_input,
                    train[target],
                    batch_size=8192,
                    epochs=5,
                    verbose=1,
                    shuffle=True,
                    callbacks=[cp_callback],
                    validation_data=(val_model_input, val[target]))
コード例 #4
0
        # 4.Define Model,compile and train
        model = DeepFM({"sparse": sparse_feature_dim, "dense": []},
                       final_activation='sigmoid')

        model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'])

        filepath = 'model_save/deep_fm_sample-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
        checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

        history = model.fit(model_input, data[target].values, callbacks=[checkpoint],
                            batch_size=batch_size, epochs=50, verbose=1, validation_split=0.2,)

    elif mode == 'test':
        model = DeepFM({"sparse": sparse_feature_dim, "dense": []},
                       final_activation='sigmoid')
        model.load_weights('model_save/deep_fm_sample-ep001-loss0.184-val_loss0.172.h5')

        # model = load_model('model_save/deep_fm_sample-ep001-loss0.192-val_loss0.176.h5')

        data = pd.read_csv("./data/sample/validation.txt")

        # 1.Label Encoding for sparse features,and do simple Transformation for dense features
        for feat in sparse_features:
            lbe = LabelEncoder()
            data[feat] = lbe.fit_transform(data[feat])
        # 2.count #unique features for each sparse field
        sparse_feature_dim = {feat: data[feat].nunique()
                              for feat in sparse_features}
        # 3.generate input data for model
        model_input = [data[feat].values for feat in sparse_feature_dim]
コード例 #5
0
            epochs=50,
            verbose=1,
            validation_data=generate_arrays_from_file(
                './data/feature_mapped_combined_valid.data',
                batch_size=batch_size),
            validation_steps=int(np.ceil(num_valid / batch_size)))

    elif mode == 'test':
        # model.load_weights('model_save/deep_fm_fn-ep002-loss0.148-val_loss0.174.h5')  # auc: 0.718467 batch_size=6000
        #model.load_weights('model_save/deep_fm_fn-ep001-loss0.149-val_loss0.175.h5')  # auc: 0.714243  batch_size = 2048
        # model.load_weights('model_save/deep_fm_fn-ep005-loss0.147-val_loss0.173.h5')  # auc: 0.722535  batch_size = 10000
        # model.load_weights('model_save/deep_fm_fn_bs10000-ep001-loss0.155-val_loss0.153.h5')  # auc: 0.738023
        #model.load_weights('model_save/deep_fm_fn_bs15000-ep001-loss0.156-val_loss0.152.h5')  # auc: 0.739935
        #model.load_weights('model_save/deep_fm_fn-ep002-loss0.154-val_loss0.154-bs15000-ee20-hz[128, 128].h5')  # auc: 0.741590
        model.load_weights(
            'model_save/deep_fm_fn-ep020-loss0.153-val_loss0.153-bs15000-ee20-hz[5, 600].h5'
        )  # auc: 0.742558

        labels = []
        preds = []

        reader = pd.read_csv("./data/feature_mapped_combined_valid.data",
                             header=None,
                             chunksize=chunk_size)

        for df in reader:
            print('df size: %d' % df.shape[0])
            df = shuffle(df)
            cnt = 0
            while cnt < df.shape[0]:
                end = cnt + batch_size
コード例 #6
0
    shutil.rmtree(dirpath)
os.mkdir('checkpoint')

hist = model.fit(online_train_model_input,
                 train_df['label'].values,
                 batch_size=BATCH_SIZE,
                 epochs=EPOCHS,
                 verbose=2,
                 validation_split=0.1,
                 shuffle=True,
                 callbacks=get_callbacks())

# In[ ]:

best_epoch = np.argmax(hist.history["val_auroc"]) + 1
model.load_weights('checkpoint/epoch_{:02d}.hdf5'.format(best_epoch))
print(hist.history["val_auroc"])
print('loading epoch_{:02d}.hdf5'.format(best_epoch))

y_pre = model.predict(online_test_model_input,
                      verbose=1,
                      batch_size=BATCH_SIZE)
res = pd.DataFrame()
res['id'] = test_id
res['probability'] = y_pre
res.to_csv('submission_DeepFM_fibinet_feature.csv', index=False)

# pred_ans = pred_ans.flatten()
# ans = pd.DataFrame(data={'id': np.array(
#     [i for i in range(1, pred_ans.shape[0]+1)]), 'probability': pred_ans})
# ans.to_csv('submission_DeepFM.csv', index=False, header=True)
コード例 #7
0
ファイル: deepfm.py プロジェクト: marchinho11/travelhack
class DeepFMHelper:
    def __init__(self):
        self.min_max_scaler = MinMaxScaler(feature_range=(0, 1))
        self.cat_features = [
            "user_Вид тура_last",
            "user_Звездность_last",
            "tour_Страна",
            "tour_Страна тура",
            "user_Тип заявки_last",
        ]
        self.dense_features = None
        self.fixlen_feature_columns = None
        self.feature_names = None
        self.model = None

    def fit(self, X, y):
        X_ = X.copy()
        self.dense_features = list(X_.columns.difference(self.cat_features))

        logger.debug("MinMaxScaler")
        self.min_max_scaler.fit(X_[self.dense_features])
        X_[self.dense_features] = self.min_max_scaler.transform(
            X_[self.dense_features])

        self._column_mapping(X_)
        X_.columns = [self.columns_mapping[col] for col in X_.columns]

        self.fixlen_feature_columns = [
            SparseFeat(
                self.columns_mapping[feat],
                vocabulary_size=X_[self.columns_mapping[feat]].max() + 1,
                embedding_dim=4,
            ) for i, feat in enumerate(self.cat_features)
        ] + [
            DenseFeat(
                self.columns_mapping[feat],
                1,
            ) for feat in self.dense_features
        ]
        self.feature_names = get_feature_names(self.fixlen_feature_columns)

        logger.debug("Compile DeepFM model")
        self.model = DeepFM(self.fixlen_feature_columns,
                            self.fixlen_feature_columns,
                            task="binary")
        self.model.compile(
            "adam",
            "binary_crossentropy",
            metrics=["binary_crossentropy"],
        )

        logger.debug("Fit DeepFM")
        train_model_input = {
            name: X_[name].values
            for name in self.feature_names
        }
        self.model.fit(
            train_model_input,
            y,
            batch_size=256,
            epochs=3,
            verbose=2,
            validation_split=0.2,
        )

    def predict_proba(self, X):
        X_ = X.copy()
        X_[self.dense_features] = self.min_max_scaler.transform(
            X_[self.dense_features])
        X_.columns = [self.columns_mapping[col] for col in X_.columns]
        model_input = {name: X_[name].values for name in self.feature_names}
        pred = self.inference(model_input)
        pred = pred[:, 0].numpy()
        return pred

    def _column_mapping(self, X):
        symbols = (
            "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
            "abvgdeejzijklmnoprstufhzcss_y_euaABVGDEEJZIJKLMNOPRSTUFHZCSS_Y_EUA",
        )

        tr = {ord(a): ord(b) for a, b in zip(*symbols)}

        self.columns_mapping = dict(
            zip(
                X.columns,
                [
                    col.translate(tr).replace(" ", "_").replace("$", "dollar")
                    for col in X.columns
                ],
            ))

    @tf.function()
    def inference(self, test_model_input):
        return self.model(test_model_input)

    def save_model(self):
        self.model.save_weights("backend/data/DeepFM_w.h5")
        with open("backend/data/DeepFM_data.pkl", "wb") as f_out:
            pickle.dump(
                (
                    self.columns_mapping,
                    self.min_max_scaler,
                    self.dense_features,
                    self.fixlen_feature_columns,
                    self.feature_names,
                ),
                f_out,
            )

    def load_model(self):
        with open("data/DeepFM_data.pkl", "rb") as f_in:
            (
                self.columns_mapping,
                self.min_max_scaler,
                self.dense_features,
                self.fixlen_feature_columns,
                self.feature_names,
            ) = pickle.load(f_in)
        self.model = DeepFM(self.fixlen_feature_columns,
                            self.fixlen_feature_columns,
                            task="binary")
        self.model.compile(
            "adam",
            "binary_crossentropy",
            metrics=["binary_crossentropy"],
        )
        self.model.load_weights("data/DeepFM_w.h5")
コード例 #8
0
            model_input,
            label,
            callbacks=[checkpoint],
            batch_size=batch_size,
            epochs=50,
            verbose=1,
            validation_split=0.2,
        )

    elif mode == 'test':
        model = DeepFM({
            "sparse": sparse_feature_dim,
            "dense": []
        },
                       final_activation='sigmoid')
        model.load_weights(
            'model_save/deep_fm_sample-ep002-loss0.175-val_loss0.171.h5')

        # model = load_model('model_save/deep_fm_sample-ep001-loss0.192-val_loss0.176.h5')

        data = pd.read_csv("./data/sample/feature_mapped.data", header=None)
        label = data[0].values
        model_input = [
            data[feat + 1].values for feat in range(len(sparse_feature_dim))
        ]

        pred = model.predict(model_input, batch_size, 1)
        label = label.flatten().tolist()
        pred = pred.flatten().tolist()
        with open('data/pctr', 'w') as fw:
            for i in range(len(pred)):
                if i % 10000 == 0: