def test_DeepFM(use_fm, hidden_size, sparse_feature_num): model_name = "DeepFM" sample_size = 64 feature_dim_dict = {"sparse": {}, 'dense': []} for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]): if name == "sparse": for i in range(num): feature_dim_dict[name][name + '_' + str(i)] = np.random.randint(1, 10) else: for i in range(num): feature_dim_dict[name].append(name + '_' + str(i)) sparse_input = [np.random.randint(0, dim, sample_size) for dim in feature_dim_dict['sparse'].values()] dense_input = [np.random.random(sample_size) for name in feature_dim_dict['dense']] y = np.random.randint(0, 2, sample_size) x = sparse_input + dense_input model = DeepFM(feature_dim_dict, use_fm=use_fm, hidden_size=hidden_size, keep_prob=0.5, ) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) model.fit(x, y, batch_size=100, epochs=1, validation_split=0.5) print(model_name+" test train valid pass!") model.save_weights(model_name + '_weights.h5') model.load_weights(model_name + '_weights.h5') print(model_name+" test save load weight pass!") save_model(model, model_name + '.h5') model = load_model(model_name + '.h5', custom_objects) print(model_name + " test save load model pass!") print(model_name + " test pass!")
def test_DeepFM(use_fm, hidden_size): name = "DeepFM" sample_size = 64 feature_dim_dict = { 'sparse': { 'sparse_1': 2, 'sparse_2': 5, 'sparse_3': 10 }, 'dense': ['dense_1', 'dense_2', 'dense_3'] } sparse_input = [ np.random.randint(0, dim, sample_size) for dim in feature_dim_dict['sparse'].values() ] dense_input = [ np.random.random(sample_size) for name in feature_dim_dict['dense'] ] y = np.random.randint(0, 2, sample_size) x = sparse_input + dense_input model = DeepFM( feature_dim_dict, use_fm=use_fm, hidden_size=hidden_size, keep_prob=0.5, ) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) model.fit(x, y, batch_size=100, epochs=1, validation_split=0.5) print(name + " test train valid pass!") model.save_weights(name + '_weights.h5') model.load_weights(name + '_weights.h5') print(name + " test save load weight pass!") save_model(model, name + '.h5') model = load_model(name + '.h5', custom_objects) print(name + " test save load model pass!") print(name + " test pass!")
# 编译有错,临时去掉embedding_size=8,use_fm=True,编译不过 model = DeepFM(linear_feature_columns, dnn_feature_columns, fm_group=fixlen_feature_columns, dnn_hidden_units=(256, 256, 256), l2_reg_linear=0.001, l2_reg_embedding=0.001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.5, dnn_activation='relu', dnn_use_bn=True, task='binary') try: model.load_weights(checkpoint_path) print('load weights') except: pass model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy', 'AUC']) history = model.fit(train_model_input, train[target], batch_size=8192, epochs=5, verbose=1, shuffle=True, callbacks=[cp_callback], validation_data=(val_model_input, val[target]))
# 4.Define Model,compile and train model = DeepFM({"sparse": sparse_feature_dim, "dense": []}, final_activation='sigmoid') model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy']) filepath = 'model_save/deep_fm_sample-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5' checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min') history = model.fit(model_input, data[target].values, callbacks=[checkpoint], batch_size=batch_size, epochs=50, verbose=1, validation_split=0.2,) elif mode == 'test': model = DeepFM({"sparse": sparse_feature_dim, "dense": []}, final_activation='sigmoid') model.load_weights('model_save/deep_fm_sample-ep001-loss0.184-val_loss0.172.h5') # model = load_model('model_save/deep_fm_sample-ep001-loss0.192-val_loss0.176.h5') data = pd.read_csv("./data/sample/validation.txt") # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) # 2.count #unique features for each sparse field sparse_feature_dim = {feat: data[feat].nunique() for feat in sparse_features} # 3.generate input data for model model_input = [data[feat].values for feat in sparse_feature_dim]
epochs=50, verbose=1, validation_data=generate_arrays_from_file( './data/feature_mapped_combined_valid.data', batch_size=batch_size), validation_steps=int(np.ceil(num_valid / batch_size))) elif mode == 'test': # model.load_weights('model_save/deep_fm_fn-ep002-loss0.148-val_loss0.174.h5') # auc: 0.718467 batch_size=6000 #model.load_weights('model_save/deep_fm_fn-ep001-loss0.149-val_loss0.175.h5') # auc: 0.714243 batch_size = 2048 # model.load_weights('model_save/deep_fm_fn-ep005-loss0.147-val_loss0.173.h5') # auc: 0.722535 batch_size = 10000 # model.load_weights('model_save/deep_fm_fn_bs10000-ep001-loss0.155-val_loss0.153.h5') # auc: 0.738023 #model.load_weights('model_save/deep_fm_fn_bs15000-ep001-loss0.156-val_loss0.152.h5') # auc: 0.739935 #model.load_weights('model_save/deep_fm_fn-ep002-loss0.154-val_loss0.154-bs15000-ee20-hz[128, 128].h5') # auc: 0.741590 model.load_weights( 'model_save/deep_fm_fn-ep020-loss0.153-val_loss0.153-bs15000-ee20-hz[5, 600].h5' ) # auc: 0.742558 labels = [] preds = [] reader = pd.read_csv("./data/feature_mapped_combined_valid.data", header=None, chunksize=chunk_size) for df in reader: print('df size: %d' % df.shape[0]) df = shuffle(df) cnt = 0 while cnt < df.shape[0]: end = cnt + batch_size
shutil.rmtree(dirpath) os.mkdir('checkpoint') hist = model.fit(online_train_model_input, train_df['label'].values, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=2, validation_split=0.1, shuffle=True, callbacks=get_callbacks()) # In[ ]: best_epoch = np.argmax(hist.history["val_auroc"]) + 1 model.load_weights('checkpoint/epoch_{:02d}.hdf5'.format(best_epoch)) print(hist.history["val_auroc"]) print('loading epoch_{:02d}.hdf5'.format(best_epoch)) y_pre = model.predict(online_test_model_input, verbose=1, batch_size=BATCH_SIZE) res = pd.DataFrame() res['id'] = test_id res['probability'] = y_pre res.to_csv('submission_DeepFM_fibinet_feature.csv', index=False) # pred_ans = pred_ans.flatten() # ans = pd.DataFrame(data={'id': np.array( # [i for i in range(1, pred_ans.shape[0]+1)]), 'probability': pred_ans}) # ans.to_csv('submission_DeepFM.csv', index=False, header=True)
class DeepFMHelper: def __init__(self): self.min_max_scaler = MinMaxScaler(feature_range=(0, 1)) self.cat_features = [ "user_Вид тура_last", "user_Звездность_last", "tour_Страна", "tour_Страна тура", "user_Тип заявки_last", ] self.dense_features = None self.fixlen_feature_columns = None self.feature_names = None self.model = None def fit(self, X, y): X_ = X.copy() self.dense_features = list(X_.columns.difference(self.cat_features)) logger.debug("MinMaxScaler") self.min_max_scaler.fit(X_[self.dense_features]) X_[self.dense_features] = self.min_max_scaler.transform( X_[self.dense_features]) self._column_mapping(X_) X_.columns = [self.columns_mapping[col] for col in X_.columns] self.fixlen_feature_columns = [ SparseFeat( self.columns_mapping[feat], vocabulary_size=X_[self.columns_mapping[feat]].max() + 1, embedding_dim=4, ) for i, feat in enumerate(self.cat_features) ] + [ DenseFeat( self.columns_mapping[feat], 1, ) for feat in self.dense_features ] self.feature_names = get_feature_names(self.fixlen_feature_columns) logger.debug("Compile DeepFM model") self.model = DeepFM(self.fixlen_feature_columns, self.fixlen_feature_columns, task="binary") self.model.compile( "adam", "binary_crossentropy", metrics=["binary_crossentropy"], ) logger.debug("Fit DeepFM") train_model_input = { name: X_[name].values for name in self.feature_names } self.model.fit( train_model_input, y, batch_size=256, epochs=3, verbose=2, validation_split=0.2, ) def predict_proba(self, X): X_ = X.copy() X_[self.dense_features] = self.min_max_scaler.transform( X_[self.dense_features]) X_.columns = [self.columns_mapping[col] for col in X_.columns] model_input = {name: X_[name].values for name in self.feature_names} pred = self.inference(model_input) pred = pred[:, 0].numpy() return pred def _column_mapping(self, X): symbols = ( "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", "abvgdeejzijklmnoprstufhzcss_y_euaABVGDEEJZIJKLMNOPRSTUFHZCSS_Y_EUA", ) tr = {ord(a): ord(b) for a, b in zip(*symbols)} self.columns_mapping = dict( zip( X.columns, [ col.translate(tr).replace(" ", "_").replace("$", "dollar") for col in X.columns ], )) @tf.function() def inference(self, test_model_input): return self.model(test_model_input) def save_model(self): self.model.save_weights("backend/data/DeepFM_w.h5") with open("backend/data/DeepFM_data.pkl", "wb") as f_out: pickle.dump( ( self.columns_mapping, self.min_max_scaler, self.dense_features, self.fixlen_feature_columns, self.feature_names, ), f_out, ) def load_model(self): with open("data/DeepFM_data.pkl", "rb") as f_in: ( self.columns_mapping, self.min_max_scaler, self.dense_features, self.fixlen_feature_columns, self.feature_names, ) = pickle.load(f_in) self.model = DeepFM(self.fixlen_feature_columns, self.fixlen_feature_columns, task="binary") self.model.compile( "adam", "binary_crossentropy", metrics=["binary_crossentropy"], ) self.model.load_weights("data/DeepFM_w.h5")
model_input, label, callbacks=[checkpoint], batch_size=batch_size, epochs=50, verbose=1, validation_split=0.2, ) elif mode == 'test': model = DeepFM({ "sparse": sparse_feature_dim, "dense": [] }, final_activation='sigmoid') model.load_weights( 'model_save/deep_fm_sample-ep002-loss0.175-val_loss0.171.h5') # model = load_model('model_save/deep_fm_sample-ep001-loss0.192-val_loss0.176.h5') data = pd.read_csv("./data/sample/feature_mapped.data", header=None) label = data[0].values model_input = [ data[feat + 1].values for feat in range(len(sparse_feature_dim)) ] pred = model.predict(model_input, batch_size, 1) label = label.flatten().tolist() pred = pred.flatten().tolist() with open('data/pctr', 'w') as fw: for i in range(len(pred)): if i % 10000 == 0: