def run_deepfm_model(): train, test, train_model_input, test_model_input, dnn_feature_columns, linear_feature_columns, feature_names, target = read_data_as_model( ) #Define Model,train,predict and evaluate model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) return pred_ans, test[target].values, round( roc_auc_score(test[target].values, pred_ans), 4), 'deepfm'
def train_deepFM(): k = featureengineer.k #缺失值填充+编码处理 data,appsnum, tags_nums = trainmodel.data,trainmodel.appsnum,trainmodel.tags_nums data[trainmodel.sparse_features] = data[trainmodel.sparse_features].fillna('-1', ) for feat in trainmodel.dense_features: data[feat].fillna(data[feat].dropna().mean(), inplace=True) for feat in trainmodel.sparse_features: data[feat] = data[feat].apply(lambda x:str(x)) lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[trainmodel.dense_features] = mms.fit_transform(data[trainmodel.dense_features]) #数据格式转换 fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=8) for i, feat in enumerate(trainmodel.sparse_features)] + \ [DenseFeat(feat, 1, ) for feat in trainmodel.dense_features] lgbOut_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=1) for i, feat in enumerate(trainmodel.lgbOut_Features)] key2index_len = {'applist': appsnum+1, 'new_tag': tags_nums} varlen_features = [VarLenSparseFeat('%s' % i, vocabulary_size=key2index_len[i], maxlen=k, embedding_dim=8, combiner='mean',weight_name=None) for i in trainmodel.var_features] dnn_feature_columns = fixlen_feature_columns + varlen_features linear_feature_columns = fixlen_feature_columns + varlen_features + lgbOut_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) sparse_dense_features = trainmodel.sparse_features + trainmodel.dense_features + trainmodel.lgbOut_Features train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in sparse_dense_features} test_model_input = {name: test[name] for name in sparse_dense_features} for x in trainmodel.var_features: if x == 'applist': train_model_input[x] = np.array(train[x].tolist()) test_model_input[x] = np.array(test[x].tolist()) if x == 'new_tag': train_model_input[x] = np.array(train[x].tolist())-appsnum test_model_input[x] = np.array(test[x].tolist())-appsnum # 模型 model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns, dnn_hidden_units=(50, 30, 30), l2_reg_linear=0.001, l2_reg_embedding=0.001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.1, dnn_activation='relu', dnn_use_bn=True, task='binary') model.compile("adam", "binary_crossentropy",metrics=['AUC'], ) history = model.fit(train_model_input, train['target'].values, batch_size=256, epochs=1, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test AUC", round(roc_auc_score(test['target'].values, pred_ans), 4))
def train_model(train, test, linear_feature, dnn_feature): model = DeepFM(linear_feature, dnn_feature, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['AUC'], ) history = model.fit( *train, batch_size=512, epochs=5, verbose=2, validation_split=0.1, ) pred_ans = model.predict(test[0], batch_size=512) print("test LogLoss", round(log_loss(test[1], pred_ans), 4)) print("test AUC", round(roc_auc_score(test[1], pred_ans), 4))
def deepfm_model(linear_feature_columns, dnn_feature_columns, train_model_input, train, test_model_input, test): cols = ['model', 'RMSE', 'MAE', 'MSE', 'AUC', 'score'] df_result = pd.DataFrame(columns=cols, index=range(1)) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=config.deepfm_att["dnn_hidden_units"], init_std=config.deepfm_att["init_std"], seed=config.deepfm_att["seed"], dnn_dropout=config.deepfm_att["dnn_dropout"], dnn_activation=config.deepfm_att["dnn_activation"], task=config.deepfm_att["task"], fm_group=config.deepfm_att["fm_group"], dnn_use_bn=config.deepfm_att["dnn_use_bn"]) model.compile("adam", "mse", metrics=['mse']) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=config.model_epoch['epoch'], verbose=2, validation_split=0.2) pred_ans = model.predict(test_model_input, batch_size=256) save_model(model, 'saved_deepfm.h5') # save_model auc = roc_auc_score(test[target].values, pred_ans) df_result.loc[0].model = "DeepFM" df_result.loc[0].RMSE = np.round( math.sqrt(mean_squared_error(test[target].values, pred_ans)), 3) df_result.loc[0].MAE = np.round( mean_absolute_error(test[target].values, pred_ans), 3) df_result.loc[0].MSE = np.round( mean_squared_error(test[target].values, pred_ans), 3) df_result.loc[0].AUC = np.round(auc, 3) #df_result.loc[0].score=(1/df_result.iloc[0]['RMSE'])*(1/df_result.iloc[0]['MAE'])*(2*df_result.iloc[0]['AUC']) return df_result
except: pass model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy', 'AUC']) history = model.fit(train_model_input, train[target], batch_size=8192, epochs=5, verbose=1, shuffle=True, callbacks=[cp_callback], validation_data=(val_model_input, val[target])) data['predict'] = 0 data.loc[train_index, 'predict'] = model.predict(train_model_input, batch_size=8192) data.loc[val_index, 'predict'] = model.predict(val_model_input, batch_size=8192) data.loc[test_index, 'predict'] = model.predict(test_model_input, batch_size=8192) p = 88.5 pred_val = data.loc[val_index, 'predict'] print("val LogLoss", round(log_loss(val[target], pred_val), 4)) threshold_val = round(np.percentile(pred_val, p), 4) pred_val2 = [1 if i > threshold_val else 0 for i in pred_val] print("val F1 >%s" % threshold_val, round(f1_score(val[target], pred_val2), 4)) pred_train_val = data.loc[data['isTest'] != 1, 'predict'] print( "train_val LogLoss",
dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') model = multi_gpu_model(model, gpus=2) model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=2, validation_split=0.1, shuffle=True, callbacks=get_callbacks()) # In[ ]: best_epoch = np.argmax(hist.history["val_auroc"]) + 1 model.load_weights('checkpoint/epoch_{:02d}.hdf5'.format(best_epoch)) print(hist.history["val_auroc"]) print('loading epoch_{:02d}.hdf5'.format(best_epoch)) y_pre = model.predict(online_test_model_input, verbose=1, batch_size=BATCH_SIZE) res = pd.DataFrame() res['id'] = test_id res['probability'] = y_pre res.to_csv('submission_DeepFM_fibinet_feature.csv', index=False) # pred_ans = pred_ans.flatten() # ans = pd.DataFrame(data={'id': np.array( # [i for i in range(1, pred_ans.shape[0]+1)]), 'probability': pred_ans}) # ans.to_csv('submission_DeepFM.csv', index=False, header=True) # del model # gc.collect() # # In[ ]:
model.load_weights('model_save/deep_fm_sample-ep001-loss0.184-val_loss0.172.h5') # model = load_model('model_save/deep_fm_sample-ep001-loss0.192-val_loss0.176.h5') data = pd.read_csv("./data/sample/validation.txt") # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) # 2.count #unique features for each sparse field sparse_feature_dim = {feat: data[feat].nunique() for feat in sparse_features} # 3.generate input data for model model_input = [data[feat].values for feat in sparse_feature_dim] pred = model.predict(model_input, batch_size, 1) label = data[target].values.flatten().tolist() pred = pred.flatten().tolist() with open('data/pctr', 'w') as fw: for i in range(len(pred)): if i % 10000 == 0: print('label: %f, pred: %f' % (label[i], pred[i])) to_write = str(i+1)+','+str(label[i])+','+str(pred[i])+'\n' fw.write(to_write) fw.close() AUC = auc.auc(label, pred) print('auc: %f' % AUC) print("demo done")
print(model_input) # print(model_input.shape) # 4.Define Model,compile and train model = DeepFM( { "sparse": sparse_feat_list, "dense": dense_feat_list, "sequence": sequence_feature }, final_activation='linear', embedding_size=8, use_fm=False, hidden_size=(64, 64)) model.compile( "adam", "mape", metrics=['mape'], ) history = model.fit( model_input, df_train[target].values, batch_size=2048, epochs=200, verbose=2, validation_split=0.2, ) pred = model.predict(model_input) print(pred) print(smape(df_train[target].values, pred))
X = data[feats] y = data['Rating'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) sparse_features = [ 'UserID', 'MovieID', 'Gender', 'Occupation', 'day', 'weekday' ] dense_features = ['hour', 'Age'] fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for feat in sparse_features] + \ [DenseFeat(feat, 1) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns model = DeepFM(linear_feature_columns, dnn_feature_columns, task='multiclass') model.compile('adam', 'mse', metrics=['accuracy']) feature_names = get_feature_names(fixlen_feature_columns) train_feed_dict = {name: X_train[name] for name in feature_names} test_feed_dict = {name: X_test[name] for name in feature_names} model.fit(train_feed_dict, y_train, batch_size=256, epochs=10, validation_split=0.2) pred_ans = model.predict(test_feed_dict, batch_size=256)
linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, valid = train_test_split(data, test_size=0.2,random_state=10) train_model_input = {name:train[name] for name in feature_names} valid_model_input = {name:valid[name] for name in feature_names} test_model_input = {name:test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate #dnn_hidden_units用来定义隐藏层数量以及每层神经元个数 model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary',dnn_hidden_units=[100,100],dnn_dropout=0.2) model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=2, verbose=2, validation_data=(valid_model_input,valid['target']) ) pred_ans = model.predict(valid_model_input, batch_size=256) print("valid LogLoss", round(log_loss(valid[target].values, pred_ans), 4)) print("valid AUC", round(roc_auc_score(valid[target].values, pred_ans), 4)) #进行预测,并写入csv文件 result=model.predict(test_model_input, batch_size=256) result=pd.DataFrame(result,columns=['label']) submit=pd.DataFrame(test['ID'],columns=['ID']) submit=submit.join(result) submit.to_csv(sys.path[0]+"\\tem\\"+"result"+'.csv',index=False)
from deepctr.models import DeepFM if __name__ == "__main__": data = pd.read_csv("./movielens_sample.txt") sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) # 2.count #unique features for each sparse field sparse_feature_dim = {feat: data[feat].nunique() for feat in sparse_features} # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = [train[feat].values for feat in sparse_feature_dim] test_model_input = [test[feat].values for feat in sparse_feature_dim] # 4.Define Model,train,predict and evaluate model = DeepFM({"sparse": sparse_feature_dim, "dense": []}, final_activation='linear') model.compile("adam", "mse", metrics=['mse'],) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=2, validation_split=0.2,) pred_ans = model.predict(test_model_input, batch_size=256) print("test MSE", round(mean_squared_error( test[target].values, pred_ans), 4))
class RecommenderDeepNN: ''' Recommender for Yelp dataset using the deepFM model. Parameters ---------- category: 'restaurants', Keep only businesses of a certain category - Options: 'restaurants', 'automotive', 'shopping' min_review: 5, Keep only business with more review_count than this value min_category: 50, Keep only categories that apply to more than this amount of businesses weight: False, Whether or not to use weights for the attribute matrix in the DeepFM scaler: 'minmax', Scaler for dense features optimizer: "adam", Optimizer for the DeepFM loss: 'mse', Loss function for the DeepFM batch_size: 256, epochs: 10, train_size: 0.8, deepfm__dnn_hidden_units: (128, 128), deepfm__l2_reg_linear: 1e-05, deepfm__l2_reg_embedding: 1e-05, deepfm__l2_reg_dnn: 0, deepfm__seed: 1024, deepfm__dnn_dropout: 0, deepfm__dnn_activation: 'relu' Example ------- deepnn = RecommenderDeepNN(deepfm__seed=2048) deepnn.load_data(config.JSON_BUSINESS, config.CSV_RATINGS) deepnn.fit() deepnn.topN(260, n=5) deepnn = RecommenderDeepNN(scaler='standard', train_size=0.99) deepnn.fit(config.JSON_BUSINESS, config.CSV_RATINGS) ''' def __init__(self, **kwargs): ''' Parameters ---------- path_business: Path to the business.json file that contains 'attributes' and 'catogories' as dictionaries for all businesses path_ratings: Path to the ratings.csv file that contains 'user_id', 'business_id' and 'stars'. The review text is not needed here. ''' self.path_business = "" self.path_ratings = "" self.features_sparse = features_sparse self.features_dense = features_dense self.params = params_deepnn self.params_deepfm = {} self.business = None self.data = None self.attr2index = {} self.raw_to_iid = {} self.iid_to_raw = {} self.raw_to_uid = {} self.uid_to_raw = {} # Label encoders self.lbe_user = None self.lbe_item = None self.model = None self.features_linear = [] self.features_dnn = [] self.model_input = {} self.update_params(**kwargs) def load_data(self, path_business, path_ratings): ''' Load data and transform it to usable format. ''' print("Loading data ...") self.path_business = path_business self.path_ratings = path_ratings df = pd.read_json(self.path_business, lines=True, encoding='utf-8') df_ratings = pd.read_csv(self.path_ratings) df_ratings.rename({'stars':'rating'}, axis=1, inplace=True) to_keep = config.Keywords_Categories[self.params['category']] keeprows = utils.filter_business_with_categories(df, to_keep) df = df[keeprows] # Map user_id and business_id encodings to integers self.uid_to_raw = dict(df_ratings['user_id'].drop_duplicates().reset_index()['user_id']) self.raw_to_uid = {k:v for v, k in self.uid_to_raw.items()} self.iid_to_raw = dict(df['business_id']) self.raw_to_iid = {k:v for v, k in self.iid_to_raw.items()} self.business = df[['business_id', 'name', 'stars', 'review_count', 'categories']] df = df[df['review_count'] > self.params['min_review']] df = df_ratings.join(df[['business_id', 'stars', 'review_count', 'categories']].set_index('business_id'), on='business_id', how='right') # Has to be "right"... otherwise there will be NaNs # Also, use df.set_index() because df is smaller in size df['user_id'] = df['user_id'].map(self.raw_to_uid) df['business_id'] = df['business_id'].map(self.raw_to_iid) self.lbe_user = LabelEncoder() self.lbe_item = LabelEncoder() df['user_id'] = self.lbe_user.fit_transform(df['user_id']) df['business_id'] = self.lbe_item.fit_transform(df['business_id']) # x = lbe_user.inverse_transform(df_ratings['user_id']) # y = lbe_item.inverse_transform(df_ratings['business_id']) if(self.params['scaler'] == 'minmax'): scaler = MinMaxScaler(feature_range=(0,1)) elif(self.params['scaler'] == 'standard'): scaler = StandardScaler() df[self.features_dense] = scaler.fit_transform(df[self.features_dense]) lbe = LabelEncoder() for var in self.features_sparse: if(var not in ['business_id', 'user_id']): df[var] = lbe.fit_transform(df[var]) self.data = df del df, df_ratings def _compile_business_categories(self, df_business): ''' Find all the categories that apply to the businesses in the DataFrame df_business ''' categories = Counter() for line in df_business['categories']: if(isinstance(line, str)): categories.update(re.split(', ', line)) categories = pd.DataFrame.from_dict(categories, orient='index', columns=['count']) return categories def _build_category_dict(self, drop_categories=[]): attrs = self._compile_business_categories(self.data) attrs = attrs[attrs['count'] > self.params['min_category']].sort_values(by='count', ascending=False) for cat in drop_categories: attrs.drop(cat, inplace=True) attrs.index.to_list() self.attr2index = {k:v+1 for v, k in enumerate(attrs.index.to_list())} del attrs def _category_vectorizer(self, x): ''' Label encode categories of any business x into a list of indices. The mapping is given by the dictionary attr2index{catogory:index}. ''' if(isinstance(x, str)): spt = re.split(', ', x) return list(map(lambda x: self.attr2index[x] if x in self.attr2index else 0, spt)) else: return [] def _get_category_matrix(self, df): attrs_matrix = [self._category_vectorizer(x) for x in df['categories'].values] attrs_max_len = max(np.array(list(map(len, attrs_matrix)))) attrs_matrix = pad_sequences(attrs_matrix, maxlen=attrs_max_len, padding='post',) print("Matrix takes {:5.2f} MB".format(attrs_matrix.nbytes/1024./1024.)) return attrs_matrix, attrs_max_len def _build_model(self): to_drop = config.Keywords_Categories[self.params['category']] self._build_category_dict(drop_categories=to_drop) attrs_matrix, attrs_max_len = self._get_category_matrix(self.data) vars_fixlen = [SparseFeat(var, self.data[var].nunique(), embedding_dim=4) for var in self.features_sparse] vars_fixlen += [DenseFeat(var, 1,) for var in self.features_dense] vars_varlen = [VarLenSparseFeat(SparseFeat('categories', vocabulary_size=len(self.attr2index) + 1, embedding_dim=4), maxlen=attrs_max_len, combiner='mean', weight_name='attrs_weight' if self.params['weight'] else None)] self.features_linear = vars_fixlen + vars_varlen self.features_dnn = vars_fixlen + vars_varlen self.model = DeepFM(self.features_linear, self.features_dnn, task='regression', **self.params_deepfm) return attrs_matrix, attrs_max_len def get_feature_names(self): return get_feature_names(self.features_linear + self.features_dnn) def _set_params_deepfm(self): for k, v in self.params.items(): spt = k.split('__') if(len(spt) > 1): self.params_deepfm[spt[1]] = v def update_params(self, recompile=True, **kwargs): ''' Update parameters for the recommender and re-compile the DeepFM model unless recompile is set to False. Example ------- deepnn.update_params(epochs=20, deepfm__l2_reg_linear=2e-4) ''' for (k, v) in kwargs.items(): if(k in self.params): self.params[k] = v else: raise ValueError('{0} is not a valid parameter for RecommenderDeepNN.'.format(k)) self._set_params_deepfm() if(recompile == True and self.model is not None): self.model = DeepFM(self.features_linear, self.features_dnn, task='regression', **self.params_deepfm) def fit(self, path_business=None, path_ratings=None): if(self.data is None): self.load_data(path_business, path_ratings) model_input = self._get_model_input(self.data) self.model.compile(self.params['optimizer'], self.params['loss'], metrics=[self.params['loss']],) self.model.fit(model_input, self.data['rating'].values, batch_size=self.params['batch_size'], epochs=self.params['epochs'], validation_split=1-self.params['train_size'], verbose=2) def _get_model_input(self, df): if(self.model is None): attrs_matrix, attrs_max_len = self._build_model() else: attrs_matrix, attrs_max_len = self._get_category_matrix(df) features = self.get_feature_names() model_input = {name: df[name] for name in features} model_input['categories'] = attrs_matrix if(self.params['weight']): model_input['attrs_weight'] = np.random.randn(df.shape[0], attrs_max_len, 1) return model_input def predictAllItemsForUser(self, uid): ''' Returns predicted ratings of all businesses for any user (uid) ''' df = self.data.drop_duplicates('business_id').drop('user_id', axis=1) df['user_id'] = uid model_input = self._get_model_input(df) pred = self.model.predict(model_input, batch_size=self.params['batch_size']) return pd.DataFrame(pred,index=df['business_id'],columns=['pred']) def topN(self, uid, n=5): inner_uid = self.lbe_user.transform([uid])[0] pred = self.predictAllItemsForUser(inner_uid) topn = pred.nlargest(n, columns='pred') top_n_iid = self.lbe_item.inverse_transform(topn.index) predictions = topn['pred'].to_list() n_reviews = self.data['user_id'].value_counts()[inner_uid] print() print("UserID: {0}, Rated: {1}".format(uid, n_reviews)) print("--------------------------------") topN_business = self.business.loc[top_n_iid] for i, (_, business) in enumerate(topN_business.iterrows()): print(business['name']) print(business['categories']) print("Pred: %4.2f Avg: %3.1f out of %d reviews\n" % \ (predictions[i], business['stars'], business['review_count']))
class DeepModel: def __init__(self, model_name, model_architecture="DeepFM"): self.model_name = model_name self.model_architecture = model_architecture self.model = None self.history = None self.data = None self.callbacks = [] # requires tf2 # def set_notebook_mode(self): # progress_bar_cb = tfa.callbacks.TQDMProgressBar() #TQDMNotebookCallback(leave_inner=True, leave_outer=True) # self.callbacks.append(progress_bar_cb) def prepare_data(self, data_source, sparse_features, target, test_size=0.1): self.data = Data(sparse_features, target, data_format="deepctr", test_size=test_size) self.data.ingest(data_source) self.data.prepare() def build(self, task): assert task in ['regression', 'binary'] if self.model_architecture == "DeepFM": self.model = DeepFM( self.data.linear_feature_columns, self.data.dnn_feature_columns, task=task, ) else: raise NotImplementedError( 'At the current stage of the development, only a DeepFM is supported' ) task_attr = { 'regression': { 'loss': 'mse', 'metrics': 'mse' }, 'binary': { 'loss': 'binary_crossentropy', 'metrics': 'accuracy' } } if task == "regression": loss = "mse" metrics = "mse" elif task == "binary": loss = "binary_crossentropy" metrics = "accuracy" self.model.compile(optimizer="adam", loss=task_attr[task]['loss'], metrics=task_attr[task]['metrics']) def train(self, batch_size=256, epochs=10, validation_split=0.1): #class_weights = class_weight.compute_class_weight( # "balanced", np.unique(self.data.y_train[:, 0]), self.data.y_train[:, 0] #) self.history = self.model.fit( self.data.X_train, self.data.y_train, batch_size=batch_size, epochs=epochs, validation_split=validation_split, verbose=2, #class_weight=class_weights, callbacks=self.callbacks, ) def evaluate(self): self.model.evaluate(self.data.X_test, self.data.y_test, batch_size=4096) def prepare_input(self, df): df = df.copy() for feat in self.data.sparse_features: lbe = self.data.encoders[feat] df[feat] = lbe.transform(df[feat]) X = {name: df[name].values for name in self.data.feature_names} return X def predict(self, X, batch_size=256): return self.model.predict(X, batch_size=batch_size)
hist = model.fit(train_and_val_model_input, y_train_val, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=2, validation_split=0.1, callbacks=get_callbacks()) # In[ ]: best_epoch = np.argmax(hist.history["val_auroc"]) + 1 model.load_weights('checkpoint/epoch_{:02d}.hdf5'.format(best_epoch)) print(hist.history["val_auroc"]) print('loading epoch_{:02d}.hdf5'.format(best_epoch)) pred_ans = model.predict(test_model_input, verbose=1, batch_size=BATCH_SIZE) pred_ans = pred_ans.flatten() ans = pd.DataFrame( data={ 'id': np.array([i for i in range(1, pred_ans.shape[0] + 1)]), 'probability': pred_ans }) ans.to_csv('submission_DeepFM_countFeature.csv', index=False, header=True) # del model # gc.collect() # # In[ ]: # # EPOCHS = np.argmax(hist.history["val_auroc"])+1
def test_DFM_avazu(data, train, test): print("\nTesting DFM on avazu dataset...\n") results_activation_function = {"auc": [], "logloss": [], "rmse": []} results_dropout = {"auc": [], "logloss": [], "rmse": []} results_number_of_neurons = {"auc": [], "logloss": [], "rmse": []} auc = 0 logloss = 0 rmse = 0 features_labels = train.columns sparse_features_labels = features_labels[1:23] target_label = features_labels[0] dnn_feature_columns = [ SparseFeat( feat, vocabulary_size=data[feat].nunique(), embedding_dim=4, ) for feat in sparse_features_labels ] linear_feature_columns = [ SparseFeat( feat, vocabulary_size=data[feat].nunique(), embedding_dim=4, ) for feat in sparse_features_labels ] feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} true_y = test[target_label].values print("\t\t-- ACTIVATION FUNCTIONS --\t\t") for dnn_activation in dnn_activation_list: print("\nTesting {dnn_activation}...".format( dnn_activation=dnn_activation)) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_activation=dnn_activation, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION, ) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_activation_function["auc"].append(auc) results_activation_function["logloss"].append(logloss) results_activation_function["rmse"].append(rmse) print("\t\t-- DROPOUT RATES --\t\t") for dnn_dropout in dnn_dropout_list: print("\nTesting {dnn_dropout}...".format(dnn_dropout=dnn_dropout)) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_dropout=dnn_dropout, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION, ) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_dropout["auc"].append(auc) results_dropout["logloss"].append(logloss) results_dropout["rmse"].append(rmse) print("\t\t-- HIDDEN UNITS --\t\t") for dnn_hidden_units in dnn_hidden_units_list: print("\nTesting {dnn_hidden_units}...".format( dnn_hidden_units=dnn_hidden_units)) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=dnn_hidden_units, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION, ) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_number_of_neurons["auc"].append(auc) results_number_of_neurons["logloss"].append(logloss) results_number_of_neurons["rmse"].append(rmse) if PLOT: create_plots("DFM", "avazu", results_activation_function, "Activation Function", "activation_func", dnn_activation_list) create_plots("DFM", "avazu", results_dropout, "Dropout Rate", "dropout", dnn_dropout_list) create_plots("DFM", "avazu", results_number_of_neurons, "Number of Neurons per layer", "nr_neurons", dnn_hidden_units_list)
def deepctr_cv(X_train, y_train, folds, logger, cv_path, X_test=None, optional_data=None, prep=True, split_conf=None): scores = [] preds = [] meta = np.zeros_like(y_train).astype("float64") if split_conf is None: X_tr, X_te, main_conf, _ = prep_for_embedding(X_train, X_test, conf, prep=prep) X_train, X_test = X_tr, X_te else: main_conf = split_conf cat_cols = [c for c, _, _ in main_conf[0]] cat_fs = [SingleFeat(c, d) for c, d, _ in main_conf[0]] num_fs = [SingleFeat(c, 0) for c in conf.num_cols] X_test = split_df(X_test, cat_cols, conf.num_cols) for num_fold, (tr_ind, tes_ind) in enumerate(folds): if num_fold > 0: break logger.info(f"fold_{num_fold}") fold_path = cv_path / f"fold{num_fold}" seed_path = fold_path Path(fold_path).mkdir(exist_ok=True, parents=True) callbacks = [CSVLogger(str(fold_path / 'epochs.csv'))] X_cv_train, X_cv_test = X_train.iloc[tr_ind], X_train.iloc[tes_ind] y_cv_train, y_cv_test = y_train.iloc[tr_ind], y_train.iloc[tes_ind] X_cv_train = split_df(X_cv_train, cat_cols, conf.num_cols) X_cv_test = split_df(X_cv_test, cat_cols, conf.num_cols) model = DeepFM({ 'sparse': cat_fs, 'dense': num_fs }, final_activation='sigmoid') model.compile("adam", "binary_crossentropy", metrics=['accuracy']) model.fit(X_cv_train, y_cv_train, callbacks=callbacks, batch_size=2048, epochs=10, verbose=1, validation_data=(X_cv_test, y_cv_test)) model.save_weights(str(seed_path / 'weights.h5'), save_format='hdf5') gc.collect() if X_test is not None: pred = model.predict(X_test, batch_size=2048) pred = pred[:, 0] np.save(seed_path / f"pred.npy", pred) train_oof = model.predict(X_cv_test, batch_size=2048) train_oof = train_oof[:, 0] auc = roc_auc_score(y_cv_test.values, train_oof) logger.info(f"{num_fold}: auc {auc}") np.save(seed_path / f"train_oof.npy", train_oof) # auc = roc_auc_score(y_cv_test, train_oof) # logger.info(f"seed_average: auc {auc}") scores.append(auc) np.save(fold_path / f"tes_ind.npy", tes_ind) meta[tes_ind] += train_oof del X_cv_train, y_cv_train, X_cv_test, y_cv_test if X_test is not None: preds.append(pred) scores = np.array(scores) preds = np.array(preds) pred = rank_average(preds) logger.info(f"{scores.mean()}, {scores.std()}") return scores, pred, meta