def get_xy_random2(X, y, cols_family={}): # X = np.random.rand(100,30) # y = np.random.binomial(n=1, p=0.5, size=[100]) ## PREPROCESSING STEPS # change into dataframe target = 'y' cols = [str(i) for i in range(X.shape[1])] # define column pd dataframe, need to be string type data = pd.DataFrame(X, columns=cols) # need to convert into df, following the step from documentation #data['y'] = y # define which feature columns sparse or dense type # since our data categorize as Dense Features, we define the sparse features as empty list #cols_sparse_features = [] #cols_dense_features = [str(i) for i in range(X.shape[1])] cols_sparse_features = cols_family['colsparse'] cols_dense_features = cols_family['coldense'] # convert feature type into SparseFeat or DenseFeat type, adjusting from DeepCTR library sparse_feat_l = [SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i,feat in enumerate(cols_sparse_features)] dense_feat_l = [DenseFeat(feat, dimension=1) for feat in cols_dense_features] feature_col = sparse_feat_l + dense_feat_l linear_feat_col = feature_col # containing all the features used by linear part of the model dnn_feat_col = feature_col # containing all the features used by deep part of the model feature_names = get_feature_names(linear_feat_col + dnn_feat_col) train_model_input = {name: data[name] for name in feature_names} X_train, y_train = train_model_input, y.values return X_train, y_train, linear_feat_col, dnn_feat_col
def test_long_dense_vector(): feature_columns = [ SparseFeat( 'user_id', 4, ), SparseFeat( 'item_id', 5, ), DenseFeat("pic_vec", 5) ] fixlen_feature_names = get_feature_names(feature_columns) user_id = np.array([[1], [0], [1]]) item_id = np.array([[3], [2], [1]]) pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2]]) label = np.array([1, 0, 1]) input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec} model_input = [input_dict[name] for name in fixlen_feature_names] model = DeepFM(feature_columns, feature_columns[:-1]) model.compile('adagrad', 'binary_crossentropy') model.fit(model_input, label)
def test_long_dense_vector(): #构造特征 feature_columns = [ SparseFeat( 'user_id', 4, ), SparseFeat( 'item_id', 5, ), DenseFeat("pic_vec", 5) ] fixlen_feature_names = get_feature_names(feature_columns) #构造样本 user_id = np.array([[1], [0], [1]]) item_id = np.array([[3], [2], [1]]) pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2]]) label = np.array([1, 0, 1]) input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec} model_input = [input_dict[name] for name in fixlen_feature_names] #创建模型model model = DeepFM(feature_columns, feature_columns[:-1]) # model.summary() #tf.keras.utils.plot_model(model, "test_compu") #训练模型 model.compile('adagrad', 'binary_crossentropy') model.fit(model_input, label)
def get_xy_fd(hash_flag=False): feature_columns = [SparseFeat('user', 3, embedding_dim=10), SparseFeat( 'gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8), SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1)] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length")] # Notice: History behavior sequence feature name must start with "hist_". behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value pay_score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]]) seq_length = np.array([3, 3, 2]) # the actual length of the behavior sequence feature_dict = {'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score, 'seq_length': seq_length} x = {name: feature_dict[name] for name in get_feature_names(feature_columns)} y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
def get_xy_random(): X = np.random.rand(100, 30) y = np.random.binomial(n=1, p=0.5, size=[100]) ## PREPROCESSING STEPS # change into dataframe cols = [str(i) for i in range(X.shape[1]) ] # define column pd dataframe, need to be string type data = pd.DataFrame( X, columns=cols ) # need to convert into df, following the step from documentation data['y'] = y # define which feature columns sparse or dense type # since our data categorize as Dense Features, we define the sparse features as empty list cols_sparse_features = [] cols_dense_features = [str(i) for i in range(X.shape[1])] # convert feature type into SparseFeat or DenseFeat type, adjusting from DeepCTR library sparse_feat_l = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(cols_sparse_features) ] dense_feat_l = [ DenseFeat(feat, dimension=1) for feat in cols_dense_features ] feature_col = sparse_feat_l + dense_feat_l linear_feat_col = feature_col # containing all the features used by linear part of the model dnn_feat_col = feature_col # containing all the features used by deep part of the model feature_names = get_feature_names(linear_feat_col + dnn_feat_col) train_full, test = train_test_split(data, random_state=2021, stratify=data['y']) train, val = train_test_split(train_full, random_state=2021, stratify=train_full['y']) train_model_input = {name: train[name] for name in feature_names} val_model_input = {name: val[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} target = 'y' ## END OF PREPROCESSING STEPS X_train, y_train = train_model_input, train[target].values X_val, y_val = val_model_input, val[target].values X_test, y_test = test_model_input, test[target].values return X_train, X_val, X_test, y_train, y_val, y_test, linear_feat_col, dnn_feat_col
def fit(self, X, y): X_ = X.copy() self.dense_features = list(X_.columns.difference(self.cat_features)) logger.debug("MinMaxScaler") self.min_max_scaler.fit(X_[self.dense_features]) X_[self.dense_features] = self.min_max_scaler.transform( X_[self.dense_features]) self._column_mapping(X_) X_.columns = [self.columns_mapping[col] for col in X_.columns] self.fixlen_feature_columns = [ SparseFeat( self.columns_mapping[feat], vocabulary_size=X_[self.columns_mapping[feat]].max() + 1, embedding_dim=4, ) for i, feat in enumerate(self.cat_features) ] + [ DenseFeat( self.columns_mapping[feat], 1, ) for feat in self.dense_features ] self.feature_names = get_feature_names(self.fixlen_feature_columns) logger.debug("Compile DeepFM model") self.model = DeepFM(self.fixlen_feature_columns, self.fixlen_feature_columns, task="binary") self.model.compile( "adam", "binary_crossentropy", metrics=["binary_crossentropy"], ) logger.debug("Fit DeepFM") train_model_input = { name: X_[name].values for name in self.feature_names } self.model.fit( train_model_input, y, batch_size=256, epochs=3, verbose=2, validation_split=0.2, )
def run_base_experiment(data_path, dataset_type, model_params, model_type, opt): if dataset_type == 'critero': data_df, sparse_features, dense_features, target = load_citero_dataset( data_path) else: data_df, sparse_features, dense_features, target = load_taboola_dataset( data_path) data_df = prepare_data_for_train(data_df, sparse_features, dense_features) fixlen_feature_columns = [ SparseFeat( feat, vocabulary_size=data_df[feat].nunique(), embedding_dim=10) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data_df, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} batch_size = 1024 # 4.Define Model,train,predict and evaluate model = model_type(linear_feature_columns, dnn_feature_columns, seed=1024, **model_params) model.compile( optimizer=opt, loss="binary_crossentropy", metrics=['binary_crossentropy', 'accuracy'], ) history = model.fit( train_model_input, train[target].values, batch_size=batch_size, epochs=10, verbose=1, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=batch_size) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) pass
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3, embedding_dim=10), SparseFeat('gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8), SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4) ] behavior_feature_list = ["item_id", "cate_id"] # 变长特征使用的base稀疏特征 uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value pay_score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]]) # 特征名->data输入 feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
def read_data_as_model(): data = pd.read_csv('GiveMeSomeCredit/cs-training.csv') sparse_features = [ 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents' ] dense_features = [ 'RevolvingUtilizationOfUnsecuredLines', 'age', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines' ] data[sparse_features] = data[sparse_features].fillna(-1, ) data[dense_features] = data[dense_features].fillna(-1, ) target = ['SeriousDlqin2yrs'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2, random_state=1234) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} return train, test, train_model_input, test_model_input, dnn_feature_columns, linear_feature_columns, feature_names, target
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=4, embedding_name='item_gender'), maxlen=4) ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } feature_names = get_feature_names(feature_columns) x = {name: feature_dict[name] for name in feature_names} y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(): feature_columns = [SparseFeat('driver_age', 7, embedding_dim=32), SparseFeat('pax_age', 7, embedding_dim=32), SparseFeat('des_id', 10000, embedding_dim=32), SparseFeat('price_id', 20, embedding_dim=32)] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_price_id', vocabulary_size=5, embedding_dim=32), maxlen=3), VarLenSparseFeat(SparseFeat('hist_des_id', vocabulary_size=5, embedding_dim=32), maxlen=3)] # Notice: History behavior sequence feature name must start with "hist_". behavior_feature_list = ["price_id", "des_id"] driver_age = np.array([0, 1, 2]) pax_age = np.array([0, 1, 0]) pax_des = np.array([1, 2, 3]) # 0 is mask value pax_price = np.array([1, 2, 2]) # 0 is mask value hist_price_seq = np.array([[1, 2, 3], [3, 2, 1], [1, 2, 0]]) hist_des_seq = np.array([[1, 2, 2], [2, 2, 1], [1, 2, 0]]) feature_dict = {'driver_age': driver_age, 'pax_age': pax_age, 'des_id': pax_des, 'price_id': pax_price, 'hist_price_id': hist_price_seq, 'hist_des_id': hist_des_seq} x = {name: feature_dict[name] for name in get_feature_names(feature_columns)} y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [ SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag), SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag), SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag), SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) behavior_length = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': score, "seq_length": behavior_length } if use_neg: feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) feature_columns += [ VarLenSparseFeat(SparseFeat('neg_hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('neg_hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 3, use_hash=hash_flag), SparseFeat('gender', 2, use_hash=hash_flag), SparseFeat('item', 3 + 1, use_hash=hash_flag), SparseFeat('item_gender', 2 + 1, use_hash=hash_flag), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('sess_0_item', 3 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('sess_0_item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item_gender'), maxlen=4) ] feature_columns += [ VarLenSparseFeat(SparseFeat('sess_1_item', 3 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('sess_1_item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item_gender'), maxlen=4) ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]]) sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]]) sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess_number = np.array([2, 1, 0]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score, 'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } x["sess_length"] = sess_number y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
VarLenSparseFeat(sparsefeat=SparseFeat('hist_merchant_id', vocabulary_size=1993, embedding_dim=8, embedding_name='merchant_id'), maxlen=M), VarLenSparseFeat(sparsefeat=SparseFeat('hist_action_type', vocabulary_size=4, embedding_dim=4, embedding_name='action_type'), maxlen=M)] history_features = ['merchant_id', 'action_type'] print(len(feature_columns)) # 使用DIN模型 model = DIN(feature_columns, history_features) # 使用Adam优化器,二分类的交叉熵 model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) # model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"]) # 组装train_model_input,得到feature names,将train_X转换为字典格式 feature_names = list(train_X.columns) train_model_input = {name: train_X[name].values for name in get_feature_names(feature_columns)} print("########################################") # histroy输入必须是二维数组 from tqdm import tqdm for fea in ['hist_merchant_id', 'hist_action_type']: list = [] for i in tqdm(train_model_input[fea]): list.append(i) train_model_input[fea] = np.array(list) history = model.fit(train_model_input, train_y.values, verbose=True, epochs=10, validation_split=0.2, batch_size=512) # 转换test__model_input test_data['action_type'] = 3
for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1,embedding_dim=4 ) for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # list of string # 3.generate input data for model train, test = train_test_split(data, test_size=0.2, random_state=2020) train_model_input = {name:train[name] for name in feature_names} test_model_input = {name:test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
print('data.columns', data.columns.tolist()) print('unique date_: ', data['date_'].unique()) train = data[data['date_'] < 14] val = data[data['date_'] == 14] # 第14天样本作为验证集 pretrained_feed_embedding_initializer = tf.initializers.identity(feed_embedding) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat('feedid', vocabulary_size=data['feedid'].max() + 1, embedding_dim=512, embeddings_initializer=pretrained_feed_embedding_initializer)] + [ SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=embedding_dim) for feat in sparse_features if feat is not 'feedid'] + [DenseFeat(feat, 1) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(dnn_feature_columns) # 3.generate input data for model train_model_input = {name: train[name] for name in feature_names} val_model_input = {name: val[name] for name in feature_names} userid_list = val['userid'].astype(str).tolist() test_model_input = {name: test[name] for name in feature_names} train_labels = [train[y].values for y in target] val_labels = [val[y].values for y in target] # 4.Define Model,train,predict and evaluate train_model = MMOE(dnn_feature_columns, num_tasks=4, expert_dim=8, dnn_hidden_units=(128, 128), tasks=['binary', 'binary', 'binary', 'binary']) train_model.compile("adagrad", loss='binary_crossentropy') # print(train_model.summary())
def get_xy_from_txt(file_path="data/movielens_sample_din.txt"): feature_columns = [ SparseFeat('user', 3, embedding_dim=10), SparseFeat('gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8), SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4) ] behavior_feature_list = ["item_id", "cate_id"] # head = ['label', 'user', 'gender', 'item_id', 'cate_id', 'hist_item_id', 'hist_cate_id', 'pay_score'] data = pd.read_csv(file_path, delimiter=',') def to_int_array(x): ret = [] a = x.split('|') for str in a: ret.append(int(str)) return np.array(ret) # return ret data['hist_item_id'] = data['hist_item_id'].apply(to_int_array) data['hist_cate_id'] = data['hist_cate_id'].apply(to_int_array) uid = np.array(data['user']) ugender = np.array(data['gender']) iid = np.array(data['item_id']) # 0 is mask value cate_id = np.array(data['cate_id']) # 0 is mask value pay_score = np.array(data['pay_score']) print("hist_cate_id: ", type(data['hist_cate_id']), type(data['hist_cate_id'][0]), np.shape(data['hist_cate_id'][0]), data['hist_cate_id']) print("------------" * 10) hist_iid = np.array(data['hist_item_id'].tolist()) hist_cate_id = np.array(data['hist_cate_id'].tolist()) print("uid: ", type(uid), uid) print("hist_cate_id: ", type(hist_cate_id), type(hist_cate_id[0]), np.shape(hist_cate_id[0]), hist_cate_id) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array(data.pop('label')) return x, y, feature_columns, behavior_feature_list
untrainable_features_columns = [] dense_features = [] else: print('plz input dataset name') sys.exit() udg_features = 'userId' target = ['rating'] behavior_feature_list = ['itemId', 'category'] fixlen_feature_columns = [SparseFeat(feat, train[feat].nunique(), embedding_dim=int(sys.argv[5])) for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_feature_names(fixlen_feature_columns) train_model_input = {name: train[name] for name in fixlen_feature_names} test_model_input = {name: test[name] for name in fixlen_feature_names} if sys.argv[1] in ['DIEN', 'DIEN_UDG', 'DIN', 'DIN_UDG']: test_model_input, test_label, max_len = get_input(test, 0, 'test') train_model_input, train_label, _ = get_input(train, max_len, 'train') fixlen_feature_columns = [SparseFeat(feat, train[feat].nunique()+1, embedding_dim=int(sys.argv[5])) for feat in sparse_features] fixlen_feature_columns += [DenseFeat(feat, 1,) for feat in dense_features] fixlen_feature_columns += [ VarLenSparseFeat(SparseFeat('hist_itemId', train['itemId'].nunique() + 1, embedding_dim = int(sys.argv[5]), embedding_name='itemId'), maxlen=max_len, length_name='seq_length'), VarLenSparseFeat(SparseFeat('hist_category', train['category'].nunique() + 1, embedding_dim = int(sys.argv[5]), embedding_name='category'), maxlen=max_len, length_name='seq_length'),
def get_xy_dataset(data_sample=None): if data_sample == "avazu": df = pd.read_csv( 'https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/avazu_sample.txt' ) df['day'] = df['hour'].apply(lambda x: str(x)[4:6]) df['hour'] = df['hour'].apply(lambda x: str(x)[6:]) sparse_features = [ 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_model', 'device_type', 'device_conn_type', # 'device_ip', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', ] df[sparse_features] = df[sparse_features].fillna('-1', ) target = ['click'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() df[feat] = lbe.fit_transform(df[feat]) # 2.count #unique features for each sparse field,and record dense feature field name field_info = dict(C14='user', C15='user', C16='user', C17='user', C18='user', C19='user', C20='user', C21='user', C1='user', banner_pos='context', site_id='context', site_domain='context', site_category='context', app_id='item', app_domain='item', app_category='item', device_model='user', device_type='user', device_conn_type='context', hour='context', device_id='user') fixlen_feat_col = [ SparseFeat(name, vocabulary_size=df[name].nunique(), embedding_dim=16, use_hash=False, dtype='int32', group_name=field_info[name]) for name in sparse_features ] dnn_feat_col = fixlen_feat_col linear_feat_col = fixlen_feat_col feature_names = get_feature_names(linear_feat_col + dnn_feat_col) elif data_sample == "criteo": df = pd.read_csv( 'https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/criteo_sample.txt' ) sparse_features = ['C' + str(i) for i in range(1, 27)] dense_features = ['I' + str(i) for i in range(1, 14)] df[sparse_features] = df[sparse_features].fillna('-1', ) df[dense_features] = df[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() df[feat] = lbe.fit_transform(df[feat]) mms = MinMaxScaler(feature_range=(0, 1)) df[dense_features] = mms.fit_transform(df[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feat_col = [ SparseFeat( feat, vocabulary_size=df[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feat_col = fixlen_feat_col linear_feat_col = fixlen_feat_col feature_names = get_feature_names(linear_feat_col + dnn_feat_col) elif data_sample == "movielens": df = pd.read_csv( "https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/movielens_sample.txt" ) sparse_features = [ "movie_id", "user_id", "gender", "age", "occupation", "zip" ] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() df[feat] = lbe.fit_transform(df[feat]) # 2.count #unique features for each sparse field fixlen_feat_col = [ SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_features ] linear_feat_col = fixlen_feat_col dnn_feat_col = fixlen_feat_col feature_names = get_feature_names(linear_feat_col + dnn_feat_col) # 3.generate input data for model train_full, test = train_test_split(df, random_state=2021, stratify=df[target]) train, val = train_test_split(train_full, random_state=2021, stratify=train_full[target]) train_model_input = {name: train[name] for name in feature_names} val_model_input = {name: val[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} X_train, y_train = train_model_input, train[target].values X_val, y_val = val_model_input, val[target].values X_test, y_test = test_model_input, test[target].values return X_train, X_val, X_test, y_train, y_val, y_test, linear_feat_col, dnn_feat_col
#数据加载 data = pd.read_csv("deepfm_movielens_full.csv") sparse_features = ["MovieID", "UserID", "Genres", "Age", "OccupationID", "Zip-code"] target = ['Rating'] # 对特征标签进行编码 for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) # 计算每个特征中的 不同特征值的个数 fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features] print(fixlen_feature_columns) linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 将数据集切分成训练集和测试集 train, test = train_test_split(data, test_size=0.2) train_model_input = {name:train[name].values for name in feature_names} test_model_input = {name:test[name].values for name in feature_names} # 使用DeepFM进行训练 model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression') model.compile("adam", "mse", metrics=['mse'], ) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, ) # 使用DeepFM进行预测 pred_ans = model.predict(test_model_input, batch_size=256) # 输出RMSE或MSE mse = round(mean_squared_error(test[target].values, pred_ans), 4) rmse = mse ** 0.5
def main(model_dir, data_dir, train_steps, model_name): data = pd.read_csv(os.path.join(data_dir, 'criteo_sample.txt')) sparse_features = ['C' + str(i) for i in range(1, 27)] dense_features = ['I' + str(i) for i in range(1, 14)] data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2, random_state=2020) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate if model_name == 'DeepFM': model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'FNN': model = FNN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'WDL': model = WDL(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'MLR': model = MLR(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'NFM': model = NFM(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'DIN': model = DIN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'CCPM': model = CCPM(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'PNN': model = PNN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'AFM': model = AFM(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'DCN': model = DCN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'DIEN': model = DIEN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'DSIN': model = DSIN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'xDeepFM': model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'AutoInt': model = AutoInt(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'ONN': model = ONN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'FGCNN': model = FGCNN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'FiBiNET': model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'FLEN': model = FLEN(linear_feature_columns, dnn_feature_columns, task='binary') else: print(model_name + ' is not supported now.') return gpus = int(os.getenv('SM_NUM_GPUS', '0')) print('gpus:', gpus) if gpus > 1: from tensorflow.keras.utils import multi_gpu_model model = multi_gpu_model(model, gpus=gpus) model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=train_steps, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) try: print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) except Exception as e: print(e) try: print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) except Exception as e: print(e) model.save_weights(os.path.join(model_dir, 'DeepFM_w.h5'))
def get_feature_names(self): return get_feature_names(self.features_linear + self.features_dnn)