def generate_din_feature_columns(data, sparse_features, dense_features): feat_lbe_dict = get_glv('feat_lbe_dict') sparse_feature_columns = [ SparseFeat(feat, vocabulary_size=len(feat_lbe_dict[feat].classes_) + 1, embedding_dim=EMBED_DIM) for i, feat in enumerate(sparse_features) if feat not in time_feat ] dense_feature_columns = [DenseFeat( feat, 1, ) for feat in dense_features] var_feature_columns = [ VarLenSparseFeat(SparseFeat( 'hist_item_id', vocabulary_size=len(feat_lbe_dict['item_id'].classes_) + 1, embedding_dim=EMBED_DIM, embedding_name='item_id'), maxlen=max_seq_len) ] # DNN side dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns # FM side linear_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns # all feature names feature_names = get_feature_names(dnn_feature_columns + linear_feature_columns) return feature_names, linear_feature_columns, dnn_feature_columns
def simple_pre(df): # Label Encoding for sparse features,and normalization for dense numerical features for feat in config.sparse_features: lbe = LabelEncoder() df[feat] = lbe.fit_transform(df[feat]) mms = MinMaxScaler(feature_range=(0, 1)) df[config.dense_features] = mms.fit_transform(df[config.dense_features]) #Generate feature columns #For sparse features, we transform them into dense vectors by embedding techniques. #For dense numerical features, we concatenate them to the input tensors of fully connected layer. # count #unique features for each sparse field fixlen_feature_columns = [ SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in config.sparse_features ] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) return linear_feature_columns, dnn_feature_columns, feature_names
def get_xy_fd(hash_flag=False): feature_columns = [SparseFeat('user', 3, hash_flag), SparseFeat('gender', 2, hash_flag), SparseFeat('item', 3 + 1, hash_flag), SparseFeat('item_gender', 2 + 1, hash_flag), DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('sess_0_item',3+1,4,use_hash=hash_flag,embedding_name='item'),VarLenSparseFeat('sess_0_item_gender',2+1,4,use_hash=hash_flag,embedding_name='item_gender')] feature_columns += [VarLenSparseFeat('sess_1_item', 3 + 1, 4, use_hash=hash_flag, embedding_name='item'),VarLenSparseFeat('sess_1_item_gender', 2 + 1, 4, use_hash=hash_flag,embedding_name='item_gender')] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]]) sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]]) sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess_number = np.array([2, 1, 0]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score, 'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, } x = {name:feature_dict[name] for name in get_feature_names(feature_columns)} x["sess_length"] = sess_number y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def test_long_dense_vector(): feature_columns = [ SparseFeat( 'user_id', 4, ), SparseFeat( 'item_id', 5, ), DenseFeat("pic_vec", 5) ] fixlen_feature_names = get_feature_names(feature_columns) user_id = np.array([[1], [0], [1]]) item_id = np.array([[3], [2], [1]]) pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2]]) label = np.array([1, 0, 1]) input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec} model_input = [input_dict[name] for name in fixlen_feature_names] model = DeepFM(feature_columns, feature_columns[:-1]) model.compile('adagrad', 'binary_crossentropy') model.fit(model_input, label)
def train_deepFM(): k = featureengineer.k #缺失值填充+编码处理 data,appsnum, tags_nums = trainmodel.data,trainmodel.appsnum,trainmodel.tags_nums data[trainmodel.sparse_features] = data[trainmodel.sparse_features].fillna('-1', ) for feat in trainmodel.dense_features: data[feat].fillna(data[feat].dropna().mean(), inplace=True) for feat in trainmodel.sparse_features: data[feat] = data[feat].apply(lambda x:str(x)) lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[trainmodel.dense_features] = mms.fit_transform(data[trainmodel.dense_features]) #数据格式转换 fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=8) for i, feat in enumerate(trainmodel.sparse_features)] + \ [DenseFeat(feat, 1, ) for feat in trainmodel.dense_features] lgbOut_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=1) for i, feat in enumerate(trainmodel.lgbOut_Features)] key2index_len = {'applist': appsnum+1, 'new_tag': tags_nums} varlen_features = [VarLenSparseFeat('%s' % i, vocabulary_size=key2index_len[i], maxlen=k, embedding_dim=8, combiner='mean',weight_name=None) for i in trainmodel.var_features] dnn_feature_columns = fixlen_feature_columns + varlen_features linear_feature_columns = fixlen_feature_columns + varlen_features + lgbOut_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) sparse_dense_features = trainmodel.sparse_features + trainmodel.dense_features + trainmodel.lgbOut_Features train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in sparse_dense_features} test_model_input = {name: test[name] for name in sparse_dense_features} for x in trainmodel.var_features: if x == 'applist': train_model_input[x] = np.array(train[x].tolist()) test_model_input[x] = np.array(test[x].tolist()) if x == 'new_tag': train_model_input[x] = np.array(train[x].tolist())-appsnum test_model_input[x] = np.array(test[x].tolist())-appsnum # 模型 model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns, dnn_hidden_units=(50, 30, 30), l2_reg_linear=0.001, l2_reg_embedding=0.001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.1, dnn_activation='relu', dnn_use_bn=True, task='binary') model.compile("adam", "binary_crossentropy",metrics=['AUC'], ) history = model.fit(train_model_input, train['target'].values, batch_size=256, epochs=1, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test AUC", round(roc_auc_score(test['target'].values, pred_ans), 4))
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3, embedding_dim=10), SparseFeat('gender', 2, embedding_dim=4), SparseFeat('item', 3 + 1, embedding_dim=8), SparseFeat('item_gender', 2 + 1, embedding_dim=4), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1, embedding_dim=20, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=4, embedding_name='item_gender'), maxlen=4) ] feature_columns += [DenseFeat('hist_len', 1, dtype="int64")] behavior_feature_list = ["item"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) hist_len = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'hist_len': hist_len, 'score': score } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = [1, 1, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(): # 固定长度的离散特征 feature_columns = [ SparseFeat('user', 3, embedding_dim=10), SparseFeat('gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8), SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1) ] # 不固定长度的离散特征 feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4) ] behavior_feature_list = ["item", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 1]) # 0 is mask value pay_score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } print('x=', x) y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=4, embedding_name='item_gender'), maxlen=4) ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } feature_names = get_feature_names(feature_columns) x = {name: feature_dict[name] for name in feature_names} y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def prepare(self): if self.data_format == "deepctr": # 1.Label Encoding for sparse features, and # simple Transformation for dense features for feat in self.sparse_features: lbe = LabelEncoder() self.input[feat] = lbe.fit_transform(self.input[feat]) self.encoders[feat] = lbe # 2.count #unique features for each sparse field fixlen_feature_columns = [ SparseFeat(feat, self.input[feat].nunique(), embedding_dim=4) for feat in self.sparse_features ] self.linear_feature_columns = fixlen_feature_columns self.dnn_feature_columns = fixlen_feature_columns self.feature_names = get_feature_names( self.linear_feature_columns + self.dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(self.input, test_size=self.test_size) self.X_train = { name: train[name].values for name in self.feature_names } self.y_train = train[self.target].values self.X_test = { name: test[name].values for name in self.feature_names } self.y_test = test[self.target].values else: raise ("Not supported dataset:" + self.data_format)
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [SparseFeat('user', 3,hash_flag), SparseFeat('gender', 2,hash_flag), SparseFeat('item', 3+1,hash_flag), SparseFeat('item_gender', 2+1,hash_flag), DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] behavior_feature_list = ["item","item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3])#0 is mask value igender = np.array([1, 2, 1])# 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]]) hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]]) behavior_length = np.array([3,3,2]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} if use_neg: feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] x = {name:feature_dict[name] for name in get_feature_names(feature_columns)} x["seq_length"] = behavior_length y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def client_restful_criteo(): data = pd.read_csv('./data/criteo_sample.txt') sparse_features = ['C' + str(i) for i in range(1, 27)] dense_features = ['I' + str(i) for i in range(1, 14)] data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # model_input = [data[name].iloc[0] for name in feature_names] # model_input = [{name:data[name].iloc[0]} for name in feature_names] model_input = [{name:data[name].iloc[0] for name in feature_names}] print(model_input) data = json.dumps({"signature_name": "serving_default", "instances": model_input}, cls=NpEncoder) headers = {"content-type": "application/json"} json_response = requests.post('http://localhost:8501/v1/models/criteo:predict', data=data, headers=headers) json_response = json.loads(json_response.text) print(json_response)
mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') model = multi_gpu_model(model, gpus=2) model.compile( "adam", "binary_crossentropy",
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [ SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag), SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag), SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag), SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) behavior_length = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': score, "seq_length": behavior_length } if use_neg: feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) feature_columns += [ VarLenSparseFeat(SparseFeat('neg_hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('neg_hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def feature_construct(path, embedding_dim=16, data_sample=100000, test_size=0.2): data = load_data(path) data = data.sample(data_sample, random_state=SEED) data['day'] = data['hour'].apply(lambda x: str(x)[4:6]) data['hour'] = data['hour'].apply(lambda x: str(x)[6:]) target = ['click'] sparse_features = [ 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21' ] field_info = dict( C14='user', C15='user', C16='user', C17='user', C18='user', C19='user', C20='user', C21='user', C1='user', device_model='user', device_type='user', device_id='user', banner_pos='context', site_id='context', site_domain='context', site_category='context', device_conn_type='context', hour='context', app_id='item', app_domain='item', app_category='item', ) for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) fixlen_feature_columns = [ SparseFeat(feature, data[feature].nunique(), embedding_dim=embedding_dim, group_name=field_info[feature]) for feature in sparse_features ] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) data_train, data_test = train_test_split(data, test_size=test_size) target_train = data_train[target].values target_test = data_test[target].values train_model_input = { name: data_train[name].values for name in feature_names } test_model_input = {name: data_test[name].values for name in feature_names} return (train_model_input, target_train), ( test_model_input, target_test), linear_feature_columns, dnn_feature_columns
def structural_feature(train, test): test['label'] = -1 data = pd.concat([train, test], axis=0) '''特征工程 >>>>>''' # data['year'] = data['date'].dt.year # data['month'] = data['date'].dt.month # data['day'] = data['date'].dt.day data['hour'] = data['date'].dt.hour del data['date'] data['D1+D2'] = data['D1'] + data['D2'] data['D1-D2'] = data['D1'] - data['D2'] data['D1/D2'] = data['D1'] / data['D2'] # data['A_sum'] = data['A1'] + data['A2'] + data['A3'] data['B_sum'] = data['B1'] + data['B2'] + data['B3'] # data['C_sum'] = data['C1'] + data['C2'] + data['C3'] data['A_*'] = data['A1'] * data['A2'] * data['A3'] data['B_*'] = data['B1'] * data['B2'] * data['B3'] # data['C_*'] = data['C1'] * data['C2'] * data['C3'] data['A_+'] = data['A1'] + data['A2'] + data['A3'] data['B_+'] = data['B1'] + data['B2'] + data['B3'] data['C_+'] = data['C1'] + data['C2'] + data['C3'] normalization_columns = [ 'A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3', 'E2', 'E3', 'E5', 'E7', 'E9', 'E10', 'E13', 'E16', 'E17', 'E19', 'E21', 'E22' ] for column in normalization_columns: data[column] = (data[column] - data[column].min(axis=0)) / ( data[column].max(axis=0) - data[column].min(axis=0)) sparse_features = [ 'D1', 'D2', 'E4', 'E8', 'E11', 'E15', 'E18', 'E25', 'hour' ] dense_features = [ 'E1', 'E2', 'E3', 'E5', 'E6', 'E7', 'E9', 'E10', 'E12', 'E13', 'E14', 'E16', 'E17', 'E16', 'E17', 'E19', 'E20', 'E21', 'E22', 'E23', 'E24', 'A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3' ] data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) '''特征工程结束 <<<<''' train = data[data.label != -1] test = data[data.label == -1] del test['label'] '''调整特征顺序''' l = train['label'] del train['label'] train['label'] = l return train, test, feature_names, linear_feature_columns, dnn_feature_columns
feats = [i for i in data.columns if i != 'Rating'] X = data[feats] y = data['Rating'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) sparse_features = [ 'UserID', 'MovieID', 'Gender', 'Occupation', 'day', 'weekday' ] dense_features = ['hour', 'Age'] fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for feat in sparse_features] + \ [DenseFeat(feat, 1) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns model = DeepFM(linear_feature_columns, dnn_feature_columns, task='multiclass') model.compile('adam', 'mse', metrics=['accuracy']) feature_names = get_feature_names(fixlen_feature_columns) train_feed_dict = {name: X_train[name] for name in feature_names} test_feed_dict = {name: X_test[name] for name in feature_names} model.fit(train_feed_dict, y_train, batch_size=256, epochs=10, validation_split=0.2) pred_ans = model.predict(test_feed_dict, batch_size=256)
def _preprocess_movielens(df, **kw): multiple_value = kw.get('multiple_value') sparse_col = ["movie_id", "user_id", "gender", "age", "occupation", "zip"] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_col: lbe = LabelEncoder() df[feat] = lbe.fit_transform(df[feat]) if not multiple_value: # 2.count #unique features for each sparse field fixlen_cols = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_col] linear_cols = fixlen_cols dnn_cols = fixlen_cols train, test = train_test_split(df, test_size=0.2) ytrue = test[target].values else: ytrue = df[target].values hash_feature = kw.get('hash_feature', False) if not hash_feature: def split(x): key_ans = x.split('|') for key in key_ans: if key not in key2index: # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input key2index[key] = len(key2index) + 1 return list(map(lambda x: key2index[x], key_ans)) # preprocess the sequence feature key2index = {} genres_list = list(map(split, df['genres'].values)) genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', ) fixlen_cols = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_col] use_weighted_sequence = False if use_weighted_sequence: varlen_cols = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len( key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', weight_name='genres_weight')] # Notice : value 0 is for padding for sequence input feature else: varlen_cols = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len( key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', weight_name=None)] # Notice : value 0 is for padding for sequence input feature linear_cols = fixlen_cols + varlen_cols dnn_cols = fixlen_cols + varlen_cols # generate input data for model model_input = {name: df[name] for name in sparse_col} # model_input["genres"] = genres_list model_input["genres_weight"] = np.random.randn(df.shape[0], max_len, 1) else: df[sparse_col] = df[sparse_col].astype(str) # 1.Use hashing encoding on the fly for sparse features,and process sequence features genres_list = list(map(lambda x: x.split('|'), df['genres'].values)) genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0) # 2.set hashing space for each sparse field and generate feature config for sequence feature fixlen_cols = [ SparseFeat(feat, df[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string') for feat in sparse_col] varlen_cols = [ VarLenSparseFeat( SparseFeat('genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"), maxlen=max_len, combiner='mean', )] # Notice : value 0 is for padding for sequence input feature linear_cols = fixlen_cols + varlen_cols dnn_cols = fixlen_cols + varlen_cols feature_names = get_feature_names(linear_cols + dnn_cols) # 3.generate input data for model model_input = {name: df[name] for name in feature_names} model_input['genres'] = genres_list train, test = model_input, model_input return df, linear_cols, dnn_cols, train, test, target, ytrue
def loadData(trainFile, testFile, embedding_dim, multivalue_len, multiClass=False): train = pd.read_csv(trainFile) test = pd.read_csv(testFile) ##1. feature type declarion sparse_features = [ "BaseAdGroupId", "Criteria", 'placementType', 'Week', 'IsRestrict', 'IsNegative', 'AccountTimeZone', 'AccountCurrencyCode', 'BiddingStrategyType', 'CampaignId', 'Month' ] dense_features = [ 'adClicks', 'adConversions', 'adCtr', 'adConversionRate', 'adActiveViewImpressions', 'adActiveViewMeasurability', 'adActiveViewMeasurableCost', 'adActiveViewViewability', 'adImpressions', 'adActiveViewCpm', 'adAverageCpc', 'adAverageCpe', 'adCpcBid', 'adActiveViewMeasurableImpressions', 'adActiveViewCtr', 'adAverageCpm', 'adAverageCpv', 'adCost', 'plaClicks', 'plaConversions', 'plaCtr', 'plaConversionRate', 'plaActiveViewImpressions', 'plaActiveViewMeasurability', 'plaActiveViewMeasurableCost', 'plaActiveViewViewability', 'plaImpressions', 'plaCpcBid', 'plaActiveViewMeasurableImpressions', 'plaActiveViewCtr', 'plaActiveViewCpm', 'plaAverageCpc', 'plaAverageCpe', 'plaAverageCpm', 'plaAverageCpv', 'plaCost', 'histListLen' ] multivalue_features = [ 'locationName', 'languageCode', 'hist_BaseAdGroupId' ] sparse_features = ["BaseAdGroupId", "Criteria", 'placementType'] target = ['Ctr'] # 2. Missing value process. train[sparse_features + multivalue_features] = train[sparse_features + multivalue_features].fillna('-1', ) train[dense_features + target] = train[dense_features + target].fillna(0, ) test[sparse_features + multivalue_features] = test[sparse_features + multivalue_features].fillna('-1', ) test[dense_features + target] = test[dense_features + target].fillna(0, ) train["BaseAdGroupId"] = train["BaseAdGroupId"].apply(lambda x: str( (int(x)))) test["BaseAdGroupId"] = test["BaseAdGroupId"].apply(lambda x: str( (int(x)))) # 3. sparse features transformation for feat in sparse_features: lbe = LabelEncoder() train[feat] = lbe.fit_transform(train[feat]) test[feat] = lbe.fit_transform(test[feat]) # 4. dense features transformation for numFeature in dense_features: train[numFeature] = train[numFeature].apply( lambda x: x if x < 2 else math.sqrt(math.log(x))) test[numFeature] = test[numFeature].apply( lambda x: x if x < 2 else math.sqrt(math.log(x))) # 5. multivalue features transformation for feat in multivalue_features: exec( '{}_train_list = list([split(x,{}Dict) for x in train[feat].values])' .format(feat, feat, feat)) exec( '{}_test_list = list([split(x,{}Dict) for x in test[feat].values])' .format(feat, feat, feat)) exec('{}_length = np.array(list(map(len, {}_train_list)))'.format( feat, feat)) exec('{}_maxlen = max({}_length)'.format(feat, feat)) exec( '{}_train_list = pad_sequences({}_train_list, maxlen=multivalue_len, padding="post",)' .format(feat, feat, feat)) exec( '{}_test_list = pad_sequences({}_test_list, maxlen=multivalue_len, padding="post",)' .format(feat, feat, feat)) # 6. feature colums fixlen_feature_columns = [ SparseFeat(feat, vocabulary_size=(train[feat].append( test[feat], ignore_index=True)).nunique(), embedding_dim=embedding_dim) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] varlen_feature_columns = [] for feat in multivalue_features: exec( 'varlen_feature_columns.append(VarLenSparseFeat("{}", maxlen= multivalue_len,vocabulary_size=len({}Dict) + 1,embedding_dim=embedding_dim, combiner="mean",weight_name=None))' .format(str(feat), feat)) dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 7.generate input data for model train_model_input = { name: train[name] for name in sparse_features + dense_features } test_model_input = { name: test[name] for name in sparse_features + dense_features } for feat in multivalue_features: name = str(feat) exec('train_model_input["{}"] = {}_train_list'.format(name, feat)) exec('test_model_input["{}"] = {}_test_list'.format(name, feat)) behavior_feature_list = ["BaseAdGroupId"] return train_model_input, train, test_model_input, test, dnn_feature_columns, linear_feature_columns, behavior_feature_list
def test_DFM_avazu(data, train, test): print("\nTesting DFM on avazu dataset...\n") results_activation_function = {"auc": [], "logloss": [], "rmse": []} results_dropout = {"auc": [], "logloss": [], "rmse": []} results_number_of_neurons = {"auc": [], "logloss": [], "rmse": []} auc = 0 logloss = 0 rmse = 0 features_labels = train.columns sparse_features_labels = features_labels[1:23] target_label = features_labels[0] dnn_feature_columns = [ SparseFeat( feat, vocabulary_size=data[feat].nunique(), embedding_dim=4, ) for feat in sparse_features_labels ] linear_feature_columns = [ SparseFeat( feat, vocabulary_size=data[feat].nunique(), embedding_dim=4, ) for feat in sparse_features_labels ] feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} true_y = test[target_label].values print("\t\t-- ACTIVATION FUNCTIONS --\t\t") for dnn_activation in dnn_activation_list: print("\nTesting {dnn_activation}...".format( dnn_activation=dnn_activation)) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_activation=dnn_activation, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION, ) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_activation_function["auc"].append(auc) results_activation_function["logloss"].append(logloss) results_activation_function["rmse"].append(rmse) print("\t\t-- DROPOUT RATES --\t\t") for dnn_dropout in dnn_dropout_list: print("\nTesting {dnn_dropout}...".format(dnn_dropout=dnn_dropout)) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_dropout=dnn_dropout, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION, ) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_dropout["auc"].append(auc) results_dropout["logloss"].append(logloss) results_dropout["rmse"].append(rmse) print("\t\t-- HIDDEN UNITS --\t\t") for dnn_hidden_units in dnn_hidden_units_list: print("\nTesting {dnn_hidden_units}...".format( dnn_hidden_units=dnn_hidden_units)) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=dnn_hidden_units, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION, ) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_number_of_neurons["auc"].append(auc) results_number_of_neurons["logloss"].append(logloss) results_number_of_neurons["rmse"].append(rmse) if PLOT: create_plots("DFM", "avazu", results_activation_function, "Activation Function", "activation_func", dnn_activation_list) create_plots("DFM", "avazu", results_dropout, "Dropout Rate", "dropout", dnn_dropout_list) create_plots("DFM", "avazu", results_number_of_neurons, "Number of Neurons per layer", "nr_neurons", dnn_hidden_units_list)
def run(data, ziel, line0, grid , loop): poi_feature_transfer = [] print('++++', '\n', grid) for a in range(len(poi_feature)): poi_feature_transfer.append('poi_feature_%d'%a) data = data.rename(columns={poi_feature[a]: 'poi_feature_%d'%a}) features = ['provname', 'prefname', 'cntyname', 'townname', 'villname','dispincm', 'urbcode_1', 'hauslvl'] + poi_feature_transfer# sparse_features = [] dense_features = [] for f in features: if f not in x_category or x_category[f] == 1: dense_features.append(f) else: sparse_features.append(f) data[sparse_features] = data[sparse_features].fillna(-1) data[dense_features] = data[dense_features].fillna(0 ) y=[] #ziel = # villmean, income y_limit= [np.min(data[ziel])-1]+ line0 +[np.max(data[ziel])] for index, row in data.iterrows(): for i in range(1, len(y_limit)): if y_limit[i - 1] < row[ziel] <= y_limit[i]: y.append(i-1) break data['income_0'] = y target = ['income_0'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] + \ [DenseFeat(feat, 1,)for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns fixlen_feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) # try to oversampling # (train_x,train_y)=over_sampling(train[features],train[ziel], 3) # train = (np.column_stack((train_x, train_y))) train_model_input = [train[name] for name in fixlen_feature_names] test_model_input = [test[name] for name in fixlen_feature_names] # 4.Define Model,train,predict and evaluate ############################################## (models, model_names,xlabel) = model_gridsearch(linear_feature_columns,dnn_feature_columns,grid) logloss, auc1, acc1, pre1, recall1,f11 = [],[],[],[],[],[] print(ziel, line0, len(data)) for name,model in zip(model_names,models): ll_avg, auc_avg = [],[] for i in range(loop): model.compile("adam",'binary_crossentropy', metrics=['binary_crossentropy']) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=0, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) true = test[target].values ''' f = open("pred.csv", 'a', encoding='utf_8_sig') f.write('%s\n'%(ziel)) for i in range(len(pred_ans)): f.write('%s, %s\n' % (pred_ans[i],true[i] )) f.close()''' ll = round(log_loss(test[target].values, pred_ans), 4) auc = round(roc_auc_score(test[target].values, pred_ans), 4) #acc = round(accuracy_score(test[target].values, pred_ans.round()), 4) #pre = round(precision_score(test[target].values, pred_ans.round()), 4) #recall = round(recall_score(test[target].values, pred_ans.round()), 4) #f1 = round(f1_score(test[target].values, pred_ans.round(), average='weighted'),4) #spec = round(specificity_score(test[target].values, pred_ans.round(), average='weighted'),4) #sens = round(sensitivity_score(test[target].values, pred_ans.round(), average='weighted'),4) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) ll_avg.append(ll), auc_avg.append(auc) logloss.append(np.mean(ll_avg)), auc1.append(np.mean(auc_avg))#, acc1.append(acc), pre1.append(pre), recall1.append(recall), f11.append(f1) ''' cm = confusion_matrix(test[target].values, pred_ans.round()) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] cm = [] for m in range(len(line0)+1): cm.append([]) for n in range(len(line0)+1): cm[m].append(round(cm_normalized[m][n],4)) ''' ''' print(name) print("LogLoss", ll, end=' ') print("AUC", auc, end=' ') print("accuracy", acc, end=' ') #print("precision" , pre, end=' ') #print("recall", recall, end=' ') print("f1" , f1, end=' ') print("spec", spec, end=' ') print("sens" , sens, end=' ') print(cm) #f = open("DeepFM.csv", 'a', encoding='utf_8_sig') #f.write('%s,%s\n'%(ziel,line0)) #f.write('%s, %s, %s, %s, %s, %s, %s,' % (name, ll, auc, acc, f1, spec, sens)) #f.write('%s\n' % str(cm).replace(',',';')) #f.close() ''' return (logloss, auc1, xlabel)