def __init__(self, dim, maxlen, indexes): self.dim = dim self.maxlen = maxlen self.item_index = indexes[0] self.city_index = indexes[1] self.action_index = indexes[2] hash_flag = True iFeature = SingleFeat('item', len(self.item_index) + 1, hash_flag) cFeature = SingleFeat('city', len(self.city_index) + 1, hash_flag) pFeature = SingleFeat('position', 25 + 1, hash_flag) aFeature = SingleFeat('action', len(self.action_index) + 1, hash_flag) self.feature_dim_dict = {"sparse": [iFeature, cFeature, pFeature, aFeature], "dense": [SingleFeat('price', False)]} self.behavior_feature_list = ["item", "city", "position", "action"] self.model = DSIN(self.feature_dim_dict, self.behavior_feature_list, sess_max_count=1, sess_len_max=self.maxlen, embedding_size=self.dim, att_head_num=1, att_embedding_size=self.dim * len(self.behavior_feature_list), dnn_hidden_units=[self.dim, self.dim, self.dim ], dnn_dropout=0.5) self.model.compile('adam', 'binary_crossentropy', metrics=['acc'])
def get_xy_fd(use_neg=False, hash_flag=False): feature_dim_dict = {"sparse": [SingleFeat('user', 3,hash_flag), SingleFeat( 'gender', 2,hash_flag), SingleFeat('item', 3+1,hash_flag), SingleFeat('item_gender', 2+1,hash_flag)], "dense": [SingleFeat('score', 0)]} behavior_feature_list = ["item","item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3])#0 is mask value igender = np.array([1, 2, 1])# 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]]) hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]]) behavior_length = np.array([3,3,2]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in feature_dim_dict["dense"]] + [ feature_dict['hist_' + feat] for feat in behavior_feature_list] if use_neg: feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) x += [feature_dict['neg_hist_'+feat] for feat in behavior_feature_list] x += [behavior_length] y = [1, 0, 1] return x, y, feature_dim_dict, behavior_feature_list
def get_xy_fd(): feature_dim_dict = { "sparse": [ SingleFeat('user', 4), SingleFeat('gender', 2), SingleFeat('item', 4), SingleFeat('item_gender', 2) ], "dense": [] } behavior_feature_list = ["item"] uid = np.array([1, 2, 3]) ugender = np.array([0, 1, 0]) iid = np.array([0, 1, 2]) igender = np.array([0, 1, 0]) hist_iid = np.array([[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]) hist_igender = np.array([[0, 1, 0, 1], [0, 1, 1, 1], [0, 0, 1, 0]]) hist_length = np.array([4, 4, 4]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, } x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] \ + [feature_dict['hist_'+feat] for feat in behavior_feature_list]\ + [hist_length] y = [1, 0, 1] return x, y, feature_dim_dict, behavior_feature_list
def sparse_feat_list_gen(data, sparse_features, hashing): if hashing: sparse_feat_list = [ SingleFeat(feat, data[feat].nunique() * 5, hash_flag=True, dtype='string') for feat in sparse_features ] else: sparse_feat_list = [ SingleFeat(feat, data[feat].nunique()) for feat in sparse_features ] return sparse_feat_list
def get_train_and_feature_list(data, sparse_features, multivalue_cols, name_col='C4'): sparse_feature_list = [SingleFeat(feat, 1e3, hash_flag=True, dtype='float32') # since the input is string for feat in sparse_features] sequence_feature = [] sequence_input = [] sequence_input_lens = [] for f in multivalue_cols: print(data.iloc[0][f]) print(len(data.columns)) data[f] = data[f] + "|" + data[name_col].map(str) print(data.iloc[0][f]) genres_list = list(map(lambda x: list(reversed(x.split('|'))), data[f].values)) genres_length = np.array(list(map(len, genres_list))) print("{0}: mean len {1}, max len {2}".format(f, np.mean(genres_length), np.max(genres_length))) max_len = max(genres_length) max_len = max(max_len, 51) # print(max_len) # Notice : padding=`post` genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0) # print(genres_list) # sequence_feature += [VarLenFeat(f, len(key2index) + 1, max_len, 'mean')] sequence_feature += [VarLenFeat(f, 1e3, max_len, 'mean', hash_flag=True, dtype="string")] sequence_input.append(genres_list) sequence_input_lens.append(max_len) data[name_col] = data[name_col].map(float) sparse_input = [data[feat.name].values for feat in sparse_feature_list] model_input = sparse_input + sequence_input + [genres_length] # print("eseseswes {0}".format(sequence_input)) return model_input, sparse_feature_list, sequence_feature, sequence_input_lens
def get_xy_fd(hash_flag=False): feature_dim_dict = { "sparse": [ SingleFeat('user', 3, hash_flag), SingleFeat('gender', 2, hash_flag), SingleFeat('item', 3 + 1, hash_flag), SingleFeat('item_gender', 2 + 1, hash_flag) ], "dense": [SingleFeat('score', 0)] } behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]]) sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]]) sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess_number = np.array([2, 1, 0]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'sess1_item': sess1_iid, 'sess1_item_gender': sess1_igender, 'score': score, 'sess2_item': sess2_iid, 'sess2_item_gender': sess2_igender, } x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [ feature_dict[feat.name] for feat in feature_dim_dict["dense"] ] + [feature_dict['sess1_' + feat] for feat in behavior_feature_list ] + [feature_dict['sess2_' + feat] for feat in behavior_feature_list] x += [sess_number] y = [1, 0, 1] return x, y, feature_dim_dict, behavior_feature_list
def get_test_data(sample_size=1000, sparse_feature_num=1, dense_feature_num=1, sequence_feature=('max', 'mean', 'sum'), classification=True, include_length=False): feature_dim_dict = {"sparse": [], 'dense': [], 'sequence': []} for i in range(sparse_feature_num): dim = np.random.randint(1, 10) feature_dim_dict['sparse'].append(SingleFeat('sparse_' + str(i), dim)) for i in range(dense_feature_num): feature_dim_dict['dense'].append(SingleFeat('dense_' + str(i), 0)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_dim_dict['sequence'].append( VarLenFeat('sequence_' + str(i), dim, maxlen, mode)) sparse_input = [ np.random.randint(0, dim, sample_size) for feat, dim in feature_dim_dict['sparse'] ] dense_input = [ np.random.random(sample_size) for name in feature_dim_dict['dense'] ] sequence_input = [] sequence_len_input = [] for var in feature_dim_dict['sequence']: s_input, s_len_input = gen_sequence(var.dimension, var.maxlen, sample_size) sequence_input.append(s_input) sequence_len_input.append(s_len_input) if classification: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) x = sparse_input + dense_input + sequence_input if include_length: x += sequence_len_input return x, y, feature_dim_dict
def get_xy_fd(): feature_dim_dict = {"sparse": [SingleFeat('user', 3), SingleFeat( 'gender', 2), SingleFeat('item', 3+1), SingleFeat('item_gender', 2+1)], "dense": [SingleFeat('score', 0)]} behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in feature_dim_dict["dense"]] + [feature_dict['hist_'+feat] for feat in behavior_feature_list] y = [1, 0, 1] return x, y, feature_dim_dict, behavior_feature_list
def get_feature_list(sparse_features, multivalue_cols, max_len=50): sparse_feature_list = [ SingleFeat(feat, 1000, hash_flag=True, dtype='string') # since the input is string for feat in sparse_features ] sequence_feature = [] for f in multivalue_cols: sequence_feature += [ VarLenFeat(f, 1000, max_len, 'mean', hash_flag=True, dtype="string") ] return sparse_feature_list, sequence_feature
def test_DCN_invalid(embedding_size=8, cross_num=0, hidden_size=()): feature_dim_dict = { 'sparse': [ SingleFeat('sparse_1', 2), SingleFeat('sparse_2', 5), SingleFeat('sparse_3', 10) ], 'dense': [ SingleFeat('dense_1', 1), SingleFeat('dense_1', 1), SingleFeat('dense_1', 1) ] } with pytest.raises(ValueError): _ = DCN( feature_dim_dict, embedding_size=embedding_size, cross_num=cross_num, dnn_hidden_units=hidden_size, dnn_dropout=0.5, )
sparse_features = [ 'userid', 'adgroup_id', 'pid', 'cms_segid', 'cms_group_id', 'final_gender_code', 'age_level', 'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level', 'campaign_id', 'customer' ] dense_features = ['price'] for feat in tqdm(sparse_features): lbe = LabelEncoder() # or Hash data[feat] = lbe.fit_transform(data[feat]) mms = StandardScaler() data[dense_features] = mms.fit_transform(data[dense_features]) sparse_feature_list = [ SingleFeat(feat, data[feat].nunique() + 1) for feat in sparse_features + ['cate_id', 'brand'] ] dense_feature_list = [SingleFeat(feat, 1) for feat in dense_features] sess_feature = ['cate_id', 'brand'] sess_input = [ pad_sequences(sess_input_dict[feat], maxlen=DIN_SESS_MAX_LEN, padding='post') for feat in sess_feature ] neg_sess_input = [ pad_sequences(neg_sess_input_dict[feat], maxlen=DIN_SESS_MAX_LEN, padding='post') for feat in sess_feature
data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name sparse_feature_list = [ SingleFeat(feat, data[feat].nunique()) for feat in sparse_features ] dense_feature_list = [SingleFeat( feat, 0, ) for feat in dense_features] # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \ [train[feat.name].values for feat in dense_feature_list] test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \ [test[feat.name].values for feat in dense_feature_list] # 4.Define Model,train,predict and evaluate
# key2index = {} # df_train[col] = df_train[col].apply(str) # # all_data[col] = all_data[col].apply(str) # lst = list(map(split, df_train[col].values)) # lst_all = list(map(split, df_train[col].values)) # lst_length = np.array(list(map(len, lst_all))) # max_len = max(lst_length) # # Notice : padding=`post` # lst = pad_sequences(lst, maxlen=max_len, padding='post') # multi_values_input += [lst] # # 2.count #unique features for each sparse field and generate feature config for sequence feature # sequence_feature += [VarLenFeat(col, len(key2index) + 1, max_len, 'mean')] # # Notice : value 0 is for padding for sequence input feature sparse_feat_list = [ SingleFeat(feat, df_train[feat].nunique()) for feat in sparse_features ] dense_feat_list = [SingleFeat(feat, 0) for feat in dense_features] # 3.generate input data for model sparse_input = [df_train[feat.name].values for feat in sparse_feat_list] dense_input = [df_train[feat.name].values for feat in dense_feat_list] model_input = sparse_input + dense_input + multi_values_input print(model_input) # print(model_input.shape) # 4.Define Model,compile and train model = DeepFM( { "sparse": sparse_feat_list, "dense": dense_feat_list,
data = pd.merge(sample_sub, user, how='left', on='userid', ) data = pd.merge(data, ad, how='left', on='adgroup_id') sparse_features = ['userid', 'adgroup_id', 'pid', 'cms_segid', 'cms_group_id', 'final_gender_code', 'age_level', 'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level', 'campaign_id', 'customer'] dense_features = ['price'] for feat in tqdm(sparse_features): lbe = LabelEncoder() # or Hash data[feat] = lbe.fit_transform(data[feat]) mms = StandardScaler() data[dense_features] = mms.fit_transform(data[dense_features]) sparse_feature_list = [SingleFeat(feat, data[feat].max( ) + 1) for feat in sparse_features + ['cate_id', 'brand']] dense_feature_list = [SingleFeat(feat, 1) for feat in dense_features] sess_feature = ['cate_id', 'brand'] sess_input = [pad_sequences( sess_input_dict[feat], maxlen=DIN_SESS_MAX_LEN, padding='post') for feat in sess_feature] neg_sess_input = [pad_sequences(neg_sess_input_dict[feat], maxlen=DIN_SESS_MAX_LEN, padding='post') for feat in sess_feature] model_input = [data[feat.name].values for feat in sparse_feature_list] + \ [data[feat.name].values for feat in dense_feature_list] sess_lists = sess_input + neg_sess_input + [np.array(sess_input_length)] model_input += sess_lists if not os.path.exists('../model_input/'):
genres_list = list(map(lambda x: x.split('|'), data['genres'].values)) genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0) # 2.set hashing space for each sparse field and generate feature config for sequence feature sparse_feat_list = [ SingleFeat(feat, data[feat].nunique() * 5, hash_flag=True, dtype='string') for feat in sparse_features ] sequence_feature = [ VarLenFeat('genres', 100, max_len, 'mean', hash_flag=True, dtype="string") ] # Notice : value 0 is for padding for sequence input feature # 3.generate input data for model sparse_input = [data[feat.name].values for feat in sparse_feat_list] dense_input = [] sequence_input = [genres_list] model_input = sparse_input + dense_input + \ sequence_input # make sure the order is right # 4.Define Model,compile and train model = DeepFM({
data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]).astype(np.int32) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]).astype(np.float32) # 2.count #unique features for each sparse field,and record dense feature field name sparse_feature_list = [SingleFeat(feat, data[feat].nunique()) for feat in sparse_features] dense_feature_list = [SingleFeat(feat, 0,) for feat in dense_features] # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \ [train[feat.name].values for feat in dense_feature_list] test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \ [test[feat.name].values for feat in dense_feature_list] # 4.Define Model,train,predict and evaluate model = DeepFM({"sparse": sparse_feature_list, "dense": dense_feature_list}, task='binary', embedding_size=4, dnn_hidden_units=(64, 64))
if __name__ == "__main__": data = pd.read_csv("./movielens_sample.txt") sparse_features = [ "movie_id", "user_id", "gender", "age", "occupation", "zip" ] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) # 2.count #unique features for each sparse field sparse_feat_list = [ SingleFeat(feat, data[feat].nunique()) for feat in sparse_features ] # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = [train[feat.name].values for feat in sparse_feat_list] test_model_input = [test[feat.name].values for feat in sparse_feat_list] # 4.Define Model,train,predict and evaluate model = DeepFM({"sparse": sparse_feat_list}, task='regression') model.compile( "adam", "mse", metrics=['mse'], ) history = model.fit(
data = pd.read_csv(file, sep='\t', header=None, names=names, dtype=dtypes) data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.do simple Transformation for dense features mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.set hashing space for each sparse field,and record dense feature field name sparse_feature_list = [ SingleFeat(feat, 1000, hash_flag=True, dtype='string') # since the input is string for feat in sparse_features ] dense_feature_list = [SingleFeat( feat, 0, ) for feat in dense_features] # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \ [train[feat.name].values for feat in dense_feature_list] test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \ [test[feat.name].values for feat in dense_feature_list]
from deepctr.models import DeepFM, DCN from deepctr.utils import SingleFeat #对稠密特征归一化 mms = MinMaxScaler(feature_range=(0, 1)) X_train[dense_features] = mms.fit_transform(X_train[dense_features]) X_test[dense_features] = mms.transform(X_test[dense_features]) #对稀疏特征编码 for feat in sparse_features: lbe = LabelEncoder() X_train[feat] = lbe.fit_transform(X_train[feat]) X_test[feat] = lbe.transform(X_test[feat]) sparse_feature_list = [ SingleFeat(feat, X_train[feat].nunique()) # since the input is string for feat in sparse_features ] dense_feature_list = [SingleFeat( feat, 0, ) for feat in dense_features] train_model_input = [X_train[feat.name].values for feat in sparse_feature_list] + \ [X_train[feat.name].values for feat in dense_feature_list] test_model_input = [X_test[feat.name].values for feat in sparse_feature_list] + \ [X_test[feat.name].values for feat in dense_feature_list] model = DeepFM({ "sparse": sparse_feature_list, "dense": dense_feature_list