def get_feature_columns(self): ''' 获取特征列 ''' file_name = "{stage}_{action}_{day}_concate_sample.csv".format(stage=self.stage, action=self.action, day=STAGE_END_DAY[self.stage]) stage_dir = os.path.join(FLAGS.root_path, self.stage, file_name) self.df = pd.read_csv(stage_dir) sparse_features = ["userid", "feedid", "authorid", "bgm_singer_id", "bgm_song_id"] self.df[sparse_features] = self.df[sparse_features].fillna('-1', ) for feat in sparse_features: lbe = LabelEncoder() self.df[feat] = lbe.fit_transform(self.df[feat]) # mms = MinMaxScaler(feature_range=(0, 1)) # data[dense_features] = mms.fit_transform(data[dense_features]) # df[dense_features] = df[dense_features].fillna(0, ) linear_feature_columns = list() dnn_feature_columns = [SparseFeat(feat, self.df[feat].nunique(), FLAGS.embed_dim, dtype=str) for feat in sparse_features] video_seconds = DenseFeat(name='videoplayseconds') device = DenseFeat(name='device') linear_feature_columns.append(video_seconds) linear_feature_columns.append(device) # 行为统计特征 for b in FEA_COLUMN_LIST: feed_b = DenseFeat(b + "sum") linear_feature_columns.append(feed_b) user_b = DenseFeat(b + "sum_user") linear_feature_columns.append(user_b) return dnn_feature_columns, linear_feature_columns
def get_test_data(sample_size=1000, sparse_feature_num=1, dense_feature_num=1, sequence_feature=('sum', 'mean', 'max'), classification=True, include_length=False, hash_flag=False, prefix=''): feature_columns = [] for i in range(sparse_feature_num): dim = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sparse_feature_' + str(i), dim, hash_flag, torch.int32)) for i in range(dense_feature_num): feature_columns.append( DenseFeat(prefix + 'dense_feature_' + str(i), 1, torch.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(prefix + 'sequence_' + str(i), dim, maxlen, mode)) model_input = [] sequence_input = [] sequence_len_input = [] for fc in feature_columns: if isinstance(fc, SparseFeat): model_input.append(np.random.randint(0, fc.dimension, sample_size)) elif isinstance(fc, DenseFeat): model_input.append(np.random.random(sample_size)) else: s_input, s_len_input = gen_sequence(fc.dimension, fc.maxlen, sample_size) sequence_input.append(s_input) sequence_len_input.append(s_len_input) if classification: y = np.random.randint(0, 2, sample_size) while sum(y) < 0.3 * sample_size: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) x = model_input + sequence_input if include_length: for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sequence_' + str(i) + '_seq_length', 1, embedding=False)) x += sequence_len_input return x, y, feature_columns
def fit_test(self, train_X, train_Y, val_X, val_Y, test_X, test_Y, cat_cols): sparse_features = cat_cols dense_features = [ idx for idx in range(train_X.shape[1]) if idx not in cat_cols ] sparse_feature_columns = [ SparseFeat(str(feat), vocabulary_size=len(set(train_X[:, feat])) + 1, embedding_dim=4) for i, feat in enumerate(sparse_features) ] dense_feature_columns = [ DenseFeat( str(feat), 1, ) for feat in dense_features ] dnn_feature_columns = sparse_feature_columns + dense_feature_columns linear_feature_columns = sparse_feature_columns + dense_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) train_model_input = { name: train_X[:, int(name)] for name in feature_names } val_model_input = {name: val_X[:, int(name)] for name in feature_names} test_model_input = { name: test_X[:, int(name)] for name in feature_names } use_cuda = True if use_cuda and torch.cuda.is_available(): print('cuda ready...') self.device = 'cuda:0' self.model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary', device=self.device) self.model.compile( Adam(self.model.parameters(), 0.0001), "binary_crossentropy", metrics=['binary_crossentropy'], ) es = EarlyStopping(monitor='val_binary_crossentropy', min_delta=0, verbose=1, patience=30, mode='min') lbe = LabelEncoder() self.model.fit(train_model_input, lbe.fit_transform(train_Y), batch_size=512, epochs=21, verbose=2, validation_data=(val_model_input, lbe.transform(val_Y))) pred_ans = self.model.predict(test_model_input, batch_size=256) print(f'{log_loss(test_Y, pred_ans):.5f}')
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 4, embedding_dim=4, use_hash=hash_flag), SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag), SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag), SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', vocabulary_size=2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2, 3]) gender = np.array([0, 1, 0, 1]) item_id = np.array([1, 2, 3, 2]) # 0 is mask value cate_id = np.array([1, 2, 1, 2]) # 0 is mask value score = np.array([0.1, 0.2, 0.3, 0.2]) hist_item_id = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0], [1, 2, 0, 0]]) behavior_length = np.array([3, 3, 2, 2]) feature_dict = { 'user': uid, 'gender': gender, 'item_id': item_id, 'cate_id': cate_id, 'hist_item_id': hist_item_id, 'hist_cate_id': hist_cate_id, 'pay_score': score, "seq_length": behavior_length } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1, 0]) return x, y, feature_columns, behavior_feature_list
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1, sequence_feature=['sum', 'mean', 'max'], classification=True, include_length=False, hash_flag=False, prefix=''): feature_columns = [] model_input = {} if 'weight' in sequence_feature: feature_columns.append(VarLenSparseFeat(SparseFeat(prefix+"weighted_seq",vocabulary_size=2,embedding_dim=embedding_size),maxlen=3,length_name=prefix+"weighted_seq"+"_seq_length",weight_name=prefix+"weight")) s_input, s_len_input = gen_sequence( 2, 3, sample_size) model_input[prefix+"weighted_seq"] = s_input model_input[prefix+'weight'] = np.random.randn(sample_size,3,1) model_input[prefix+"weighted_seq"+"_seq_length"] = s_len_input sequence_feature.pop(sequence_feature.index('weight')) for i in range(sparse_feature_num): dim = np.random.randint(1, 10) feature_columns.append(SparseFeat(prefix+'sparse_feature_'+str(i), dim,embedding_size,dtype=torch.int32)) for i in range(dense_feature_num): feature_columns.append(DenseFeat(prefix+'dense_feature_'+str(i), 1,dtype=torch.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(SparseFeat(prefix +'sequence_' + mode,vocabulary_size=dim, embedding_dim=embedding_size), maxlen=maxlen, combiner=mode)) for fc in feature_columns: if isinstance(fc,SparseFeat): model_input[fc.name]= np.random.randint(0, fc.vocabulary_size, sample_size) elif isinstance(fc,DenseFeat): model_input[fc.name] = np.random.random(sample_size) else: s_input, s_len_input = gen_sequence( fc.vocabulary_size, fc.maxlen, sample_size) model_input[fc.name] = s_input if include_length: fc.length_name = prefix+"sequence_"+str(i)+'_seq_length' model_input[prefix+"sequence_"+str(i)+'_seq_length'] = s_len_input if classification: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) return model_input, y, feature_columns
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3, embedding_dim=8), SparseFeat('gender', 2, embedding_dim=8), SparseFeat('item', 3 + 1, embedding_dim=8), SparseFeat('item_gender', 2 + 1, embedding_dim=8), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', 3 + 1, embedding_dim=8), 4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=8), 4, length_name="seq_length") ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) behavior_length = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score, "seq_length": behavior_length } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2, random_state=2020, shuffle=False) # litez: a dictionary of pd.Series train_model_input = {name: train[name] for name in feature_names}
def task(action): print('-----------action-----------', action) USE_FEAT = [action] + SELECT_FRTS train = pd.read_csv(ROOT_PATH + f'/train_data_for_{action}.csv')[USE_FEAT] train = train.sample(frac=1, random_state=42).reset_index(drop=True) print("posi prop:") print(sum((train[action] == 1) * 1) / train.shape[0]) test = pd.read_csv(ROOT_PATH + '/test_data.csv')[SELECT_FRTS] target = [action] test[target[0]] = 0 test = test[USE_FEAT] data = pd.concat((train, test)).reset_index(drop=True) print(train.shape, test.shape, data.shape) dense_features = DENSE_FEATURE sparse_features = [ i for i in USE_FEAT if i not in dense_features and i not in target ] data[sparse_features] = data[sparse_features].fillna(0) data[dense_features] = data[dense_features].fillna(0) # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] # dnn_feature_columns = fixlen_feature_columns #linear_feature_columns = [SparseFeat(feat, data[feat].nunique()) # for feat in sparse_features] linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = data.iloc[:train.shape[0]].reset_index( drop=True), data.iloc[train.shape[0]:].reset_index(drop=True) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} #------- eval_ratio = 0. eval_df = train[int((1 - eval_ratio) * train.shape[0]):].reset_index(drop=True) userid_list = eval_df['userid'].astype(str).tolist() print('val len:', len(userid_list)) # 4.Define Model,train,predict and evaluate device = 'cpu' use_cuda = True if use_cuda and torch.cuda.is_available(): print('cuda ready...') device = 'cuda:0' model = MyDeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns, use_fm=True, dnn_hidden_units=(256, 128), l2_reg_linear=1e-1, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0., dnn_activation='relu', dnn_use_bn=False, task='binary', device=device) model.compile("adagrad", "binary_crossentropy", metrics=["auc"]) history = model.fit(train_model_input, train[target].values, batch_size=1024, epochs=NUM_EPOCH_DICT[action], verbose=1, validation_split=eval_ratio, userid_list=userid_list) pred_ans = model.predict(test_model_input, 128) #submit[action] = pred_ans torch.cuda.empty_cache() return pred_ans
INNER_DIM = args.inner_dim if INNER_DIM <= 0: INNER_DIM = None BATCH = args.batch OUTER_DIM = args.embd_dim #data = pd.read_csv('../../preprocessed/criteo_train.csv') data = pickle.load(open('../preprocessed/preprocessed_avazu.pkl','rb')) header_names = ['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'] sparse_features = header_names[3:] dense_features = ['hour'] target = ['click'] # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(),embedding_dim=OUTER_DIM) for feat in sparse_features] + [DenseFeat(feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.1,random_state=42) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} train_model_labels = train[target].values test_model_labels = test[target].values # memory optimization import gc del data data = None gc.collect() # 4.Define Model,train,predict and evaluate
data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = [train[name] for name in fixlen_feature_names] test_model_input = [test[name] for name in fixlen_feature_names]
dense_features = ['I' + str(i) for i in range(1, 14)] data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] + [DenseFeat(feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate
import numpy as np import pandas as pd from sklearn.metrics import log_loss, roc_auc_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, MinMaxScaler from deepctr_torch.models import * from deepctr_torch.inputs import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names import torch import torch.nn.functional as F feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender', 3 + 1, maxlen=4, embedding_name='item_gender') ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3])