def get_xy_fd(): feature_dim_dict = { "sparse": [ SingleFeat('user_age', 4), SingleFeat('user_gender', 2), SingleFeat('item_id', 4), SingleFeat('item_gender', 4) ] } # raw feature:single value feature # history behavior feature:multi-value value feature behavior_feature_list = ["item_id", "item_gender"] # single value feature input user_age = np.array([1, 2, 3]) user_gender = np.array([0, 1, 0]) item_id = np.array([0, 1, 2]) item_gender = np.array([0, 1, 0]) # multi-value feature input hist_item_id = np.array([[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 0]]) hist_item_gender = np.array([[0, 1, 0, 1], [0, 1, 1, 1], [0, 0, 1, 0]]) # valid length of behavior sequence of every sample hist_length = np.array([4, 4, 3]) feature_dict = { 'user_age': user_age, 'user_gender': user_gender, 'item_id': item_id, 'item_gender': item_gender, 'hist_item_id': hist_item_id, 'hist_item_gender': hist_item_gender, } x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + \ [feature_dict['hist_'+feat] for feat in behavior_feature_list] + [hist_length] # Notice the concatenation order: single feature + multi-value feature + length # Since the length of the historical sequences of different features in DIN are the same(they are all extended from item_id),only one length vector is enough. y = [1, 0, 1] return x, y, feature_dim_dict, behavior_feature_list
def test_DCN_invalid(embedding_size=8, cross_num=0, hidden_size=()): feature_dim_dict = { 'sparse': [ SingleFeat('sparse_1', 2), SingleFeat('sparse_2', 5), SingleFeat('sparse_3', 10) ], 'dense': [ SingleFeat('dense_1', 1), SingleFeat('dense_1', 1), SingleFeat('dense_1', 1) ] } with pytest.raises(ValueError): _ = DCN( feature_dim_dict, embedding_size=embedding_size, cross_num=cross_num, hidden_size=hidden_size, keep_prob=0.5, )
data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name sparse_feature_list = [ SingleFeat(feat, data[feat].nunique()) for feat in sparse_features ] dense_feature_list = [SingleFeat(feat, 0) for feat in dense_features] # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \ [train[feat.name].values for feat in dense_feature_list] test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \ [test[feat.name].values for feat in dense_feature_list] # 4.Define Model,train,predict and evaluate model = DeepFM({ "sparse": sparse_feature_list, "dense": dense_feature_list
def get_input(use_count=False, use_unique=False, use_video=False, use_audio=False, use_title=False, ONLINE_FLAG=False, SAMPLE_FLAG=True, VALIDATION_FRAC=0.2, target='finish'): train_file = 'track2/sample_train.txt' if SAMPLE_FLAG else 'track2/final_track2_train.txt' test_file = 'track2/sample_test_no_answer.txt' if SAMPLE_FLAG else 'track2/final_track2_test_no_anwser.txt' video_file = 'track2/sample_video_features.csv' if SAMPLE_FLAG else 'track2/track2_video_features_mms.csv' face_file = 'track2/sample_face2.csv' if SAMPLE_FLAG else 'track2/face_df2.csv' audio_file = 'track2/sample_audio_features.csv' if SAMPLE_FLAG else 'track2/track2_audio_features.csv' title_file = 'track2/sample_title.txt' if SAMPLE_FLAG else 'track2/track2_title.txt' data = pd.read_csv(train_file, sep='\t', names=[ 'uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'finish', 'like', 'music_id', 'did', 'creat_time', 'video_duration' ]) print('training set read completed.') if ONLINE_FLAG: test_data = pd.read_csv(test_file, sep='\t', names=[ 'uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'finish', 'like', 'music_id', 'did', 'creat_time', 'video_duration' ]) train_size = data.shape[0] data = data.append(test_data).reset_index(drop=True) else: train_size = int(data.shape[0] * (1 - VALIDATION_FRAC)) print('test set read completed.') sparse_features = [ 'uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'music_id', 'did', ] dense_features = [] data['video_duration'] = pd.qcut(data['video_duration'], q=10, labels=False, duplicates='drop') sparse_features.append('video_duration') data['creat_time'] = data['creat_time'] % (24 * 3600) / 3600 data['creat_time'] = pd.qcut(data['creat_time'], q=24, labels=False, duplicates='drop') sparse_features.append('creat_time') if use_count: data['uid-author_id'] = data['uid'].astype( str) + '-' + data['author_id'].astype(str) data['uid-did'] = data['uid'].astype(str) + '-' + data['did'].astype( str) data['did-channel'] = data['did'].astype( str) + '-' + data['channel'].astype(str) # 计数特征 cols = ['uid', 'did', 'item_id', 'author_id', 'uid-author_id'] for c in cols: data[c + '_cnt'] = data[c].map(data[c].value_counts()) data[c + '_cnt'] = pd.qcut(data[c + '_cnt'], q=10, labels=False, duplicates='drop') sparse_features.append(c + '_cnt') # 均值特征 df = get_expanding_mean(data[:train_size], data[train_size:], [ 'uid-author_id', 'uid-did', 'did-channel', 'uid', 'did', 'item_id' ], 'finish') dense_features += list(df.columns) data = pd.concat([data, df], axis=1) if use_unique: data['uid_icity_nunique'] = data['uid'].map( data.groupby('uid')['item_city'].nunique()) data['uid_icity_nunique'] = pd.qcut(data['uid_icity_nunique'], q=10, labels=False, duplicates='drop') sparse_features.append('uid_icity_nunique') data['uid_item_nunique'] = data['uid'].map( data.groupby('uid')['item_id'].nunique()) data['uid_item_nunique'] = pd.qcut(data['uid_item_nunique'], q=10, labels=False, duplicates='drop') sparse_features.append('uid_item_nunique') data['uid_author_nunique'] = data['uid'].map( data.groupby('uid')['author_id'].nunique()) data['uid_author_nunique'] = pd.qcut(data['uid_author_nunique'], q=10, labels=False, duplicates='drop') sparse_features.append('uid_author_nunique') data['uid_music_nunique'] = data['uid'].map( data.groupby('uid')['music_id'].nunique()) data['uid_music_nunique'] = pd.qcut(data['uid_music_nunique'], q=10, labels=False, duplicates='drop') sparse_features.append('uid_music_nunique') data['item_ucity_nunique'] = data['item_id'].map( data.groupby('item_id')['user_city'].nunique()) data['item_ucity_nunique'] = pd.qcut(data['item_ucity_nunique'], q=10, labels=False, duplicates='drop') sparse_features.append('item_ucity_nunique') data['item_uid_nunique'] = data['item_id'].map( data.groupby('item_id')['uid'].nunique()) data['item_uid_nunique'] = pd.qcut(data['item_uid_nunique'], q=30, labels=False, duplicates='drop') sparse_features.append('item_uid_nunique') data['author_uid_nunique'] = data['author_id'].map( data.groupby('author_id')['uid'].nunique()) data['author_uid_nunique'] = pd.qcut(data['author_uid_nunique'], q=20, labels=False, duplicates='drop') sparse_features.append('author_uid_nunique') print('generate stats feats completed.') if use_video: video_feats = pd.read_csv(video_file) print('video feats read completed.') data = pd.merge(data, video_feats, how='left', on='item_id') for i in range(128): col = 'vd' + str(i) data[col].fillna(0, inplace=True) print('merge video feats completed.') if use_audio: audio_feats = pd.read_csv(audio_file) print('audio feats read completed.') data = pd.merge(data, audio_feats, how='left', on='item_id') for i in range(128): col = 'ad' + str(i) data[col].fillna(0, inplace=True) print('merge audio feats completed.') if use_title: max_len = 47 title_feats = pd.read_json(title_file, lines=True) print('title feats read completed') def get_title_len(d): return sum(d.values()) title_feats['title_len'] = title_feats['title_features'].apply( get_title_len) prior = title_feats['title_len'].mean() dense_features.append('title_len') title_feats['title_features'] = title_feats['title_features'].apply( lambda x: list(x.keys())) data = pd.merge(data, title_feats, how='left', on='item_id') for row in data.loc[data.title_features.isna(), 'title_features'].index: data.at[row, 'title_features'] = [] data['title_len'].fillna(prior, inplace=True) print('merge title feats completed') data[sparse_features] = data[sparse_features].fillna('-1', ) for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) if len(dense_features) > 0: mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) sparse_feature_list = [ SingleFeat(feat, data[feat].nunique()) for feat in sparse_features ] dense_feature_list = [SingleFeat(feat, 0) for feat in dense_features] sequence_feature_list = [] if use_title: sequence_feature_list.append( VarLenFeat('title', 134545, max_len, 'sum')) print('data preprocess completed.') train = data.iloc[:train_size] test = data.iloc[train_size:] train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \ [train[feat.name].values for feat in dense_feature_list] test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \ [test[feat.name].values for feat in dense_feature_list] if use_title: train_model_input += [ pad_sequences(train['title_features'], maxlen=max_len, padding='post') ] test_model_input += [ pad_sequences(test['title_features'], maxlen=max_len, padding='post') ] if use_video: vd_cols = ['vd' + str(i) for i in range(128)] video_input = data[vd_cols].values train_model_input += [video_input[:train_size]] test_model_input += [video_input[train_size:]] if use_audio: ad_cols = ['ad' + str(i) for i in range(128)] audio_input = data[ad_cols].values train_model_input += [audio_input[:train_size]] test_model_input += [audio_input[train_size:]] print('input process completed.') print(f'use sparse feats: [{",".join(sparse_features)}]') print(f'use dense feats: [{",".join(dense_features)}]') train_labels, test_labels = train[target].values, test[target].values feature_dim_dict = { "sparse": sparse_feature_list, "dense": dense_feature_list, "sequence": sequence_feature_list } if ONLINE_FLAG: return feature_dim_dict, train_model_input, train_labels, test_model_input, test_labels, test_data return feature_dim_dict, train_model_input, train_labels, test_model_input, test_labels
def model_pool(defaultfilename='./input/final_track1_train.txt', defaulttestfile='./input/final_track1_test_no_anwser.txt', defaultcolumnname=['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'finish', 'like', 'music_id', 'did', 'creat_time', 'video_duration'], defaulttarget=['finish', 'like'], defaultmodel="AFM", PERCENT=100): sparse_features=[] dense_features=[] target=defaulttarget #1 train file data = pd.read_csv(defaultfilename, sep='\t', names=defaultcolumnname, iterator=True) #1 train file concats take=[] loop = True while loop: try: chunk=data.get_chunk(10000000) chunk=chunk.sample(frac=PERCENT/100., replace=True, random_state=1) take.append(chunk) gc.collect() except StopIteration: loop=False print('stop iteration') data = pd.concat(take, ignore_index=True, copy=False) train_size = data.shape[0] print(train_size) take.clear() del [chunk,take] for column in data.columns: if column in defaulttarget: continue if data[column].dtype in [numpy.float_ , numpy.float64]: dense_features.append(column) if data[column].dtype in [numpy.int_, numpy.int64]: sparse_features.append(column) # sparse_features=list(set(sparse_features)) # dense_features=list(set(dense_features)) #***************normal #3. Remove na values data[sparse_features].fillna('-1', inplace=True) data[dense_features].fillna(0, inplace=True) #4. Label Encoding for sparse features, and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) #5. Dense normalize if dense_features: mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) #*****************normal #6. generate input data for model sparse_feature_list = [SingleFeat(feat, data[feat].nunique()) for feat in sparse_features] dense_feature_list = [SingleFeat(feat, 0) for feat in dense_features] #****************model # 6.choose a model import pkgutil import mdeepctr.models # modelnames = [name for _, name, _ in pkgutil.iter_modules(mdeepctr.__path__)] # modelname = input("choose a model: "+",".join(modelnames)+"\n") # if not modelname: modelname=defaultmodel # 7.build a model model = getattr(mdeepctr.models, modelname)({"sparse": sparse_feature_list, "dense": dense_feature_list}, final_activation='sigmoid', output_dim=len(defaulttarget)) # 8. eval predict def auc(y_true, y_pred): return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double) model.compile("adam", loss="binary_crossentropy", loss_weights=loss_weights, metrics=[auc]) train_model_input = [data[feat.name].values for feat in sparse_feature_list] + \ [data[feat.name].values for feat in dense_feature_list] train_labels = [data[target].values for target in defaulttarget] my_callbacks = [EarlyStopping(monitor='loss', min_delta=1e-2, patience=1, verbose=1, mode='min')] history = model.fit(train_model_input, train_labels, batch_size=2**14, epochs=3, verbose=1, callbacks=my_callbacks) del [train_model_input, train_labels, data] # import objgraph # objgraph.show_refs([data], filename='data-graph.png') #2 test file test_data = pd.read_csv(defaulttestfile, sep='\t', names=defaultcolumnname, ) raw_test_data=test_data.copy() #data = data.append(test_data) test_size=test_data.shape[0] print(test_size) #***************normal #3. Remove na values test_data[sparse_features].fillna('-1', inplace=True) test_data[dense_features].fillna(0, inplace=True) #4. Label Encoding for sparse features, and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() test_data[feat] = lbe.fit_transform(test_data[feat]) #5. Dense normalize if dense_features: mms = MinMaxScaler(feature_range=(0, 1)) test_data[dense_features] = mms.fit_transform(test_data[dense_features]) #*****************normal #test = test_data test_model_input = [test_data[feat.name].values for feat in sparse_feature_list] + \ [test_data[feat.name].values for feat in dense_feature_list] pred_ans = model.predict(test_model_input, batch_size=2**14) result = raw_test_data[['uid', 'item_id', 'finish', 'like']].copy() result.rename(columns={'finish': 'finish_probability', 'like': 'like_probability'}, inplace=True) result['finish_probability'] = pred_ans[0] result['like_probability'] = pred_ans[1] output = "%s-result.csv" % (modelname) result[['uid', 'item_id', 'finish_probability', 'like_probability']].to_csv( output, index=None, float_format='%.6f') return history
sparse_features = ['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'music_id', 'did', ] dense_features = ['video_duration'] # 'creat_time', data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0,) target = ['finish', 'like'] for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) sparse_feature_list = [SingleFeat(feat, data[feat].nunique()) for feat in sparse_features] dense_feature_list = [SingleFeat(feat, 0) for feat in dense_features] train = data.iloc[:train_size] test = data.iloc[train_size:] train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \ [train[feat.name].values for feat in dense_feature_list] test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \ [test[feat.name].values for feat in dense_feature_list] train_labels = [train[target[0]].values, train[target[1]].values] test_labels = [test[target[0]].values, test[target[1]].values]
key2index = {} genres_list = list(map(split, data['genres'].values)) genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences( genres_list, maxlen=max_len, padding='post', ) # 2.count #unique features for each sparse field and generate feature config for sequence feature sparse_feat_list = [ SingleFeat(feat, data[feat].nunique()) for feat in sparse_features ] sequence_feature = [VarLenFeat( 'genres', len(key2index) + 1, max_len, 'mean')] # Notice : value 0 is for padding for sequence input feature # 3.generate input data for model sparse_input = [data[feat.name].values for feat in sparse_feat_list] dense_input = [] sequence_input = [genres_list] model_input = sparse_input + dense_input + \ sequence_input # make sure the order is right # 4.Define Model,compile and train model = DeepFM({
def model_pool(defaultfilename='./input/final_track2_train.txt', defaulttestfile='./input/final_track2_test_no_anwser.txt', defaultcolumnname=[ 'uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'finish', 'like', 'music_id', 'did', 'creat_time', 'video_duration' ], defaulttarget=['finish', 'like'], defaultmodel="AFM", PERCENT=1): data = pd.read_csv(defaultfilename, sep='\t', names=defaultcolumnname, iterator=True) #1 train file take = [] loop = True while loop: try: chunk = data.get_chunk(10000) chunk = chunk.take(list(range(min(chunk.shape[0], PERCENT * 100))), axis=0) take.append(chunk) except StopIteration: loop = False print('stop iteration') data = pd.concat(take, ignore_index=True) train_size = data.shape[0] print(train_size) #2 extract file test_data = pd.read_csv( defaulttestfile, sep='\t', names=defaultcolumnname, ) data = data.append(test_data) test_size = test_data.shape[0] print(test_size) sparse_features = [] dense_features = [] target = defaulttarget for column in data.columns: if column in defaulttarget: continue if data[column].dtype in [numpy.float_, numpy.float64]: dense_features.append(column) if data[column].dtype in [numpy.int_, numpy.int64]: sparse_features.append(column) #3. Remove na values data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) #4. Label Encoding for sparse features, and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) #5. Dense normalize if dense_features: mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) #6. generate input data for model sparse_feature_list = [ SingleFeat(feat, data[feat].nunique()) for feat in sparse_features ] dense_feature_list = [SingleFeat(feat, 0) for feat in dense_features] #7. generate input data for model train = data.iloc[:train_size] test = data.iloc[train_size:] #8.generate data print(train.columns) train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \ [train[feat.name].values for feat in dense_feature_list] test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \ [test[feat.name].values for feat in dense_feature_list] train_labels = [train[target].values for target in defaulttarget] test_labels = test[target] # 6.choose a model import pkgutil import mdeepctr.models # modelnames = [name for _, name, _ in pkgutil.iter_modules(mdeepctr.__path__)] # modelname = input("choose a model: "+",".join(modelnames)+"\n") # if not modelname: modelname = defaultmodel # 7.build a model model = getattr(mdeepctr.models, modelname)({ "sparse": sparse_feature_list, "dense": dense_feature_list }, final_activation='sigmoid', output_dim=len(defaulttarget)) # 8. eval predict def auc(y_true, y_pred): return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double) # model.compile("adagrad", loss="binary_crossentropy", metrics=[auc]) model.compile(optimizer="adam", loss="binary_crossentropy", metrics=[auc]) my_callbacks = [ EarlyStopping(monitor='loss', min_delta=1e-2, patience=3, verbose=1, mode='min') ] history = model.fit(train_model_input, train_labels, batch_size=4096, epochs=100, verbose=1, callbacks=my_callbacks) pred_ans = model.predict(test_model_input, batch_size=2**14) # nsamples, nx, ny = numpy.asarray(pred_ans).shape # pred_ans = numpy.asarray(pred_ans).reshape((nx*ny, nsamples)) # print(test_labels.shape) # print(pred_ans.shape) # # logloss = round(log_loss(test_labels, pred_ans), 4) # try: # roc_auc = round(roc_auc_score(test_labels, pred_ans), 4) # except: # roc_auc=0 result = test_data[['uid', 'item_id', 'finish', 'like']].copy() result.rename(columns={ 'finish': 'finish_probability', 'like': 'like_probability' }, inplace=True) result['finish_probability'] = pred_ans[0] result['like_probability'] = pred_ans[1] output = "%s-result.csv" % (modelname) result[['uid', 'item_id', 'finish_probability', 'like_probability']].to_csv(output, index=None, float_format='%.6f') return history
def deepctr_cv(X_train, y_train, folds, logger, cv_path, X_test=None, optional_data=None, prep=True, split_conf=None): scores = [] preds = [] meta = np.zeros_like(y_train).astype("float64") if split_conf is None: X_tr, X_te, main_conf, _ = prep_for_embedding(X_train, X_test, conf, prep=prep) X_train, X_test = X_tr, X_te else: main_conf = split_conf cat_cols = [c for c, _, _ in main_conf[0]] cat_fs = [SingleFeat(c, d) for c, d, _ in main_conf[0]] num_fs = [SingleFeat(c, 0) for c in conf.num_cols] X_test = split_df(X_test, cat_cols, conf.num_cols) for num_fold, (tr_ind, tes_ind) in enumerate(folds): if num_fold > 0: break logger.info(f"fold_{num_fold}") fold_path = cv_path / f"fold{num_fold}" seed_path = fold_path Path(fold_path).mkdir(exist_ok=True, parents=True) callbacks = [CSVLogger(str(fold_path / 'epochs.csv'))] X_cv_train, X_cv_test = X_train.iloc[tr_ind], X_train.iloc[tes_ind] y_cv_train, y_cv_test = y_train.iloc[tr_ind], y_train.iloc[tes_ind] X_cv_train = split_df(X_cv_train, cat_cols, conf.num_cols) X_cv_test = split_df(X_cv_test, cat_cols, conf.num_cols) model = DeepFM({ 'sparse': cat_fs, 'dense': num_fs }, final_activation='sigmoid') model.compile("adam", "binary_crossentropy", metrics=['accuracy']) model.fit(X_cv_train, y_cv_train, callbacks=callbacks, batch_size=2048, epochs=10, verbose=1, validation_data=(X_cv_test, y_cv_test)) model.save_weights(str(seed_path / 'weights.h5'), save_format='hdf5') gc.collect() if X_test is not None: pred = model.predict(X_test, batch_size=2048) pred = pred[:, 0] np.save(seed_path / f"pred.npy", pred) train_oof = model.predict(X_cv_test, batch_size=2048) train_oof = train_oof[:, 0] auc = roc_auc_score(y_cv_test.values, train_oof) logger.info(f"{num_fold}: auc {auc}") np.save(seed_path / f"train_oof.npy", train_oof) # auc = roc_auc_score(y_cv_test, train_oof) # logger.info(f"seed_average: auc {auc}") scores.append(auc) np.save(fold_path / f"tes_ind.npy", tes_ind) meta[tes_ind] += train_oof del X_cv_train, y_cv_train, X_cv_test, y_cv_test if X_test is not None: preds.append(pred) scores = np.array(scores) preds = np.array(preds) pred = rank_average(preds) logger.info(f"{scores.mean()}, {scores.std()}") return scores, pred, meta