def get_xy_fd(hash_flag=False): feature_columns = [SparseFeat('user', 3, embedding_dim=10), SparseFeat( 'gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8), SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1)] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length")] # Notice: History behavior sequence feature name must start with "hist_". behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value pay_score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]]) seq_length = np.array([3, 3, 2]) # the actual length of the behavior sequence feature_dict = {'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score, 'seq_length': seq_length} x = {name: feature_dict[name] for name in get_feature_names(feature_columns)} y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3, embedding_dim=10), SparseFeat('gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8), SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4) ] behavior_feature_list = ["item_id", "cate_id"] # 变长特征使用的base稀疏特征 uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value pay_score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]]) # 特征名->data输入 feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
def test_feature_column_sparsefeat_vocabulary_path(): vocab_path = "./dummy_test.csv" sf = SparseFeat('user_id', 4, vocabulary_path=vocab_path) if sf.vocabulary_path != vocab_path: raise ValueError("sf.vocabulary_path is invalid") vlsf = VarLenSparseFeat(sf, 6) if vlsf.vocabulary_path != vocab_path: raise ValueError("vlsf.vocabulary_path is invalid")
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=4, embedding_name='item_gender'), maxlen=4) ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } feature_names = get_feature_names(feature_columns) x = {name: feature_dict[name] for name in feature_names} y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(): feature_columns = [SparseFeat('driver_age', 7, embedding_dim=32), SparseFeat('pax_age', 7, embedding_dim=32), SparseFeat('des_id', 10000, embedding_dim=32), SparseFeat('price_id', 20, embedding_dim=32)] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_price_id', vocabulary_size=5, embedding_dim=32), maxlen=3), VarLenSparseFeat(SparseFeat('hist_des_id', vocabulary_size=5, embedding_dim=32), maxlen=3)] # Notice: History behavior sequence feature name must start with "hist_". behavior_feature_list = ["price_id", "des_id"] driver_age = np.array([0, 1, 2]) pax_age = np.array([0, 1, 0]) pax_des = np.array([1, 2, 3]) # 0 is mask value pax_price = np.array([1, 2, 2]) # 0 is mask value hist_price_seq = np.array([[1, 2, 3], [3, 2, 1], [1, 2, 0]]) hist_des_seq = np.array([[1, 2, 2], [2, 2, 1], [1, 2, 0]]) feature_dict = {'driver_age': driver_age, 'pax_age': pax_age, 'des_id': pax_des, 'price_id': pax_price, 'hist_price_id': hist_price_seq, 'hist_des_id': hist_des_seq} x = {name: feature_dict[name] for name in get_feature_names(feature_columns)} y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
def training_set_construct(self): # 加载数据 data = pd.read_csv('./data/read_history.csv') # 负采样个数 negsample = 0 # 特征编码 features = ['user_id', 'item_id', 'gender', 'age', 'city'] feature_max_idx = {} for feature in features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 # 抽取用户、物品特征 user_info = data[["user_id", "gender", "age", "city"]].drop_duplicates('user_id') item_info = data[["item_id"]].drop_duplicates('item_id') user_info.set_index("user_id", inplace=True) # 构建输入数据 train_set, test_set = gen_data_set(data, negsample) # 转化为模型输入 train_model_input, train_label = gen_model_input( train_set, user_info, self.SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_info, self.SEQ_LEN) # 用户端输入特征 self.user_feature_columns = [ SparseFeat('user_id', feature_max_idx['user_id'], 16), SparseFeat("gender", feature_max_idx['gender'], 16), SparseFeat("age", feature_max_idx['age'], 16), SparseFeat("city", feature_max_idx['city'], 16), VarLenSparseFeat( SparseFeat('hist_item_id', feature_max_idx['item_id'], self.embedding_dim, embedding_name="item_id"), self.SEQ_LEN, 'mean', 'hist_len'), ] # 物品端输入特征 self.item_feature_columns = [ SparseFeat('item_id', feature_max_idx['item_id'], self.embedding_dim) ] return train_model_input, train_label, test_model_input, test_label, \ train_set, test_set, user_info, item_info
def train_youtube_model(train_model_input, train_label, embedding_dim, feature_max_idx, his_seq_maxlen, batch_size, epochs, verbose, validation_split): """构建youtubednn并完成训练""" # 特征封装 user_feature_columns = [ SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), VarLenSparseFeat( SparseFeat('hist_doc_ids', feature_max_idx['article_id'], embedding_dim, embedding_name="click_doc_id"), his_seq_maxlen, 'mean', 'hist_len'), SparseFeat('u_city', feature_max_idx['city'], embedding_dim), SparseFeat('u_age', feature_max_idx['age'], embedding_dim), SparseFeat('u_gender', feature_max_idx['gender'], embedding_dim), DenseFeat( 'u_example_age', 1, ) ] doc_feature_columns = [ SparseFeat('click_doc_id', feature_max_idx['article_id'], embedding_dim) # 这里后面也可以把文章的类别画像特征加入 ] # 定义模型 model = YoutubeDNN(user_feature_columns, doc_feature_columns, num_sampled=5, user_dnn_hidden_units=(64, embedding_dim)) # 模型编译 model.compile(optimizer="adam", loss=sampledsoftmaxloss) # 模型训练,这里可以定义验证集的比例,如果设置为0的话就是全量数据直接进行训练 history = model.fit(train_model_input, train_label, batch_size=batch_size, epochs=epochs, verbose=verbose, validation_split=validation_split) return model
def _build_model(self): to_drop = config.Keywords_Categories[self.params['category']] self._build_category_dict(drop_categories=to_drop) attrs_matrix, attrs_max_len = self._get_category_matrix(self.data) vars_fixlen = [SparseFeat(var, self.data[var].nunique(), embedding_dim=4) for var in self.features_sparse] vars_fixlen += [DenseFeat(var, 1,) for var in self.features_dense] vars_varlen = [VarLenSparseFeat(SparseFeat('categories', vocabulary_size=len(self.attr2index) + 1, embedding_dim=4), maxlen=attrs_max_len, combiner='mean', weight_name='attrs_weight' if self.params['weight'] else None)] self.features_linear = vars_fixlen + vars_varlen self.features_dnn = vars_fixlen + vars_varlen self.model = DeepFM(self.features_linear, self.features_dnn, task='regression', **self.params_deepfm) return attrs_matrix, attrs_max_len
def get_xy_fd(hash_flag=False): user_feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1, embedding_dim=4, embedding_name='item'), maxlen=4, length_name="hist_len") ] item_feature_columns = [SparseFeat( 'item', 3 + 1, embedding_dim=4, )] uid = np.array([0, 1, 2, 1]) ugender = np.array([0, 1, 0, 1]) iid = np.array([1, 2, 3, 1]) # 0 is mask value hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [3, 0, 0, 0]]) hist_len = np.array([3, 3, 2, 1]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'hist_item': hist_iid, "hist_len": hist_len } # feature_names = get_feature_names(feature_columns) x = feature_dict y = np.array([1, 1, 1, 1]) return x, y, user_feature_columns, item_feature_columns
dim = 8 else: dim = 4 if column == 'user_id': feature_columns += [SparseFeat(column, 212062 + 1, embedding_dim=dim)] elif column == 'merchant_id': feature_columns += [SparseFeat(column, 1993 + 1, embedding_dim=dim)] elif column == 'action_type': feature_columns += [SparseFeat(column, 4 + 1, embedding_dim=dim)] else: feature_columns += [DenseFeat(column, 1)] # maxlen为历史信息的长度,vocabulary_size为onehot的长度 feature_columns += [ VarLenSparseFeat(sparsefeat=SparseFeat('hist_merchant_id', vocabulary_size=1993, embedding_dim=8, embedding_name='merchant_id'), maxlen=M), VarLenSparseFeat(sparsefeat=SparseFeat('hist_action_type', vocabulary_size=4, embedding_dim=4, embedding_name='action_type'), maxlen=M)] history_features = ['merchant_id', 'action_type'] print(len(feature_columns)) # 使用DIN模型 model = DIN(feature_columns, history_features) # 使用Adam优化器,二分类的交叉熵 model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) # model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"]) # 组装train_model_input,得到feature names,将train_X转换为字典格式 feature_names = list(train_X.columns) train_model_input = {name: train_X[name].values for name in get_feature_names(feature_columns)} print("########################################")
# 1.Use hashing encoding on the fly for sparse features,and process sequence features genres_list = list(map(lambda x: x.split('|'), data['genres'].values)) genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=object, value=0).astype(str) # 2.set hashing space for each sparse field and generate feature config for sequence feature fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string') for feat in sparse_features] varlen_feature_columns = [ VarLenSparseFeat(SparseFeat('genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"), maxlen=max_len, combiner='mean', )] # Notice : value 0 is for padding for sequence input feature linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model model_input = {name: data[name] for name in feature_names} model_input['genres'] = genres_list # 4.Define Model,compile and train model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression') model.compile("adam", "mse", metrics=['mse'], ) history = model.fit(model_input, data[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
train_set, test_set = gen_data_set_sdm(data, seq_short_len=SEQ_LEN_short, seq_prefer_len=SEQ_LEN_prefer) train_model_input, train_label = gen_model_input_sdm(train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) test_model_input, test_label = gen_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 32 # for sdm,we must provide `VarLenSparseFeat` with name "prefer_xxx" and "short_xxx" and their length user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16), SparseFeat("gender", feature_max_idx['gender'], 16), SparseFeat("age", feature_max_idx['age'], 16), SparseFeat("occupation", feature_max_idx['occupation'], 16), SparseFeat("zip", feature_max_idx['zip'], 16), VarLenSparseFeat(SparseFeat('short_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN_short, 'mean', 'short_sess_length'), VarLenSparseFeat(SparseFeat('prefer_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN_prefer, 'mean', 'prefer_sess_length'), VarLenSparseFeat(SparseFeat('short_genres', feature_max_idx['genres'], embedding_dim, embedding_name="genres"), SEQ_LEN_short, 'mean', 'short_sess_length'), VarLenSparseFeat(SparseFeat('prefer_genres', feature_max_idx['genres'], embedding_dim, embedding_name="genres"), SEQ_LEN_prefer, 'mean', 'prefer_sess_length'), ] item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] K.set_learning_phase(True)
embedding_dim=embedding_size), SparseFeat("movie_id", vocabulary_size=movie_num, embedding_dim=embedding_size), SparseFeat("current_label", vocabulary_size=genre_num, embedding_dim=embedding_size), SparseFeat("release_year", vocabulary_size=year_num, embedding_dim=embedding_size), ] feature_columns += [ VarLenSparseFeat(SparseFeat("user_recent_click_movie_ids", vocabulary_size=movie_num, embedding_dim=embedding_size, embedding_name='movie_id'), maxlen=20), VarLenSparseFeat(SparseFeat("user_recent_click_labels", vocabulary_size=genre_num, embedding_dim=embedding_size, embedding_name='current_label'), maxlen=20), VarLenSparseFeat(SparseFeat("user_like_genres", vocabulary_size=genre_num, embedding_dim=embedding_size, embedding_name='current_label'), maxlen=2), ] dnn_feature_columns = feature_columns
def get_xy_from_txt(file_path="data/movielens_sample_din.txt"): feature_columns = [ SparseFeat('user', 3, embedding_dim=10), SparseFeat('gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8), SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4) ] behavior_feature_list = ["item_id", "cate_id"] # head = ['label', 'user', 'gender', 'item_id', 'cate_id', 'hist_item_id', 'hist_cate_id', 'pay_score'] data = pd.read_csv(file_path, delimiter=',') def to_int_array(x): ret = [] a = x.split('|') for str in a: ret.append(int(str)) return np.array(ret) # return ret data['hist_item_id'] = data['hist_item_id'].apply(to_int_array) data['hist_cate_id'] = data['hist_cate_id'].apply(to_int_array) uid = np.array(data['user']) ugender = np.array(data['gender']) iid = np.array(data['item_id']) # 0 is mask value cate_id = np.array(data['cate_id']) # 0 is mask value pay_score = np.array(data['pay_score']) print("hist_cate_id: ", type(data['hist_cate_id']), type(data['hist_cate_id'][0]), np.shape(data['hist_cate_id'][0]), data['hist_cate_id']) print("------------" * 10) hist_iid = np.array(data['hist_item_id'].tolist()) hist_cate_id = np.array(data['hist_cate_id'].tolist()) print("uid: ", type(uid), uid) print("hist_cate_id: ", type(hist_cate_id), type(hist_cate_id[0]), np.shape(hist_cate_id[0]), hist_cate_id) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array(data.pop('label')) return x, y, feature_columns, behavior_feature_list
fixlen_feature_columns = [SparseFeat(feat, train[feat].nunique(), embedding_dim=int(sys.argv[5])) for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_feature_names(fixlen_feature_columns) train_model_input = {name: train[name] for name in fixlen_feature_names} test_model_input = {name: test[name] for name in fixlen_feature_names} if sys.argv[1] in ['DIEN', 'DIEN_UDG', 'DIN', 'DIN_UDG']: test_model_input, test_label, max_len = get_input(test, 0, 'test') train_model_input, train_label, _ = get_input(train, max_len, 'train') fixlen_feature_columns = [SparseFeat(feat, train[feat].nunique()+1, embedding_dim=int(sys.argv[5])) for feat in sparse_features] fixlen_feature_columns += [DenseFeat(feat, 1,) for feat in dense_features] fixlen_feature_columns += [ VarLenSparseFeat(SparseFeat('hist_itemId', train['itemId'].nunique() + 1, embedding_dim = int(sys.argv[5]), embedding_name='itemId'), maxlen=max_len, length_name='seq_length'), VarLenSparseFeat(SparseFeat('hist_category', train['category'].nunique() + 1, embedding_dim = int(sys.argv[5]), embedding_name='category'), maxlen=max_len, length_name='seq_length'), VarLenSparseFeat(SparseFeat('neg_hist_itemId', train['itemId'].nunique() + 1, embedding_dim = int(sys.argv[5]), embedding_name='itemId'), maxlen=max_len, length_name='seq_length'), VarLenSparseFeat(SparseFeat('neg_hist_category', train['category'].nunique() + 1, embedding_dim = int(sys.argv[5]), embedding_name='category'), maxlen=max_len, length_name='seq_length') ] behavior_feature_list = ['itemId', 'category'] if sys.argv[1] == 'DeepFM_UDG': model = DeepFM_UDG(linear_feature_columns, dnn_feature_columns, untrainable_features_columns,
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [ SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag), SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag), SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag), SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) behavior_length = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': score, "seq_length": behavior_length } if use_neg: feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) feature_columns += [ VarLenSparseFeat(SparseFeat('neg_hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('neg_hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
padding='post', ) # 2.count #unique features for each sparse field and generate feature config for sequence feature fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique(), embedding_dim=4) for feat in sparse_features ] use_weighted_sequence = False if use_weighted_sequence: varlen_feature_columns = [ VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', weight_name='genres_weight') ] # Notice : value 0 is for padding for sequence input feature else: varlen_feature_columns = [ VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', weight_name=None) ] # Notice : value 0 is for padding for sequence input feature linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
def main(data_path): unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] user = pd.read_csv(data_path + "ml-1m/users.dat", sep="::", header=None, names=unames) rnames = ['user_id', 'movie_id', 'rating', 'timestamp'] ratings = pd.read_csv(data_path + "ml-1m/ratings.dat", sep="::", header=None, names=rnames) mnames = ['movie_id', 'title', 'genres'] movies = pd.read_csv(data_path + "ml-1m/movies.dat", sep="::", header=None, names=mnames) data = pd.merge(pd.merge(ratings, movies), user) sparse_features = [ "movie_id", "user_id", "gender", "age", "occupation", "zip", "genres" ] SEQ_LEN_short = 5 SEQ_LEN_prefer = 50 # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` features = [ 'user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip', 'genres' ] feature_max_idx = {} for feature in features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 user_profile = data[[ "user_id", "gender", "age", "occupation", "zip", "genres" ]].drop_duplicates('user_id') item_profile = data[["movie_id"]].drop_duplicates('movie_id') user_profile.set_index("user_id", inplace=True) train_set, test_set = gen_data_set_sdm(data, seq_short_len=SEQ_LEN_short, seq_prefer_len=SEQ_LEN_prefer) train_model_input, train_label = get_model_input_sdm( train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) test_model_input, test_label = get_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) print(train_model_input) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 32 user_feature_columns = [ SparseFeat('user_id', feature_max_idx['user_id'], 16), SparseFeat('gender', feature_max_idx['gender'], 16), SparseFeat('age', feature_max_idx['age'], 16), SparseFeat('occupation', feature_max_idx['occupation'], 16), SparseFeat('zip', feature_max_idx['zip'], 16), VarLenSparseFeat(SparseFeat('short_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN_short, combiner='mean', length_name='short_sess_length'), VarLenSparseFeat(SparseFeat('prefer_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN_prefer, combiner='mean', length_name='prefer_sess_length'), VarLenSparseFeat(SparseFeat('short_genres', feature_max_idx['genres'], embedding_dim, embedding_name='genres'), SEQ_LEN_short, combiner='mean', length_name='short_sess_length'), VarLenSparseFeat(SparseFeat('prefer_genres', feature_max_idx['genres'], embedding_dim, embedding_name='genres'), SEQ_LEN_short, combiner='mean', length_name='prefer_sess_length') ] item_feature_columns = [ SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim) ] K.set_learning_phase(True) model = SDM( user_feature_columns, item_feature_columns, history_feature_list=['movie_id', 'genres'], units=embedding_dim, num_sampled=100, ) # 梯度裁剪 optimizer = tf.keras.optimizers.Adam(lr=0.001, clipnorm=5.0)
if re.search('query', feat_name) != None: continue if re.search('sparse', feat_name) != None: # 是sparse特征 if feat_name[-6:] == 'weight': select_columns_name.append(feat_name) continue select_columns_name.append(feat_name) for key in vocabulary_size.keys(): if key in feat_name: vocabulary_size_val = vocabulary_size[key] embedding_name = key break varlen_feature_columns.append(VarLenSparseFeat( SparseFeat(feat_name, vocabulary_size=vocabulary_size_val + 1, embedding_dim=4, use_hash=False, embedding_name=embedding_name), maxlen=1, combiner='mean', weight_name=feat_name + '_weight', weight_norm=False)) else: # 是dense特征 if feat_name[-6:] == 'weight': select_columns_name.append(feat_name) fixed_feature_columns.append(DenseFeat(feat_name, 1, )) # dense 特征 else: continue # if use_hour_features: # 复现最优结果 # for feat_name in all_columns: # if feat_name[-6:] == 'weight' or feat_name in ['ctr_label', 'cvr_label']: # select_columns_name.append(feat_name) # continue # for key in vocabulary_size.keys(): # if key in feat_name:
# 2.set hashing space for each sparse field and generate feature config for sequence feature fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string') for feat in sparse_features ] varlen_feature_columns = [ VarLenSparseFeat( SparseFeat('genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"), maxlen=max_len, combiner='mean', ) ] # Notice : value 0 is for padding for sequence input feature linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model model_input = {name: data[name] for name in feature_names} model_input['genres'] = genres_list print("model_input:", model_input) # 4.Define Model,compile and train
def get_xy_fd_sdm(hash_flag=False): user_feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), VarLenSparseFeat(SparseFeat('prefer_item', vocabulary_size=100, embedding_dim=8, embedding_name='item'), maxlen=6, length_name="prefer_sess_length"), VarLenSparseFeat(SparseFeat('prefer_cate', vocabulary_size=100, embedding_dim=8, embedding_name='cate'), maxlen=6, length_name="prefer_sess_length"), VarLenSparseFeat(SparseFeat('short_item', vocabulary_size=100, embedding_dim=8, embedding_name='item'), maxlen=4, length_name="short_sess_length"), VarLenSparseFeat(SparseFeat('short_cate', vocabulary_size=100, embedding_dim=8, embedding_name='cate'), maxlen=4, length_name="short_sess_length"), ] item_feature_columns = [SparseFeat( 'item', 100, embedding_dim=8, )] uid = np.array([0, 1, 2, 1]) ugender = np.array([0, 1, 0, 1]) iid = np.array([1, 2, 3, 1]) # 0 is mask value prefer_iid = np.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 0], [1, 2, 3, 3, 0, 0], [1, 2, 4, 0, 0, 0]]) prefer_cate = np.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 0], [1, 2, 3, 3, 0, 0], [1, 2, 4, 0, 0, 0]]) short_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [3, 0, 0, 0]]) short_cate = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [3, 0, 0, 0]]) prefer_len = np.array([6, 5, 4, 3]) short_len = np.array([3, 3, 2, 1]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'prefer_item': prefer_iid, "prefer_cate": prefer_cate, 'short_item': short_iid, 'short_cate': short_cate, 'prefer_sess_length': prefer_len, 'short_sess_length': short_len } # feature_names = get_feature_names(feature_columns) x = feature_dict y = np.array([1, 1, 1, 0]) history_feature_list = ['item', 'cate'] return x, y, user_feature_columns, item_feature_columns, history_feature_list
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 3, use_hash=hash_flag), SparseFeat('gender', 2, use_hash=hash_flag), SparseFeat('item', 3 + 1, use_hash=hash_flag), SparseFeat('item_gender', 2 + 1, use_hash=hash_flag), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('sess_0_item', 3 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('sess_0_item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item_gender'), maxlen=4) ] feature_columns += [ VarLenSparseFeat(SparseFeat('sess_1_item', 3 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('sess_1_item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item_gender'), maxlen=4) ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]]) sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]]) sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess_number = np.array([2, 1, 0]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score, 'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } x["sess_length"] = sess_number y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1, sequence_feature=['sum', 'mean', 'max', 'weight'], classification=True, include_length=False, hash_flag=False, prefix='', use_group=False): feature_columns = [] model_input = {} if 'weight' in sequence_feature: feature_columns.append( VarLenSparseFeat(SparseFeat(prefix + "weighted_seq", vocabulary_size=2, embedding_dim=embedding_size), maxlen=3, length_name=prefix + "weighted_seq" + "_seq_length", weight_name=prefix + "weight")) s_input, s_len_input = gen_sequence(2, 3, sample_size) model_input[prefix + "weighted_seq"] = s_input model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1) model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input sequence_feature.pop(sequence_feature.index('weight')) for i in range(sparse_feature_num): if use_group: group_name = str(i % 3) else: group_name = DEFAULT_GROUP_NAME dim = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sparse_feature_' + str(i), dim, embedding_size, use_hash=hash_flag, dtype=tf.int32, group_name=group_name)) for i in range(dense_feature_num): feature_columns.append( DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(SparseFeat(prefix + 'sequence_' + mode, vocabulary_size=dim, embedding_dim=embedding_size), maxlen=maxlen, combiner=mode)) for fc in feature_columns: if isinstance(fc, SparseFeat): model_input[fc.name] = np.random.randint(0, fc.vocabulary_size, sample_size) elif isinstance(fc, DenseFeat): model_input[fc.name] = np.random.random(sample_size) else: s_input, s_len_input = gen_sequence(fc.vocabulary_size, fc.maxlen, sample_size) model_input[fc.name] = s_input if include_length: fc.length_name = prefix + "sequence_" + str(i) + '_seq_length' model_input[prefix + "sequence_" + str(i) + '_seq_length'] = s_len_input if classification: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) return model_input, y, feature_columns
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 16 user_feature_columns = [ SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), SparseFeat("gender", feature_max_idx['gender'], embedding_dim), SparseFeat("age", feature_max_idx['age'], embedding_dim), SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), SparseFeat("zip", feature_max_idx['zip'], embedding_dim), VarLenSparseFeat( SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), ] item_feature_columns = [ SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim) ] # 3.Define Model and train K.set_learning_phase(True) import tensorflow as tf if tf.__version__ >= '2.0.0': tf.compat.v1.disable_eager_execution()
def train(): data = load_data() item_set = set(data['movie_id'].unique()) SEQ_LEN = 50 # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip'] feature_max_idx = {} for feature in features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') item_profile = data[["movie_id"]].drop_duplicates('movie_id') user_profile.set_index("user_id", inplace=True) user_item_list = data.groupby("user_id")['movie_id'].apply(list) train_set, test_set = gen_data_set(data, 0) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 16 user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), SparseFeat("gender", feature_max_idx['gender'], embedding_dim), SparseFeat("age", feature_max_idx['age'], embedding_dim), SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), SparseFeat("zip", feature_max_idx['zip'], embedding_dim), VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), ] item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] # 3.Define Model and train K.set_learning_phase(True) import tensorflow as tf if tf.__version__ >= '2.0.0': tf.compat.v1.disable_eager_execution() model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, user_dnn_hidden_units=(64, embedding_dim)) model.compile(optimizer="adam", loss=sampledsoftmaxloss) # "binary_crossentropy") history = model.fit(train_model_input, train_label, # train_label, batch_size=256, epochs=50, verbose=1, validation_split=0.0, ) # 4. Generate user features for testing and full item features for retrieval test_user_model_input = test_model_input all_item_model_input = {"movie_id": item_profile['movie_id'].values} user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) # user_embs = user_embs[:, i, :] # i in [0,k_max) if MIND item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) # print(user_embs) # print(item_embs) # 5. [Optional] ANN search by faiss and evaluate the result test_true_label = {line[0]: [line[2]] for line in test_set} index = faiss.IndexFlatIP(embedding_dim) # faiss.normalize_L2(item_embs) index.add(item_embs) # faiss.normalize_L2(user_embs) D, I = index.search(np.ascontiguousarray(user_embs), 10) recommed_dict = {} for i, uid in enumerate(test_user_model_input['user_id']): recommed_dict.setdefault(uid, []) try: pred = [item_profile['movie_id'].values[x] for x in I[i]] recommed_dict[uid] = pred except: print(i) test_user_items = dict() for ts in test_set: if ts[0] not in test_user_items: test_user_items[ts[0]] = set(ts[1]) item_popularity = dict() for ts in train_set: for item in ts[1]: if item in item_popularity: item_popularity[item] += 1 else: item_popularity.setdefault(item, 1) precision = metric.precision(recommed_dict, test_user_items) recall = metric.recall(recommed_dict, test_user_items) coverage = metric.coverage(recommed_dict, item_set) popularity = metric.popularity(item_popularity, recommed_dict) print("precision:{:.4f}, recall:{:.4f}, coverage:{:.4f}, popularity:{:.4f}".format(precision, recall, coverage, popularity))
def _model_fn(features, labels, mode, config): train_flag = (mode == tf.estimator.ModeKeys.TRAIN) with variable_scope(DNN_SCOPE_NAME): sparse_feature_columns = [] dense_feature_columns = [] varlen_sparse_feature_columns = [] for feat in dnn_feature_columns: new_feat_name = list(feat.parse_example_spec.keys())[0] if new_feat_name in ['hist_price_id', 'hist_des_id']: varlen_sparse_feature_columns.append( VarLenSparseFeat(SparseFeat(new_feat_name, vocabulary_size=100, embedding_dim=32, use_hash=False), maxlen=3)) elif is_embedding(feat): sparse_feature_columns.append( SparseFeat(new_feat_name, vocabulary_size=feat[0]._num_buckets + 1, embedding_dim=feat.dimension)) else: dense_feature_columns.append(DenseFeat(new_feat_name)) history_feature_columns = [] sparse_varlen_feature_columns = [] history_fc_names = list( map(lambda x: "hist_" + x, history_feature_list)) for fc in varlen_sparse_feature_columns: feature_name = fc.name if feature_name in history_fc_names: history_feature_columns.append(fc) else: sparse_varlen_feature_columns.append(fc) my_feature_columns = sparse_feature_columns + dense_feature_columns + varlen_sparse_feature_columns embedding_dict = create_embedding_matrix(my_feature_columns, l2_reg_embedding, seed, prefix="") query_emb_list = embedding_lookup(embedding_dict, features, sparse_feature_columns, history_feature_list, history_feature_list, to_list=True) print('query_emb_list', query_emb_list) print('embedding_dict', embedding_dict) print('haha') print('history_feature_columns', history_feature_columns) print('haha') keys_emb_list = embedding_lookup(embedding_dict, features, history_feature_columns, history_fc_names, history_fc_names, to_list=True) print('keys_emb_list', keys_emb_list) dnn_input_emb_list = embedding_lookup( embedding_dict, features, sparse_feature_columns, mask_feat_list=history_feature_list, to_list=True) print('dnn_input_emb_list', dnn_input_emb_list) dense_value_list = get_dense_input(features, dense_feature_columns) sequence_embed_dict = varlen_embedding_lookup( embedding_dict, features, sparse_varlen_feature_columns) sequence_embed_list = get_varlen_pooling_list( sequence_embed_dict, features, sparse_varlen_feature_columns, to_list=True) dnn_input_emb_list += sequence_embed_list keys_emb = concat_func(keys_emb_list, mask=True) deep_input_emb = concat_func(dnn_input_emb_list) query_emb = concat_func(query_emb_list, mask=True) hist = AttentionSequencePoolingLayer( att_hidden_size, att_activation, weight_normalization=att_weight_normalization, supports_masking=True)([query_emb, keys_emb]) deep_input_emb = tf.keras.layers.Concatenate()( [NoMask()(deep_input_emb), hist]) deep_input_emb = tf.keras.layers.Flatten()(deep_input_emb) dnn_input = combined_dnn_input([deep_input_emb], dense_value_list) output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed=seed)(dnn_input) final_logit = tf.keras.layers.Dense( 1, use_bias=False, kernel_initializer=tf.keras.initializers.glorot_normal(seed))( output) # logits_list.append(final_logit) # logits = add_func(logits_list) # print(labels) # tf.summary.histogram(final_logit + '/final_logit', final_logit) return deepctr_model_fn(features, mode, final_logit, labels, task, linear_optimizer, dnn_optimizer, training_chief_hooks=training_chief_hooks)
def train_sdm_model(train_model_input, train_label, embedding_dim, feature_max_idx, SEQ_LEN_short, SEQ_LEN_prefer, batch_size, epochs, verbose, validation_split): """构建sdm并完成训练""" # 建立模型 user_feature_columns = [ SparseFeat('user_id', feature_max_idx['user_id'], 16), SparseFeat('gender', feature_max_idx['gender'], 16), SparseFeat('age', feature_max_idx['age'], 16), SparseFeat('city', feature_max_idx['city'], 16), VarLenSparseFeat( SparseFeat('short_doc_id', feature_max_idx['article_id'], embedding_dim, embedding_name="doc_id"), SEQ_LEN_short, 'mean', 'short_sess_length'), VarLenSparseFeat( SparseFeat('prefer_doc_id', feature_max_idx['article_id'], embedding_dim, embedding_name='doc_id'), SEQ_LEN_prefer, 'mean', 'prefer_sess_length'), VarLenSparseFeat( SparseFeat('short_cat1', feature_max_idx['cat_1'], embedding_dim, embedding_name='cat_1'), SEQ_LEN_short, 'mean', 'short_sess_length'), VarLenSparseFeat( SparseFeat('prefer_cat1', feature_max_idx['cat_1'], embedding_dim, embedding_name='cat_1'), SEQ_LEN_prefer, 'mean', 'prefer_sess_length'), VarLenSparseFeat( SparseFeat('short_cat2', feature_max_idx['cat_2'], embedding_dim, embedding_name='cat_2'), SEQ_LEN_short, 'mean', 'short_sess_length'), VarLenSparseFeat( SparseFeat('prefer_cat2', feature_max_idx['cat_2'], embedding_dim, embedding_name='cat_2'), SEQ_LEN_prefer, 'mean', 'prefer_sess_length'), ] item_feature_columns = [ SparseFeat('doc_id', feature_max_idx['article_id'], embedding_dim) ] # 定义模型 model = Sdm(user_feature_columns, item_feature_columns, history_feature_list=['doc_id', 'cat1', 'cat2']) # 模型编译 model.compile(optimizer="adam", loss=sampledsoftmaxloss) # 模型训练,这里可以定义验证集的比例,如果设置为0的话就是全量数据直接进行训练 history = model.fit(train_model_input, train_label, batch_size=batch_size, epochs=epochs, verbose=verbose, validation_split=validation_split) return model