def get_mov_combined_features(): MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1 mov_id = layers.data(name='movie_id', shape=[1], dtype='int64') mov_emb = layers.embedding(input=mov_id, dtype='float32', size=[MOV_DICT_SIZE, 32], param_attr='movie_table', is_sparse=IS_SPARSE) mov_fc = layers.fc(input=mov_emb, size=32) CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) category_id = layers.data(name='category_id', shape=[1], dtype='int64', lod_level=1) mov_categories_emb = layers.embedding(input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) mov_categories_hidden = layers.sequence_pool(input=mov_categories_emb, pool_type="sum") MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64', lod_level=1) mov_title_emb = layers.embedding(input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) # 电影标题名称(title)是一个序列的整数,整数代表的是这个词在索引序列中的下标。 # 这个序列会被送入 sequence_conv_pool 层,这个层会在时间维度上使用卷积和池化。 # 因为如此,所以输出会是固定长度,尽管输入的序列长度各不相同。 mov_title_conv = nets.sequence_conv_pool(input=mov_title_emb, num_filters=32, filter_size=3, act="tanh", pool_type="sum") concat_embed = layers.concat( input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1) mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") return mov_combined_features
def textcnn(token_ids, vocab_size, num_classes, emb_dim, num_filters, mlp_hid_dim): """TextCNN模型前向过程实现 Args: token_ids: 包含不同长度样本的lod tensor,形状为[-1,1] vocab_size: 词典大小 num_classes: 类别数量 emb_dim: 词向量维度 num_filters: 每种尺寸的卷积核数量 mlm_hid_dim: MLP的隐层维度 Returns: prediction: 预测结果,各个类别的概率分布 """ emb = layers.embedding( # 得到输入样本的词向量表示 input=token_ids, size=[vocab_size, emb_dim]) res_size3 = nets.sequence_conv_pool( # 尺寸为3的卷积层&池化操作 input=emb, num_filters=num_filters, filter_size=3, act="tanh", pool_type="max") res_size4 = nets.sequence_conv_pool( # 尺寸为4的卷积层&池化操作 input=emb, num_filters=num_filters, filter_size=4, act="tanh", pool_type="max") res_size5 = nets.sequence_conv_pool( # 尺寸为5的卷积层&池化操作 input=emb, num_filters=num_filters, filter_size=5, act="tanh", pool_type="max") hidden = layers.fc( # 特征向量到MLP隐层的映射 input=[res_size3, res_size4, res_size4], size=mlp_hid_dim) prediction = fluid.layers.fc( # MLP隐层到类别 input=hidden, size=num_classes, act="softmax") return prediction
def get_mov_combined_features(): MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1 mov_id = layers.data(name='movie_id', shape=[1], dtype='int64') mov_emb = layers.embedding(input=mov_id, dtype='float32', size=[MOV_DICT_SIZE, 32], param_attr='movie_table', is_sparse=IS_SPARSE) mov_fc = layers.fc(input=mov_emb, size=32) CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) category_id = layers.data(name='category_id', shape=[1], dtype='int64', lod_level=1) mov_categories_emb = layers.embedding(input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) mov_categories_hidden = layers.sequence_pool(input=mov_categories_emb, pool_type="sum") MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64', lod_level=1) mov_title_emb = layers.embedding(input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) mov_title_conv = nets.sequence_conv_pool(input=mov_title_emb, num_filters=32, filter_size=3, act="tanh", pool_type="sum") concat_embed = layers.concat( input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1) # FIXME(dzh) : need tanh operator mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") return mov_combined_features
def get_mov_combined_features(): MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1 mov_id = layers.data(name='movie_id', shape=[1], dtype='int64') mov_emb = layers.embedding( input=mov_id, dtype='float32', size=[MOV_DICT_SIZE, 32], param_attr='movie_table', is_sparse=IS_SPARSE) mov_fc = layers.fc(input=mov_emb, size=32) CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) category_id = layers.data( name='category_id', shape=[1], dtype='int64', lod_level=1) mov_categories_emb = layers.embedding( input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) mov_categories_hidden = layers.sequence_pool( input=mov_categories_emb, pool_type="sum") MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) mov_title_id = layers.data( name='movie_title', shape=[1], dtype='int64', lod_level=1) mov_title_emb = layers.embedding( input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) mov_title_conv = nets.sequence_conv_pool( input=mov_title_emb, num_filters=32, filter_size=3, act="tanh", pool_type="sum") concat_embed = layers.concat( input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1) # FIXME(dzh) : need tanh operator mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") return mov_combined_features
def model(): """model""" user_phone_brand_id = layers.data(name='user_phone_brand', shape=[1], dtype='int64') user_gender_id = layers.data(name='user_gender', shape=[1], dtype='int64') user_age_id = layers.data(name='user_age', shape=[1], dtype='int64') user_status_id = layers.data(name='user_status', shape=[1], dtype="int64") user_trade_id = fluid.layers.data(name='user_trade', shape=[1], dtype='int64') user_cater_id = fluid.layers.data(name='user_cater', shape=[1], dtype='int64') user_income_id = fluid.layers.data(name='user_income', shape=[1], dtype='int64') user_city_id = fluid.layers.data(name='user_city', shape=[1], dtype='int64') user_click_id = fluid.layers.data(name='user_click', shape=[1], dtype='int64') user_b_click_id = fluid.layers.data(name='user_b_click', shape=[1], dtype='int64') user_c_click_id = fluid.layers.data(name='user_c_click', shape=[1], dtype='int64') user_d_click_id = fluid.layers.data(name='user_d_click', shape=[1], dtype='int64') week_id = layers.data(name='week', shape=[1], dtype="int64") hour_id = layers.data(name='hour', shape=[1], dtype='int64') content_b_c_d_id = layers.data(name='content_b_c_d', shape=[1], dtype='int64') content_tags_id = layers.data(name='content_tags', shape=[1], dtype='int64', lod_level=1) content_subtags_id = layers.data(name='content_subtags', shape=[1], dtype='int64', lod_level=1) user_content_tag_click_id = layers.data(name='user_content_tag_click', shape=[1], dtype='int64') user_content_subtag_click_id = layers.data(name='user_content_subtag_click', shape=[1], dtype='int64') content_pctr_discrete_id = layers.data(name='content_pctr_discrete', shape=[1], dtype='int64') # dnn_score_discrete_id = layers.data(name='dnn_score_discrete', shape=[1], dtype='int64') content_pctr = layers.data(name='content_pctr', shape=[1], dtype='float32') # dnn_score = layers.data(name='dnn_score', shape=[1], dtype='float32') # content_emb = layers.data(name='content_emb', shape=[64], dtype='float32') # user_emb = layers.data(name='user_emb', shape=[64], dtype='float32') user_click_tags_id = layers.data( name='user_click_tags_id', shape=[1], dtype='int64', lod_level=1) user_click_subtags_id = layers.data( name='user_click_subtags_id', shape=[1], dtype='int64', lod_level=1) candidate_title_word = layers.data(name='candidate_title', shape=[1], dtype='int64', lod_level=1) candidate_subtitle_word = layers.data(name='candidate_subtitle', shape=[1], dtype='int64', lod_level=1) candidate_title_len_id = layers.data(name='candidate_title_len', shape=[1], dtype='int64') candidate_subtitle_len_id = layers.data(name='candidate_subtitle_len', shape=[1], dtype='int64') click_title_list = layers.data(name='click_title_list', shape=[1], dtype='int64', lod_level=2) click_subtitle_list = layers.data(name='click_subtitle_list', shape=[1], dtype='int64', lod_level=2) click_title_len_list = layers.data(name='click_title_len_list', shape=[1], dtype='int64', lod_level=1) click_subtitle_len_list = layers.data(name='click_subtitle_len_list', shape=[1], dtype='int64', lod_level=1) label = layers.data(name='label', shape=[1], dtype='int64') # dnn_score_discrete_id.name, dnn_score.name, content_emb.name,user_emb.name, load_list = [user_phone_brand_id, user_gender_id, user_age_id, user_status_id, user_trade_id, user_cater_id, user_income_id, user_city_id, user_click_id, user_b_click_id, user_c_click_id, user_d_click_id, week_id, hour_id, content_b_c_d_id, content_tags_id, content_subtags_id, user_content_tag_click_id, user_content_subtag_click_id, content_pctr_discrete_id, content_pctr, user_click_tags_id, user_click_subtags_id, candidate_title_word, candidate_subtitle_word, candidate_title_len_id, candidate_subtitle_len_id, click_title_list, click_subtitle_list, click_title_len_list, click_subtitle_len_list, label] feed_order = [x.name for x in load_list] user_phone_brand_emb = layers.embedding( input=user_phone_brand_id, dtype='float32', size=[7, EMB_LEN], param_attr='user_phone_brand_emb', is_sparse=True) user_gender_emb = layers.embedding( input=user_gender_id, dtype='float32', size=[3, EMB_LEN], param_attr='user_gender_emb', is_sparse=True) user_age_emb = layers.embedding( input=user_age_id, dtype='float32', size=[8, EMB_LEN], param_attr='user_age_emb', is_sparse=True) user_status_emb = layers.embedding( input=user_status_id, dtype='float32', size=[3, EMB_LEN], is_sparse=True, param_attr='user_status_emb') user_trade_emb = layers.embedding( input=user_trade_id, dtype='float32', size=[24, EMB_LEN], is_sparse=True, param_attr='user_trade_emb') user_cater_emb = layers.embedding( input=user_cater_id, dtype='float32', size=[4, EMB_LEN], is_sparse=True, param_attr='user_cater_emb') user_income_emb = layers.embedding( input=user_income_id, dtype='float32', size=[6, EMB_LEN], is_sparse=True, param_attr='user_income_emb') user_city_emb = layers.embedding( input=user_city_id, dtype='float32', size=[4000, EMB_LEN], is_sparse=True, param_attr='user_city_emb') user_click_emb = layers.embedding( input=user_click_id, dtype='float32', size=[6, EMB_LEN], is_sparse=True, param_attr='user_click_emb') user_b_click_emb = layers.embedding( input=user_b_click_id, dtype='float32', size=[6, EMB_LEN], is_sparse=True, param_attr='user_b_click_emb') user_c_click_emb = layers.embedding( input=user_c_click_id, dtype='float32', size=[6, EMB_LEN], is_sparse=True, param_attr='user_c_click_emb') user_d_click_emb = layers.embedding( input=user_d_click_id, dtype='float32', size=[6, EMB_LEN], is_sparse=True, param_attr='user_d_click_emb') week_emb = layers.embedding( input=week_id, dtype='float32', size=[8, EMB_LEN], is_sparse=True, param_attr='week_emb') hour_emb = layers.embedding( input=hour_id, dtype='float32', size=[24, EMB_LEN], is_sparse=True, param_attr='hour_emb') content_b_c_d_emb = layers.embedding( input=content_b_c_d_id, dtype='float32', size=[3, EMB_LEN], is_sparse=True, param_attr='content_b_c_d_emb') content_tags_emb = layers.embedding( input=content_tags_id, size=[11, EMB_LEN], dtype='float32', is_sparse=True, param_attr=fluid.ParamAttr( name="content_tags_emb", learning_rate=0.5, regularizer=fluid.regularizer.L2Decay(1.0)) ) content_tags_emb_avg = fluid.layers.sequence_pool(input=content_tags_emb, pool_type='average') content_subtags_emb = layers.embedding( input=content_subtags_id, size=[65, EMB_LEN], dtype='float32', is_sparse=True, param_attr=fluid.ParamAttr( name="content_subtags_emb", learning_rate=0.5, regularizer=fluid.regularizer.L2Decay(1.0)) ) content_subtags_emb_avg = fluid.layers.sequence_pool( input=content_subtags_emb, pool_type='average') user_content_tag_click_emb = layers.embedding( input=user_content_tag_click_id, dtype='float32', size=[11 * 6, EMB_LEN], is_sparse=True, param_attr='user_content_tag_click_emb') user_content_subtag_click_emb = layers.embedding( input=user_content_subtag_click_id, dtype='float32', size=[65 * 6, EMB_LEN], is_sparse=True, param_attr='user_content_subtag_click_emb') content_pctr_discrete_emb = layers.embedding( input=content_pctr_discrete_id, dtype='float32', size=[55, EMB_LEN], is_sparse=True, param_attr='content_pctr_discrete_emb') # dnn_score_discrete_emb = layers.embedding( # input=dnn_score_discrete_id, dtype='float32', # size=[21, EMB_LEN], is_sparse=True, param_attr='dnn_score_discrete_emb') user_click_tags_id_emb = layers.embedding( input=user_click_tags_id, size=[11 * 6, EMB_LEN], dtype='float32', is_sparse=True, param_attr="user_content_tag_click_emb") user_click_tags_id_emb_avg = fluid.layers.sequence_pool( input=user_click_tags_id_emb, pool_type='average') user_click_subtags_id_emb = layers.embedding( input=user_click_subtags_id, size=[65 * 6, EMB_LEN], dtype='float32', is_sparse=True, param_attr="user_content_subtag_click_emb") user_click_subtags_id_emb_avg = fluid.layers.sequence_pool( input=user_click_subtags_id_emb, pool_type='average') # 候选内容feature生成 cand_title_emb = layers.embedding(input=candidate_title_word, size=[19962, EMB_LEN], dtype='float32', is_sparse=False, param_attr='word_embedding') cand_title_conv_pool = nets.sequence_conv_pool( input=cand_title_emb, num_filters=NUM_FILTERS, filter_size=3, act="relu", pool_type="average", param_attr='title_emb_conv', bias_attr='title_emb_conv_b') cand_subtitle_emb = layers.embedding(input=candidate_subtitle_word, size=[19962, EMB_LEN], dtype='float32', is_sparse=False, param_attr='word_embedding') cand_subtitle_conv_pool = nets.sequence_conv_pool( input=cand_subtitle_emb, num_filters=NUM_FILTERS, filter_size=3, act="relu", pool_type="average", param_attr='subtitle_emb_conv', bias_attr='subtitle_emb_conv_b') cand_title_len_emb = layers.embedding(input=candidate_title_len_id, size=[100, EMB_LEN], dtype='float32', is_sparse=True, param_attr='title_len_emb') cand_subtitle_len_emb = layers.embedding(input=candidate_subtitle_len_id, size=[100, EMB_LEN], dtype='float32', is_sparse=True, param_attr='subtitle_len_emb') cand_title_inf = layers.concat( input=[cand_title_conv_pool, cand_subtitle_conv_pool, cand_title_len_emb, cand_subtitle_len_emb], axis=-1) cand_title_feature = layers.fc( input=cand_title_inf, size=32, act="relu", param_attr='title_feature_list') #共享参数 # 用户历史点击内容feature生成 click_title_emb = layers.embedding(input=click_title_list, size=[19962, EMB_LEN], dtype='float32', is_sparse=False, param_attr='word_embedding') click_title_drnn = fluid.layers.DynamicRNN() with click_title_drnn.block(): title_emb = click_title_drnn.step_input(click_title_emb) click_title_conv_pool = nets.sequence_conv_pool( input=title_emb, num_filters=NUM_FILTERS, filter_size=3, act="relu", pool_type="average", param_attr='title_emb_conv', bias_attr='title_emb_conv_b') click_title_drnn.output(click_title_conv_pool) click_title_conv_pool_list = click_title_drnn() click_subtitle_emb = layers.embedding(input=click_subtitle_list, size=[19962, EMB_LEN], dtype='float32', is_sparse=False, param_attr='word_embedding') click_subtitle_drnn = fluid.layers.DynamicRNN() with click_subtitle_drnn.block(): subtitle_emb = click_subtitle_drnn.step_input(click_subtitle_emb) click_subtitle_conv_pool = nets.sequence_conv_pool( input=subtitle_emb, num_filters=NUM_FILTERS, filter_size=3, act="relu", pool_type="average", param_attr='subtitle_emb_conv', bias_attr='subtitle_emb_conv_b') click_subtitle_drnn.output(click_subtitle_conv_pool) click_subtitle_conv_pool_list = click_subtitle_drnn() click_title_len_emb_list = layers.embedding(input=click_title_len_list, size=[100, EMB_LEN], dtype='float32', is_sparse=True, param_attr='title_len_emb') click_subtitle_len_emb_list = layers.embedding(input=click_subtitle_len_list, size=[100, EMB_LEN], dtype='float32', is_sparse=True, param_attr='subtitle_len_emb') click_title_inf_list = layers.concat( input=[click_title_conv_pool_list, click_subtitle_conv_pool_list, click_title_len_emb_list, click_subtitle_len_emb_list], axis=-1) click_title_feature_list = layers.fc( input=click_title_inf_list, size=32, act="relu", param_attr='title_feature_list') #共享参数 user_click_title_feature = layers.sequence_pool(input=click_title_feature_list, pool_type="average") user_emb_feature = layers.concat( input=[user_phone_brand_emb, user_gender_emb, user_age_emb, user_status_emb, user_trade_emb, user_cater_emb, user_income_emb, user_city_emb, user_click_emb, user_b_click_emb, user_c_click_emb, user_d_click_emb], axis=1) content_emb_feature = layers.concat( input=[content_b_c_d_emb, content_tags_emb_avg, content_subtags_emb_avg, content_pctr_discrete_emb, cand_title_feature], axis=1) cross_emb_feature = layers.concat( input=[user_content_tag_click_emb, user_content_subtag_click_emb, user_click_tags_id_emb_avg, user_click_subtags_id_emb_avg, user_click_title_feature], axis=1) env_emb_feature = layers.concat( input=[week_emb, hour_emb], axis=1) combined_features = layers.concat(input=[ user_emb_feature, content_emb_feature, cross_emb_feature, env_emb_feature], axis=1) fc1 = layers.fc(input=combined_features, size=200, act='relu', param_attr='fc1', bias_attr='fc1_b') fc2 = layers.fc(input=fc1, size=200, act="relu", param_attr='fc2', bias_attr='fc2_b') fc3 = layers.fc(input=fc2, size=200, act="relu", param_attr='fc3', bias_attr='fc3_b') content_pctr_discrete_id_one_hot = layers.one_hot( content_pctr_discrete_id, 55, allow_out_of_range=False) final_layer = layers.concat(input=[fc3, content_pctr, content_pctr_discrete_id_one_hot], axis=1) predict = layers.fc( input=final_layer, size=2, act="softmax", param_attr='final_predict', bias_attr='final_predict_b') auc = fluid.layers.auc( input=predict, label=label, num_thresholds=2 ** 12) cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.reduce_mean(cost) loader = fluid.io.DataLoader.from_generator( feed_list=load_list, capacity=256, use_double_buffer=True, iterable=True) return {'predict': predict, 'avg_cost': avg_cost, 'feed_order': feed_order, 'loader': loader, 'auc': auc}