Example #1
0
def create_feature_columns(note_emb_size=10, note_user_emb_size=6):
    # 先创建分类列

    creator_ids = fc.categorical_column_with_hash_bucket("last_note_creators",
                                                         hash_bucket_size=2000,
                                                         dtype=tf.string)
    note_ids = fc.categorical_column_with_hash_bucket("last_note_ids",
                                                      20000,
                                                      dtype=tf.int64)

    creator_id = fc.categorical_column_with_hash_bucket("note_open_id", 2000)
    note_id = fc.categorical_column_with_hash_bucket("note_id",
                                                     20000,
                                                     dtype=tf.int64)

    video_duration = fc.numeric_column("note_video_duration")
    video_duration_bucket = fc.bucketized_column(source_column=video_duration,
                                                 boundaries=[5, 10, 30, 60])

    note_emb = fc.shared_embedding_columns([note_ids, note_id],
                                           note_emb_size,
                                           combiner='sum')
    creator_emb = fc.shared_embedding_columns([creator_ids, creator_id],
                                              note_user_emb_size,
                                              combiner='sum')

    my_feature_columns = note_emb + creator_emb + [video_duration_bucket]
    print("*" * 100)
    print("feature columns:")
    for i in my_feature_columns:
        print(i)
    print("*" * 100)
    return my_feature_columns
Example #2
0
def create_feature_columns():
  # user feature
  bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10240, dtype=tf.int64)
  c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64)
  cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10240, dtype=tf.int64)
  sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10240, dtype=tf.int64)
  pids = fc.categorical_column_with_hash_bucket("behaviorPids", 1000000, dtype=tf.int64)
  bids_weighted = fc.weighted_categorical_column(bids, "bidWeights")
  c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights")
  cids_weighted = fc.weighted_categorical_column(cids, "cidWeights")
  sids_weighted = fc.weighted_categorical_column(sids, "sidWeights")
  pids_weighted = fc.weighted_categorical_column(pids, "pidWeights")

  # item feature
  pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64)
  sid = fc.categorical_column_with_hash_bucket("sellerId", 10240, dtype=tf.int64)
  bid = fc.categorical_column_with_hash_bucket("brandId", 10240, dtype=tf.int64)
  c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64)
  cid = fc.categorical_column_with_hash_bucket("cateId", 10240, dtype=tf.int64)

  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200))
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
  hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))
  phoneBrand = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneBrand", 1000))
  phoneResolution = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneResolution", 500))
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  tab = fc.indicator_column(fc.categorical_column_with_vocabulary_list("tab",
        ["ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi"], default_value=0))

  pid_embed = fc.shared_embedding_columns([pids_weighted, pid], 64, combiner='sum', shared_embedding_collection_name="pid")
  bid_embed = fc.shared_embedding_columns([bids_weighted, bid], 32, combiner='sum', shared_embedding_collection_name="bid")
  cid_embed = fc.shared_embedding_columns([cids_weighted, cid], 32, combiner='sum', shared_embedding_collection_name="cid")
  c1id_embed = fc.shared_embedding_columns([c1ids_weighted, c1id], 10, combiner='sum', shared_embedding_collection_name="c1id")
  sid_embed = fc.shared_embedding_columns([sids_weighted, sid], 32, combiner='sum', shared_embedding_collection_name="sid")
  global my_feature_columns
  my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution,
             phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer]
  my_feature_columns += pid_embed
  my_feature_columns += sid_embed
  my_feature_columns += bid_embed
  my_feature_columns += cid_embed
  my_feature_columns += c1id_embed
  print("feature columns:", my_feature_columns)
  return my_feature_columns
def test_shared_embedding_column_with_hash_bucket():
    # 1. Input features
    color_data = {
        'range': [[2, 2], [5, 5], [0, -1], [0, 0]],
        'id': [[2], [5], [-1], [0]]
    }
    builder = _LazyBuilder(color_data)
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_hash_bucket(
        'range', 7, dtype=tf.int32)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    # 2. Feature columns (Sparse)
    color_column2 = feature_column.categorical_column_with_hash_bucket(
        'id', 7, dtype=tf.int32)
    color_column_tensor2 = color_column2._get_sparse_tensors(builder)
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('not use input_layer' + '_' * 40)
        print(session.run([color_column_tensor.id_tensor]))
        print(session.run([color_column_tensor2.id_tensor]))

    # 2. Feature columns (Dense)
    color_column_embed = feature_column.shared_embedding_columns(
        [color_column2, color_column], 3, combiner='sum')
    print(type(color_column_embed))
    # 3. Feature tensor
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    color_column_embed)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run(color_dense_tensor))
Example #4
0
    def transform(self, output_tensors):
        input_tensor = list()
        for input_tensor_name in self.parameters.get("input_tensor"):
            input_tensor.append(output_tensors.get(input_tensor_name))
        if self.parameters.has_key("dimension"):
            dimension = self.parameters.get("dimension")
        else:
            msg = "parameters error, embedding_column must need dimension"
            logger.error(msg)
            raise ParametersError(msg)
        ckpt_to_load_from = None
        tensor_name_in_ckpt = None
        if self.parameters.has_key("ckpt_to_load_from") and self.parameters.has_key("tensor_name_in_ckpt"):
            ckpt_to_load_from = self.parameters.get("ckpt_to_load_from")
            tensor_name_in_ckpt = self.parameters.get("tensor_name_in_ckpt")

        combiner = self.parameters.get("combiner") if self.parameters.has_key("combiner") else "mean"
        shared_embedding_columns = fc.shared_embedding_columns(
            categorical_columns = input_tensor,
            dimension = dimension,
            combiner=combiner,
            ckpt_to_load_from=ckpt_to_load_from,
            tensor_name_in_ckpt=tensor_name_in_ckpt
        )
        for output_tensor_name, output_tensor in zip(self.parameters.get("output_tensor"), shared_embedding_columns):
            output_tensors[output_tensor_name] = output_tensor
Example #5
0
def test_shared_embedding_column_with_hash_bucket():
    color_data = {
        'color': [[2, 2], [5, 5], [0, -1], [0, 0]],
        'color2': [[2], [5], [-1], [0]]
    }  # 4行样本
    builder = _LazyBuilder(color_data)
    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7, dtype=tf.int32)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    color_column2 = feature_column.categorical_column_with_hash_bucket(
        'color2', 7, dtype=tf.int32)
    color_column_tensor2 = color_column2._get_sparse_tensors(builder)
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('not use input_layer' + '_' * 40)
        print(session.run([color_column_tensor.id_tensor]))
        print(session.run([color_column_tensor2.id_tensor]))

    # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
    color_column_embed = feature_column.shared_embedding_columns(
        [color_column2, color_column], 3, combiner='sum')
    print(type(color_column_embed))
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    color_column_embed)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run(color_dense_tensor))
Example #6
0
def create_feature_columns(note_emb_size=10, note_user_emb_size=6):
    # 先创建分类列

    creator_ids = fc.categorical_column_with_hash_bucket("last_note_creators",
                                                         hash_bucket_size=2000,
                                                         dtype=tf.string)
    note_ids = fc.categorical_column_with_hash_bucket("last_note_ids",
                                                      20000,
                                                      dtype=tf.int64)

    creator_id = fc.categorical_column_with_hash_bucket("note_open_id", 2000)
    note_id = fc.categorical_column_with_hash_bucket("note_id",
                                                     20000,
                                                     dtype=tf.int64)

    video_duration = fc.numeric_column("note_video_duration")
    video_duration_bucket = fc.bucketized_column(source_column=video_duration,
                                                 boundaries=[5, 10, 30, 60])

    note_emb = fc.shared_embedding_columns([note_ids, note_id],
                                           note_emb_size,
                                           combiner='sum')
    creator_emb = fc.shared_embedding_columns([creator_ids, creator_id],
                                              note_user_emb_size,
                                              combiner='sum')

    # phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
    # phoneBrand = fc.embedding_column(phoneBrandId, 20)
    # phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
    # phoneResolution = fc.embedding_column(phoneResolutionId, 10)
    # phoneOs = fc.indicator_column(fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
    # gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0))
    # city_id = fc.categorical_column_with_hash_bucket("city", 700)
    # city = fc.embedding_column(city_id, 16)
    # hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))

    my_feature_columns = note_emb + creator_emb + [video_duration_bucket]
    print("*" * 100)
    print("feature columns:")
    for i in my_feature_columns:
        print(i)
    print("*" * 100)
    return my_feature_columns
Example #7
0
def create_user_feature_columns():
  gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0))
  age_class = fc.indicator_column(fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0))
  has_baby = fc.indicator_column(fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0))
  baby_gender = fc.indicator_column(fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0))
  baby_age = fc.indicator_column(fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0))
  grade = fc.indicator_column(fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0))
  rfm_type = fc.indicator_column(fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0))
  cate1_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate1_price_prefer", num_buckets=6, default_value=0))
  cate2_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate2_price_prefer", num_buckets=6, default_value=0))
  cate3_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate3_price_prefer", num_buckets=6, default_value=0))
  city_id = fc.categorical_column_with_hash_bucket("city", 700)
  city = fc.shared_embedding_columns([city_id], 16)
  cols = [gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, cate1_price_prefer, cate2_price_prefer, cate3_price_prefer]
  return cols + city
Example #8
0
def create_interaction_feature_columns(shared_embedding_dim=60):
  # user embedding features
  phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
  phoneBrand = fc.shared_embedding_columns([phoneBrandId], shared_embedding_dim)
  phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
  phoneResolution = fc.shared_embedding_columns([phoneResolutionId], shared_embedding_dim)
  bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10240, dtype=tf.int64)
  c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64)
  cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10240, dtype=tf.int64)
  sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10240, dtype=tf.int64)
  pids = fc.categorical_column_with_hash_bucket("behaviorPids", 1000000, dtype=tf.int64)
  bids_weighted = fc.weighted_categorical_column(bids, "bidWeights")
  c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights")
  cids_weighted = fc.weighted_categorical_column(cids, "cidWeights")
  sids_weighted = fc.weighted_categorical_column(sids, "sidWeights")
  pids_weighted = fc.weighted_categorical_column(pids, "pidWeights")

  # item embedding features
  pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64)
  sid = fc.categorical_column_with_hash_bucket("sellerId", 10240, dtype=tf.int64)
  bid = fc.categorical_column_with_hash_bucket("brandId", 10240, dtype=tf.int64)
  c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64)
  c2id = fc.categorical_column_with_hash_bucket("cate2Id", 500, dtype=tf.int64)
  cid = fc.categorical_column_with_hash_bucket("cateId", 10240, dtype=tf.int64)

  # shared embedding
  pid_emb = fc.shared_embedding_columns([pids_weighted, pid], shared_embedding_dim, combiner='sum')
  bid_emb = fc.shared_embedding_columns([bids_weighted, bid], shared_embedding_dim, combiner='sum')
  cid_emb = fc.shared_embedding_columns([cids_weighted, cid], shared_embedding_dim, combiner='sum')
  c1id_emb = fc.shared_embedding_columns([c1ids_weighted, c1id], shared_embedding_dim, combiner='sum')
  sid_emb = fc.shared_embedding_columns([sids_weighted, sid], shared_embedding_dim, combiner='sum')
  c2id_emb = fc.shared_embedding_columns([c2id], shared_embedding_dim)

  columns = phoneBrand
  columns += phoneResolution
  columns += pid_emb
  columns += sid_emb
  columns += bid_emb
  columns += cid_emb
  columns += c1id_emb
  columns += c2id_emb
  print("interaction feature columns:", columns)
  return columns
Example #9
0
def create_feature_columns(dataset, embed_size=32, hash_size=10000):
    n_users = dataset.user.nunique()
    n_items = dataset.item.nunique()
    genre_list = dataset.genre1.unique()
    users = fc.categorical_column_with_vocabulary_list("user",
                                                       np.arange(n_users),
                                                       default_value=-1,
                                                       dtype=tf.int64)
    items = fc.categorical_column_with_vocabulary_list("item",
                                                       np.arange(n_items),
                                                       default_value=-1,
                                                       dtype=tf.int64)
    gender = fc.categorical_column_with_vocabulary_list("gender", ["M", "F"])
    age = fc.categorical_column_with_vocabulary_list(
        "age", [1, 18, 25, 35, 45, 50, 56], dtype=tf.int64)
    occupation = fc.categorical_column_with_vocabulary_list("occupation",
                                                            np.arange(21),
                                                            dtype=tf.int64)
    genre1 = fc.categorical_column_with_vocabulary_list("genre1", genre_list)
    genre2 = fc.categorical_column_with_vocabulary_list("genre2", genre_list)
    genre3 = fc.categorical_column_with_vocabulary_list("genre3", genre_list)

    wide_cols = [
        users, items, gender, age, occupation, genre1, genre2, genre3,
        fc.crossed_column([gender, age, occupation],
                          hash_bucket_size=hash_size),
        fc.crossed_column([age, genre1], hash_bucket_size=hash_size)
    ]

    embed_cols = [users, items, age, occupation]
    deep_cols = list()
    for col in embed_cols:
        deep_cols.append(fc.embedding_column(col, embed_size))

    shared_embed_cols = [genre1, genre2, genre3]
    deep_cols.extend(fc.shared_embedding_columns(shared_embed_cols,
                                                 embed_size))
    deep_cols.append(fc.indicator_column(gender))

    label = fc.numeric_column("label", default_value=0.0, dtype=tf.float32)
    feat_columns = [label]
    feat_columns += wide_cols
    feat_columns += deep_cols
    feat_spec = fc.make_parse_example_spec(feat_columns)
    return wide_cols, deep_cols, feat_spec
Example #10
0
def create_embedding_feature_columns(shared_embedding_dim=64):
    '''
    describe:当我们需要对特征进行embedding共享对时候
    :return:
    '''
    # 点击category id
    c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids",
                                                   100,
                                                   dtype=tf.int64)
    # 对clids进行加权赋值,有点像attention
    c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights")
    # category id
    c1id = fc.categorical_column_with_hash_bucket("cate1Id",
                                                  100,
                                                  dtype=tf.int64)
    # c1ids_weighted 和 c1id中都用到了category id,但是这边是保证了其在同一个embedding空间,并不是特征一致
    # 此处c1id_emb会返回长度为2的列表,每个元素的是shared_embedding_dim维的tenser,总长2*shared_embedding_dim
    c1id_emb = fc.shared_embedding_columns([c1ids_weighted, c1id],
                                           shared_embedding_dim,
                                           combiner='sum')
    return c1id_emb
def shared_embedding_column_with_hash_bucket():

    features = {'L1': [[410387, 415955, 412596, 416526, 416805, 408844, 418514, 411611, 415266],
                          [410387, 415955, 412596, 416526, 416805, 408844, 418514, 411611, 415266]],
                'LW1': [[44.0, 33.0, 17.0, 6.0, 3.0, 2.0, 1.0, 1.0, 1.0],[44.0, 33.0, 17.0, 6.0, 3.0, 2.0, 1.0, 1.0, 1.0]],
                 'a2': [[410387],[415955]]
                 }
    """
    这两个编码的映射hash_bucket_size要是统一的一个值,这里是40
    """
    brandlist = tf.feature_column.categorical_column_with_hash_bucket(key='L1',hash_bucket_size=40,dtype=tf.int64)
    brandweighteds = tf.feature_column.weighted_categorical_column(brandlist, 'LW1',dtype=tf.float32)
    brand = tf.feature_column.categorical_column_with_hash_bucket(key='a2',hash_bucket_size=40,dtype=tf.int64)
    brand_embed = feature_column.shared_embedding_columns([brandweighteds, brand], 5, combiner='sum',shared_embedding_collection_name="brand")  # mark
    print(brand_embed)
    color_dense_tensor = feature_column.input_layer(features,brand_embed)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('shared_embedding_columns' + '_' * 40)
        print(session.run(color_dense_tensor))
def shared_embedding_column_with_hash_bucket():
    color_data = {'color': [[2, 2], [5, 5], [0, -1], [0, 0]],  # 4行样本 shape=[4,2]
                  'color2': [[2], [5], [-1], [0]]}  # 4行样本  shape=[4,1]
    builder = _LazyBuilder(color_data)

    # categorical_column1
    color_column = feature_column.categorical_column_with_hash_bucket('color', 7, dtype=tf.int32)
    print(color_column)
    # tensor1
    color_column_tensor = color_column._get_sparse_tensors(builder)

    # categorical_column2
    color_column2 = feature_column.categorical_column_with_hash_bucket('color2', 7, dtype=tf.int32)
    print(color_column2)

    # tensor2
    color_column_tensor2 = color_column2._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('categorical_column_with_hash_bucket' + '_' * 40)
        print(session.run([color_column_tensor.id_tensor]))
        print(session.run([color_column_tensor2.id_tensor]))
    print('not use input_layer' + '_' * 40)


    color_column_embed = feature_column.shared_embedding_columns([color_column2, color_column], 3, combiner='sum')
    print(type(color_column_embed))
    print((color_column_embed))

    color_dense_tensor = feature_column.input_layer(color_data, color_column_embed)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('shared_embedding_columns' + '_' * 40)
        print(session.run(color_dense_tensor))
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
构建分布式Tensorflow模型系列之CVR预估案例ESMM模型
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
'''

from tensorflow import feature_column as fc
# user field 用户历史浏览过的产品ID列表 映射
pids = fc.categorical_column_with_hash_bucket("behaviorPids",
                                              10240,
                                              dtype=tf.int64)
# item field   productId表示当前的候选产品ID映射
pid = fc.categorical_column_with_hash_bucket("productId",
                                             1000000,
                                             dtype=tf.int64)
pid_embed = fc.shared_embedding_columns([pids, pid],
                                        100,
                                        combiner='sum',
                                        shared_embedding_collection_name="pid")
"""
那么如何实现weighted sum pooling操作呢?答案就是使用weighted_categorical_column函数。我们必须在构建样本时添加一个额外的权重特征,
权重特征表示行为序列中每个产品的权重,因此权重特征是一个与行为序列平行的列表(向量),两者的维度必须相同。
另外,如果行为序列中有填充的默认值-1,那么权重特征中这些默认值对应的权重必须为0。代码示例如下:
"""
from tensorflow import feature_column as fc
# user field
pids = fc.categorical_column_with_hash_bucket("behaviorPids",
                                              10240,
                                              dtype=tf.int64)
pids_weighted = fc.weighted_categorical_column(pids, "pidWeights")
# item field
pid = fc.categorical_column_with_hash_bucket("productId",
                                             1000000,
        featrues, [p_x_c_identy])
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    print(sess.run([p_x_c_identy_dense_tensor]))
"""--------------------------------------------- Senior_column -------------------------------------------------"""
"""shared_embedding_columns"""
with tf.Session() as sess:
    color_data = {
        'color': [[2, 2], [5, 5], [0, -1], [0, 0]],
        'color2': [[2], [5], [-1], [0]]
    }  # 4行样本
    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7, dtype=tf.int32)
    color_column2 = feature_column.categorical_column_with_hash_bucket(
        'color2', 7, dtype=tf.int32)
    color_column_embed = feature_column.shared_embedding_columns(
        [color_column2, color_column], 3, combiner='sum')
    print(type(color_column_embed))
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    color_column_embed)
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    print(sess.run(color_dense_tensor))
"""weighted_categorical_column"""
features = {
    'color': [['R'], ['A'], ['G'], ['B'], ['R']],
    'weight': [[1.0], [5.0], [4.0], [8.0], [3.0]]
}

color_f_c = tf.feature_column.categorical_column_with_vocabulary_list(
    'color', ['R', 'G', 'B', 'A'], dtype=tf.string, default_value=-1)
Example #15
0
def create_feature_columns():
  # user feature
  bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10000, dtype=tf.int64)
  c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64)
  cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10000, dtype=tf.int64)
  sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10000, dtype=tf.int64)
  pids = fc.categorical_column_with_hash_bucket("behaviorPids", 500000, dtype=tf.int64)

  gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0))
  age_class = fc.indicator_column(fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0))
  has_baby = fc.indicator_column(fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0))
  baby_gender = fc.indicator_column(fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0))
  baby_age = fc.indicator_column(fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0))
  grade = fc.indicator_column(fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0))
  rfm_type = fc.indicator_column(fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0))
  cate1_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate1_price_prefer", num_buckets=6, default_value=0))
  cate2_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate2_price_prefer", num_buckets=6, default_value=0))
  cate3_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate3_price_prefer", num_buckets=6, default_value=0))
  city_id = fc.categorical_column_with_hash_bucket("city", 700)
  city = fc.shared_embedding_columns([city_id], 16)

  # item feature
  pid = fc.categorical_column_with_hash_bucket("productId", 500000, dtype=tf.int64)
  sid = fc.categorical_column_with_hash_bucket("sellerId", 10000, dtype=tf.int64)
  bid = fc.categorical_column_with_hash_bucket("brandId", 10000, dtype=tf.int64)
  c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64)
  cid = fc.categorical_column_with_hash_bucket("cateId", 10000, dtype=tf.int64)
  c2id = fc.categorical_column_with_hash_bucket("cate2Id", 10000, dtype=tf.int64)
  modified_time = fc.numeric_column("modified_time", default_value=0.0)
  modified_time_sqrt = fc.numeric_column("modified_time_sqrt", default_value=0.0)
  modified_time_square = fc.numeric_column("modified_time_square", default_value=0.0)
  props_sex = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("props_sex", ["男", "女", "通用", "情侣"], default_value=0))
  brand_grade = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("brand_grade", ["A类品牌", "B类品牌", "C类品牌", "D类品牌"], default_value=0))
  shipment_rate = fc.numeric_column("shipment_rate", default_value=0.0)
  shipping_rate = fc.numeric_column("shipping_rate", default_value=0.0)
  ipv_ntile = fc.bucketized_column(fc.numeric_column("ipv_ntile", dtype=tf.int64, default_value=99), boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
  pay_ntile = fc.bucketized_column(fc.numeric_column("pay_ntile", dtype=tf.int64, default_value=99), boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
  price = fc.numeric_column("price_norm", default_value=0.0)
  ctr_1d = fc.numeric_column("ctr_1d", default_value=0.0)
  cvr_1d = fc.numeric_column("cvr_1d", default_value=0.0)
  uv_cvr_1d = fc.numeric_column("uv_cvr_1d", default_value=0.0)
  ctr_1w = fc.numeric_column("ctr_1w", default_value=0.0)
  cvr_1w = fc.numeric_column("cvr_1w", default_value=0.0)
  uv_cvr_1w = fc.numeric_column("uv_cvr_1w", default_value=0.0)
  ctr_2w = fc.numeric_column("ctr_2w", default_value=0.0)
  cvr_2w = fc.numeric_column("cvr_2w", default_value=0.0)
  uv_cvr_2w = fc.numeric_column("uv_cvr_2w", default_value=0.0)
  ctr_1m = fc.numeric_column("ctr_1m", default_value=0.0)
  cvr_1m = fc.numeric_column("cvr_1m", default_value=0.0)
  uv_cvr_1m = fc.numeric_column("uv_cvr_1m", default_value=0.0)
  pay_qty_1d = fc.numeric_column("pay_qty_1d", default_value=0.0)
  pay_qty_1w = fc.numeric_column("pay_qty_1w", default_value=0.0)
  pay_qty_2w = fc.numeric_column("pay_qty_2w", default_value=0.0)
  pay_qty_1m = fc.numeric_column("pay_qty_1m", default_value=0.0)
  cat2_pay_qty = fc.numeric_column("cat2_pay_qty_1d", default_value=0.0)
  cat1_pay_qty = fc.numeric_column("cat1_pay_qty_1d", default_value=0.0)
  brd_pay_qty = fc.numeric_column("brd_pay_qty_1d", default_value=0.0)
  slr_pay_qty_1d = fc.numeric_column("slr_pay_qty_1d", default_value=0.0)
  slr_pay_qty_1w = fc.numeric_column("slr_pay_qty_1w", default_value=0.0)
  slr_pay_qty_2w = fc.numeric_column("slr_pay_qty_2w", default_value=0.0)
  slr_pay_qty_1m = fc.numeric_column("slr_pay_qty_1m", default_value=0.0)
  slr_brd_pay_qty_1d = fc.numeric_column("slr_brd_pay_qty_1d", default_value=0.0)
  slr_brd_pay_qty_1w = fc.numeric_column("slr_brd_pay_qty_1w", default_value=0.0)
  slr_brd_pay_qty_2w = fc.numeric_column("slr_brd_pay_qty_2w", default_value=0.0)
  slr_brd_pay_qty_1m = fc.numeric_column("slr_brd_pay_qty_1m", default_value=0.0)
  weighted_ipv = fc.numeric_column("weighted_ipv", default_value=0.0)
  cat1_weighted_ipv = fc.numeric_column("cat1_weighted_ipv", default_value=0.0)
  cate_weighted_ipv = fc.numeric_column("cate_weighted_ipv", default_value=0.0)
  slr_weighted_ipv = fc.numeric_column("slr_weighted_ipv", default_value=0.0)
  brd_weighted_ipv = fc.numeric_column("brd_weighted_ipv", default_value=0.0)
  cms_scale = fc.numeric_column("cms_scale", default_value=0.0)
  cms_scale_sqrt = fc.numeric_column("cms_scale_sqrt", default_value=0.0)

  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  position = fc.bucketized_column(fc.numeric_column("position", dtype=tf.int64, default_value=301),
    boundaries=[1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 30, 40, 50, 80, 100, 150, 200, 300])
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 41, default_value=40))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 41, default_value=40))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
  hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))
  phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
  phoneBrand = fc.shared_embedding_columns([phoneBrandId], 20)
  phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
  phoneResolution = fc.shared_embedding_columns([phoneResolutionId], 10)
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  tab = fc.indicator_column(fc.categorical_column_with_vocabulary_list("tab",
        ["ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi"], default_value=0))

  pid_embed = fc.shared_embedding_columns([pids, pid], 64, combiner='sqrtn', shared_embedding_collection_name="pid")
  bid_embed = fc.shared_embedding_columns([bids, bid], 32, combiner='sqrtn', shared_embedding_collection_name="bid")
  cid_embed = fc.shared_embedding_columns([cids, cid], 48, combiner='sqrtn', shared_embedding_collection_name="cid")
  c1id_embed = fc.shared_embedding_columns([c1ids, c1id], 10, combiner='sqrtn', shared_embedding_collection_name="c1id")
  sid_embed = fc.shared_embedding_columns([sids, sid], 32, combiner='sqrtn', shared_embedding_collection_name="sid")
  c2id_embed = fc.shared_embedding_columns([c2id], 32, shared_embedding_collection_name="c2id")
  feature_columns = [matchScore, matchType, position, triggerNum, triggerRank, sceneType, hour,
    phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer,
    gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, price, props_sex, brand_grade,
    cate1_price_prefer, cate2_price_prefer, cate3_price_prefer, modified_time, modified_time_sqrt, modified_time_square,
    shipment_rate, shipping_rate, ipv_ntile, pay_ntile, uv_cvr_1d, uv_cvr_1w, uv_cvr_2w, uv_cvr_1m,
    ctr_1d, ctr_1w, ctr_2w, ctr_1m, cvr_1d, cvr_1w, cvr_2w, cvr_1m,
    pay_qty_1d, pay_qty_1w, pay_qty_2w, pay_qty_1m, cat2_pay_qty, cat1_pay_qty, brd_pay_qty,
    slr_pay_qty_1d, slr_pay_qty_1w, slr_pay_qty_2w, slr_pay_qty_1m,
    slr_brd_pay_qty_1d, slr_brd_pay_qty_1w, slr_brd_pay_qty_2w, slr_brd_pay_qty_1m,
    weighted_ipv, cat1_weighted_ipv, cate_weighted_ipv, slr_weighted_ipv, brd_weighted_ipv,
    cms_scale, cms_scale_sqrt]
  feature_columns += pid_embed
  feature_columns += sid_embed
  feature_columns += bid_embed
  feature_columns += cid_embed
  feature_columns += c1id_embed
  feature_columns += c2id_embed
  feature_columns += city
  feature_columns += phoneResolution
  feature_columns += phoneBrand
  print("feature columns:", feature_columns)
  return feature_columns
Example #16
0
def create_feature_columns():
    c2id = fc.categorical_column_with_hash_bucket("cate2Id",
                                                  5000,
                                                  dtype=tf.int64)
    modified_time = fc.numeric_column("modified_time", default_value=0.0)
    modified_time_sqrt = fc.numeric_column("modified_time_sqrt",
                                           default_value=0.0)
    modified_time_square = fc.numeric_column("modified_time_square",
                                             default_value=0.0)
    props_sex = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list("props_sex",
                                                   ["男", "女", "通用", "情侣"],
                                                   default_value=0))
    brand_grade = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list(
            "brand_grade", ["A类品牌", "B类品牌", "C类品牌", "D类品牌"], default_value=0))
    shipment_rate = fc.numeric_column("shipment_rate", default_value=0.0)
    shipping_rate = fc.numeric_column("shipping_rate", default_value=0.0)
    ipv_ntile = fc.bucketized_column(
        fc.numeric_column("ipv_ntile", dtype=tf.int64, default_value=99),
        boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
    pay_ntile = fc.bucketized_column(
        fc.numeric_column("pay_ntile", dtype=tf.int64, default_value=99),
        boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
    price = fc.numeric_column("price_norm", default_value=0.0)
    ctr_1d = fc.numeric_column("ctr_1d", default_value=0.0)
    cvr_1d = fc.numeric_column("cvr_1d", default_value=0.0)
    uv_cvr_1d = fc.numeric_column("uv_cvr_1d", default_value=0.0)
    ctr_1w = fc.numeric_column("ctr_1w", default_value=0.0)
    cvr_1w = fc.numeric_column("cvr_1w", default_value=0.0)
    uv_cvr_1w = fc.numeric_column("uv_cvr_1w", default_value=0.0)
    ctr_2w = fc.numeric_column("ctr_2w", default_value=0.0)
    cvr_2w = fc.numeric_column("cvr_2w", default_value=0.0)
    uv_cvr_2w = fc.numeric_column("uv_cvr_2w", default_value=0.0)
    ctr_1m = fc.numeric_column("ctr_1m", default_value=0.0)
    cvr_1m = fc.numeric_column("cvr_1m", default_value=0.0)
    uv_cvr_1m = fc.numeric_column("uv_cvr_1m", default_value=0.0)
    pay_qty_1d = fc.numeric_column("pay_qty_1d", default_value=0.0)
    pay_qty_1w = fc.numeric_column("pay_qty_1w", default_value=0.0)
    pay_qty_2w = fc.numeric_column("pay_qty_2w", default_value=0.0)
    pay_qty_1m = fc.numeric_column("pay_qty_1m", default_value=0.0)
    cat2_pay_qty = fc.numeric_column("cat2_pay_qty_1d", default_value=0.0)
    cat1_pay_qty = fc.numeric_column("cat1_pay_qty_1d", default_value=0.0)
    brd_pay_qty = fc.numeric_column("brd_pay_qty_1d", default_value=0.0)
    slr_pay_qty_1d = fc.numeric_column("slr_pay_qty_1d", default_value=0.0)
    slr_pay_qty_1w = fc.numeric_column("slr_pay_qty_1w", default_value=0.0)
    slr_pay_qty_2w = fc.numeric_column("slr_pay_qty_2w", default_value=0.0)
    slr_pay_qty_1m = fc.numeric_column("slr_pay_qty_1m", default_value=0.0)
    slr_brd_pay_qty_1d = fc.numeric_column("slr_brd_pay_qty_1d",
                                           default_value=0.0)
    slr_brd_pay_qty_1w = fc.numeric_column("slr_brd_pay_qty_1w",
                                           default_value=0.0)
    slr_brd_pay_qty_2w = fc.numeric_column("slr_brd_pay_qty_2w",
                                           default_value=0.0)
    slr_brd_pay_qty_1m = fc.numeric_column("slr_brd_pay_qty_1m",
                                           default_value=0.0)
    weighted_ipv = fc.numeric_column("weighted_ipv", default_value=0.0)
    cat1_weighted_ipv = fc.numeric_column("cat1_weighted_ipv",
                                          default_value=0.0)
    cate_weighted_ipv = fc.numeric_column("cate_weighted_ipv",
                                          default_value=0.0)
    slr_weighted_ipv = fc.numeric_column("slr_weighted_ipv", default_value=0.0)
    brd_weighted_ipv = fc.numeric_column("brd_weighted_ipv", default_value=0.0)
    cms_scale = fc.numeric_column("cms_scale", default_value=0.0)
    cms_scale_sqrt = fc.numeric_column("cms_scale_sqrt", default_value=0.0)

    # context feature
    matchScore = fc.numeric_column("matchScore", default_value=0.0)
    popScore = fc.numeric_column("popScore", default_value=0.0)
    brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0)
    cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0)
    catePrefer = fc.numeric_column("catePrefer", default_value=0.0)
    sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0)
    matchType = fc.indicator_column(
        fc.categorical_column_with_identity("matchType", 9, default_value=0))
    position = fc.bucketized_column(fc.numeric_column("position",
                                                      dtype=tf.int64,
                                                      default_value=301),
                                    boundaries=[
                                        1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 30,
                                        40, 50, 80, 100, 150, 200, 300
                                    ])
    triggerNum = fc.indicator_column(
        fc.categorical_column_with_identity("triggerNum", 41,
                                            default_value=40))
    triggerRank = fc.indicator_column(
        fc.categorical_column_with_identity("triggerRank",
                                            41,
                                            default_value=40))
    sceneType = fc.indicator_column(
        fc.categorical_column_with_identity("type", 2, default_value=0))
    hour = fc.indicator_column(
        fc.categorical_column_with_identity("hour", 24, default_value=0))
    phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
    phoneBrand = fc.shared_embedding_columns([phoneBrandId], 20)
    phoneResolutionId = fc.categorical_column_with_hash_bucket(
        "phoneResolution", 500)
    phoneResolution = fc.shared_embedding_columns([phoneResolutionId], 10)
    phoneOs = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list("phoneOs",
                                                   ["android", "ios"],
                                                   default_value=0))
    tab = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list("tab", [
            "ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang",
            "JuJia", "MeiShi"
        ],
                                                   default_value=0))

    c2id_embed = fc.shared_embedding_columns(
        [c2id], 16, shared_embedding_collection_name="c2id")
    feature_columns = [
        matchScore, matchType, position, triggerNum, triggerRank, sceneType,
        hour, phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer,
        catePrefer, price, props_sex, brand_grade, modified_time,
        modified_time_sqrt, modified_time_square, shipment_rate, shipping_rate,
        ipv_ntile, pay_ntile, uv_cvr_1d, uv_cvr_1w, uv_cvr_2w, uv_cvr_1m,
        ctr_1d, ctr_1w, ctr_2w, ctr_1m, cvr_1d, cvr_1w, cvr_2w, cvr_1m,
        pay_qty_1d, pay_qty_1w, pay_qty_2w, pay_qty_1m, cat2_pay_qty,
        cat1_pay_qty, brd_pay_qty, slr_pay_qty_1d, slr_pay_qty_1w,
        slr_pay_qty_2w, slr_pay_qty_1m, slr_brd_pay_qty_1d, slr_brd_pay_qty_1w,
        slr_brd_pay_qty_2w, slr_brd_pay_qty_1m, weighted_ipv,
        cat1_weighted_ipv, cate_weighted_ipv, slr_weighted_ipv,
        brd_weighted_ipv, cms_scale, cms_scale_sqrt
    ]
    feature_columns += c2id_embed
    feature_columns += phoneResolution
    feature_columns += phoneBrand
    return feature_columns