Esempio n. 1
0
def test_shared_embedding_column_with_hash_bucket():
    color_data = {
        'color': [[2, 2], [5, 5], [0, -1], [0, 0]],
        'color2': [[2], [5], [-1], [0]]
    }  # 4行样本
    builder = _LazyBuilder(color_data)
    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7, dtype=tf.int32)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    color_column2 = feature_column.categorical_column_with_hash_bucket(
        'color2', 7, dtype=tf.int32)
    color_column_tensor2 = color_column2._get_sparse_tensors(builder)
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('not use input_layer' + '_' * 40)
        print(session.run([color_column_tensor.id_tensor]))
        print(session.run([color_column_tensor2.id_tensor]))

    # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
    color_column_embed = feature_column.shared_embedding_columns(
        [color_column2, color_column], 3, combiner='sum')
    print(type(color_column_embed))
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    color_column_embed)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run(color_dense_tensor))
Esempio n. 2
0
def create_feature_columns():
  # user feature
  phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
  phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
  phoneBrand = fc.embedding_column(phoneBrandId, 20)
  phoneResolution = fc.embedding_column(phoneResolutionId, 10)
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200))
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
  hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))

  global my_feature_columns
  my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand,
                        phoneResolution, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer]
  print("feature columns:", my_feature_columns)
  return my_feature_columns
Esempio n. 3
0
    def create_features_columns(self):
        userID = fc.embedding_column(
            fc.categorical_column_with_hash_bucket(
                key="userID",
                hash_bucket_size=FLAGS.user_did_size,
                dtype=tf.int64),
            dimension=FLAGS.embed_size,
            initializer=tf.uniform_unit_scaling_initializer(factor=1e-5,
                                                            seed=1,
                                                            dtype=tf.float32))
        itemID = fc.embedding_column(
            fc.categorical_column_with_hash_bucket(
                key="itemID",
                hash_bucket_size=FLAGS.item_uuid_size,
                dtype=tf.int64),
            dimension=FLAGS.embed_size,
            initializer=tf.uniform_unit_scaling_initializer(factor=1e-5,
                                                            seed=1,
                                                            dtype=tf.float32))
        self.all_columns["userID"] = userID
        self.all_columns["itemID"] = itemID
        self.feature_spec = tf.feature_column.make_parse_example_spec(
            self.all_columns.values())

        return self
def get_feature_columns():
    '''
    获取特征列
    '''
    dnn_feature_columns = list()
    linear_feature_columns = list()
    # DNN features
    user_cate = fc.categorical_column_with_hash_bucket("userid", 40000, tf.int64)
    feed_cate = fc.categorical_column_with_hash_bucket("feedid", 240000, tf.int64)
    author_cate = fc.categorical_column_with_hash_bucket("authorid", 40000, tf.int64)
    bgm_singer_cate = fc.categorical_column_with_hash_bucket("bgm_singer_id", 40000, tf.int64)
    bgm_song_cate = fc.categorical_column_with_hash_bucket("bgm_song_id", 60000, tf.int64)
    user_embedding = fc.embedding_column(user_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2)
    feed_embedding = fc.embedding_column(feed_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2)
    author_embedding = fc.embedding_column(author_cate, FLAGS.embed_dim, max_norm=FLAGS.embed_l2)
    bgm_singer_embedding = fc.embedding_column(bgm_singer_cate, FLAGS.embed_dim)
    bgm_song_embedding = fc.embedding_column(bgm_song_cate, FLAGS.embed_dim)
    dnn_feature_columns.append(user_embedding)
    dnn_feature_columns.append(feed_embedding)
    dnn_feature_columns.append(author_embedding)
    dnn_feature_columns.append(bgm_singer_embedding)
    dnn_feature_columns.append(bgm_song_embedding)
    # Linear features
    video_seconds = fc.numeric_column("videoplayseconds", default_value=0.0)
    device = fc.numeric_column("device", default_value=0.0)
    linear_feature_columns.append(video_seconds)
    linear_feature_columns.append(device)
    # 行为统计特征
    for b in FEA_COLUMN_LIST:
        feed_b = fc.numeric_column(b+"sum", default_value=0.0)
        linear_feature_columns.append(feed_b)
        user_b = fc.numeric_column(b+"sum_user", default_value=0.0)
        linear_feature_columns.append(user_b)
    return dnn_feature_columns, linear_feature_columns
Esempio n. 5
0
    def make_feature_layer(self):
        feature_cols = []

        for col in self.numeric_column:
            feature_cols.append(feature_column.numeric_column(col))

        for col in self.categorical_column_num:
            unique_count = self.data[col].nunique()
            feat_cols = feature_column.embedding_column(
                feature_column.categorical_column_with_hash_bucket(
                    col, hash_bucket_size=int(3 * unique_count)),
                dimension=1)
            feature_cols.append(feat_cols)

        for col in self.categorical_column_text:
            unique_count = self.data[col].nunique()
            feat_cols = feature_column.embedding_column(
                feature_column.categorical_column_with_hash_bucket(
                    col, hash_bucket_size=int(3 * unique_count)),
                dimension=1)
            feature_cols.append(feat_cols)

        for col in self.bool_column:
            unique_count = self.data[col].nunique()
            feat_cols = feature_column.embedding_column(
                feature_column.categorical_column_with_hash_bucket(
                    col, hash_bucket_size=3),
                dimension=1)
            feature_cols.append(feat_cols)
        return feature_cols
Esempio n. 6
0
def test_shared_embedding_column_with_hash_bucket():
    # 1. Input features
    color_data = {
        'range': [[2, 2], [5, 5], [0, -1], [0, 0]],
        'id': [[2], [5], [-1], [0]]
    }
    builder = _LazyBuilder(color_data)
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_hash_bucket(
        'range', 7, dtype=tf.int32)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    # 2. Feature columns (Sparse)
    color_column2 = feature_column.categorical_column_with_hash_bucket(
        'id', 7, dtype=tf.int32)
    color_column_tensor2 = color_column2._get_sparse_tensors(builder)
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('not use input_layer' + '_' * 40)
        print(session.run([color_column_tensor.id_tensor]))
        print(session.run([color_column_tensor2.id_tensor]))

    # 2. Feature columns (Dense)
    color_column_embed = feature_column.shared_embedding_columns(
        [color_column2, color_column], 3, combiner='sum')
    print(type(color_column_embed))
    # 3. Feature tensor
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    color_column_embed)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run(color_dense_tensor))
Esempio n. 7
0
def create_feature_columns(note_emb_size=10, note_user_emb_size=6):
    # 先创建分类列

    creator_ids = fc.categorical_column_with_hash_bucket("last_note_creators",
                                                         hash_bucket_size=2000,
                                                         dtype=tf.string)
    note_ids = fc.categorical_column_with_hash_bucket("last_note_ids",
                                                      20000,
                                                      dtype=tf.int64)

    creator_id = fc.categorical_column_with_hash_bucket("note_open_id", 2000)
    note_id = fc.categorical_column_with_hash_bucket("note_id",
                                                     20000,
                                                     dtype=tf.int64)

    video_duration = fc.numeric_column("note_video_duration")
    video_duration_bucket = fc.bucketized_column(source_column=video_duration,
                                                 boundaries=[5, 10, 30, 60])

    note_emb = fc.shared_embedding_columns([note_ids, note_id],
                                           note_emb_size,
                                           combiner='sum')
    creator_emb = fc.shared_embedding_columns([creator_ids, creator_id],
                                              note_user_emb_size,
                                              combiner='sum')

    my_feature_columns = note_emb + creator_emb + [video_duration_bucket]
    print("*" * 100)
    print("feature columns:")
    for i in my_feature_columns:
        print(i)
    print("*" * 100)
    return my_feature_columns
def build_ama_ele_columns():
    feature_columns = [
        fc.embedding_column(fc.categorical_column_with_hash_bucket(
            'user_id', hash_bucket_size=200000),
                            dimension=32),
        fc.embedding_column(fc.categorical_column_with_hash_bucket(
            'item_id', hash_bucket_size=1000),
                            dimension=32),
        # fc.embedding_column(fc.categorical_column_with_hash_bucket('seq',     hash_bucket_size=200000),dimension=32),
        # fc.embedding_column(fc.categorical_column_with_hash_bucket('seq_cate',hash_bucket_size=200000),dimension=32),
    ]
    feat_field_size = len(feature_columns)
    return feature_columns, feat_field_size
Esempio n. 9
0
def build_model_columns(embedding_size):
    linear_feature_columns = []
    embedding_feature_columns = []

    u_id = feature_column.categorical_column_with_hash_bucket('u_id', 500000, dtype=tf.dtypes.int64)
    u_id_embedded = feature_column.embedding_column(u_id, embedding_size)
    linear_feature_columns.append(feature_column.indicator_column(u_id))
    embedding_feature_columns.append(u_id_embedded)

    i_id = feature_column.categorical_column_with_hash_bucket('i_id', 100000, dtype=tf.dtypes.int64)
    i_id_embedded = feature_column.embedding_column(i_id, embedding_size)
    linear_feature_columns.append(feature_column.indicator_column(i_id))
    embedding_feature_columns.append(i_id_embedded)

    return linear_feature_columns, embedding_feature_columns
Esempio n. 10
0
    def create_features_columns(self):
        # 向量类特征
        user_vector = fc.numeric_column(key="user_vector",
                                        shape=(128, ),
                                        default_value=[0.0] * 128,
                                        dtype=tf.float32)
        item_vector = fc.numeric_column(key="item_vector",
                                        shape=(128, ),
                                        default_value=[0.0] * 128,
                                        dtype=tf.float32)

        # 分桶类特征
        age = fc.numeric_column(key="age",
                                shape=(1, ),
                                default_value=[0],
                                dtype=tf.int64)
        age = fc.bucketized_column(
            input_fc, boundaries=[0, 10, 20, 30, 40, 50, 60, 70, 80])
        age = fc.embedding_column(age, dimension=32, combiner='mean')

        # 分类特征
        city = fc.categorical_column_with_identity(key="city",
                                                   num_buckets=1000,
                                                   default_value=0)
        city = fc.embedding_column(city, dimension=32, combiner='mean')

        # hash特征
        device_id = fc.categorical_column_with_hash_bucket(
            key="device_id", hash_bucket_size=1000000, dtype=tf.int64)
        device_id = fc.embedding_column(device_id,
                                        dimension=32,
                                        combiner='mean')

        item_id = fc.categorical_column_with_hash_bucket(
            key="item_id", hash_bucket_size=10000, dtype=tf.int64)
        item_id = fc.embedding_column(device_id, dimension=32, combiner='mean')

        self.user_columns["user_vector"] = user_vector
        self.user_columns["age"] = age
        self.user_columns["city"] = city
        self.user_columns["device_id"] = device_id
        self.item_columns["item_vector"] = item_vector
        self.item_columns["item_id"] = item_id

        self.feature_spec = tf.feature_column.make_parse_example_spec(
            self.user_columns.values() + self.item_columns.values())

        return self
def test_categorical_column_with_hash_bucket():

    color_data = {
        'color': [['R', 'G'], ['G', 'A'], ['B', 'G'], ['A', 'G'], ['A', '']]
    }  # 4行样本
    builder = _LazyBuilder(color_data)

    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7)
    color_column_tensor = color_column._get_sparse_tensors(builder).id_tensor

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor]))
        #print(session.run([color_column_tensor.id_tensor]))

    # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
    color_column_identy = feature_column.indicator_column(color_column)

    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))
Esempio n. 12
0
    def _generate_cat_column(name, data, vocab_threshold=50, bucket_size=100):
        """Generate a feature column from a categorical string data set

        Parameters
        ----------
        name : str
            Name of categorical columns
        data : np.ndarray | list
            String data array
        vocab_threshold : int
            Number of unique entries in the data array below which this
            will use a vocabulary list, above which a hash bucket will be used.
        bucket_size : int
            Hash bucket size.

        Returns
        -------
        f_col : IndicatorColumn
            Categorical feature column.
        """

        n_unique = len(set(data))

        if n_unique < vocab_threshold:
            f_col = feature_column.categorical_column_with_vocabulary_list(
                name, list(set(data)))
        else:
            f_col = feature_column.categorical_column_with_hash_bucket(
                name, bucket_size)

        f_col = feature_column.indicator_column(f_col)

        return f_col
def fc_transform(feature_name, hash_bucket_size, dtype=tf.string):
    f = feature_column.categorical_column_with_hash_bucket(
        feature_name, hash_bucket_size=hash_bucket_size, dtype=dtype)
    f1 = feature_column.embedding_column(f, 4)
    feature_layer = tf.keras.layers.DenseFeatures(
        [f1])  # 把feature_column对象转为tensor
    return feature_layer
 def _build_census_wide_columns(numeric_range=None):
     base_columns, cross_columns = [], []
     for col in ALI_DISPLAY_ADS_CONFIG['wide_muti_hot_cols']:
         base_columns.append(
             fc.indicator_column(
                 fc.categorical_column_with_hash_bucket(
                     col,
                     hash_bucket_size=1000 if
                     ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else
                     ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000)))
     for col in ALI_DISPLAY_ADS_CONFIG['wide_bucket_cols']:
         base_columns.append(
             fc.bucketized_column(fc.numeric_column(col),
                                  boundaries=list(
                                      np.linspace(numeric_range[col][0],
                                                  numeric_range[col][1],
                                                  1000))))
     for col in ALI_DISPLAY_ADS_CONFIG['wide_cross_cols']:
         cross_columns.append(
             fc.indicator_column(
                 fc.crossed_column([col[0], col[1]],
                                   hash_bucket_size=10000)))
     feature_columns = base_columns + cross_columns
     feat_field_size = len(feature_columns)
     return feature_columns, feat_field_size
Esempio n. 15
0
def fc_transform(feature_name, hash_bucket_size, dtype=tf.string):
    f = feature_column.categorical_column_with_hash_bucket(
        feature_name, hash_bucket_size=hash_bucket_size, dtype=dtype)
    f1 = feature_column.embedding_column(f, 4)  # 可以被训练的,就需要在main里面对变量进行初始化
    # 把feature_column对象转为 dense tensor,所有的feature_column都必须做这步
    feature_layer = tf.keras.layers.DenseFeatures([f1])  # 继承自Layer基类,会自动实现call方法
    return feature_layer
def test_categorical_column_with_hash_bucket():
    #源数据
    color_data = {'color': [[2], [5], [-1], [0]]}  # 4行样本 shape=[4,1]
    builder = _LazyBuilder(color_data)

    # categorical_column
    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7, dtype=tf.int32)

    # tensor
    color_column_tensor = color_column._get_sparse_tensors(builder)  #稀疏表示
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    # 通过indicator_column,将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
    color_column_identy = feature_column.indicator_column(color_column)

    #input_layer连接数据源和声明的column生成新的tensor
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))
Esempio n. 17
0
def test_weighted_categorical_column():
    f_in = open("new.tf.rec.base64", "r")
    for line in f_in:
        try:
            b = base64.b64decode(line.strip())
        except Exception as e:
            sys.stderr.write(e)
            continue

        exa = example_pb2.Example()
        print("before parse proto...........")
        try:
            exa.ParseFromString(b)
        except Exception as e:
            sys.stderr.write(e.str())
            continue
        print("after parse proto........")
        #print (exa)
        u_pocs_l1_norm = feature_column.categorical_column_with_hash_bucket(
            "u_pocs_l1_norm", 3000)
        u_pocs_l1_norm_weighted = feature_column.weighted_categorical_column(
            u_pocs_l1_norm, weight_feature_key='u_pocs_l1_norm_val')
        feature_columns = [u_pocs_l1_norm_weighted]
        features = tf.parse_single_example(
            b, tf.feature_column.make_parse_example_spec(feature_columns))
        print(features["u_pocs_l1_norm"])
        with tf.Session() as session:
            session.run(tf.global_variables_initializer())
            session.run(tf.tables_initializer())
            print(session.run(features["u_pocs_l1_norm"]))
        break
Esempio n. 18
0
def test_categorical_column_with_hash_bucket():
    # 1. Input features
    color_data = {'color': [[2], [5], [-1], [0]]}
    builder = _LazyBuilder(color_data)
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7, dtype=tf.int32)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    # 2. Feature columns (Dense)
    # Convert the Categorical Column to Dense Column
    color_column_identity = feature_column.indicator_column(color_column)
    # 3. Feature tensor
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identity])

    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))
Esempio n. 19
0
 def hash_embedding(self, hash_bucket, embedding_dim, name):
     cate_feature = feature_column.categorical_column_with_hash_bucket(
         name, hash_bucket, dtype=tf.string)
     emb_col = feature_column.embedding_column(cate_feature,
                                               dimension=embedding_dim,
                                               combiner='mean')
     ind_col = feature_column.indicator_column(cate_feature)
     return emb_col, ind_col
Esempio n. 20
0
 def hashed_columns(self, hashed_columns_dict):
     ### Independance
     for col_name, bucket_size in hashed_columns_dict.items():
         hashedCol = feature_column.categorical_column_with_hash_bucket(
             col_name, hash_bucket_size=bucket_size)
         hashedFeature = feature_column.indicator_column(hashedCol)
         self.sparse_columns[col_name] = hashedFeature
     return hashedFeature
def practise():
    fx = {'x': [['a', 'a'], ['b', 'c'], ['c', 'e'], ['d', ''], ['e', 'f']]}
    fc = feature_column.categorical_column_with_hash_bucket('x', 5)
    fic = feature_column.indicator_column(fc)
    t2 = fc._get_sparse_tensors(_LazyBuilder(fx)).id_tensor
    tsor = feature_column.input_layer(fx, fic)

    with tf.Session() as sess:
        print(sess.run(t2))
        print(sess.run(tsor))
Esempio n. 22
0
def create_feature_columns(note_emb_size=10, note_user_emb_size=6):
    # 先创建分类列

    creator_ids = fc.categorical_column_with_hash_bucket("last_note_creators",
                                                         hash_bucket_size=2000,
                                                         dtype=tf.string)
    note_ids = fc.categorical_column_with_hash_bucket("last_note_ids",
                                                      20000,
                                                      dtype=tf.int64)

    creator_id = fc.categorical_column_with_hash_bucket("note_open_id", 2000)
    note_id = fc.categorical_column_with_hash_bucket("note_id",
                                                     20000,
                                                     dtype=tf.int64)

    video_duration = fc.numeric_column("note_video_duration")
    video_duration_bucket = fc.bucketized_column(source_column=video_duration,
                                                 boundaries=[5, 10, 30, 60])

    note_emb = fc.shared_embedding_columns([note_ids, note_id],
                                           note_emb_size,
                                           combiner='sum')
    creator_emb = fc.shared_embedding_columns([creator_ids, creator_id],
                                              note_user_emb_size,
                                              combiner='sum')

    # phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
    # phoneBrand = fc.embedding_column(phoneBrandId, 20)
    # phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
    # phoneResolution = fc.embedding_column(phoneResolutionId, 10)
    # phoneOs = fc.indicator_column(fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
    # gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0))
    # city_id = fc.categorical_column_with_hash_bucket("city", 700)
    # city = fc.embedding_column(city_id, 16)
    # hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))

    my_feature_columns = note_emb + creator_emb + [video_duration_bucket]
    print("*" * 100)
    print("feature columns:")
    for i in my_feature_columns:
        print(i)
    print("*" * 100)
    return my_feature_columns
Esempio n. 23
0
 def embeddings_columns(self, coldim_dict):
     for col_name, dimension in coldim_dict.items():
         #embCol    = feature_column.categorical_column_with_vocabulary_list(col_name, colunique )
         bucket_size = dimension * dimension
         embCol = feature_column.categorical_column_with_hash_bucket(
             col_name, hash_bucket_size=bucket_size)
         embedding = feature_column.embedding_column(embCol,
                                                     dimension=dimension)
         self.real_columns[col_name] = embedding
     return embedding
Esempio n. 24
0
def build_census_emb_columns():
    n_range = get_census_numeric_feat_range()
    feature_columns = [
        # numeric feature embedding
        fc.embedding_column(fc.bucketized_column(
            fc.numeric_column('age'),
            boundaries=list(
                np.linspace(n_range['age'][0], n_range['age'][1], 1000))),
                            dimension=32),
        fc.embedding_column(fc.bucketized_column(
            fc.numeric_column('education_num'),
            boundaries=list(
                np.linspace(n_range['education_num'][0],
                            n_range['education_num'][1], 1000))),
                            dimension=32),
        fc.embedding_column(fc.bucketized_column(
            fc.numeric_column('capital_gain'),
            boundaries=list(
                np.linspace(n_range['capital_gain'][0],
                            n_range['capital_gain'][1], 1000))),
                            dimension=32),
        fc.embedding_column(fc.bucketized_column(
            fc.numeric_column('capital_loss'),
            boundaries=list(
                np.linspace(n_range['capital_loss'][0],
                            n_range['capital_loss'][1], 1000))),
                            dimension=32),
        fc.embedding_column(fc.bucketized_column(
            fc.numeric_column('hours_per_week'),
            boundaries=list(
                np.linspace(n_range['hours_per_week'][0],
                            n_range['hours_per_week'][1], 1000))),
                            dimension=32),
        # category feature embedding
        fc.embedding_column(fc.categorical_column_with_hash_bucket(
            'gender', hash_bucket_size=1000),
                            dimension=32),
        fc.embedding_column(fc.categorical_column_with_hash_bucket(
            'education', hash_bucket_size=1000),
                            dimension=32),
        fc.embedding_column(fc.categorical_column_with_hash_bucket(
            'marital_status', hash_bucket_size=1000),
                            dimension=32),
        fc.embedding_column(fc.categorical_column_with_hash_bucket(
            'relationship', hash_bucket_size=1000),
                            dimension=32),
        fc.embedding_column(fc.categorical_column_with_hash_bucket(
            'workclass', hash_bucket_size=1000),
                            dimension=32),
        fc.embedding_column(fc.categorical_column_with_hash_bucket(
            'native_country', hash_bucket_size=1000),
                            dimension=32),
        fc.embedding_column(fc.categorical_column_with_hash_bucket(
            'occupation', hash_bucket_size=1000),
                            dimension=32)
    ]
    feat_field_size = len(feature_columns)
    return feature_columns, feat_field_size
Esempio n. 25
0
def create_linear_feature_columns():
  phoneBrand = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
  phoneResolution = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
  phoneOs = fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate)
  matchType = fc.categorical_column_with_identity("matchType", 9, default_value=0)
  position = fc.categorical_column_with_identity("position", 201, default_value=200)
  triggerNum = fc.categorical_column_with_identity("triggerNum", 51, default_value=50)
  triggerRank = fc.categorical_column_with_identity("triggerRank", 51, default_value=50)
  sceneType = fc.categorical_column_with_identity("type", 2, default_value=0)
  hour = fc.categorical_column_with_identity("hour", 24, default_value=0)
  columns = [phoneBrand, phoneResolution, phoneOs, matchScore, popScore, brandPrefer, cate2Prefer, catePrefer,
          sellerPrefer, matchType, position, triggerRank, triggerNum, sceneType, hour]
  print("linear feature columns:", columns)
  return columns
Esempio n. 26
0
def create_embedding_feature_columns(shared_embedding_dim=64):
    '''
    describe:当我们需要对特征进行embedding共享对时候
    :return:
    '''
    # 点击category id
    c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids",
                                                   100,
                                                   dtype=tf.int64)
    # 对clids进行加权赋值,有点像attention
    c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights")
    # category id
    c1id = fc.categorical_column_with_hash_bucket("cate1Id",
                                                  100,
                                                  dtype=tf.int64)
    # c1ids_weighted 和 c1id中都用到了category id,但是这边是保证了其在同一个embedding空间,并不是特征一致
    # 此处c1id_emb会返回长度为2的列表,每个元素的是shared_embedding_dim维的tenser,总长2*shared_embedding_dim
    c1id_emb = fc.shared_embedding_columns([c1ids_weighted, c1id],
                                           shared_embedding_dim,
                                           combiner='sum')
    return c1id_emb
def shared_embedding_column_with_hash_bucket():
    color_data = {'color': [[2, 2], [5, 5], [0, -1], [0, 0]],  # 4行样本 shape=[4,2]
                  'color2': [[2], [5], [-1], [0]]}  # 4行样本  shape=[4,1]
    builder = _LazyBuilder(color_data)

    # categorical_column1
    color_column = feature_column.categorical_column_with_hash_bucket('color', 7, dtype=tf.int32)
    print(color_column)
    # tensor1
    color_column_tensor = color_column._get_sparse_tensors(builder)

    # categorical_column2
    color_column2 = feature_column.categorical_column_with_hash_bucket('color2', 7, dtype=tf.int32)
    print(color_column2)

    # tensor2
    color_column_tensor2 = color_column2._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('categorical_column_with_hash_bucket' + '_' * 40)
        print(session.run([color_column_tensor.id_tensor]))
        print(session.run([color_column_tensor2.id_tensor]))
    print('not use input_layer' + '_' * 40)


    color_column_embed = feature_column.shared_embedding_columns([color_column2, color_column], 3, combiner='sum')
    print(type(color_column_embed))
    print((color_column_embed))

    color_dense_tensor = feature_column.input_layer(color_data, color_column_embed)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('shared_embedding_columns' + '_' * 40)
        print(session.run(color_dense_tensor))
Esempio n. 28
0
    def __init__(self, name, params):
        super(MLP, self).__init__()
        self.model_name = name
        self.params = params
        num_features = [
            feature_column.bucketized_column(
                feature_column.numeric_column(str(i)),
                boundaries=[
                    j / (num_bin_size[i] - 1)
                    for j in range(num_bin_size[i] - 1)
                ]) for i in range(8)
        ]
        if name == "MLP_FSIW":
            print("using elapse feature")
            num_features.append(feature_column.numeric_column("elapse"))
        cate_features = [
            feature_column.embedding_column(
                feature_column.categorical_column_with_hash_bucket(
                    str(i), hash_bucket_size=cate_bin_size[i - 8]),
                dimension=8) for i in range(8, 17)
        ]

        all_features = num_features + cate_features

        self.feature_layer = tf.keras.layers.DenseFeatures(all_features)

        self.fc1 = layers.Dense(256,
                                activation=tf.nn.leaky_relu,
                                kernel_regularizer=regularizers.l2(
                                    params["l2_reg"]))
        self.bn1 = layers.BatchNormalization()
        self.fc2 = layers.Dense(256,
                                activation=tf.nn.leaky_relu,
                                kernel_regularizer=regularizers.l2(
                                    params["l2_reg"]))
        self.bn2 = layers.BatchNormalization()
        self.fc3 = layers.Dense(128,
                                activation=tf.nn.leaky_relu,
                                kernel_regularizer=regularizers.l2(
                                    params["l2_reg"]))
        self.bn3 = layers.BatchNormalization()
        print("build model {}".format(name))
        if self.model_name == "MLP_EXP_DELAY":
            self.fc4 = layers.Dense(2)
        elif self.model_name == "MLP_tn_dp":
            self.fc4 = layers.Dense(2)
        elif self.model_name in ["MLP_SIG", "MLP_FSIW"]:
            self.fc4 = layers.Dense(1)
        else:
            raise ValueError("model name {} not exist".format(name))
Esempio n. 29
0
def create_user_feature_columns():
  gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0))
  age_class = fc.indicator_column(fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0))
  has_baby = fc.indicator_column(fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0))
  baby_gender = fc.indicator_column(fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0))
  baby_age = fc.indicator_column(fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0))
  grade = fc.indicator_column(fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0))
  rfm_type = fc.indicator_column(fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0))
  cate1_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate1_price_prefer", num_buckets=6, default_value=0))
  cate2_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate2_price_prefer", num_buckets=6, default_value=0))
  cate3_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate3_price_prefer", num_buckets=6, default_value=0))
  city_id = fc.categorical_column_with_hash_bucket("city", 700)
  city = fc.shared_embedding_columns([city_id], 16)
  cols = [gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, cate1_price_prefer, cate2_price_prefer, cate3_price_prefer]
  return cols + city
Esempio n. 30
0
 def transform(self, output_tensors):
     input_tensor_name = self.parameters.get("input_tensor")
     output_tensor_name = self.parameters.get("output_tensor")
     dtype= self.get_value_tf_type("dtype") if self.get_value_tf_type("dtype") != None else tf.string
     if self.parameters.has_key("hash_bucket_size"):
         hash_bucket_size = self.parameters.get("hash_bucket_size")
     else:
         msg = "parameters error, sparse_column_with_hash_bucket must need hash_bucket_size"
         logger.error(msg)
         raise ParametersError(msg)
     print("bucket output_tensor_name:",output_tensor_name)
     output_tensor = fc.categorical_column_with_hash_bucket(
         key=input_tensor_name,
         hash_bucket_size=hash_bucket_size,
         dtype=dtype
     )
     output_tensors[output_tensor_name] = output_tensor