コード例 #1
0
def get_xy_fd(hash_flag=False):
    feature_columns = [SparseFeat('user', 3, embedding_dim=10), SparseFeat(
        'gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8),
                       SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1)]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'),
                         maxlen=4, length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4,
                         length_name="seq_length")]
    # Notice: History behavior sequence feature name must start with "hist_".
    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]])
    seq_length = np.array([3, 3, 2])  # the actual length of the behavior sequence

    feature_dict = {'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id,
                    'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id,
                    'pay_score': pay_score, 'seq_length': seq_length}
    x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list
コード例 #2
0
def get_xy_fd():

    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10),
        SparseFeat('gender', 2, embedding_dim=4),
        SparseFeat('item_id', 3 + 1, embedding_dim=8),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4),
        DenseFeat('pay_score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item_id", "cate_id"]  # 变长特征使用的base稀疏特征
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]])

    # 特征名->data输入
    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': pay_score
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list
コード例 #3
0
ファイル: feature_test.py プロジェクト: zzszmyf/DeepCTR
def test_feature_column_sparsefeat_vocabulary_path():
    vocab_path = "./dummy_test.csv"
    sf = SparseFeat('user_id', 4, vocabulary_path=vocab_path)
    if sf.vocabulary_path != vocab_path:
        raise ValueError("sf.vocabulary_path is invalid")
    vlsf = VarLenSparseFeat(sf, 6)
    if vlsf.vocabulary_path != vocab_path:
        raise ValueError("vlsf.vocabulary_path is invalid")
コード例 #4
0
ファイル: DIN_test.py プロジェクト: xmh645214784/Kraken
def get_xy_fd(hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        SparseFeat('item', 3 + 1),
        SparseFeat('item_gender', 2 + 1),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }

    feature_names = get_feature_names(feature_columns)
    x = {name: feature_dict[name] for name in feature_names}
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
コード例 #5
0
def get_xy_fd():
    feature_columns = [SparseFeat('driver_age', 7, embedding_dim=32),
                       SparseFeat('pax_age', 7, embedding_dim=32),
                       SparseFeat('des_id', 10000, embedding_dim=32),
                       SparseFeat('price_id', 20, embedding_dim=32)]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_price_id', vocabulary_size=5, embedding_dim=32), maxlen=3),
        VarLenSparseFeat(SparseFeat('hist_des_id', vocabulary_size=5, embedding_dim=32), maxlen=3)]
    # Notice: History behavior sequence feature name must start with "hist_".
    behavior_feature_list = ["price_id", "des_id"]
    driver_age = np.array([0, 1, 2])
    pax_age = np.array([0, 1, 0])
    pax_des = np.array([1, 2, 3])  # 0 is mask value
    pax_price = np.array([1, 2, 2])  # 0 is mask value

    hist_price_seq = np.array([[1, 2, 3], [3, 2, 1], [1, 2, 0]])
    hist_des_seq = np.array([[1, 2, 2], [2, 2, 1], [1, 2, 0]])

    feature_dict = {'driver_age': driver_age, 'pax_age': pax_age, 'des_id': pax_des, 'price_id': pax_price,
                    'hist_price_id': hist_price_seq, 'hist_des_id': hist_des_seq}
    x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list
コード例 #6
0
    def training_set_construct(self):
        # 加载数据
        data = pd.read_csv('./data/read_history.csv')
        # 负采样个数
        negsample = 0
        # 特征编码
        features = ['user_id', 'item_id', 'gender', 'age', 'city']
        feature_max_idx = {}
        for feature in features:
            lbe = LabelEncoder()
            data[feature] = lbe.fit_transform(data[feature]) + 1
            feature_max_idx[feature] = data[feature].max() + 1
        # 抽取用户、物品特征
        user_info = data[["user_id", "gender", "age",
                          "city"]].drop_duplicates('user_id')
        item_info = data[["item_id"]].drop_duplicates('item_id')
        user_info.set_index("user_id", inplace=True)

        # 构建输入数据
        train_set, test_set = gen_data_set(data, negsample)
        # 转化为模型输入
        train_model_input, train_label = gen_model_input(
            train_set, user_info, self.SEQ_LEN)
        test_model_input, test_label = gen_model_input(test_set, user_info,
                                                       self.SEQ_LEN)

        # 用户端输入特征
        self.user_feature_columns = [
            SparseFeat('user_id', feature_max_idx['user_id'], 16),
            SparseFeat("gender", feature_max_idx['gender'], 16),
            SparseFeat("age", feature_max_idx['age'], 16),
            SparseFeat("city", feature_max_idx['city'], 16),
            VarLenSparseFeat(
                SparseFeat('hist_item_id',
                           feature_max_idx['item_id'],
                           self.embedding_dim,
                           embedding_name="item_id"), self.SEQ_LEN, 'mean',
                'hist_len'),
        ]
        # 物品端输入特征
        self.item_feature_columns = [
            SparseFeat('item_id', feature_max_idx['item_id'],
                       self.embedding_dim)
        ]
        return train_model_input, train_label, test_model_input, test_label, \
               train_set, test_set, user_info, item_info
コード例 #7
0
def train_youtube_model(train_model_input, train_label, embedding_dim,
                        feature_max_idx, his_seq_maxlen, batch_size, epochs,
                        verbose, validation_split):
    """构建youtubednn并完成训练"""
    # 特征封装
    user_feature_columns = [
        SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
        VarLenSparseFeat(
            SparseFeat('hist_doc_ids',
                       feature_max_idx['article_id'],
                       embedding_dim,
                       embedding_name="click_doc_id"), his_seq_maxlen, 'mean',
            'hist_len'),
        SparseFeat('u_city', feature_max_idx['city'], embedding_dim),
        SparseFeat('u_age', feature_max_idx['age'], embedding_dim),
        SparseFeat('u_gender', feature_max_idx['gender'], embedding_dim),
        DenseFeat(
            'u_example_age',
            1,
        )
    ]
    doc_feature_columns = [
        SparseFeat('click_doc_id', feature_max_idx['article_id'],
                   embedding_dim)
        # 这里后面也可以把文章的类别画像特征加入
    ]

    # 定义模型
    model = YoutubeDNN(user_feature_columns,
                       doc_feature_columns,
                       num_sampled=5,
                       user_dnn_hidden_units=(64, embedding_dim))

    # 模型编译
    model.compile(optimizer="adam", loss=sampledsoftmaxloss)

    # 模型训练,这里可以定义验证集的比例,如果设置为0的话就是全量数据直接进行训练
    history = model.fit(train_model_input,
                        train_label,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=verbose,
                        validation_split=validation_split)

    return model
コード例 #8
0
    def _build_model(self):
        to_drop = config.Keywords_Categories[self.params['category']]
        self._build_category_dict(drop_categories=to_drop)
        attrs_matrix, attrs_max_len = self._get_category_matrix(self.data)
        
        vars_fixlen = [SparseFeat(var, self.data[var].nunique(),
                                  embedding_dim=4)
                       for var in self.features_sparse]
        vars_fixlen += [DenseFeat(var, 1,) for var in self.features_dense]
        vars_varlen = [VarLenSparseFeat(SparseFeat('categories',
                        vocabulary_size=len(self.attr2index) + 1,
                        embedding_dim=4),
                        maxlen=attrs_max_len, combiner='mean',
                        weight_name='attrs_weight' if self.params['weight'] else None)]

        self.features_linear = vars_fixlen + vars_varlen
        self.features_dnn = vars_fixlen + vars_varlen

        self.model = DeepFM(self.features_linear, self.features_dnn,
                            task='regression', **self.params_deepfm)
        return attrs_matrix, attrs_max_len
コード例 #9
0
def get_xy_fd(hash_flag=False):
    user_feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        VarLenSparseFeat(SparseFeat('hist_item',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=4,
                                    embedding_name='item'),
                         maxlen=4,
                         length_name="hist_len")
    ]
    item_feature_columns = [SparseFeat(
        'item',
        3 + 1,
        embedding_dim=4,
    )]

    uid = np.array([0, 1, 2, 1])
    ugender = np.array([0, 1, 0, 1])
    iid = np.array([1, 2, 3, 1])  # 0 is mask value

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0],
                         [3, 0, 0, 0]])
    hist_len = np.array([3, 3, 2, 1])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'hist_item': hist_iid,
        "hist_len": hist_len
    }

    # feature_names = get_feature_names(feature_columns)
    x = feature_dict
    y = np.array([1, 1, 1, 1])
    return x, y, user_feature_columns, item_feature_columns
コード例 #10
0
                dim = 8
            else:
                dim = 4

        if column == 'user_id':
            feature_columns += [SparseFeat(column, 212062 + 1, embedding_dim=dim)]
        elif column == 'merchant_id':
            feature_columns += [SparseFeat(column, 1993 + 1, embedding_dim=dim)]
        elif column == 'action_type':
            feature_columns += [SparseFeat(column, 4 + 1, embedding_dim=dim)]
        else:
            feature_columns += [DenseFeat(column, 1)]

# maxlen为历史信息的长度,vocabulary_size为onehot的长度
feature_columns += [
    VarLenSparseFeat(sparsefeat=SparseFeat('hist_merchant_id', vocabulary_size=1993, embedding_dim=8,
                                           embedding_name='merchant_id'), maxlen=M),
    VarLenSparseFeat(sparsefeat=SparseFeat('hist_action_type', vocabulary_size=4, embedding_dim=4,
                                           embedding_name='action_type'), maxlen=M)]
history_features = ['merchant_id', 'action_type']
print(len(feature_columns))

# 使用DIN模型
model = DIN(feature_columns, history_features)
# 使用Adam优化器,二分类的交叉熵
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])
# model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"])

# 组装train_model_input,得到feature names,将train_X转换为字典格式
feature_names = list(train_X.columns)
train_model_input = {name: train_X[name].values for name in get_feature_names(feature_columns)}
print("########################################")
コード例 #11
0
    # 1.Use hashing encoding on the fly for sparse features,and process sequence features

    genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)

    # Notice : padding=`post`
    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=object, value=0).astype(str)
    # 2.set hashing space for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string')
                              for feat in sparse_features]
    varlen_feature_columns = [
        VarLenSparseFeat(SparseFeat('genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"),
                         maxlen=max_len, combiner='mean',
                         )]  # Notice : value 0 is for padding for sequence input feature
    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model
    model_input = {name: data[name] for name in feature_names}
    model_input['genres'] = genres_list

    # 4.Define Model,compile and train
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')

    model.compile("adam", "mse", metrics=['mse'], )
    history = model.fit(model_input, data[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
コード例 #12
0
    train_set, test_set = gen_data_set_sdm(data, seq_short_len=SEQ_LEN_short, seq_prefer_len=SEQ_LEN_prefer)

    train_model_input, train_label = gen_model_input_sdm(train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer)
    test_model_input, test_label = gen_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer)

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    embedding_dim = 32
    # for sdm,we must provide `VarLenSparseFeat` with name "prefer_xxx" and "short_xxx" and their length
    user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),
                            SparseFeat("gender", feature_max_idx['gender'], 16),
                            SparseFeat("age", feature_max_idx['age'], 16),
                            SparseFeat("occupation", feature_max_idx['occupation'], 16),
                            SparseFeat("zip", feature_max_idx['zip'], 16),
                            VarLenSparseFeat(SparseFeat('short_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                        embedding_name="movie_id"), SEQ_LEN_short, 'mean',
                                             'short_sess_length'),
                            VarLenSparseFeat(SparseFeat('prefer_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                        embedding_name="movie_id"), SEQ_LEN_prefer, 'mean',
                                             'prefer_sess_length'),
                            VarLenSparseFeat(SparseFeat('short_genres', feature_max_idx['genres'], embedding_dim,
                                                        embedding_name="genres"), SEQ_LEN_short, 'mean',
                                             'short_sess_length'),
                            VarLenSparseFeat(SparseFeat('prefer_genres', feature_max_idx['genres'], embedding_dim,
                                                        embedding_name="genres"), SEQ_LEN_prefer, 'mean',
                                             'prefer_sess_length'),
                            ]

    item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]

    K.set_learning_phase(True)
コード例 #13
0
               embedding_dim=embedding_size),
    SparseFeat("movie_id",
               vocabulary_size=movie_num,
               embedding_dim=embedding_size),
    SparseFeat("current_label",
               vocabulary_size=genre_num,
               embedding_dim=embedding_size),
    SparseFeat("release_year",
               vocabulary_size=year_num,
               embedding_dim=embedding_size),
]

feature_columns += [
    VarLenSparseFeat(SparseFeat("user_recent_click_movie_ids",
                                vocabulary_size=movie_num,
                                embedding_dim=embedding_size,
                                embedding_name='movie_id'),
                     maxlen=20),
    VarLenSparseFeat(SparseFeat("user_recent_click_labels",
                                vocabulary_size=genre_num,
                                embedding_dim=embedding_size,
                                embedding_name='current_label'),
                     maxlen=20),
    VarLenSparseFeat(SparseFeat("user_like_genres",
                                vocabulary_size=genre_num,
                                embedding_dim=embedding_size,
                                embedding_name='current_label'),
                     maxlen=2),
]

dnn_feature_columns = feature_columns
コード例 #14
0
def get_xy_from_txt(file_path="data/movielens_sample_din.txt"):
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10),
        SparseFeat('gender', 2, embedding_dim=4),
        SparseFeat('item_id', 3 + 1, embedding_dim=8),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4),
        DenseFeat('pay_score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    # head = ['label', 'user', 'gender', 'item_id', 'cate_id', 'hist_item_id', 'hist_cate_id', 'pay_score']

    data = pd.read_csv(file_path, delimiter=',')

    def to_int_array(x):
        ret = []
        a = x.split('|')
        for str in a:
            ret.append(int(str))
        return np.array(ret)
        # return ret

    data['hist_item_id'] = data['hist_item_id'].apply(to_int_array)
    data['hist_cate_id'] = data['hist_cate_id'].apply(to_int_array)

    uid = np.array(data['user'])

    ugender = np.array(data['gender'])
    iid = np.array(data['item_id'])  # 0 is mask value
    cate_id = np.array(data['cate_id'])  # 0 is mask value
    pay_score = np.array(data['pay_score'])
    print("hist_cate_id: ", type(data['hist_cate_id']),
          type(data['hist_cate_id'][0]), np.shape(data['hist_cate_id'][0]),
          data['hist_cate_id'])
    print("------------" * 10)
    hist_iid = np.array(data['hist_item_id'].tolist())
    hist_cate_id = np.array(data['hist_cate_id'].tolist())
    print("uid: ", type(uid), uid)
    print("hist_cate_id: ", type(hist_cate_id), type(hist_cate_id[0]),
          np.shape(hist_cate_id[0]), hist_cate_id)
    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': pay_score
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array(data.pop('label'))

    return x, y, feature_columns, behavior_feature_list
コード例 #15
0
fixlen_feature_columns = [SparseFeat(feat, train[feat].nunique(), embedding_dim=int(sys.argv[5])) for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features]
linear_feature_columns = fixlen_feature_columns 
dnn_feature_columns = fixlen_feature_columns
fixlen_feature_names = get_feature_names(fixlen_feature_columns)
train_model_input = {name: train[name] for name in fixlen_feature_names}  
test_model_input = {name: test[name] for name in fixlen_feature_names}

if sys.argv[1] in ['DIEN', 'DIEN_UDG', 'DIN', 'DIN_UDG']:
    test_model_input, test_label, max_len = get_input(test, 0, 'test')
    train_model_input, train_label, _ = get_input(train, max_len, 'train')
    fixlen_feature_columns = [SparseFeat(feat, train[feat].nunique()+1, embedding_dim=int(sys.argv[5])) for feat in sparse_features]
    fixlen_feature_columns += [DenseFeat(feat, 1,) for feat in dense_features]
    fixlen_feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_itemId', train['itemId'].nunique() + 1,
                         embedding_dim = int(sys.argv[5]), embedding_name='itemId'), maxlen=max_len, 
                        length_name='seq_length'),
        VarLenSparseFeat(SparseFeat('hist_category', train['category'].nunique() + 1,
                         embedding_dim = int(sys.argv[5]), embedding_name='category'), maxlen=max_len, 
                        length_name='seq_length'),
        VarLenSparseFeat(SparseFeat('neg_hist_itemId', train['itemId'].nunique() + 1,
                         embedding_dim = int(sys.argv[5]), embedding_name='itemId'), maxlen=max_len, 
                        length_name='seq_length'),
        VarLenSparseFeat(SparseFeat('neg_hist_category', train['category'].nunique() + 1,
                         embedding_dim = int(sys.argv[5]), embedding_name='category'), maxlen=max_len, 
                        length_name='seq_length')
    ]
behavior_feature_list = ['itemId', 'category']

if sys.argv[1] == 'DeepFM_UDG':
    model = DeepFM_UDG(linear_feature_columns, dnn_feature_columns, untrainable_features_columns, 
コード例 #16
0
ファイル: run_dien.py プロジェクト: xmh645214784/Kraken
def get_xy_fd(use_neg=False, hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('pay_score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4,
                         length_name="seq_length")
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]])

    behavior_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': score,
        "seq_length": behavior_length
    }

    if use_neg:
        feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3,
                                                      0], [1, 2, 3, 0],
                                                     [1, 2, 0, 0]])
        feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2,
                                                      0], [1, 2, 2, 0],
                                                     [1, 2, 0, 0]])
        feature_columns += [
            VarLenSparseFeat(SparseFeat('neg_hist_item_id',
                                        vocabulary_size=3 + 1,
                                        embedding_dim=8,
                                        embedding_name='item_id'),
                             maxlen=4,
                             length_name="seq_length"),
            VarLenSparseFeat(SparseFeat('neg_hist_cate_id',
                                        2 + 1,
                                        embedding_dim=4,
                                        embedding_name='cate_id'),
                             maxlen=4,
                             length_name="seq_length")
        ]

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list
コード例 #17
0
        padding='post',
    )

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
        for feat in sparse_features
    ]

    use_weighted_sequence = False
    if use_weighted_sequence:
        varlen_feature_columns = [
            VarLenSparseFeat(SparseFeat('genres',
                                        vocabulary_size=len(key2index) + 1,
                                        embedding_dim=4),
                             maxlen=max_len,
                             combiner='mean',
                             weight_name='genres_weight')
        ]  # Notice : value 0 is for padding for sequence input feature
    else:
        varlen_feature_columns = [
            VarLenSparseFeat(SparseFeat('genres',
                                        vocabulary_size=len(key2index) + 1,
                                        embedding_dim=4),
                             maxlen=max_len,
                             combiner='mean',
                             weight_name=None)
        ]  # Notice : value 0 is for padding for sequence input feature

    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
コード例 #18
0
def main(data_path):
    unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
    user = pd.read_csv(data_path + "ml-1m/users.dat",
                       sep="::",
                       header=None,
                       names=unames)
    rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings = pd.read_csv(data_path + "ml-1m/ratings.dat",
                          sep="::",
                          header=None,
                          names=rnames)
    mnames = ['movie_id', 'title', 'genres']
    movies = pd.read_csv(data_path + "ml-1m/movies.dat",
                         sep="::",
                         header=None,
                         names=mnames)

    data = pd.merge(pd.merge(ratings, movies), user)

    sparse_features = [
        "movie_id", "user_id", "gender", "age", "occupation", "zip", "genres"
    ]

    SEQ_LEN_short = 5
    SEQ_LEN_prefer = 50

    # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`
    features = [
        'user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip', 'genres'
    ]
    feature_max_idx = {}
    for feature in features:
        lbe = LabelEncoder()
        data[feature] = lbe.fit_transform(data[feature]) + 1
        feature_max_idx[feature] = data[feature].max() + 1

    user_profile = data[[
        "user_id", "gender", "age", "occupation", "zip", "genres"
    ]].drop_duplicates('user_id')

    item_profile = data[["movie_id"]].drop_duplicates('movie_id')

    user_profile.set_index("user_id", inplace=True)

    train_set, test_set = gen_data_set_sdm(data,
                                           seq_short_len=SEQ_LEN_short,
                                           seq_prefer_len=SEQ_LEN_prefer)

    train_model_input, train_label = get_model_input_sdm(
        train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer)
    test_model_input, test_label = get_model_input_sdm(test_set, user_profile,
                                                       SEQ_LEN_short,
                                                       SEQ_LEN_prefer)

    print(train_model_input)
    # 2.count #unique features for each sparse field and generate feature config for sequence feature
    embedding_dim = 32

    user_feature_columns = [
        SparseFeat('user_id', feature_max_idx['user_id'], 16),
        SparseFeat('gender', feature_max_idx['gender'], 16),
        SparseFeat('age', feature_max_idx['age'], 16),
        SparseFeat('occupation', feature_max_idx['occupation'], 16),
        SparseFeat('zip', feature_max_idx['zip'], 16),
        VarLenSparseFeat(SparseFeat('short_movie_id',
                                    feature_max_idx['movie_id'],
                                    embedding_dim,
                                    embedding_name="movie_id"),
                         SEQ_LEN_short,
                         combiner='mean',
                         length_name='short_sess_length'),
        VarLenSparseFeat(SparseFeat('prefer_movie_id',
                                    feature_max_idx['movie_id'],
                                    embedding_dim,
                                    embedding_name="movie_id"),
                         SEQ_LEN_prefer,
                         combiner='mean',
                         length_name='prefer_sess_length'),
        VarLenSparseFeat(SparseFeat('short_genres',
                                    feature_max_idx['genres'],
                                    embedding_dim,
                                    embedding_name='genres'),
                         SEQ_LEN_short,
                         combiner='mean',
                         length_name='short_sess_length'),
        VarLenSparseFeat(SparseFeat('prefer_genres',
                                    feature_max_idx['genres'],
                                    embedding_dim,
                                    embedding_name='genres'),
                         SEQ_LEN_short,
                         combiner='mean',
                         length_name='prefer_sess_length')
    ]

    item_feature_columns = [
        SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)
    ]

    K.set_learning_phase(True)

    model = SDM(
        user_feature_columns,
        item_feature_columns,
        history_feature_list=['movie_id', 'genres'],
        units=embedding_dim,
        num_sampled=100,
    )

    # 梯度裁剪
    optimizer = tf.keras.optimizers.Adam(lr=0.001, clipnorm=5.0)
コード例 #19
0
                if re.search('query', feat_name) != None:
                    continue

                if re.search('sparse', feat_name) != None:  # 是sparse特征
                    if feat_name[-6:] == 'weight':
                        select_columns_name.append(feat_name)
                        continue
                    select_columns_name.append(feat_name)
                    for key in vocabulary_size.keys():
                        if key in feat_name:
                            vocabulary_size_val = vocabulary_size[key]
                            embedding_name = key
                            break
                    varlen_feature_columns.append(VarLenSparseFeat(
                        SparseFeat(feat_name, vocabulary_size=vocabulary_size_val + 1, embedding_dim=4,
                                   use_hash=False, embedding_name=embedding_name),
                        maxlen=1,
                        combiner='mean', weight_name=feat_name + '_weight', weight_norm=False))
                else:  # 是dense特征
                    if feat_name[-6:] == 'weight':
                        select_columns_name.append(feat_name)
                        fixed_feature_columns.append(DenseFeat(feat_name, 1, ))  # dense 特征
                    else:
                        continue
        # if use_hour_features:  # 复现最优结果
        #     for feat_name in all_columns:
        #         if feat_name[-6:] == 'weight' or feat_name in ['ctr_label', 'cvr_label']:
        #             select_columns_name.append(feat_name)
        #             continue
        #         for key in vocabulary_size.keys():
        #             if key in feat_name:
コード例 #20
0
    # 2.set hashing space for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [
        SparseFeat(feat,
                   data[feat].nunique() * 5,
                   embedding_dim=4,
                   use_hash=True,
                   dtype='string') for feat in sparse_features
    ]
    varlen_feature_columns = [
        VarLenSparseFeat(
            SparseFeat('genres',
                       vocabulary_size=100,
                       embedding_dim=4,
                       use_hash=True,
                       dtype="string"),
            maxlen=max_len,
            combiner='mean',
        )
    ]  # Notice : value 0 is for padding for sequence input feature
    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model
    model_input = {name: data[name] for name in feature_names}
    model_input['genres'] = genres_list
    print("model_input:", model_input)
    # 4.Define Model,compile and train
コード例 #21
0
def get_xy_fd_sdm(hash_flag=False):
    user_feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        VarLenSparseFeat(SparseFeat('prefer_item',
                                    vocabulary_size=100,
                                    embedding_dim=8,
                                    embedding_name='item'),
                         maxlen=6,
                         length_name="prefer_sess_length"),
        VarLenSparseFeat(SparseFeat('prefer_cate',
                                    vocabulary_size=100,
                                    embedding_dim=8,
                                    embedding_name='cate'),
                         maxlen=6,
                         length_name="prefer_sess_length"),
        VarLenSparseFeat(SparseFeat('short_item',
                                    vocabulary_size=100,
                                    embedding_dim=8,
                                    embedding_name='item'),
                         maxlen=4,
                         length_name="short_sess_length"),
        VarLenSparseFeat(SparseFeat('short_cate',
                                    vocabulary_size=100,
                                    embedding_dim=8,
                                    embedding_name='cate'),
                         maxlen=4,
                         length_name="short_sess_length"),
    ]
    item_feature_columns = [SparseFeat(
        'item',
        100,
        embedding_dim=8,
    )]

    uid = np.array([0, 1, 2, 1])
    ugender = np.array([0, 1, 0, 1])
    iid = np.array([1, 2, 3, 1])  # 0 is mask value

    prefer_iid = np.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 0],
                           [1, 2, 3, 3, 0, 0], [1, 2, 4, 0, 0, 0]])
    prefer_cate = np.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 0],
                            [1, 2, 3, 3, 0, 0], [1, 2, 4, 0, 0, 0]])
    short_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0],
                          [3, 0, 0, 0]])
    short_cate = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0],
                           [3, 0, 0, 0]])
    prefer_len = np.array([6, 5, 4, 3])
    short_len = np.array([3, 3, 2, 1])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'prefer_item': prefer_iid,
        "prefer_cate": prefer_cate,
        'short_item': short_iid,
        'short_cate': short_cate,
        'prefer_sess_length': prefer_len,
        'short_sess_length': short_len
    }

    # feature_names = get_feature_names(feature_columns)
    x = feature_dict
    y = np.array([1, 1, 1, 0])
    history_feature_list = ['item', 'cate']

    return x, y, user_feature_columns, item_feature_columns, history_feature_list
コード例 #22
0
ファイル: DSIN_test.py プロジェクト: zunairazaman2021/DeepCTR
def get_xy_fd(hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3, use_hash=hash_flag),
        SparseFeat('gender', 2, use_hash=hash_flag),
        SparseFeat('item', 3 + 1, use_hash=hash_flag),
        SparseFeat('item_gender', 2 + 1, use_hash=hash_flag),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('sess_0_item',
                                    3 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('sess_0_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('sess_1_item',
                                    3 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('sess_1_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]])
    sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]])

    sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

    sess_number = np.array([2, 1, 0])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'sess_0_item': sess1_iid,
        'sess_0_item_gender': sess1_igender,
        'score': score,
        'sess_1_item': sess2_iid,
        'sess_1_item_gender': sess2_igender,
    }

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    x["sess_length"] = sess_number

    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
コード例 #23
0
def get_test_data(sample_size=1000,
                  embedding_size=4,
                  sparse_feature_num=1,
                  dense_feature_num=1,
                  sequence_feature=['sum', 'mean', 'max', 'weight'],
                  classification=True,
                  include_length=False,
                  hash_flag=False,
                  prefix='',
                  use_group=False):
    feature_columns = []
    model_input = {}

    if 'weight' in sequence_feature:
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix + "weighted_seq",
                                        vocabulary_size=2,
                                        embedding_dim=embedding_size),
                             maxlen=3,
                             length_name=prefix + "weighted_seq" +
                             "_seq_length",
                             weight_name=prefix + "weight"))
        s_input, s_len_input = gen_sequence(2, 3, sample_size)

        model_input[prefix + "weighted_seq"] = s_input
        model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1)
        model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input
        sequence_feature.pop(sequence_feature.index('weight'))

    for i in range(sparse_feature_num):
        if use_group:
            group_name = str(i % 3)
        else:
            group_name = DEFAULT_GROUP_NAME
        dim = np.random.randint(1, 10)
        feature_columns.append(
            SparseFeat(prefix + 'sparse_feature_' + str(i),
                       dim,
                       embedding_size,
                       use_hash=hash_flag,
                       dtype=tf.int32,
                       group_name=group_name))

    for i in range(dense_feature_num):
        feature_columns.append(
            DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix + 'sequence_' + mode,
                                        vocabulary_size=dim,
                                        embedding_dim=embedding_size),
                             maxlen=maxlen,
                             combiner=mode))

    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            model_input[fc.name] = np.random.randint(0, fc.vocabulary_size,
                                                     sample_size)
        elif isinstance(fc, DenseFeat):
            model_input[fc.name] = np.random.random(sample_size)
        else:
            s_input, s_len_input = gen_sequence(fc.vocabulary_size, fc.maxlen,
                                                sample_size)
            model_input[fc.name] = s_input
            if include_length:
                fc.length_name = prefix + "sequence_" + str(i) + '_seq_length'
                model_input[prefix + "sequence_" + str(i) +
                            '_seq_length'] = s_len_input

    if classification:
        y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    return model_input, y, feature_columns
コード例 #24
0
ファイル: run_youtubednn.py プロジェクト: zlin-zou/DeepMatch
    test_model_input, test_label = gen_model_input(test_set, user_profile,
                                                   SEQ_LEN)

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    embedding_dim = 16

    user_feature_columns = [
        SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
        SparseFeat("gender", feature_max_idx['gender'], embedding_dim),
        SparseFeat("age", feature_max_idx['age'], embedding_dim),
        SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
        SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
        VarLenSparseFeat(
            SparseFeat('hist_movie_id',
                       feature_max_idx['movie_id'],
                       embedding_dim,
                       embedding_name="movie_id"), SEQ_LEN, 'mean',
            'hist_len'),
    ]

    item_feature_columns = [
        SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)
    ]

    # 3.Define Model and train

    K.set_learning_phase(True)
    import tensorflow as tf
    if tf.__version__ >= '2.0.0':
        tf.compat.v1.disable_eager_execution()
コード例 #25
0
def train():
    data = load_data()
    item_set = set(data['movie_id'].unique())
    SEQ_LEN = 50

    # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`
    features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip']
    feature_max_idx = {}
    for feature in features:
        lbe = LabelEncoder()
        data[feature] = lbe.fit_transform(data[feature]) + 1
        feature_max_idx[feature] = data[feature].max() + 1

    user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id')

    item_profile = data[["movie_id"]].drop_duplicates('movie_id')

    user_profile.set_index("user_id", inplace=True)

    user_item_list = data.groupby("user_id")['movie_id'].apply(list)

    train_set, test_set = gen_data_set(data, 0)

    train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)

    test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    embedding_dim = 16

    user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
                            SparseFeat("gender", feature_max_idx['gender'], embedding_dim),
                            SparseFeat("age", feature_max_idx['age'], embedding_dim),
                            SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
                            SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
                            VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                        embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                            ]

    item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]

    # 3.Define Model and train

    K.set_learning_phase(True)
    import tensorflow as tf
    if tf.__version__ >= '2.0.0':
        tf.compat.v1.disable_eager_execution()

    model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5,
                       user_dnn_hidden_units=(64, embedding_dim))

    model.compile(optimizer="adam", loss=sampledsoftmaxloss)  # "binary_crossentropy")

    history = model.fit(train_model_input, train_label,  # train_label,
                        batch_size=256, epochs=50, verbose=1, validation_split=0.0, )

    # 4. Generate user features for testing and full item features for retrieval
    test_user_model_input = test_model_input
    all_item_model_input = {"movie_id": item_profile['movie_id'].values}

    user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
    item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

    user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
    # user_embs = user_embs[:, i, :]  # i in [0,k_max) if MIND
    item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

    # print(user_embs)
    # print(item_embs)

    # 5. [Optional] ANN search by faiss  and evaluate the result

    test_true_label = {line[0]: [line[2]] for line in test_set}


    index = faiss.IndexFlatIP(embedding_dim)
    # faiss.normalize_L2(item_embs)
    index.add(item_embs)
    # faiss.normalize_L2(user_embs)
    D, I = index.search(np.ascontiguousarray(user_embs), 10)

    recommed_dict = {}
    for i, uid in enumerate(test_user_model_input['user_id']):
        recommed_dict.setdefault(uid, [])
        try:
            pred = [item_profile['movie_id'].values[x] for x in I[i]]
            recommed_dict[uid] = pred
        except:
            print(i)

    test_user_items = dict()
    for ts in test_set:
        if ts[0] not in test_user_items:
            test_user_items[ts[0]] = set(ts[1])
    item_popularity = dict()
    for ts in train_set:
        for item in ts[1]:
            if item in item_popularity:
                item_popularity[item] += 1
            else:
                item_popularity.setdefault(item, 1)

    precision = metric.precision(recommed_dict, test_user_items)
    recall = metric.recall(recommed_dict, test_user_items)
    coverage = metric.coverage(recommed_dict, item_set)
    popularity = metric.popularity(item_popularity, recommed_dict)

    print("precision:{:.4f}, recall:{:.4f}, coverage:{:.4f}, popularity:{:.4f}".format(precision, recall, coverage,
                                                                                       popularity))
コード例 #26
0
    def _model_fn(features, labels, mode, config):
        train_flag = (mode == tf.estimator.ModeKeys.TRAIN)
        with variable_scope(DNN_SCOPE_NAME):
            sparse_feature_columns = []
            dense_feature_columns = []
            varlen_sparse_feature_columns = []

            for feat in dnn_feature_columns:

                new_feat_name = list(feat.parse_example_spec.keys())[0]
                if new_feat_name in ['hist_price_id', 'hist_des_id']:
                    varlen_sparse_feature_columns.append(
                        VarLenSparseFeat(SparseFeat(new_feat_name,
                                                    vocabulary_size=100,
                                                    embedding_dim=32,
                                                    use_hash=False),
                                         maxlen=3))
                elif is_embedding(feat):
                    sparse_feature_columns.append(
                        SparseFeat(new_feat_name,
                                   vocabulary_size=feat[0]._num_buckets + 1,
                                   embedding_dim=feat.dimension))
                else:
                    dense_feature_columns.append(DenseFeat(new_feat_name))

            history_feature_columns = []
            sparse_varlen_feature_columns = []
            history_fc_names = list(
                map(lambda x: "hist_" + x, history_feature_list))
            for fc in varlen_sparse_feature_columns:
                feature_name = fc.name
                if feature_name in history_fc_names:
                    history_feature_columns.append(fc)
                else:
                    sparse_varlen_feature_columns.append(fc)
            my_feature_columns = sparse_feature_columns + dense_feature_columns + varlen_sparse_feature_columns
            embedding_dict = create_embedding_matrix(my_feature_columns,
                                                     l2_reg_embedding,
                                                     seed,
                                                     prefix="")

            query_emb_list = embedding_lookup(embedding_dict,
                                              features,
                                              sparse_feature_columns,
                                              history_feature_list,
                                              history_feature_list,
                                              to_list=True)
            print('query_emb_list', query_emb_list)
            print('embedding_dict', embedding_dict)
            print('haha')
            print('history_feature_columns', history_feature_columns)
            print('haha')
            keys_emb_list = embedding_lookup(embedding_dict,
                                             features,
                                             history_feature_columns,
                                             history_fc_names,
                                             history_fc_names,
                                             to_list=True)
            print('keys_emb_list', keys_emb_list)
            dnn_input_emb_list = embedding_lookup(
                embedding_dict,
                features,
                sparse_feature_columns,
                mask_feat_list=history_feature_list,
                to_list=True)
            print('dnn_input_emb_list', dnn_input_emb_list)
            dense_value_list = get_dense_input(features, dense_feature_columns)
            sequence_embed_dict = varlen_embedding_lookup(
                embedding_dict, features, sparse_varlen_feature_columns)
            sequence_embed_list = get_varlen_pooling_list(
                sequence_embed_dict,
                features,
                sparse_varlen_feature_columns,
                to_list=True)

            dnn_input_emb_list += sequence_embed_list

            keys_emb = concat_func(keys_emb_list, mask=True)
            deep_input_emb = concat_func(dnn_input_emb_list)
            query_emb = concat_func(query_emb_list, mask=True)
            hist = AttentionSequencePoolingLayer(
                att_hidden_size,
                att_activation,
                weight_normalization=att_weight_normalization,
                supports_masking=True)([query_emb, keys_emb])

            deep_input_emb = tf.keras.layers.Concatenate()(
                [NoMask()(deep_input_emb), hist])
            deep_input_emb = tf.keras.layers.Flatten()(deep_input_emb)
            dnn_input = combined_dnn_input([deep_input_emb], dense_value_list)
            output = DNN(dnn_hidden_units,
                         dnn_activation,
                         l2_reg_dnn,
                         dnn_dropout,
                         dnn_use_bn,
                         seed=seed)(dnn_input)
            final_logit = tf.keras.layers.Dense(
                1,
                use_bias=False,
                kernel_initializer=tf.keras.initializers.glorot_normal(seed))(
                    output)
        #             logits_list.append(final_logit)
        #         logits = add_func(logits_list)
        #             print(labels)
        #             tf.summary.histogram(final_logit + '/final_logit', final_logit)
        return deepctr_model_fn(features,
                                mode,
                                final_logit,
                                labels,
                                task,
                                linear_optimizer,
                                dnn_optimizer,
                                training_chief_hooks=training_chief_hooks)
コード例 #27
0
def train_sdm_model(train_model_input, train_label, embedding_dim,
                    feature_max_idx, SEQ_LEN_short, SEQ_LEN_prefer, batch_size,
                    epochs, verbose, validation_split):
    """构建sdm并完成训练"""
    # 建立模型
    user_feature_columns = [
        SparseFeat('user_id', feature_max_idx['user_id'], 16),
        SparseFeat('gender', feature_max_idx['gender'], 16),
        SparseFeat('age', feature_max_idx['age'], 16),
        SparseFeat('city', feature_max_idx['city'], 16),
        VarLenSparseFeat(
            SparseFeat('short_doc_id',
                       feature_max_idx['article_id'],
                       embedding_dim,
                       embedding_name="doc_id"), SEQ_LEN_short, 'mean',
            'short_sess_length'),
        VarLenSparseFeat(
            SparseFeat('prefer_doc_id',
                       feature_max_idx['article_id'],
                       embedding_dim,
                       embedding_name='doc_id'), SEQ_LEN_prefer, 'mean',
            'prefer_sess_length'),
        VarLenSparseFeat(
            SparseFeat('short_cat1',
                       feature_max_idx['cat_1'],
                       embedding_dim,
                       embedding_name='cat_1'), SEQ_LEN_short, 'mean',
            'short_sess_length'),
        VarLenSparseFeat(
            SparseFeat('prefer_cat1',
                       feature_max_idx['cat_1'],
                       embedding_dim,
                       embedding_name='cat_1'), SEQ_LEN_prefer, 'mean',
            'prefer_sess_length'),
        VarLenSparseFeat(
            SparseFeat('short_cat2',
                       feature_max_idx['cat_2'],
                       embedding_dim,
                       embedding_name='cat_2'), SEQ_LEN_short, 'mean',
            'short_sess_length'),
        VarLenSparseFeat(
            SparseFeat('prefer_cat2',
                       feature_max_idx['cat_2'],
                       embedding_dim,
                       embedding_name='cat_2'), SEQ_LEN_prefer, 'mean',
            'prefer_sess_length'),
    ]

    item_feature_columns = [
        SparseFeat('doc_id', feature_max_idx['article_id'], embedding_dim)
    ]

    # 定义模型
    model = Sdm(user_feature_columns,
                item_feature_columns,
                history_feature_list=['doc_id', 'cat1', 'cat2'])

    # 模型编译
    model.compile(optimizer="adam", loss=sampledsoftmaxloss)

    # 模型训练,这里可以定义验证集的比例,如果设置为0的话就是全量数据直接进行训练
    history = model.fit(train_model_input,
                        train_label,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=verbose,
                        validation_split=validation_split)

    return model