コード例 #1
0
def _preprocess_criteo(df, **kw):
    hash_feature = kw.get('hash_feature')
    sparse_col = ['C' + str(i) for i in range(1, 27)]
    dense_col = ['I' + str(i) for i in range(1, 14)]
    df[sparse_col] = df[sparse_col].fillna('-1', )
    df[dense_col] = df[dense_col].fillna(0, )
    target = ["label"]

    # set hashing space for each sparse field,and record dense feature field name
    if hash_feature:
        # Transformation for dense features
        mms = MinMaxScaler(feature_range=(0, 1))
        df[dense_col] = mms.fit_transform(df[dense_col])
        sparse_col = ['C' + str(i) for i in range(1, 27)]
        dense_col = ['I' + str(i) for i in range(1, 14)]

        fixlen_cols = [SparseFeat(feat, vocabulary_size=1000, embedding_dim=4, use_hash=True, dtype='string')
                       # since the input is string
                       for feat in sparse_col] + [DenseFeat(feat, 1, ) for feat in dense_col]

    else:
        for feat in sparse_col:
            lbe = LabelEncoder()
            df[feat] = lbe.fit_transform(df[feat])
        mms = MinMaxScaler(feature_range=(0, 1))
        df[dense_col] = mms.fit_transform(df[dense_col])
        fixlen_cols = [SparseFeat(feat, vocabulary_size=df[feat].nunique(), embedding_dim=4)
                       for i, feat in enumerate(sparse_col)] + [DenseFeat(feat, 1, ) for feat in dense_col]

    linear_cols = fixlen_cols
    dnn_cols = fixlen_cols
    train, test = train_test_split(df, test_size=kw['test_size'])

    return df, linear_cols, dnn_cols, train, test, target, test[target].values
コード例 #2
0
ファイル: run_mind.py プロジェクト: wangjz1993/MIND
def get_xy_fd():
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10),
        SparseFeat('gender', 2, embedding_dim=4),
        SparseFeat('item', 3 + 1, embedding_dim=8),
        SparseFeat('item_gender', 2 + 1, embedding_dim=4),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=20,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]

    feature_columns += [DenseFeat('hist_len', 1, dtype="int64")]

    behavior_feature_list = ["item"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
    hist_len = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'hist_len': hist_len,
        'score': score
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = [1, 1, 1]
    return x, y, feature_columns, behavior_feature_list
コード例 #3
0
def get_xy_fd(hash_flag=False):

    feature_columns = [SparseFeat('user', 3, hash_flag),
                       SparseFeat('gender', 2, hash_flag),
                       SparseFeat('item', 3 + 1, hash_flag),
                       SparseFeat('item_gender', 2 + 1, hash_flag),
                       DenseFeat('score', 1)]
    feature_columns += [VarLenSparseFeat('sess_0_item',3+1,4,use_hash=hash_flag,embedding_name='item'),VarLenSparseFeat('sess_0_item_gender',2+1,4,use_hash=hash_flag,embedding_name='item_gender')]
    feature_columns += [VarLenSparseFeat('sess_1_item', 3 + 1, 4, use_hash=hash_flag, embedding_name='item'),VarLenSparseFeat('sess_1_item_gender', 2 + 1, 4, use_hash=hash_flag,embedding_name='item_gender')]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]])
    sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]])

    sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

    sess_number = np.array([2, 1, 0])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score,
                    'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, }

    x = {name:feature_dict[name] for name in get_feature_names(feature_columns)}
    x["sess_length"] = sess_number
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
コード例 #4
0
def generate_din_feature_columns(data, sparse_features, dense_features):
    feat_lbe_dict = get_glv('feat_lbe_dict')

    sparse_feature_columns = [
        SparseFeat(feat,
                   vocabulary_size=len(feat_lbe_dict[feat].classes_) + 1,
                   embedding_dim=EMBED_DIM)
        for i, feat in enumerate(sparse_features) if feat not in time_feat
    ]

    dense_feature_columns = [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    var_feature_columns = [
        VarLenSparseFeat(SparseFeat(
            'hist_item_id',
            vocabulary_size=len(feat_lbe_dict['item_id'].classes_) + 1,
            embedding_dim=EMBED_DIM,
            embedding_name='item_id'),
                         maxlen=max_seq_len)
    ]

    # DNN side
    dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
    # FM side
    linear_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
    # all feature names
    feature_names = get_feature_names(dnn_feature_columns +
                                      linear_feature_columns)

    return feature_names, linear_feature_columns, dnn_feature_columns
コード例 #5
0
ファイル: feature_test.py プロジェクト: Puzz1eX/HFCN
def test_long_dense_vector():

    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_feature_names(feature_columns)

    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])

    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    model = DeepFM(feature_columns, feature_columns[:-1])
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
コード例 #6
0
ファイル: svd_complete.py プロジェクト: leoninekev/DeepCTR
    def prepare_data(cls, path, sparse_features, task='binary'):
        data_path = path
        dataframe = pd.read_csv(data_path, names= 'user_id,movie_id,rating,timestamp'.split(','))
        sparse_features = sparse_features
        y= ['rating']
        
        for feat in sparse_features:
            lbe = LabelEncoder()
            dataframe[feat] = lbe.fit_transform(dataframe[feat])

        feature_columns = [DenseFeat(feat, dataframe[feat].nunique()) for feat in sparse_features]
        #feature_columns = [SparseFeat(feat, dataframe[feat].nunique()) for feat in sparse_features]

        trainset, testset = train_test_split(dataframe, test_size=0.2)

        train_model_input = [to_categorical(trainset[fc.name].values, num_classes= fc.dimension) for fc in feature_columns]#includes values from only data[user_id], data[movie_id]
        test_model_input = [to_categorical(testset[fc.name].values, num_classes= fc.dimension) for fc in feature_columns]#includes values from only data[user_id], data[movie_id]
        if task =='binary':
            train_lbl = trainset[y]
            test_lbl= testset[y]
        elif task == 'multiclass':
            train_lbl = to_categorical(trainset[y])[:,1:]#stripping 0th column as rating is (1,5)
            test_lbl= to_categorical(testset[y])[:,1:]#stripping 0th column as rating is (1,5)
        else:
            raise ValueError("Enter task either 'binary' or 'multiclass'")
        
        return cls(feature_columns), (train_model_input, train_lbl), (test_model_input, test_lbl) #try returning train_model_input from inside __init__()
コード例 #7
0
def train_deepFM():
    k = featureengineer.k
    #缺失值填充+编码处理
    data,appsnum, tags_nums = trainmodel.data,trainmodel.appsnum,trainmodel.tags_nums
    data[trainmodel.sparse_features] = data[trainmodel.sparse_features].fillna('-1', )
    for feat in trainmodel.dense_features:
        data[feat].fillna(data[feat].dropna().mean(), inplace=True)

    for feat in trainmodel.sparse_features:
        data[feat] = data[feat].apply(lambda x:str(x))
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[trainmodel.dense_features] = mms.fit_transform(data[trainmodel.dense_features])


    #数据格式转换
    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=8)
                              for i, feat in enumerate(trainmodel.sparse_features)] + \
                             [DenseFeat(feat, 1, ) for feat in trainmodel.dense_features]

    lgbOut_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=1)
                              for i, feat in enumerate(trainmodel.lgbOut_Features)]

    key2index_len = {'applist': appsnum+1, 'new_tag': tags_nums}
    varlen_features = [VarLenSparseFeat('%s' % i, vocabulary_size=key2index_len[i], maxlen=k, embedding_dim=8, combiner='mean',weight_name=None) for i
                       in trainmodel.var_features]

    dnn_feature_columns = fixlen_feature_columns + varlen_features
    linear_feature_columns = fixlen_feature_columns + varlen_features + lgbOut_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    sparse_dense_features = trainmodel.sparse_features + trainmodel.dense_features + trainmodel.lgbOut_Features

    train, test = train_test_split(data, test_size=0.2)


    train_model_input = {name: train[name] for name in sparse_dense_features}
    test_model_input = {name: test[name] for name in sparse_dense_features}
    for x in trainmodel.var_features:
        if x == 'applist':
            train_model_input[x] = np.array(train[x].tolist())
            test_model_input[x] = np.array(test[x].tolist())
        if x == 'new_tag':
            train_model_input[x] = np.array(train[x].tolist())-appsnum
            test_model_input[x] = np.array(test[x].tolist())-appsnum
    # 模型
    model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                   dnn_hidden_units=(50, 30, 30), l2_reg_linear=0.001, l2_reg_embedding=0.001,
                   l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.1, dnn_activation='relu', dnn_use_bn=True,
                   task='binary')
    model.compile("adam", "binary_crossentropy",metrics=['AUC'], )

    history = model.fit(train_model_input, train['target'].values,
                        batch_size=256, epochs=1, verbose=2, validation_split=0.2, )

    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test AUC", round(roc_auc_score(test['target'].values, pred_ans), 4))
コード例 #8
0
    def load_stats(self):
        fixlen_feature_columns = [SparseFeat(feat, self.cat_meta[feat])
                           for feat in self.sparse_features] + [DenseFeat(feat, 1,)
                          for feat in self.dense_features]

        self.dnn_feature_columns = fixlen_feature_columns
        self.linear_feature_columns = fixlen_feature_columns

        self.fixlen_feature_names = get_fixlen_feature_names(self.linear_feature_columns + self.dnn_feature_columns)
コード例 #9
0
ファイル: utils.py プロジェクト: zzAlpha/DeepCTR
def get_test_data(sample_size=1000,
                  sparse_feature_num=1,
                  dense_feature_num=1,
                  sequence_feature=('sum', 'mean', 'max'),
                  classification=True,
                  include_length=False,
                  hash_flag=False,
                  prefix=''):

    feature_columns = []

    for i in range(sparse_feature_num):
        dim = np.random.randint(1, 10)
        feature_columns.append(
            SparseFeat(prefix + 'sparse_feature_' + str(i), dim, hash_flag,
                       tf.int32))
    for i in range(dense_feature_num):
        feature_columns.append(
            DenseFeat(prefix + 'dense_feature_' + str(i), 1, tf.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(prefix + 'sequence_' + str(i), dim, maxlen, mode))

    model_input = []
    sequence_input = []
    sequence_len_input = []
    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            model_input.append(np.random.randint(0, fc.dimension, sample_size))
        elif isinstance(fc, DenseFeat):
            model_input.append(np.random.random(sample_size))
        else:
            s_input, s_len_input = gen_sequence(fc.dimension, fc.maxlen,
                                                sample_size)
            sequence_input.append(s_input)
            sequence_len_input.append(s_len_input)

    if classification:
        y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    x = model_input + sequence_input
    if include_length:
        for i, mode in enumerate(sequence_feature):
            dim = np.random.randint(1, 10)
            maxlen = np.random.randint(1, 10)
            feature_columns.append(
                SparseFeat(prefix + 'sequence_' + str(i) + '_seq_length',
                           1,
                           embedding=False))

        x += sequence_len_input

    return x, y, feature_columns
コード例 #10
0
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1,
                  sequence_feature=['sum', 'mean', 'max', 'weight'], classification=True, include_length=False,
                  hash_flag=False, prefix='', use_group=False):
    feature_columns = []
    model_input = {}

    if 'weight' in sequence_feature:
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix + "weighted_seq", vocabulary_size=2, embedding_dim=embedding_size),
                             maxlen=3, length_name=prefix + "weighted_seq" + "_seq_length",
                             weight_name=prefix + "weight"))
        s_input, s_len_input = gen_sequence(
            2, 3, sample_size)

        model_input[prefix + "weighted_seq"] = s_input
        model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1)
        model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input
        sequence_feature.pop(sequence_feature.index('weight'))

    for i in range(sparse_feature_num):
        if use_group:
            group_name = str(i%3)
        else:
            group_name = DEFAULT_GROUP_NAME
        dim = np.random.randint(1, 10)
        feature_columns.append(
            SparseFeat(prefix + 'sparse_feature_' + str(i), dim, embedding_size, use_hash=hash_flag, dtype=tf.int32,group_name=group_name))

    for i in range(dense_feature_num):
        feature_columns.append(DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix + 'sequence_' + mode, vocabulary_size=dim, embedding_dim=embedding_size),
                             maxlen=maxlen, combiner=mode))

    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            model_input[fc.name] = np.random.randint(0, fc.vocabulary_size, sample_size)
        elif isinstance(fc, DenseFeat):
            model_input[fc.name] = np.random.random(sample_size)
        else:
            s_input, s_len_input = gen_sequence(
                fc.vocabulary_size, fc.maxlen, sample_size)
            model_input[fc.name] = s_input
            if include_length:
                fc.length_name = prefix + "sequence_" + str(i) + '_seq_length'
                model_input[prefix + "sequence_" + str(i) + '_seq_length'] = s_len_input

    if classification:
        y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    return model_input, y, feature_columns
コード例 #11
0
ファイル: DIN_test.py プロジェクト: zhuxb/DeepCTR
def get_xy_fd(hash_flag=False):
    # feature_dim_dict = {"sparse": [SingleFeat('user', 3, hash_flag), SingleFeat(
    #     'gender', 2, hash_flag), SingleFeat('item', 3 + 1, hash_flag), SingleFeat('item_gender', 2 + 1, hash_flag)],
    #                     "dense": [SingleFeat('score', 0)]}

    feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        SparseFeat('item', 3 + 1),
        SparseFeat('item_gender', 2 + 1),
        DenseFeat('score', 0)
    ]
    feature_columns += [
        VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'),
        VarLenSparseFeat('hist_item_gender',
                         3 + 1,
                         maxlen=4,
                         embedding_name='item_gender')
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }

    feature_names = get_fixlen_feature_names(feature_columns)
    varlen_feature_names = get_varlen_feature_names(feature_columns)
    x = [feature_dict[name] for name in feature_names
         ] + [feature_dict[name] for name in varlen_feature_names]

    # x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in
    #                                                                         feature_dim_dict["dense"]] + [
    #         feature_dict['hist_' + feat] for feat in behavior_feature_list]

    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
コード例 #12
0
ファイル: sample_din.py プロジェクト: zls0222/predict
def get_xy_fd():
    # 固定长度的离散特征
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10),
        SparseFeat('gender', 2, embedding_dim=4),
        SparseFeat('item_id', 3 + 1, embedding_dim=8),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4),
        DenseFeat('pay_score', 1)
    ]
    # 不固定长度的离散特征
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 1])  # 0 is mask value
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': pay_score
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    print('x=', x)
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
コード例 #13
0
ファイル: DIEN_test.py プロジェクト: zhuxb/DeepCTR
def get_xy_fd(use_neg=False, hash_flag=False):

    feature_columns = [SparseFeat('user', 3,hash_flag),
                       SparseFeat('gender', 2,hash_flag),
                       SparseFeat('item', 3+1,hash_flag),
                       SparseFeat('item_gender', 2+1,hash_flag),
                       DenseFeat('score', 1)]

    feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]

    behavior_feature_list = ["item","item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])#0 is mask value
    igender = np.array([1, 2, 1])# 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]])
    hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]])

    behavior_length = np.array([3,3,2])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'hist_item': hist_iid, 'hist_item_gender': hist_igender,
                    'score': score}

    #x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in
    #                                                                        feature_dim_dict["dense"]] + [
    #        feature_dict['hist_' + feat] for feat in behavior_feature_list]


    if use_neg:
        feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
        feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
        feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]
        #x += [feature_dict['neg_hist_'+feat] for feat in behavior_feature_list]


    feature_names = get_fixlen_feature_names(feature_columns)
    varlen_feature_names = get_varlen_feature_names(feature_columns)
    print(varlen_feature_names)
    x = [feature_dict[name] for name in feature_names] + [feature_dict[name] for name in varlen_feature_names]

    x += [behavior_length]
    y = [1, 0, 1]
    print(len(x))
    return x, y, feature_columns, behavior_feature_list
コード例 #14
0
def main():

    Use_SF = False
    if len(sys.argv) > 0 and sys.argv[0] == 'SF':
        Use_SF = True

    train, vali, test = GetFeatures(Use_SF)

    feature_count = []
    for feat in sparse_features:
        print("Fitting {}".format(feat))
        labels = {}
        for x in train[feat]:
            if x not in labels:
                labels[x] = len(labels) + 1
        print("Transforming {}".format(feat))
        for df in [train, vali, test]:
            df[feat] = df[feat].map(lambda x: labels.get(x, 0))
        feature_count.append(len(labels) + 1)

    sparse_feature_columns = [
        SparseFeat(f, f_c) for f, f_c in zip(sparse_features, feature_count)
    ]
    dense_feature_columns = [DenseFeat(f, 1) for f in dense_features]
    fixlen_feature_columns = sparse_feature_columns + dense_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns +
                                                    dnn_feature_columns)

    train_model_input = [train[name] for name in fixlen_feature_names]
    vali_model_input = [vali[name] for name in fixlen_feature_names]
    test_model_input = [test[name] for name in fixlen_feature_names]

    def eval(target):
        model, history = model_generate(train_model_input, train[[target]],
                                        vali_model_input, vali[[target]],
                                        linear_feature_columns,
                                        dnn_feature_columns)
        pred_ans = model.predict(test_model_input, batch_size=256)
        print(target + " test LogLoss",
              round(log_loss(test[target].values, pred_ans), 4))
        print(target + " test AUC",
              round(roc_auc_score(test[target].values, pred_ans), 4))

    for target in targets:
        eval(target)
コード例 #15
0
ファイル: DIN_test.py プロジェクト: Puzz1eX/HFCN
def get_xy_fd(hash_flag=False):

    feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        SparseFeat('item', 3 + 1),
        SparseFeat('item_gender', 2 + 1),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }

    feature_names = get_feature_names(feature_columns)
    x = {name: feature_dict[name] for name in feature_names}
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
コード例 #16
0
ファイル: run_din.py プロジェクト: liJay010/Ai_projects
def get_xy_fd():

    feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        SparseFeat('item', 3 + 1),
        SparseFeat('item_gender', 2 + 1),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'),
        VarLenSparseFeat('hist_item_gender',
                         3 + 1,
                         maxlen=4,
                         embedding_name='item_gender')
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }

    fixlen_feature_names = get_fixlen_feature_names(feature_columns)
    varlen_feature_names = get_varlen_feature_names(feature_columns)
    x = [feature_dict[name] for name in fixlen_feature_names
         ] + [feature_dict[name] for name in varlen_feature_names]

    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
コード例 #17
0
def get_xy_fd(use_neg=False, hash_flag=False):

    feature_columns = [SparseFeat('user', 3,hash_flag),
                       SparseFeat('gender', 2,hash_flag),
                       SparseFeat('item', 3+1,hash_flag),
                       SparseFeat('item_gender', 2+1,hash_flag),
                       DenseFeat('score', 1)]

    feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]

    behavior_feature_list = ["item","item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])#0 is mask value
    igender = np.array([1, 2, 1])# 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]])
    hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]])

    behavior_length = np.array([3,3,2])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'hist_item': hist_iid, 'hist_item_gender': hist_igender,
                    'score': score}

    if use_neg:
        feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
        feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
        feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]

    x = {name:feature_dict[name] for name in get_feature_names(feature_columns)}
    x["seq_length"] = behavior_length
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
コード例 #18
0
def client_restful_criteo():
    data = pd.read_csv('./data/criteo_sample.txt')

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
                           for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                          for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    # model_input = [data[name].iloc[0] for name in feature_names]
    # model_input = [{name:data[name].iloc[0]} for name in feature_names]
    model_input = [{name:data[name].iloc[0] for name in feature_names}]
    print(model_input)
    data = json.dumps({"signature_name": "serving_default", "instances": model_input}, cls=NpEncoder)
    headers = {"content-type": "application/json"}
    json_response = requests.post('http://localhost:8501/v1/models/criteo:predict', data=data, headers=headers)
    json_response = json.loads(json_response.text)
    print(json_response)
コード例 #19
0
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}
コード例 #20
0
ファイル: run_dien.py プロジェクト: Puzz1eX/HFCN
def get_xy_fd(use_neg=False, hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('pay_score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4,
                         length_name="seq_length")
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]])

    behavior_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': score,
        "seq_length": behavior_length
    }

    if use_neg:
        feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3,
                                                      0], [1, 2, 3, 0],
                                                     [1, 2, 0, 0]])
        feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2,
                                                      0], [1, 2, 2, 0],
                                                     [1, 2, 0, 0]])
        feature_columns += [
            VarLenSparseFeat(SparseFeat('neg_hist_item_id',
                                        vocabulary_size=3 + 1,
                                        embedding_dim=8,
                                        embedding_name='item_id'),
                             maxlen=4,
                             length_name="seq_length"),
            VarLenSparseFeat(SparseFeat('neg_hist_cate_id',
                                        2 + 1,
                                        embedding_dim=4,
                                        embedding_name='cate_id'),
                             maxlen=4,
                             length_name="seq_length")
        ]

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
コード例 #21
0
def main(args, local):
    
    if args.arch == 'xDeepFM' and args.mode == 'train':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data')
        item = pd.read_csv(csv_file,
                    dtype={
                        'article_id': str,
                        'hh': int, 'gender': str,
                        'age_range': str,
                        'read_article_ids': str
                    }, sep='\t')
        label_data_path = os.path.join(DATASET_PATH, 'train',
                                os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label')
        label = pd.read_csv(label_data_path,
                    dtype={'label': int},
                    sep='\t')
        item['label']  = label
        
        sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
        dense_features = ['image_feature']
        target = ['label']
        
        
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)
        
        
        item['len']  = len_lis
        item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')
    
        id_to_artic = dict()
        artics = item['article_id'].tolist()
        
        with open(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)
        for feat in sparse_features:
            lbe = LabelEncoder()
            item[feat] = lbe.fit_transform(item[feat])
        fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
        fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features]
        
        
        
        idx_artics_all = item['article_id'].tolist()
        
        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]
        
       
            #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns  
        fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
        print('---model defined---')
        # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까
        print(time.time() - s ,'seconds')


    if use_nsml and args.mode == 'train':

        bind_nsml(model,[], args.task)
    
    
    if args.mode == 'test':
        print('_infer root - : ', DATASET_PATH)
        print('test')
        model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(DATASET_PATH)
        bind_nsml(model, [], args.task)
        checkpoint_session = ['401','team_62/airush2/176']
        nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) 
        print('successfully loaded')

    if (args.mode == 'train'):
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네

        nsml.save('infer')
        print('end')
    print('end_main')

    if args.pause:
        nsml.paused(scope=local)
コード例 #22
0
def get_item(root):
    print('load')
    csv_file = os.path.join(root, 'test', 'test_data', 'test_data')
    item = pd.read_csv(csv_file,
                dtype={
                    'article_id': str,
                    'hh': int, 'gender': str,
                    'age_range': str,
                    'read_article_ids': str
                }, sep='\t')
    print('loaded!!')
    sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
    dense_features = ['image_feature']
    target = ['label']

    len_lis = []

    read_article_ids_all = item['read_article_ids'].tolist()
    for i in range(len(item)):
        li = read_article_ids_all[i]
        if type(li) == float:
            len_lis.append(0)
            continue
        len_li = len(li.split(','))
        len_lis.append(len_li)
    
    
    item['len']  = len_lis
    item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')

    id_to_artic = dict()
    artics = item['article_id'].tolist()
    
    with open(os.path.join(DATASET_PATH, 'test', 'test_data', 'test_image_features.pkl'), 'rb') as handle:
        image_feature_dict = pickle.load(handle)

    print('image_feaeture_dict loaded..')
    for feat in sparse_features:
        lbe = LabelEncoder()
        item[feat] = lbe.fit_transform(item[feat])

    # test set으로 구성해도 되고 item 을..
    fixlen_feature_columns = []
    for feat in sparse_features:
        if feat == 'article_id':
            fixlen_feature_columns.append(SparseFeat(feat,1896))
        else:
            fixlen_feature_columns.append(SparseFeat(feat,item[feat].nunique()))
    #fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
    fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features]
    
    print(fixlen_feature_columns)
    
    
    idx_artics_all = item['article_id'].tolist()
    
    for i in range(len(artics)):
        idx_artic = idx_artics_all[i]
        if idx_artic not in id_to_artic.keys():
            id_to_artic[idx_artic] = artics[i]
    
    
       
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns  
    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
    
    fixlen_feature_names_global = fixlen_feature_names

    model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
    #bind_nsml(model, list(), args.task)

    return model, fixlen_feature_names_global, item,image_feature_dict, id_to_artic
コード例 #23
0
    # dense_feature_list = [SingleFeat(feat, 0)
    #                       for feat in dense_features]

    train = data[data['date'] <= 20190707]
    test = data[data['date'] == 20190708]

    # train_labels = [train[target[0]].values, train[target[1]].values]
    # test_labels = [test[target[0]].values, test[target[1]].values]

    train_labels = [train[target[0]].values, train[target[1]].values]
    test_labels = [test[target[0]].values, test[target[1]].values]

    sparse_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]
    dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]

    # sparse_feature_columns = [SparseFeat(feat, dimension=int(1e6), use_hash=True) for feat in
    #                           sparse_features]  # The dimension can be set according to data
    # dense_feature_columns = [DenseFeat(feat, 1)
    #                          for feat in dense_features]

    dnn_feature_columns = sparse_feature_columns + dense_feature_columns
    linear_feature_columns = sparse_feature_columns + dense_feature_columns

    feature_names = get_fixlen_feature_names(linear_feature_columns +
                                             dnn_feature_columns)

    print(feature_names)
    train_model_input = [train[name] for name in feature_names]
コード例 #24
0
def get_input(use_img=True, use_text=True, target='isclick'):

    sequence_feature_list = []
    sparse_feature_df = pd.read_csv(feat_columns_path +
                                    "cat_cols_selected.csv")
    dense_feature_df = pd.read_csv(feat_columns_path + "num_cols_selected.csv")
    emb_feat_list = pd.read_csv(
        feat_columns_path +
        "embeding_cols_selected.csv")["embeding_feature"].values.tolist()
    user_cols_list = pd.read_csv(
        feat_columns_path +
        "user_cols_selected.csv")["user_feature"].values.tolist()
    item_cols_list = pd.read_csv(
        feat_columns_path +
        "item_cols_selected.csv")["item_feature"].values.tolist()

    # 用户信息表
    train_user_feat_df = pd.read_csv(train_path_user)
    # item 信息表
    train_item_feat_df = pd.read_csv(train_path_item, names=item_feat_cols)

    cat_feature_list = sparse_feature_df["cat_feature"].values.tolist()
    num_feature_list = list(
        set(dense_feature_df["num_feature"].values.tolist()))
    #类别聚类还没有跑 这里是手动写的,之后这个删掉
    cat_feature_list = ["user_id", "item_id"]

    data = pd.read_csv(feat_columns_path + "train.csv").iloc[:-1000]
    data["isclick"] = 1

    for missing_col in data.columns.tolist():
        if missing_col in num_feature_list:
            data[missing_col].fillna(data[missing_col].median(), inplace=True)
        elif missing_col in ['text_vd' + str(i) for i in range(128)
                             ] + ['text_vd' + str(i) for i in range(128)]:
            data[missing_col].fillna(0, inplace=True)
            data[missing_col] = data[missing_col].apply(
                lambda x: 0 if x == "nan" or x == "null" else x)
    data[cat_feature_list] = data[cat_feature_list].apply(
        LabelEncoder().fit_transform)
    # data[sparse_feature_list].fillna(-1)
    feature_columns = []
    sparse_feature_list = [
        SparseFeat(cat_col, data[cat_col].nunique(), embedding_dim=10)
        for cat_col in cat_feature_list
    ]
    dense_feature_list = [
        DenseFeat(colname, 1) for colname in num_feature_list
    ]
    sequence_feature_list = []
    feature_columns = sparse_feature_list + dense_feature_list + sequence_feature_list

    test = data.iloc[-1000:]
    train = data.iloc[:-1000]
    train_size = len(train)


    train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
        [train[feat.name].values for feat in dense_feature_list]

    test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
        [test[feat.name].values for feat in dense_feature_list]

    if use_img:
        ad_cols = ['img_vd' + str(i) for i in range(128)]
        img_input = data[ad_cols].values
        train_model_input += [img_input[:train_size]]
        test_model_input += [img_input[train_size:]]
    if use_text:
        vd_cols = ['text_vd' + str(i) for i in range(128)]
        text_input = data[vd_cols].values
        train_model_input += [text_input[:train_size]]
        test_model_input += [text_input[train_size:]]

    train_labels, test_labels = train[target].values, test[target].values
    feature_dim_dict = {
        "sparse": sparse_feature_list,
        "dense": dense_feature_list,
        "sequence": sequence_feature_list
    }

    return feature_columns, train_model_input, train_labels, test_model_input, test_labels
コード例 #25
0
target = ['finish', 'like']

for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

sparse_feature_columns = [
    SparseFeat(
        feat, data[feat].nunique()
    )  #(特征名, 特征不同取值个数)生成SparseFeat对象,name == 特征名,dimension==该特征不同取值个数, dtype ==int32
    for feat in sparse_features
]
dense_feature_columns = [
    DenseFeat(feat, 1)  #(特征名, dimension==1) 数据dtype == float32
    for feat in dense_features
]
dnn_feature_columns = sparse_feature_columns + dense_feature_columns
linear_feature_columns = sparse_feature_columns + dense_feature_columns

##['feature1','feature2',...]
feature_names = get_fixlen_feature_names(linear_feature_columns +
                                         dnn_feature_columns)

train, test = train_test_split(data, test_size=0.1)

train_model_input = [train[name] for name in feature_names]
test_model_input = [test[name] for name in feature_names]

features = build_input_features(linear_feature_columns + dnn_feature_columns)
コード例 #26
0
        test[feat] = lbe.transform(test[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    mms.fit(train[dense_features])
    train[dense_features] = mms.transform(train[dense_features])

    # preprocess the sequence feature
    genres_key2index, train_genres_list, genres_maxlen = get_var_feature(
        train, 'genres')
    user_key2index, train_user_hist, user_maxlen = get_var_feature(
        train, 'user_hist')

    user_feature_columns = [
        SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
        for i, feat in enumerate(user_sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in user_dense_features]
    item_feature_columns = [
        SparseFeat(feat, data[feat].nunique(), embedding_dim=4, use_hash=True)
        for i, feat in enumerate(item_sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in item_dense_features]

    item_varlen_feature_columns = [
        VarLenSparseFeat(SparseFeat('genres',
                                    vocabulary_size=1000,
                                    embedding_dim=4),
                         maxlen=genres_maxlen,
                         combiner='mean',
コード例 #27
0
import numpy as np
from deepctr.models import DIN
from deepctr.inputs import SparseFeat,VarLenSparseFeat,DenseFeat,get_fixlen_feature_names,get_varlen_feature_names

feature_columns = [SparseFeat('user',3),SparseFeat(
    'gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1),DenseFeat('score', 1)]
feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'),
                    VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]
behavior_feature_list = ["item", "item_gender"]
uid = np.array([0, 1, 2])
ugender = np.array([0, 1, 0])
iid = np.array([1, 2, 3])  # 0 is mask value
igender = np.array([1, 2, 1])  # 0 is mask value
score = np.array([0.1, 0.2, 0.3])

hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score}

fixlen_feature_names = get_fixlen_feature_names(feature_columns)
varlen_feature_names = get_varlen_feature_names(feature_columns)
x = [feature_dict[name] for name in fixlen_feature_names] + [feature_dict[name] for name in varlen_feature_names]

y = [1, 0, 1]

model = DIN(feature_columns, behavior_feature_list, hist_len_max=4, )
model.compile('adam', 'binary_crossentropy',
              metrics=['binary_crossentropy'])
history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)
コード例 #28
0
ファイル: 2_gen_din_input.py プロジェクト: zhaoqingxin/cikm
                    feature_list.append(
                        SparseFeat(feat, users_feature[feat].nunique() + 1))
                else:
                    feature_list.append(SparseFeat(feat, len(user_id_unique)))

            for feat in item_sparse_features:
                if feat != "item_id":
                    feature_list.append(
                        SparseFeat(feat, items_feature[feat].nunique() + 1))
                else:
                    feature_list.append(
                        SparseFeat(feat,
                                   len(item_id_unique) + 1))

            dense_feature_list = [
                DenseFeat(feat, 1) for feat in dense_features
            ]

            varLen_sparse_feature_list = [
                VarLenSparseFeat(feat, 11, maxlen=10)
                for feat in varLen_sparse_features
            ]

            sess_sparse_feature_list = [
                VarLenSparseFeat(feat,
                                 len(item_id_unique) + 1,
                                 maxlen=DIN_SESS_MAX_LEN,
                                 embedding_name='item_id')
                for feat in hist_feature
            ]
コード例 #29
0
    train = data[data['date'] <= 20190707]
    test = data[data['date'] == 20190708]

    # train_labels = [train[target[0]].values, train[target[1]].values]
    # test_labels = [test[target[0]].values, test[target[1]].values]

    train_y_id = train['g_region_id'].values
    test_y_id = test['g_region_id'].values

    train_labels = [train[target[0]].values, train[target[1]].values]
    test_labels = [test[target[0]].values, test[target[1]].values]

    sparse_feature_columns = [SparseFeat(feat, data[feat].nunique())
                              for feat in sparse_features]
    dense_feature_columns = [DenseFeat(feat, 1)
                             for feat in dense_features]

    gate_feature_columns = [DenseFeat(feat, 1)
                             for feat in gate_features]

    # sparse_feature_columns = [SparseFeat(feat, dimension=int(1e6), use_hash=True) for feat in
    #                           sparse_features]  # The dimension can be set according to data
    # dense_feature_columns = [DenseFeat(feat, 1)
    #                          for feat in dense_features]

    dnn_feature_columns = sparse_feature_columns + dense_feature_columns
    linear_feature_columns = sparse_feature_columns + dense_feature_columns

    feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns + gate_feature_columns)
コード例 #30
0
def run(data, ziel, line0, grid, loop):
    poi_feature_transfer = []
    print('++++', '\n', grid)
    for a in range(len(poi_feature)):
        poi_feature_transfer.append('poi_feature_%d' % a)
        data = data.rename(columns={poi_feature[a]: 'poi_feature_%d' % a})

    features = [
        'provname', 'prefname', 'cntyname', 'townname', 'villname', 'dispincm',
        'urbcode_1', 'hauslvl'
    ] + poi_feature_transfer  #
    sparse_features = []
    dense_features = []
    for f in features:
        if f not in x_category or x_category[f] == 1:
            dense_features.append(f)
        else:
            sparse_features.append(f)
    data[sparse_features] = data[sparse_features].fillna(-1)
    data[dense_features] = data[dense_features].fillna(0)

    y = []
    #ziel =  # villmean, income
    y_limit = [np.min(data[ziel]) - 1] + line0 + [np.max(data[ziel])]
    for index, row in data.iterrows():
        for i in range(1, len(y_limit)):
            if y_limit[i - 1] < row[ziel] <= y_limit[i]:
                y.append(i - 1)
                break
    data['income_0'] = y
    target = ['income_0']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name
    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] + \
                             [DenseFeat(feat, 1,)for feat in dense_features]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns +
                                                    dnn_feature_columns)

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)

    # try to oversampling
    # (train_x,train_y)=over_sampling(train[features],train[ziel], 3)
    # train = (np.column_stack((train_x, train_y)))

    train_model_input = [train[name] for name in fixlen_feature_names]
    test_model_input = [test[name] for name in fixlen_feature_names]
    # 4.Define Model,train,predict and evaluate ##############################################
    (models, model_names, xlabel) = model_gridsearch(linear_feature_columns,
                                                     dnn_feature_columns, grid)
    logloss, auc1, acc1, pre1, recall1, f11 = [], [], [], [], [], []
    print(ziel, line0, len(data))
    for name, model in zip(model_names, models):
        ll_avg, auc_avg = [], []
        for i in range(loop):
            model.compile("adam",
                          'binary_crossentropy',
                          metrics=['binary_crossentropy'])
            history = model.fit(
                train_model_input,
                train[target].values,
                batch_size=256,
                epochs=10,
                verbose=0,
                validation_split=0.2,
            )
            pred_ans = model.predict(test_model_input, batch_size=256)

            true = test[target].values
            '''
            f = open("pred.csv", 'a', encoding='utf_8_sig')
            f.write('%s\n'%(ziel))
            for i in range(len(pred_ans)):
                f.write('%s, %s\n' % (pred_ans[i],true[i] ))
            f.close()'''

            ll = round(log_loss(test[target].values, pred_ans), 4)
            auc = round(roc_auc_score(test[target].values, pred_ans), 4)
            #acc = round(accuracy_score(test[target].values, pred_ans.round()), 4)
            #pre = round(precision_score(test[target].values, pred_ans.round()), 4)
            #recall = round(recall_score(test[target].values, pred_ans.round()), 4)
            #f1 = round(f1_score(test[target].values, pred_ans.round(), average='weighted'),4)
            #spec = round(specificity_score(test[target].values, pred_ans.round(), average='weighted'),4)
            #sens = round(sensitivity_score(test[target].values, pred_ans.round(), average='weighted'),4)
            ll_avg.append(ll), auc_avg.append(auc)
        logloss.append(np.mean(ll_avg)), auc1.append(
            np.mean(auc_avg)
        )  #, acc1.append(acc), pre1.append(pre), recall1.append(recall), f11.append(f1)
        '''
        cm = confusion_matrix(test[target].values, pred_ans.round())
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = []
        for m in range(len(line0)+1):
            cm.append([])
            for n in range(len(line0)+1):
                cm[m].append(round(cm_normalized[m][n],4))
        '''
        '''
        print(name)
        print("LogLoss", ll, end=' ')
        print("AUC", auc, end=' ')
        print("accuracy", acc, end=' ')
        #print("precision" , pre, end=' ')
        #print("recall", recall, end=' ')
        print("f1" , f1, end=' ')
        print("spec", spec, end=' ')
        print("sens" , sens, end=' ')
        print(cm)
        #f = open("DeepFM.csv", 'a', encoding='utf_8_sig')
        #f.write('%s,%s\n'%(ziel,line0))
        #f.write('%s, %s, %s, %s, %s, %s, %s,' % (name, ll, auc, acc, f1, spec, sens))
        #f.write('%s\n' % str(cm).replace(',',';'))
        #f.close()
        '''
    return (logloss, auc1, xlabel)