def generate_din_feature_columns(data, sparse_features, dense_features):
    feat_lbe_dict = get_glv('feat_lbe_dict')

    sparse_feature_columns = [
        SparseFeat(feat,
                   vocabulary_size=len(feat_lbe_dict[feat].classes_) + 1,
                   embedding_dim=EMBED_DIM)
        for i, feat in enumerate(sparse_features) if feat not in time_feat
    ]

    dense_feature_columns = [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    var_feature_columns = [
        VarLenSparseFeat(SparseFeat(
            'hist_item_id',
            vocabulary_size=len(feat_lbe_dict['item_id'].classes_) + 1,
            embedding_dim=EMBED_DIM,
            embedding_name='item_id'),
                         maxlen=max_seq_len)
    ]

    # DNN side
    dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
    # FM side
    linear_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
    # all feature names
    feature_names = get_feature_names(dnn_feature_columns +
                                      linear_feature_columns)

    return feature_names, linear_feature_columns, dnn_feature_columns
Example #2
0
def _preprocess_criteo(df, **kw):
    hash_feature = kw.get('hash_feature')
    sparse_col = ['C' + str(i) for i in range(1, 27)]
    dense_col = ['I' + str(i) for i in range(1, 14)]
    df[sparse_col] = df[sparse_col].fillna('-1', )
    df[dense_col] = df[dense_col].fillna(0, )
    target = ["label"]

    # set hashing space for each sparse field,and record dense feature field name
    if hash_feature:
        # Transformation for dense features
        mms = MinMaxScaler(feature_range=(0, 1))
        df[dense_col] = mms.fit_transform(df[dense_col])
        sparse_col = ['C' + str(i) for i in range(1, 27)]
        dense_col = ['I' + str(i) for i in range(1, 14)]

        fixlen_cols = [SparseFeat(feat, vocabulary_size=1000, embedding_dim=4, use_hash=True, dtype='string')
                       # since the input is string
                       for feat in sparse_col] + [DenseFeat(feat, 1, ) for feat in dense_col]

    else:
        for feat in sparse_col:
            lbe = LabelEncoder()
            df[feat] = lbe.fit_transform(df[feat])
        mms = MinMaxScaler(feature_range=(0, 1))
        df[dense_col] = mms.fit_transform(df[dense_col])
        fixlen_cols = [SparseFeat(feat, vocabulary_size=df[feat].nunique(), embedding_dim=4)
                       for i, feat in enumerate(sparse_col)] + [DenseFeat(feat, 1, ) for feat in dense_col]

    linear_cols = fixlen_cols
    dnn_cols = fixlen_cols
    train, test = train_test_split(df, test_size=kw['test_size'])

    return df, linear_cols, dnn_cols, train, test, target, test[target].values
Example #3
0
def test_long_dense_vector():

    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_feature_names(feature_columns)

    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])

    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    model = DeepFM(feature_columns, feature_columns[:-1])
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
Example #4
0
def get_xy_fd(hash_flag=False):

    feature_columns = [SparseFeat('user', 3, hash_flag),
                       SparseFeat('gender', 2, hash_flag),
                       SparseFeat('item', 3 + 1, hash_flag),
                       SparseFeat('item_gender', 2 + 1, hash_flag),
                       DenseFeat('score', 1)]
    feature_columns += [VarLenSparseFeat('sess_0_item',3+1,4,use_hash=hash_flag,embedding_name='item'),VarLenSparseFeat('sess_0_item_gender',2+1,4,use_hash=hash_flag,embedding_name='item_gender')]
    feature_columns += [VarLenSparseFeat('sess_1_item', 3 + 1, 4, use_hash=hash_flag, embedding_name='item'),VarLenSparseFeat('sess_1_item_gender', 2 + 1, 4, use_hash=hash_flag,embedding_name='item_gender')]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]])
    sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]])

    sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

    sess_number = np.array([2, 1, 0])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score,
                    'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, }

    x = {name:feature_dict[name] for name in get_feature_names(feature_columns)}
    x["sess_length"] = sess_number
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
def train_deepFM():
    k = featureengineer.k
    #缺失值填充+编码处理
    data,appsnum, tags_nums = trainmodel.data,trainmodel.appsnum,trainmodel.tags_nums
    data[trainmodel.sparse_features] = data[trainmodel.sparse_features].fillna('-1', )
    for feat in trainmodel.dense_features:
        data[feat].fillna(data[feat].dropna().mean(), inplace=True)

    for feat in trainmodel.sparse_features:
        data[feat] = data[feat].apply(lambda x:str(x))
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[trainmodel.dense_features] = mms.fit_transform(data[trainmodel.dense_features])


    #数据格式转换
    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=8)
                              for i, feat in enumerate(trainmodel.sparse_features)] + \
                             [DenseFeat(feat, 1, ) for feat in trainmodel.dense_features]

    lgbOut_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=1)
                              for i, feat in enumerate(trainmodel.lgbOut_Features)]

    key2index_len = {'applist': appsnum+1, 'new_tag': tags_nums}
    varlen_features = [VarLenSparseFeat('%s' % i, vocabulary_size=key2index_len[i], maxlen=k, embedding_dim=8, combiner='mean',weight_name=None) for i
                       in trainmodel.var_features]

    dnn_feature_columns = fixlen_feature_columns + varlen_features
    linear_feature_columns = fixlen_feature_columns + varlen_features + lgbOut_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    sparse_dense_features = trainmodel.sparse_features + trainmodel.dense_features + trainmodel.lgbOut_Features

    train, test = train_test_split(data, test_size=0.2)


    train_model_input = {name: train[name] for name in sparse_dense_features}
    test_model_input = {name: test[name] for name in sparse_dense_features}
    for x in trainmodel.var_features:
        if x == 'applist':
            train_model_input[x] = np.array(train[x].tolist())
            test_model_input[x] = np.array(test[x].tolist())
        if x == 'new_tag':
            train_model_input[x] = np.array(train[x].tolist())-appsnum
            test_model_input[x] = np.array(test[x].tolist())-appsnum
    # 模型
    model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                   dnn_hidden_units=(50, 30, 30), l2_reg_linear=0.001, l2_reg_embedding=0.001,
                   l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.1, dnn_activation='relu', dnn_use_bn=True,
                   task='binary')
    model.compile("adam", "binary_crossentropy",metrics=['AUC'], )

    history = model.fit(train_model_input, train['target'].values,
                        batch_size=256, epochs=1, verbose=2, validation_split=0.2, )

    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test AUC", round(roc_auc_score(test['target'].values, pred_ans), 4))
Example #6
0
def get_test_data(sample_size=1000,
                  sparse_feature_num=1,
                  dense_feature_num=1,
                  sequence_feature=('sum', 'mean', 'max'),
                  classification=True,
                  include_length=False,
                  hash_flag=False,
                  prefix=''):

    feature_columns = []

    for i in range(sparse_feature_num):
        dim = np.random.randint(1, 10)
        feature_columns.append(
            SparseFeat(prefix + 'sparse_feature_' + str(i), dim, hash_flag,
                       tf.int32))
    for i in range(dense_feature_num):
        feature_columns.append(
            DenseFeat(prefix + 'dense_feature_' + str(i), 1, tf.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(prefix + 'sequence_' + str(i), dim, maxlen, mode))

    model_input = []
    sequence_input = []
    sequence_len_input = []
    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            model_input.append(np.random.randint(0, fc.dimension, sample_size))
        elif isinstance(fc, DenseFeat):
            model_input.append(np.random.random(sample_size))
        else:
            s_input, s_len_input = gen_sequence(fc.dimension, fc.maxlen,
                                                sample_size)
            sequence_input.append(s_input)
            sequence_len_input.append(s_len_input)

    if classification:
        y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    x = model_input + sequence_input
    if include_length:
        for i, mode in enumerate(sequence_feature):
            dim = np.random.randint(1, 10)
            maxlen = np.random.randint(1, 10)
            feature_columns.append(
                SparseFeat(prefix + 'sequence_' + str(i) + '_seq_length',
                           1,
                           embedding=False))

        x += sequence_len_input

    return x, y, feature_columns
Example #7
0
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1,
                  sequence_feature=['sum', 'mean', 'max', 'weight'], classification=True, include_length=False,
                  hash_flag=False, prefix='', use_group=False):
    feature_columns = []
    model_input = {}

    if 'weight' in sequence_feature:
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix + "weighted_seq", vocabulary_size=2, embedding_dim=embedding_size),
                             maxlen=3, length_name=prefix + "weighted_seq" + "_seq_length",
                             weight_name=prefix + "weight"))
        s_input, s_len_input = gen_sequence(
            2, 3, sample_size)

        model_input[prefix + "weighted_seq"] = s_input
        model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1)
        model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input
        sequence_feature.pop(sequence_feature.index('weight'))

    for i in range(sparse_feature_num):
        if use_group:
            group_name = str(i%3)
        else:
            group_name = DEFAULT_GROUP_NAME
        dim = np.random.randint(1, 10)
        feature_columns.append(
            SparseFeat(prefix + 'sparse_feature_' + str(i), dim, embedding_size, use_hash=hash_flag, dtype=tf.int32,group_name=group_name))

    for i in range(dense_feature_num):
        feature_columns.append(DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix + 'sequence_' + mode, vocabulary_size=dim, embedding_dim=embedding_size),
                             maxlen=maxlen, combiner=mode))

    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            model_input[fc.name] = np.random.randint(0, fc.vocabulary_size, sample_size)
        elif isinstance(fc, DenseFeat):
            model_input[fc.name] = np.random.random(sample_size)
        else:
            s_input, s_len_input = gen_sequence(
                fc.vocabulary_size, fc.maxlen, sample_size)
            model_input[fc.name] = s_input
            if include_length:
                fc.length_name = prefix + "sequence_" + str(i) + '_seq_length'
                model_input[prefix + "sequence_" + str(i) + '_seq_length'] = s_len_input

    if classification:
        y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    return model_input, y, feature_columns
Example #8
0
def get_xy_fd(hash_flag=False):
    # feature_dim_dict = {"sparse": [SingleFeat('user', 3, hash_flag), SingleFeat(
    #     'gender', 2, hash_flag), SingleFeat('item', 3 + 1, hash_flag), SingleFeat('item_gender', 2 + 1, hash_flag)],
    #                     "dense": [SingleFeat('score', 0)]}

    feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        SparseFeat('item', 3 + 1),
        SparseFeat('item_gender', 2 + 1),
        DenseFeat('score', 0)
    ]
    feature_columns += [
        VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'),
        VarLenSparseFeat('hist_item_gender',
                         3 + 1,
                         maxlen=4,
                         embedding_name='item_gender')
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }

    feature_names = get_fixlen_feature_names(feature_columns)
    varlen_feature_names = get_varlen_feature_names(feature_columns)
    x = [feature_dict[name] for name in feature_names
         ] + [feature_dict[name] for name in varlen_feature_names]

    # x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in
    #                                                                         feature_dim_dict["dense"]] + [
    #         feature_dict['hist_' + feat] for feat in behavior_feature_list]

    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Example #9
0
def get_xy_fd(use_neg=False, hash_flag=False):

    feature_columns = [SparseFeat('user', 3,hash_flag),
                       SparseFeat('gender', 2,hash_flag),
                       SparseFeat('item', 3+1,hash_flag),
                       SparseFeat('item_gender', 2+1,hash_flag),
                       DenseFeat('score', 1)]

    feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]

    behavior_feature_list = ["item","item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])#0 is mask value
    igender = np.array([1, 2, 1])# 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]])
    hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]])

    behavior_length = np.array([3,3,2])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'hist_item': hist_iid, 'hist_item_gender': hist_igender,
                    'score': score}

    #x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in
    #                                                                        feature_dim_dict["dense"]] + [
    #        feature_dict['hist_' + feat] for feat in behavior_feature_list]


    if use_neg:
        feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
        feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
        feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]
        #x += [feature_dict['neg_hist_'+feat] for feat in behavior_feature_list]


    feature_names = get_fixlen_feature_names(feature_columns)
    varlen_feature_names = get_varlen_feature_names(feature_columns)
    print(varlen_feature_names)
    x = [feature_dict[name] for name in feature_names] + [feature_dict[name] for name in varlen_feature_names]

    x += [behavior_length]
    y = [1, 0, 1]
    print(len(x))
    return x, y, feature_columns, behavior_feature_list
Example #10
0
def simple_pre(df):

    # Label Encoding for sparse features,and normalization for dense numerical features
    for feat in config.sparse_features:
        lbe = LabelEncoder()
        df[feat] = lbe.fit_transform(df[feat])

    mms = MinMaxScaler(feature_range=(0, 1))
    df[config.dense_features] = mms.fit_transform(df[config.dense_features])

    #Generate feature columns
    #For sparse features, we transform them into dense vectors by embedding techniques.
    #For dense numerical features, we concatenate them to the input tensors of fully connected layer.

    # count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, df[feat].nunique(), embedding_dim=4)
        for feat in config.sparse_features
    ]

    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)
    return linear_feature_columns, dnn_feature_columns, feature_names
Example #11
0
def get_xy_fd():

    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10),
        SparseFeat('gender', 2, embedding_dim=4),
        SparseFeat('item', 3 + 1, embedding_dim=8),
        SparseFeat('item_gender', 2 + 1, embedding_dim=4),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat('hist_item',
                         maxlen=4,
                         vocabulary_size=3 + 1,
                         embedding_dim=8,
                         embedding_name='item'),
        VarLenSparseFeat('hist_item_gender',
                         maxlen=4,
                         vocabulary_size=3 + 1,
                         embedding_dim=4,
                         embedding_name='item_gender')
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Example #12
0
    def load_stats(self):
        fixlen_feature_columns = [SparseFeat(feat, self.cat_meta[feat])
                           for feat in self.sparse_features] + [DenseFeat(feat, 1,)
                          for feat in self.dense_features]

        self.dnn_feature_columns = fixlen_feature_columns
        self.linear_feature_columns = fixlen_feature_columns

        self.fixlen_feature_names = get_fixlen_feature_names(self.linear_feature_columns + self.dnn_feature_columns)
Example #13
0
def get_xy_fd():

    feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        SparseFeat('item', 3 + 1),
        SparseFeat('item_gender', 2 + 1),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'),
        VarLenSparseFeat('hist_item_gender',
                         3 + 1,
                         maxlen=4,
                         embedding_name='item_gender')
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }

    fixlen_feature_names = get_fixlen_feature_names(feature_columns)
    varlen_feature_names = get_varlen_feature_names(feature_columns)
    x = [feature_dict[name] for name in fixlen_feature_names
         ] + [feature_dict[name] for name in varlen_feature_names]

    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
    def __init__(self, uNum, iNum, dim, maxlen):
        self.uNum = uNum
        self.iNum = iNum
        self.dim = dim
        self.maxlen = maxlen

        hash_flag = True
        self.feature_columns = [SparseFeat('user', self.uNum, hash_flag),
                                SparseFeat('item', self.iNum, hash_flag),
                                VarLenSparseFeat('sess_0_item', self.iNum, self.dim, use_hash=hash_flag,
                                                 embedding_name='item')]

        self.behavior_feature_list = ["item"]

        self.model = DSIN(self.feature_columns, self.behavior_feature_list, sess_max_count=1,
                          embedding_size=self.dim,
                          att_head_num=self.dim,
                          dnn_hidden_units=[self.dim, self.dim, self.dim], dnn_dropout=0.5)

        self.model.compile('adam', 'binary_crossentropy',
                           metrics=['acc'])
Example #15
0
def get_xy_fd(hash_flag=False):


    user_feature_columns = [SparseFeat('user',3),SparseFeat(
        'gender', 2),VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1,embedding_dim=4,embedding_name='item'), maxlen=4,length_name="hist_len") ]
    item_feature_columns = [SparseFeat('item', 3 + 1,embedding_dim=4,)]


    uid = np.array([0, 1, 2,1])
    ugender = np.array([0, 1, 0,1])
    iid = np.array([1, 2, 3,1])  # 0 is mask value

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0],[3, 0, 0, 0]])
    hist_len = np.array([3,3,2,1])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid,
                    'hist_item': hist_iid, "hist_len":hist_len}

    #feature_names = get_feature_names(feature_columns)
    x = feature_dict
    y = np.array([1, 1, 1,1])
    return x, y, user_feature_columns,item_feature_columns
Example #16
0
def get_xy_fd(use_neg=False, hash_flag=False):

    feature_columns = [SparseFeat('user', 3,hash_flag),
                       SparseFeat('gender', 2,hash_flag),
                       SparseFeat('item', 3+1,hash_flag),
                       SparseFeat('item_gender', 2+1,hash_flag),
                       DenseFeat('score', 1)]

    feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]

    behavior_feature_list = ["item","item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])#0 is mask value
    igender = np.array([1, 2, 1])# 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]])
    hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]])

    behavior_length = np.array([3,3,2])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'hist_item': hist_iid, 'hist_item_gender': hist_igender,
                    'score': score}

    if use_neg:
        feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
        feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
        feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]

    x = {name:feature_dict[name] for name in get_feature_names(feature_columns)}
    x["seq_length"] = behavior_length
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Example #17
0
 def getFeatureColumns(self):
     feature_columns = []
     for feat, value in self.fixed_sparse_dict.items():
         feature_columns.append(
             SparseFeat(feat,
                        vocabulary_size=value[0],
                        embedding_dim=value[1]))
     for feat, value in self.var_sparse_dict.items():
         feature_columns.append(
             VarLenSparseFeat(feat,
                              maxlen=value[0],
                              vocabulary_size=value[1],
                              embedding_dim=value[2],
                              embedding_name=value[3]))
     return feature_columns
Example #18
0
def main():

    Use_SF = False
    if len(sys.argv) > 0 and sys.argv[0] == 'SF':
        Use_SF = True

    train, vali, test = GetFeatures(Use_SF)

    feature_count = []
    for feat in sparse_features:
        print("Fitting {}".format(feat))
        labels = {}
        for x in train[feat]:
            if x not in labels:
                labels[x] = len(labels) + 1
        print("Transforming {}".format(feat))
        for df in [train, vali, test]:
            df[feat] = df[feat].map(lambda x: labels.get(x, 0))
        feature_count.append(len(labels) + 1)

    sparse_feature_columns = [
        SparseFeat(f, f_c) for f, f_c in zip(sparse_features, feature_count)
    ]
    dense_feature_columns = [DenseFeat(f, 1) for f in dense_features]
    fixlen_feature_columns = sparse_feature_columns + dense_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns +
                                                    dnn_feature_columns)

    train_model_input = [train[name] for name in fixlen_feature_names]
    vali_model_input = [vali[name] for name in fixlen_feature_names]
    test_model_input = [test[name] for name in fixlen_feature_names]

    def eval(target):
        model, history = model_generate(train_model_input, train[[target]],
                                        vali_model_input, vali[[target]],
                                        linear_feature_columns,
                                        dnn_feature_columns)
        pred_ans = model.predict(test_model_input, batch_size=256)
        print(target + " test LogLoss",
              round(log_loss(test[target].values, pred_ans), 4))
        print(target + " test AUC",
              round(roc_auc_score(test[target].values, pred_ans), 4))

    for target in targets:
        eval(target)
def read(data, lbe_store):
    # data['time'] = data['time'].apply(lambda x: timestamp(x), convert_dtype='int32')
    sparse_features = ["user_id", "item_id", "item_category", "time"]
    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        data[feat] = lbe_store[feat].transform(data[feat])
    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns +
                                                    dnn_feature_columns)
    data_model_input = [data[name].values for name in fixlen_feature_names]
    return data, data_model_input
Example #20
0
def get_xy_fd():
    # 固定长度的离散特征
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10),
        SparseFeat('gender', 2, embedding_dim=4),
        SparseFeat('item_id', 3 + 1, embedding_dim=8),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4),
        DenseFeat('pay_score', 1)
    ]
    # 不固定长度的离散特征
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 1])  # 0 is mask value
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': pay_score
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    print('x=', x)
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
    def prepare_data(cls, path, sparse_features, task='binary'):
        data_path = path
        dataframe = pd.read_csv(
            data_path, names='user_id,movie_id,rating,timestamp'.split(','))
        sparse_features = sparse_features  # ["movie_id", "user_id"]
        y = ['rating']

        for feat in sparse_features:
            lbe = LabelEncoder()
            dataframe[feat] = lbe.fit_transform(dataframe[feat])

        #feature_columns = [DenseFeat(feat, dataframe[feat].nunique()) for feat in sparse_features]
        feature_columns = [
            SparseFeat(feat, dataframe[feat].nunique())
            for feat in sparse_features
        ]

        trainset, testset = train_test_split(dataframe, test_size=0.2)

        #train_model_input = [to_categorical(trainset[fc.name].values, num_classes= fc.dimension) for fc in feature_columns]#includes values from only data[user_id], data[movie_id]
        #test_model_input = [to_categorical(testset[fc.name].values, num_classes= fc.dimension) for fc in feature_columns]#includes values from only data[user_id], data[movie_id]
        train_model_input = [
            trainset[name].values for name in sparse_features
        ]  #includes values from only data[user_id], data[movie_id]
        test_model_input = [
            testset[name].values for name in sparse_features
        ]  #includes values from only data[user_id], data[movie_id]

        if task == 'binary':
            train_lbl = trainset[y].values
            test_lbl = testset[y].values

        #elif task == 'multiclass':
        #    train_lbl = to_categorical(trainset[y])[:,1:]#stripping 0th column as rating is (1,5)
        #    test_lbl= to_categorical(testset[y])[:,1:]#stripping 0th column as rating is (1,5)
        #else:
        #    raise ValueError("Enter task either 'binary' or 'multiclass'")

        return cls(feature_columns), (train_model_input, train_lbl), (
            test_model_input, test_lbl
        )  #try returning train_model_input from inside __init__()
    def prepare(self):
        if self.data_format == "deepctr":
            # 1.Label Encoding for sparse features, and
            # simple Transformation for dense features
            for feat in self.sparse_features:
                lbe = LabelEncoder()
                self.input[feat] = lbe.fit_transform(self.input[feat])
                self.encoders[feat] = lbe

            # 2.count #unique features for each sparse field
            fixlen_feature_columns = [
                SparseFeat(feat, self.input[feat].nunique(), embedding_dim=4)
                for feat in self.sparse_features
            ]

            self.linear_feature_columns = fixlen_feature_columns
            self.dnn_feature_columns = fixlen_feature_columns

            self.feature_names = get_feature_names(
                self.linear_feature_columns + self.dnn_feature_columns)

            # 3.generate input data for model
            train, test = train_test_split(self.input,
                                           test_size=self.test_size)

            self.X_train = {
                name: train[name].values
                for name in self.feature_names
            }
            self.y_train = train[self.target].values

            self.X_test = {
                name: test[name].values
                for name in self.feature_names
            }
            self.y_test = test[self.target].values
        else:
            raise ("Not supported dataset:" + self.data_format)
Example #23
0
def client_restful_criteo():
    data = pd.read_csv('./data/criteo_sample.txt')

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
                           for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                          for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    # model_input = [data[name].iloc[0] for name in feature_names]
    # model_input = [{name:data[name].iloc[0]} for name in feature_names]
    model_input = [{name:data[name].iloc[0] for name in feature_names}]
    print(model_input)
    data = json.dumps({"signature_name": "serving_default", "instances": model_input}, cls=NpEncoder)
    headers = {"content-type": "application/json"}
    json_response = requests.post('http://localhost:8501/v1/models/criteo:predict', data=data, headers=headers)
    json_response = json.loads(json_response.text)
    print(json_response)
Example #24
0
def get_xy_fd_sdm(hash_flag=False):

    user_feature_columns = [SparseFeat('user',3),
                            SparseFeat('gender', 2),
                            VarLenSparseFeat(SparseFeat('prefer_item', vocabulary_size=100,embedding_dim=8,
                                                        embedding_name='item'), maxlen=6, length_name="prefer_sess_length"),
                            VarLenSparseFeat(SparseFeat('prefer_cate', vocabulary_size=100, embedding_dim=8,
                                                        embedding_name='cate'), maxlen=6, length_name="prefer_sess_length"),
                            VarLenSparseFeat(SparseFeat('short_item', vocabulary_size=100,embedding_dim=8,
                                                        embedding_name='item'), maxlen=4, length_name="short_sess_length"),
                            VarLenSparseFeat(SparseFeat('short_cate', vocabulary_size=100, embedding_dim=8,
                                                        embedding_name='cate'), maxlen=4, length_name="short_sess_length"),
                            ]
    item_feature_columns = [SparseFeat('item', 100, embedding_dim=8,)]


    uid = np.array([0, 1, 2, 1])
    ugender = np.array([0, 1, 0, 1])
    iid = np.array([1, 2, 3, 1])  # 0 is mask value

    prefer_iid = np.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 0], [1, 2, 3, 3, 0, 0], [1, 2, 4, 0, 0, 0]])
    prefer_cate = np.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 0], [1, 2, 3, 3, 0, 0], [1, 2, 4, 0, 0, 0]])
    short_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [3, 0, 0, 0]])
    short_cate = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [3, 0, 0, 0]])
    prefer_len = np.array([6, 5, 4, 3])
    short_len = np.array([3, 3, 2, 1])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'prefer_item': prefer_iid, "prefer_cate":prefer_cate,
                    'short_item': short_iid, 'short_cate': short_cate, 'prefer_sess_length': prefer_len, 'short_sess_length':short_len}

    #feature_names = get_feature_names(feature_columns)
    x = feature_dict
    y = np.array([1, 1, 1, 0])
    history_feature_list = ['item', 'cate']

    return x, y, user_feature_columns, item_feature_columns, history_feature_list
Example #25
0
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
Example #26
0
from sklearn.preprocessing import LabelEncoder

from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat, get_feature_names

data = pd.read_csv("ratings.csv")
sparse_features = ["userId", "movieId", 'timestamp']
target = ['rating']

# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [
    SparseFeat(feature, data[feature].nunique()) for feature in sparse_features
]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name: train[name].values for name in feature_names}
test_model_input = {name: test[name].values for name in feature_names}

# 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile(
    "adam",
    "mse",
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from deepctr.models import WDL
from deepctr.inputs import SparseFeat,get_feature_names
import matplotlib.pyplot as plt

data = pd.read_csv("movielens_sample.txt")
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ['rating']

# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

# 使用WDL进行训练
model = WDL(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=100, verbose=True, validation_split=0.2, )
plt.figure()
x = range(len(history.history['loss']))
Example #28
0
def get_xy_fd(use_neg=False, hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('pay_score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4,
                         length_name="seq_length")
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]])

    behavior_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': score,
        "seq_length": behavior_length
    }

    if use_neg:
        feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3,
                                                      0], [1, 2, 3, 0],
                                                     [1, 2, 0, 0]])
        feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2,
                                                      0], [1, 2, 2, 0],
                                                     [1, 2, 0, 0]])
        feature_columns += [
            VarLenSparseFeat(SparseFeat('neg_hist_item_id',
                                        vocabulary_size=3 + 1,
                                        embedding_dim=8,
                                        embedding_name='item_id'),
                             maxlen=4,
                             length_name="seq_length"),
            VarLenSparseFeat(SparseFeat('neg_hist_cate_id',
                                        2 + 1,
                                        embedding_dim=4,
                                        embedding_name='cate_id'),
                             maxlen=4,
                             length_name="seq_length")
        ]

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Example #29
0
def get_item(root):
    print('load')
    csv_file = os.path.join(root, 'test', 'test_data', 'test_data')
    item = pd.read_csv(csv_file,
                dtype={
                    'article_id': str,
                    'hh': int, 'gender': str,
                    'age_range': str,
                    'read_article_ids': str
                }, sep='\t')
    print('loaded!!')
    sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
    dense_features = ['image_feature']
    target = ['label']

    len_lis = []

    read_article_ids_all = item['read_article_ids'].tolist()
    for i in range(len(item)):
        li = read_article_ids_all[i]
        if type(li) == float:
            len_lis.append(0)
            continue
        len_li = len(li.split(','))
        len_lis.append(len_li)
    
    
    item['len']  = len_lis
    item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')

    id_to_artic = dict()
    artics = item['article_id'].tolist()
    
    with open(os.path.join(DATASET_PATH, 'test', 'test_data', 'test_image_features.pkl'), 'rb') as handle:
        image_feature_dict = pickle.load(handle)

    print('image_feaeture_dict loaded..')
    for feat in sparse_features:
        lbe = LabelEncoder()
        item[feat] = lbe.fit_transform(item[feat])

    # test set으로 구성해도 되고 item 을..
    fixlen_feature_columns = []
    for feat in sparse_features:
        if feat == 'article_id':
            fixlen_feature_columns.append(SparseFeat(feat,1896))
        else:
            fixlen_feature_columns.append(SparseFeat(feat,item[feat].nunique()))
    #fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
    fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features]
    
    print(fixlen_feature_columns)
    
    
    idx_artics_all = item['article_id'].tolist()
    
    for i in range(len(artics)):
        idx_artic = idx_artics_all[i]
        if idx_artic not in id_to_artic.keys():
            id_to_artic[idx_artic] = artics[i]
    
    
       
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns  
    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
    
    fixlen_feature_names_global = fixlen_feature_names

    model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
    #bind_nsml(model, list(), args.task)

    return model, fixlen_feature_names_global, item,image_feature_dict, id_to_artic
Example #30
0
def main(args, local):
    
    if args.arch == 'xDeepFM' and args.mode == 'train':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data')
        item = pd.read_csv(csv_file,
                    dtype={
                        'article_id': str,
                        'hh': int, 'gender': str,
                        'age_range': str,
                        'read_article_ids': str
                    }, sep='\t')
        label_data_path = os.path.join(DATASET_PATH, 'train',
                                os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label')
        label = pd.read_csv(label_data_path,
                    dtype={'label': int},
                    sep='\t')
        item['label']  = label
        
        sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
        dense_features = ['image_feature']
        target = ['label']
        
        
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)
        
        
        item['len']  = len_lis
        item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')
    
        id_to_artic = dict()
        artics = item['article_id'].tolist()
        
        with open(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)
        for feat in sparse_features:
            lbe = LabelEncoder()
            item[feat] = lbe.fit_transform(item[feat])
        fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
        fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features]
        
        
        
        idx_artics_all = item['article_id'].tolist()
        
        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]
        
       
            #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns  
        fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
        print('---model defined---')
        # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까
        print(time.time() - s ,'seconds')


    if use_nsml and args.mode == 'train':

        bind_nsml(model,[], args.task)
    
    
    if args.mode == 'test':
        print('_infer root - : ', DATASET_PATH)
        print('test')
        model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(DATASET_PATH)
        bind_nsml(model, [], args.task)
        checkpoint_session = ['401','team_62/airush2/176']
        nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) 
        print('successfully loaded')

    if (args.mode == 'train'):
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네

        nsml.save('infer')
        print('end')
    print('end_main')

    if args.pause:
        nsml.paused(scope=local)