Example #1
0
def get_test_data(sample_size=1000,
                  sparse_feature_num=1,
                  dense_feature_num=1,
                  sequence_feature=('sum', 'mean', 'max'),
                  classification=True,
                  include_length=False,
                  hash_flag=False,
                  prefix=''):

    feature_columns = []

    for i in range(sparse_feature_num):
        dim = np.random.randint(1, 10)
        feature_columns.append(
            SparseFeat(prefix + 'sparse_feature_' + str(i), dim, hash_flag,
                       torch.int32))
    for i in range(dense_feature_num):
        feature_columns.append(
            DenseFeat(prefix + 'dense_feature_' + str(i), 1, torch.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(prefix + 'sequence_' + str(i), dim, maxlen, mode))

    model_input = []
    sequence_input = []
    sequence_len_input = []
    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            model_input.append(np.random.randint(0, fc.dimension, sample_size))
        elif isinstance(fc, DenseFeat):
            model_input.append(np.random.random(sample_size))
        else:
            s_input, s_len_input = gen_sequence(fc.dimension, fc.maxlen,
                                                sample_size)
            sequence_input.append(s_input)
            sequence_len_input.append(s_len_input)

    if classification:
        y = np.random.randint(0, 2, sample_size)
        while sum(y) < 0.3 * sample_size:
            y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    x = model_input + sequence_input
    if include_length:
        for i, mode in enumerate(sequence_feature):
            dim = np.random.randint(1, 10)
            maxlen = np.random.randint(1, 10)
            feature_columns.append(
                SparseFeat(prefix + 'sequence_' + str(i) + '_seq_length',
                           1,
                           embedding=False))

        x += sequence_len_input

    return x, y, feature_columns
Example #2
0
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1,
                  sequence_feature=['sum', 'mean', 'max'], classification=True, include_length=False,
                  hash_flag=False, prefix=''):


    feature_columns = []
    model_input = {}


    if 'weight'  in sequence_feature:
        feature_columns.append(VarLenSparseFeat(SparseFeat(prefix+"weighted_seq",vocabulary_size=2,embedding_dim=embedding_size),maxlen=3,length_name=prefix+"weighted_seq"+"_seq_length",weight_name=prefix+"weight"))
        s_input, s_len_input = gen_sequence(
            2, 3, sample_size)

        model_input[prefix+"weighted_seq"] = s_input
        model_input[prefix+'weight'] = np.random.randn(sample_size,3,1)
        model_input[prefix+"weighted_seq"+"_seq_length"] = s_len_input
        sequence_feature.pop(sequence_feature.index('weight'))


    for i in range(sparse_feature_num):
        dim = np.random.randint(1, 10)
        feature_columns.append(SparseFeat(prefix+'sparse_feature_'+str(i), dim,embedding_size,dtype=torch.int32))
    for i in range(dense_feature_num):
        feature_columns.append(DenseFeat(prefix+'dense_feature_'+str(i), 1,dtype=torch.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix +'sequence_' + mode,vocabulary_size=dim,  embedding_dim=embedding_size), maxlen=maxlen, combiner=mode))

    for fc in feature_columns:
        if isinstance(fc,SparseFeat):
            model_input[fc.name]= np.random.randint(0, fc.vocabulary_size, sample_size)
        elif isinstance(fc,DenseFeat):
            model_input[fc.name] = np.random.random(sample_size)
        else:
            s_input, s_len_input = gen_sequence(
                fc.vocabulary_size, fc.maxlen, sample_size)
            model_input[fc.name] = s_input
            if include_length:
                fc.length_name = prefix+"sequence_"+str(i)+'_seq_length'
                model_input[prefix+"sequence_"+str(i)+'_seq_length'] = s_len_input

    if classification:
        y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    return model_input, y, feature_columns
Example #3
0
    def get_feature_columns(self):
        '''
        获取特征列
        '''
        file_name = "{stage}_{action}_{day}_concate_sample.csv".format(stage=self.stage, action=self.action,
                                                                       day=STAGE_END_DAY[self.stage])
        stage_dir = os.path.join(FLAGS.root_path, self.stage, file_name)
        self.df = pd.read_csv(stage_dir)
        sparse_features = ["userid", "feedid", "authorid", "bgm_singer_id", "bgm_song_id"]
        self.df[sparse_features] = self.df[sparse_features].fillna('-1', )
        for feat in sparse_features:
            lbe = LabelEncoder()
            self.df[feat] = lbe.fit_transform(self.df[feat])
        # mms = MinMaxScaler(feature_range=(0, 1))
        # data[dense_features] = mms.fit_transform(data[dense_features])

        # df[dense_features] = df[dense_features].fillna(0, )
        linear_feature_columns = list()
        dnn_feature_columns = [SparseFeat(feat, self.df[feat].nunique(), FLAGS.embed_dim, dtype=str) for feat in sparse_features]

        video_seconds = DenseFeat(name='videoplayseconds')
        device = DenseFeat(name='device')
        linear_feature_columns.append(video_seconds)
        linear_feature_columns.append(device)
        # 行为统计特征
        for b in FEA_COLUMN_LIST:
            feed_b = DenseFeat(b + "sum")
            linear_feature_columns.append(feed_b)
            user_b = DenseFeat(b + "sum_user")
            linear_feature_columns.append(user_b)
        return dnn_feature_columns, linear_feature_columns
Example #4
0
def data_preprocess(csv_file):
    data = pd.read_csv(csv_file)
    sparse_features = ["movie_id", "gender", "age"]
    movie_genres = [
        'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical',
        'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western'
    ]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    sparse_features.extend(movie_genres)
    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]

    # 他自己的資料型態
    # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group')
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)  # movie_id, gender, age.

    test_model_input = {name: data[name]
                        for name in feature_names
                        }  # dict of movie_id, gender, age value

    return test_model_input, linear_feature_columns, dnn_feature_columns
def data_preprocess(csv_file):
    data = pd.read_csv(csv_file)
    sparse_features = ["movie_id", "gender", "age"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]

    # 他自己的資料型態
    # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group')
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)  # movie_id, gender, age.

    test_model_input = {name: data[name]
                        for name in feature_names
                        }  # dict of movie_id, gender, age value

    return test_model_input, linear_feature_columns, dnn_feature_columns
Example #6
0
 def fit_test(self, train_X, train_Y, val_X, val_Y, test_X, test_Y,
              cat_cols):
     sparse_features = cat_cols
     dense_features = [
         idx for idx in range(train_X.shape[1]) if idx not in cat_cols
     ]
     sparse_feature_columns = [
         SparseFeat(str(feat),
                    vocabulary_size=len(set(train_X[:, feat])) + 1,
                    embedding_dim=4)
         for i, feat in enumerate(sparse_features)
     ]
     dense_feature_columns = [
         DenseFeat(
             str(feat),
             1,
         ) for feat in dense_features
     ]
     dnn_feature_columns = sparse_feature_columns + dense_feature_columns
     linear_feature_columns = sparse_feature_columns + dense_feature_columns
     feature_names = get_feature_names(linear_feature_columns +
                                       dnn_feature_columns)
     train_model_input = {
         name: train_X[:, int(name)]
         for name in feature_names
     }
     val_model_input = {name: val_X[:, int(name)] for name in feature_names}
     test_model_input = {
         name: test_X[:, int(name)]
         for name in feature_names
     }
     use_cuda = True
     if use_cuda and torch.cuda.is_available():
         print('cuda ready...')
         self.device = 'cuda:0'
     self.model = xDeepFM(linear_feature_columns,
                          dnn_feature_columns,
                          task='binary',
                          device=self.device)
     self.model.compile(
         Adam(self.model.parameters(), 0.0001),
         "binary_crossentropy",
         metrics=['binary_crossentropy'],
     )
     es = EarlyStopping(monitor='val_binary_crossentropy',
                        min_delta=0,
                        verbose=1,
                        patience=30,
                        mode='min')
     lbe = LabelEncoder()
     self.model.fit(train_model_input,
                    lbe.fit_transform(train_Y),
                    batch_size=512,
                    epochs=21,
                    verbose=2,
                    validation_data=(val_model_input, lbe.transform(val_Y)))
     pred_ans = self.model.predict(test_model_input, batch_size=256)
     print(f'{log_loss(test_Y, pred_ans):.5f}')
    def build_model(
        self,
        embedding_dim=4,
        task='binary',
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy'],
        device='cpu',
    ):
        fixlen_feature_columns = [
            SparseFeat(
                feat,
                vocabulary_size=self.vocabulary_size_dict[feat],
                embedding_dim=embedding_dim,
            ) for feat in self.sparse_features
        ]

        if self.variable_length_features:
            varlen_feature_columns = [
                VarLenSparseFeat(
                    SparseFeat(
                        feat,
                        vocabulary_size=self.vocabulary_size_dict[feat],
                        embedding_dim=embedding_dim,
                    ),
                    maxlen=self.variable_length_features_max_len[feat],
                    combiner='mean',
                ) for feat in self.variable_length_features
            ]
        else:
            varlen_feature_columns = []

        linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       task=task,
                       device=device)
        model.compile(optimizer, loss, metrics)
        return model
Example #8
0
def get_train_fixlen_feature_columns(csv_file):
    data = pd.read_csv(csv_file)
    sparse_features = ["movie_id", "gender", "age"]
    movie_genres = [
        'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical',
        'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western'
    ]
    sparse_features.extend(movie_genres)
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]
    return fixlen_feature_columns
Example #9
0
def get_xy_fd(hash_flag=False):
    feature_columns = [
        SparseFeat('user', 4, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('pay_score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    vocabulary_size=2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4,
                         length_name="seq_length")
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2, 3])
    gender = np.array([0, 1, 0, 1])
    item_id = np.array([1, 2, 3, 2])  # 0 is mask value
    cate_id = np.array([1, 2, 1, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3, 0.2])

    hist_item_id = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0],
                             [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0],
                             [1, 2, 0, 0]])

    behavior_length = np.array([3, 3, 2, 2])

    feature_dict = {
        'user': uid,
        'gender': gender,
        'item_id': item_id,
        'cate_id': cate_id,
        'hist_item_id': hist_item_id,
        'hist_cate_id': hist_cate_id,
        'pay_score': score,
        "seq_length": behavior_length
    }

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1, 0])
    return x, y, feature_columns, behavior_feature_list
Example #10
0
def get_xy_fd():
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=8),
        SparseFeat('gender', 2, embedding_dim=8),
        SparseFeat('item', 3 + 1, embedding_dim=8),
        SparseFeat('item_gender', 2 + 1, embedding_dim=8),
        DenseFeat('score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item', 3 + 1, embedding_dim=8),
                         4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1,
                                    embedding_dim=8),
                         4,
                         length_name="seq_length")
    ]
    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
    behavior_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score,
        "seq_length": behavior_length
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1])

    return x, y, feature_columns, behavior_feature_list
def task(action):
    print('-----------action-----------', action)
    USE_FEAT = [action] + SELECT_FRTS
    train = pd.read_csv(ROOT_PATH + f'/train_data_for_{action}.csv')[USE_FEAT]
    train = train.sample(frac=1, random_state=42).reset_index(drop=True)
    print("posi prop:")
    print(sum((train[action] == 1) * 1) / train.shape[0])
    test = pd.read_csv(ROOT_PATH + '/test_data.csv')[SELECT_FRTS]
    target = [action]
    test[target[0]] = 0
    test = test[USE_FEAT]
    data = pd.concat((train, test)).reset_index(drop=True)
    print(train.shape, test.shape, data.shape)
    dense_features = DENSE_FEATURE
    sparse_features = [
        i for i in USE_FEAT if i not in dense_features and i not in target
    ]

    data[sparse_features] = data[sparse_features].fillna(0)
    data[dense_features] = data[dense_features].fillna(0)

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]
    #
    dnn_feature_columns = fixlen_feature_columns
    #linear_feature_columns = [SparseFeat(feat, data[feat].nunique())
    #                         for feat in sparse_features]
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model
    train, test = data.iloc[:train.shape[0]].reset_index(
        drop=True), data.iloc[train.shape[0]:].reset_index(drop=True)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}
    #-------
    eval_ratio = 0.
    eval_df = train[int((1 - eval_ratio) *
                        train.shape[0]):].reset_index(drop=True)
    userid_list = eval_df['userid'].astype(str).tolist()
    print('val len:', len(userid_list))

    # 4.Define Model,train,predict and evaluate
    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

    model = MyDeepFM(linear_feature_columns=linear_feature_columns,
                     dnn_feature_columns=dnn_feature_columns,
                     use_fm=True,
                     dnn_hidden_units=(256, 128),
                     l2_reg_linear=1e-1,
                     l2_reg_embedding=0.00001,
                     l2_reg_dnn=0,
                     init_std=0.0001,
                     seed=1024,
                     dnn_dropout=0.,
                     dnn_activation='relu',
                     dnn_use_bn=False,
                     task='binary',
                     device=device)

    model.compile("adagrad", "binary_crossentropy", metrics=["auc"])

    history = model.fit(train_model_input,
                        train[target].values,
                        batch_size=1024,
                        epochs=NUM_EPOCH_DICT[action],
                        verbose=1,
                        validation_split=eval_ratio,
                        userid_list=userid_list)
    pred_ans = model.predict(test_model_input, 128)
    #submit[action] = pred_ans
    torch.cuda.empty_cache()
    return pred_ans
def train_recommend_movies(csv_file, DEVICE):
    """
        Description:
            Train recommend system on: 
                Model: "xDeepFM", 
                Target: "rating",
                Input features: ["movie_id", "gender", "age"],
                Save model to: "save_model/xDeepFM_MSE{}.h5"

        Parameters: 
            csv_file: "path to *.csv"
            DEVICE: "cuda:0"
    """
    data = pd.read_csv(csv_file)
    sparse_features = ["movie_id", "gender", "age"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]

    # 他自己的資料型態
    # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group')
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns
    )  # movie_id, user_id, gender, age, occupation, zip.

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {
        name: test[name]
        for name in feature_names
    }  # dict of movie_id, user_id, gender, age, occupation, zip values

    # 4.Define Model,train,predict and evaluate
    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = DEVICE

    # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
    # model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
    model = xDeepFM(linear_feature_columns,
                    dnn_feature_columns,
                    task='regression',
                    device=device)
    model.compile(
        "adam",
        "mse",
        metrics=['mse'],
    )

    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=10,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)

    print("test MSE",
          round(mean_squared_error(test[target].values, pred_ans), 4))

    torch.save(
        model.state_dict(), 'save_model/xDeepFM_MSE{}.h5'.format(
            round(mean_squared_error(test[target].values, pred_ans), 4)))
Example #13
0
data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

# 对离散特征用LabelEncoder编码,范围是[0, 该列取值数目],比如C11是[0,141],C12是[0,172]
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

# 对连续特征进行最大最小归一化
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])
print(data.head())

fixlen_feature_columns = [
    SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
    for i, feat in enumerate(sparse_features)
] + [DenseFeat(
    feat,
    1,
) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name: train[name] for name in feature_names}

test_model_input = {name: test[name] for name in feature_names}
    key2index = {}
    genres_list = list(map(split, data['genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    # Notice : padding=`post`
    genres_list = pad_sequences(
        genres_list,
        maxlen=max_len,
        padding='post',
    )

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
        for feat in sparse_features
    ]

    varlen_feature_columns = [
        VarLenSparseFeat(SparseFeat('genres',
                                    vocabulary_size=len(key2index) + 1,
                                    embedding_dim=4),
                         maxlen=max_len,
                         combiner='mean',
                         weight_name=None)
    ]  # Notice : value 0 is for padding for sequence input feature

    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
Example #15
0
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns =  [SparseFeat(feat, data[feat].nunique())
                           for feat in sparse_features] + [DenseFeat(feat, 1,)
                          for feat in dense_features]



    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[name] for name in fixlen_feature_names]
    test_model_input = [test[name] for name in fixlen_feature_names]
Example #16
0
INNER_DIM = args.inner_dim
if INNER_DIM <= 0:
    INNER_DIM = None
BATCH = args.batch
OUTER_DIM = args.embd_dim
#data = pd.read_csv('../../preprocessed/criteo_train.csv')
data = pickle.load(open('../preprocessed/preprocessed_avazu.pkl','rb'))
header_names = ['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
sparse_features = header_names[3:]
dense_features = ['hour']
target = ['click']
# 2.count #unique features for each sparse field,and record dense feature field name
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(),embedding_dim=OUTER_DIM) for feat in sparse_features] + [DenseFeat(feat, 1, ) for feat in dense_features]
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
# 3.generate input data for model
train, test = train_test_split(data, test_size=0.1,random_state=42)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}
train_model_labels = train[target].values
test_model_labels = test[target].values
# memory optimization
import gc
del data
data = None
gc.collect()
# 4.Define Model,train,predict and evaluate
Example #17
0
def train_recommend_movies(csv_file, DEVICE):
    """
        Description:
            Train recommend system on: 
                Model: "xDeepFM", 
                Target: "rating",
                Input features: ["movie_id", "gender", "age"],
                Save model to: "save_model/xDeepFM_MSE{}.h5"

        Parameters: 
            csv_file: "path to *.csv"
            DEVICE: "cuda:0"
    """
    data = pd.read_csv(csv_file)
    # sparse_features = ["movie_id", "user_id",
    #                    "gender", "age", "occupation", "zip"]
    sparse_features = ["movie_id", "gender", "age"]
    movie_genres = [
        'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical',
        'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western'
    ]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    sparse_features.extend(movie_genres)
    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]

    # 他自己的資料型態
    # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group')
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns
    )  # movie_id, user_id, gender, age, occupation, zip.

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {
        name: test[name]
        for name in feature_names
    }  # dict of movie_id, user_id, gender, age, occupation, zip values

    # 4.Define Model,train,predict and evaluate
    # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
    model = FiBiNET(linear_feature_columns,
                    dnn_feature_columns,
                    task='regression',
                    device=DEVICE)
    # model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
    model.compile(
        "adam",
        "mse",
        metrics=['mse'],
    )

    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=10,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)

    print("test MSE",
          round(mean_squared_error(test[target].values, pred_ans), 4))
    print("test MAE",
          round(mean_absolute_error(test[target].values, pred_ans), 4))

    # torch.save(model.state_dict(), './recommend_system/save_model/xDeepFM_MSE{}.h5' .format(round(mean_squared_error(test[target].values, pred_ans), 4)))
    torch.save(
        model.state_dict(),
        './recommend_system/save_model/FiBiNET_MSE{}.h5'.format(
            round(mean_squared_error(test[target].values, pred_ans), 4)))
Example #18
0
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['Label']
    logging.debug("02: Preprocess done.")
    
    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])
    logging.debug("03: LabelEncoding done.")

    # 2.count #unique features for each sparse field,and record dense feature field name
    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] + \
                             [DenseFeat(feat, 1, ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    logging.debug("04: Fixlen done.")

    # 3.generate input data for model
    split_num = int(len(data) * 0.8)
    train = data[:split_num]
    test = data[split_num:]

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}
Example #19
0
BATCH = args.batch
OUTER_DIM = args.embd_dim
#data = pd.read_csv('../../preprocessed/criteo_train.csv')
data = pickle.load(open('../preprocessed/preprocessed_avazu.pkl', 'rb'))
header_names = [
    'id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
    'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
    'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
    'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
]
sparse_features = header_names[3:]
dense_features = ['hour']
target = ['click']
# 2.count #unique features for each sparse field,and record dense feature field name
fixlen_feature_columns = [
    SparseFeat(feat, data[feat].nunique(), embedding_dim=OUTER_DIM)
    for feat in sparse_features
] + [DenseFeat(
    feat,
    1,
) for feat in dense_features]
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
# 3.generate input data for model
train, test = train_test_split(data, test_size=0.1, random_state=42)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}
train_model_labels = train[target].values
test_model_labels = test[target].values
# memory optimization
Example #20
0
    # 1.Label Encoding for sparse features,and process sequence features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # preprocess the sequence feature

    key2index = {}
    genres_list = list(map(split, data['Genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    # Notice : padding=`post`
    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                              for feat in sparse_features]

    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('Genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean')]  # Notice : value 0 is for padding for sequence input feature

    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model
    model_input = {name: data[name] for name in sparse_features}  #
    model_input["Genres"] = genres_list

    # 4.Define Model,compile and train
Example #21
0
df = df.fillna(-1)

for i in tqdm(dense_feature):
    try:
        df[i] = MinMaxScaler().fit_transform(df[i].values.reshape(-1, 1))
    except:
        feature_name.remove(i)
        dense_feature.remove(i)
        print("Remove", i)

train = df[df['pt_d'].isin([1, 2, 3, 4, 5, 6])]
valid = df[df['pt_d'] == 7]
test = df[df['pt_d'] == 8]

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique(), embedding_dim=8) for feat in sparse_feature] +\
                         [DenseFeat(feat, 1, ) for feat in dense_feature]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

X_train = model_feed_dict(train[feature_name])
X_valid = model_feed_dict(valid[feature_name])
X_test = model_feed_dict(test[feature_name])

Y = train['label'].values
valid_Y = valid['label'].values

torch.cuda.empty_cache()
        sys.stdout = out_handle
    else:
        out_handle = sys.stdout

    data = load_data_in_df(args, config)

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]
    target = ['label']

    # 2.count #unique features for each sparse field,and record dense feature field name
    embedding_params = config["embedding"]
    embedding_dim = embedding_params["size"]
    if embedding_params["etype"] == "full":
        fixlen_feature_columns = [
            SparseFeat(feat, data[feat].nunique(), embedding_dim)
            for feat in sparse_features
        ] + [DenseFeat(
            feat,
            1,
        ) for feat in dense_features]
    elif embedding_params["etype"] == "rma":
        print("FIGURE OUT THE INITIALIZATION")
        hashed_weight = nn.Parameter(
            torch.from_numpy(
                np.random.uniform(
                    low=-np.sqrt(1 / embedding_dim),
                    high=np.sqrt(1 / embedding_dim),
                    size=((embedding_params["rma"]["memory"], ))).astype(
                        np.float32)))
        fixlen_feature_columns = [
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data,
                                   test_size=0.2,
                                   random_state=2020,
Example #24
0
    return pd.concat([x, y], axis=1)


if __name__ == "__main__":
    knn_metric = "dot"
    pkl_file = open('../data/features_num.pkl', 'rb')
    features_num = pickle.load(pkl_file)

    dense_features = ['I' + str(i) for i in range(1, 14)]
    sparse_features = ['C' + str(i) for i in range(1, 27)]
    features = dense_features + sparse_features

    target = ['label']

    fixlen_feature_columns = [
        SparseFeat(feat, features_num[feat] + 1) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:1'
Example #25
0
    target = ['click']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features

    mms = MinMaxScaler(feature_range=(0, 1))
    if dense_features != []:
        data[dense_features] = mms.fit_transform(data[dense_features])

    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(
            feat, vocabulary_size=data[feat].nunique() + 10, embedding_dim=4)
        for i, feat in enumerate(sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for modelf

    train, test = train_test_split(data, test_size=0.1)
Example #26
0
                                   use_threads=True)

sparse_features = [
    "product_id",
    "user_id",
    "gender",
    "age",
    "skin_type",
    "idThirdCategory",
]
target = ['rating']

# 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [
    SparseFeat(feat, glowpick[feat].nunique(), embedding_dim=4)
    for feat in sparse_features
]

linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# linear_feature_columns list 와 dnn_feature_columns list 를 load
with open(FILE_PATH + 'linear_feature_columns_list.pickle', 'rb') as fp:
    linear_feature_columns = pickle.load(fp)

with open(FILE_PATH + 'dnn_feature_columns_list.pickle', 'rb') as fp:
    dnn_feature_columns = pickle.load(fp)

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
Example #27
0
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.models import *
from deepctr_torch.inputs import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
import torch
import torch.nn.functional as F

feature_columns = [
    SparseFeat('user', 3),
    SparseFeat('gender', 2),
    SparseFeat('item', 3 + 1),
    SparseFeat('item_gender', 2 + 1),
    DenseFeat('score', 1)
]
feature_columns += [
    VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'),
    VarLenSparseFeat('hist_item_gender',
                     3 + 1,
                     maxlen=4,
                     embedding_name='item_gender')
]

behavior_feature_list = ["item", "item_gender"]
uid = np.array([0, 1, 2])
ugender = np.array([0, 1, 0])
iid = np.array([1, 2, 3])  # 0 is mask value
igender = np.array([1, 2, 1])  # 0 is mask value