def data_preprocess(csv_file):
    data = pd.read_csv(csv_file)
    sparse_features = ["movie_id", "gender", "age"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]

    # 他自己的資料型態
    # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group')
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)  # movie_id, gender, age.

    test_model_input = {name: data[name]
                        for name in feature_names
                        }  # dict of movie_id, gender, age value

    return test_model_input, linear_feature_columns, dnn_feature_columns
Exemple #2
0
def data_preprocess(csv_file):
    data = pd.read_csv(csv_file)
    sparse_features = ["movie_id", "gender", "age"]
    movie_genres = [
        'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical',
        'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western'
    ]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    sparse_features.extend(movie_genres)
    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]

    # 他自己的資料型態
    # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group')
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)  # movie_id, gender, age.

    test_model_input = {name: data[name]
                        for name in feature_names
                        }  # dict of movie_id, gender, age value

    return test_model_input, linear_feature_columns, dnn_feature_columns
Exemple #3
0
 def fit_test(self, train_X, train_Y, val_X, val_Y, test_X, test_Y,
              cat_cols):
     sparse_features = cat_cols
     dense_features = [
         idx for idx in range(train_X.shape[1]) if idx not in cat_cols
     ]
     sparse_feature_columns = [
         SparseFeat(str(feat),
                    vocabulary_size=len(set(train_X[:, feat])) + 1,
                    embedding_dim=4)
         for i, feat in enumerate(sparse_features)
     ]
     dense_feature_columns = [
         DenseFeat(
             str(feat),
             1,
         ) for feat in dense_features
     ]
     dnn_feature_columns = sparse_feature_columns + dense_feature_columns
     linear_feature_columns = sparse_feature_columns + dense_feature_columns
     feature_names = get_feature_names(linear_feature_columns +
                                       dnn_feature_columns)
     train_model_input = {
         name: train_X[:, int(name)]
         for name in feature_names
     }
     val_model_input = {name: val_X[:, int(name)] for name in feature_names}
     test_model_input = {
         name: test_X[:, int(name)]
         for name in feature_names
     }
     use_cuda = True
     if use_cuda and torch.cuda.is_available():
         print('cuda ready...')
         self.device = 'cuda:0'
     self.model = xDeepFM(linear_feature_columns,
                          dnn_feature_columns,
                          task='binary',
                          device=self.device)
     self.model.compile(
         Adam(self.model.parameters(), 0.0001),
         "binary_crossentropy",
         metrics=['binary_crossentropy'],
     )
     es = EarlyStopping(monitor='val_binary_crossentropy',
                        min_delta=0,
                        verbose=1,
                        patience=30,
                        mode='min')
     lbe = LabelEncoder()
     self.model.fit(train_model_input,
                    lbe.fit_transform(train_Y),
                    batch_size=512,
                    epochs=21,
                    verbose=2,
                    validation_data=(val_model_input, lbe.transform(val_Y)))
     pred_ans = self.model.predict(test_model_input, batch_size=256)
     print(f'{log_loss(test_Y, pred_ans):.5f}')
Exemple #4
0
def get_xy_fd(hash_flag=False):
    feature_columns = [
        SparseFeat('user', 4, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('pay_score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    vocabulary_size=2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4,
                         length_name="seq_length")
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2, 3])
    gender = np.array([0, 1, 0, 1])
    item_id = np.array([1, 2, 3, 2])  # 0 is mask value
    cate_id = np.array([1, 2, 1, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3, 0.2])

    hist_item_id = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0],
                             [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0],
                             [1, 2, 0, 0]])

    behavior_length = np.array([3, 3, 2, 2])

    feature_dict = {
        'user': uid,
        'gender': gender,
        'item_id': item_id,
        'cate_id': cate_id,
        'hist_item_id': hist_item_id,
        'hist_cate_id': hist_cate_id,
        'pay_score': score,
        "seq_length": behavior_length
    }

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1, 0])
    return x, y, feature_columns, behavior_feature_list
Exemple #5
0
 def __init__(self, stage, action):
     """
     :param linear_feature_columns: List of tensorflow feature_column
     :param dnn_feature_columns: List of tensorflow feature_column
     :param stage: String. Including "online_train"/"offline_train"/"evaluate"/"submit"
     :param action: String. Including "read_comment"/"like"/"click_avatar"/"favorite"/"forward"/"comment"/"follow"
     """
     super(WideAndDeep, self).__init__()
     self.num_epochs_dict = {"read_comment": 1, "like": 1, "click_avatar": 1, "favorite": 1, "forward": 1,
                             "comment": 1, "follow": 1}
     self.estimator = None
     self.stage = stage
     self.action = action
     self.dnn_feature_columns, self.linear_feature_columns = self.get_feature_columns()
     self.feature_names = get_feature_names(self.dnn_feature_columns + self.linear_feature_columns)
     tf.logging.set_verbosity(tf.logging.INFO)
Exemple #6
0
def get_xy_fd():
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=8),
        SparseFeat('gender', 2, embedding_dim=8),
        SparseFeat('item', 3 + 1, embedding_dim=8),
        SparseFeat('item_gender', 2 + 1, embedding_dim=8),
        DenseFeat('score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item', 3 + 1, embedding_dim=8),
                         4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1,
                                    embedding_dim=8),
                         4,
                         length_name="seq_length")
    ]
    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
    behavior_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score,
        "seq_length": behavior_length
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1])

    return x, y, feature_columns, behavior_feature_list
Exemple #7
0
def data_preprocess(data_df):
    data = data_df.copy(deep=True)
    sparse_features = ["movie_id", "gender", "age"]
    # movie_genres = [
    #     'Action','Adventure','Animation','Childrens','Comedy','Crime',
    #     'Documentary','Drama','Fantasy','Film_Noir','Horror','Musical',
    #     'Mystery','Romance','Sci_Fi','Thriller','War','Western'
    #     ]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    # TODO: Add arguments
    encoder_list = get_train_LabelEncoder_info('./recommend_data/data.csv')
    for feat in sparse_features:

        # print(feat)
        # print(list(encoder_list[feat].classes_))
        # The range of age is from 1-100
        if feat == 'age':
            age_encoder = np.array([str(i) for i in range(1, 101)])
            encoder_list[feat].classes_ = age_encoder

        elif feat == 'gender':
            gender_encoder = np.array(['M', 'F'])
            encoder_list[feat].classes_ = gender_encoder

        data[feat] = encoder_list[feat].transform(data[feat])

    # 2.count #unique features for each sparse field
    # TODO: Add arguments
    fixlen_feature_columns = get_train_fixlen_feature_columns(
        './recommend_data/data.csv')

    # 他自己的資料型態
    # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group')
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)  # movie_id, gender, age.

    test_model_input = {name: data[name]
                        for name in feature_names
                        }  # dict of movie_id, gender, age value

    return test_model_input, linear_feature_columns, dnn_feature_columns
    "age",
    "skin_type",
    "idThirdCategory",
]
target = ['rating']

# 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [
    SparseFeat(feat, glowpick[feat].nunique(), embedding_dim=4)
    for feat in sparse_features
]

linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# linear_feature_columns list 와 dnn_feature_columns list 를 load
with open(FILE_PATH + 'linear_feature_columns_list.pickle', 'rb') as fp:
    linear_feature_columns = pickle.load(fp)

with open(FILE_PATH + 'dnn_feature_columns_list.pickle', 'rb') as fp:
    dnn_feature_columns = pickle.load(fp)

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model
train, test = train_test_split(glowpick, test_size=0.2)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}
def train_recommend_movies(csv_file, DEVICE):
    """
        Description:
            Train recommend system on: 
                Model: "xDeepFM", 
                Target: "rating",
                Input features: ["movie_id", "gender", "age"],
                Save model to: "save_model/xDeepFM_MSE{}.h5"

        Parameters: 
            csv_file: "path to *.csv"
            DEVICE: "cuda:0"
    """
    data = pd.read_csv(csv_file)
    sparse_features = ["movie_id", "gender", "age"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]

    # 他自己的資料型態
    # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group')
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns
    )  # movie_id, user_id, gender, age, occupation, zip.

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {
        name: test[name]
        for name in feature_names
    }  # dict of movie_id, user_id, gender, age, occupation, zip values

    # 4.Define Model,train,predict and evaluate
    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = DEVICE

    # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
    # model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
    model = xDeepFM(linear_feature_columns,
                    dnn_feature_columns,
                    task='regression',
                    device=device)
    model.compile(
        "adam",
        "mse",
        metrics=['mse'],
    )

    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=10,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)

    print("test MSE",
          round(mean_squared_error(test[target].values, pred_ans), 4))

    torch.save(
        model.state_dict(), 'save_model/xDeepFM_MSE{}.h5'.format(
            round(mean_squared_error(test[target].values, pred_ans), 4)))
def task(action):
    print('-----------action-----------', action)
    USE_FEAT = [action] + SELECT_FRTS
    train = pd.read_csv(ROOT_PATH + f'/train_data_for_{action}.csv')[USE_FEAT]
    train = train.sample(frac=1, random_state=42).reset_index(drop=True)
    print("posi prop:")
    print(sum((train[action] == 1) * 1) / train.shape[0])
    test = pd.read_csv(ROOT_PATH + '/test_data.csv')[SELECT_FRTS]
    target = [action]
    test[target[0]] = 0
    test = test[USE_FEAT]
    data = pd.concat((train, test)).reset_index(drop=True)
    print(train.shape, test.shape, data.shape)
    dense_features = DENSE_FEATURE
    sparse_features = [
        i for i in USE_FEAT if i not in dense_features and i not in target
    ]

    data[sparse_features] = data[sparse_features].fillna(0)
    data[dense_features] = data[dense_features].fillna(0)

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]
    #
    dnn_feature_columns = fixlen_feature_columns
    #linear_feature_columns = [SparseFeat(feat, data[feat].nunique())
    #                         for feat in sparse_features]
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model
    train, test = data.iloc[:train.shape[0]].reset_index(
        drop=True), data.iloc[train.shape[0]:].reset_index(drop=True)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}
    #-------
    eval_ratio = 0.
    eval_df = train[int((1 - eval_ratio) *
                        train.shape[0]):].reset_index(drop=True)
    userid_list = eval_df['userid'].astype(str).tolist()
    print('val len:', len(userid_list))

    # 4.Define Model,train,predict and evaluate
    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

    model = MyDeepFM(linear_feature_columns=linear_feature_columns,
                     dnn_feature_columns=dnn_feature_columns,
                     use_fm=True,
                     dnn_hidden_units=(256, 128),
                     l2_reg_linear=1e-1,
                     l2_reg_embedding=0.00001,
                     l2_reg_dnn=0,
                     init_std=0.0001,
                     seed=1024,
                     dnn_dropout=0.,
                     dnn_activation='relu',
                     dnn_use_bn=False,
                     task='binary',
                     device=device)

    model.compile("adagrad", "binary_crossentropy", metrics=["auc"])

    history = model.fit(train_model_input,
                        train[target].values,
                        batch_size=1024,
                        epochs=NUM_EPOCH_DICT[action],
                        verbose=1,
                        validation_split=eval_ratio,
                        userid_list=userid_list)
    pred_ans = model.predict(test_model_input, 128)
    #submit[action] = pred_ans
    torch.cuda.empty_cache()
    return pred_ans
Exemple #11
0
def train_recommend_movies(csv_file, DEVICE):
    """
        Description:
            Train recommend system on: 
                Model: "xDeepFM", 
                Target: "rating",
                Input features: ["movie_id", "gender", "age"],
                Save model to: "save_model/xDeepFM_MSE{}.h5"

        Parameters: 
            csv_file: "path to *.csv"
            DEVICE: "cuda:0"
    """
    data = pd.read_csv(csv_file)
    # sparse_features = ["movie_id", "user_id",
    #                    "gender", "age", "occupation", "zip"]
    sparse_features = ["movie_id", "gender", "age"]
    movie_genres = [
        'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical',
        'Mystery', 'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western'
    ]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    sparse_features.extend(movie_genres)
    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]

    # 他自己的資料型態
    # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group')
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns
    )  # movie_id, user_id, gender, age, occupation, zip.

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {
        name: test[name]
        for name in feature_names
    }  # dict of movie_id, user_id, gender, age, occupation, zip values

    # 4.Define Model,train,predict and evaluate
    # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
    model = FiBiNET(linear_feature_columns,
                    dnn_feature_columns,
                    task='regression',
                    device=DEVICE)
    # model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
    model.compile(
        "adam",
        "mse",
        metrics=['mse'],
    )

    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=10,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)

    print("test MSE",
          round(mean_squared_error(test[target].values, pred_ans), 4))
    print("test MAE",
          round(mean_absolute_error(test[target].values, pred_ans), 4))

    # torch.save(model.state_dict(), './recommend_system/save_model/xDeepFM_MSE{}.h5' .format(round(mean_squared_error(test[target].values, pred_ans), 4)))
    torch.save(
        model.state_dict(),
        './recommend_system/save_model/FiBiNET_MSE{}.h5'.format(
            round(mean_squared_error(test[target].values, pred_ans), 4)))
Exemple #12
0
ugender = np.array([0, 1, 0])
iid = np.array([1, 2, 3])  # 0 is mask value
igender = np.array([1, 2, 1])  # 0 is mask value
score = np.array([0.1, 0.2, 0.3])

hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

feature_dict = {
    'user': uid,
    'gender': ugender,
    'item': iid,
    'item_gender': igender,
    'hist_item': hist_iid,
    'hist_item_gender': hist_igender,
    'score': score
}
x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
y = np.array([[1], [0], [1]])

model = DIN([],
            feature_columns,
            behavior_feature_list,
            hist_len_max=4,
            device='cpu',
            dnn_activation='dice',
            att_activation='dice')
model.compile('adagrad',
              'binary_crossentropy',
              metrics=['binary_crossentropy'])
model.fit(x, y, batch_size=32, epochs=3, validation_split=0.0, verbose=1)