Ejemplo n.º 1
0
def load_data(path):
    """
    获取数据,并将user文件以及item文件和行为文件进行合并
    :param path:
    :return:
    """
    user = reduce_mem_usage(pd.read_csv(path + 'user.csv', header=None))
    item = reduce_mem_usage(pd.read_csv(path + 'df_item.csv'))
    user.columns = ['userID', 'sex', 'age', 'ability']
    # 获取用户行为日志1到16天的
    data = pd.read_csv(path + 'df_behavior.csv')
    # 要将itemID的格式转化成int64类型的
    data['itemID'] = data['itemID'].astype('int64')

    data = reduce_mem_usage(data)

    # 开始合并到data中来
    data = pd.merge(left=data, right=item, on='itemID', how='left')
    data = pd.merge(left=data, right=user, on='userID', how='left')
    return user, item, data
Ejemplo n.º 2
0
def get_underline_labels():
    """
    此函数是获取线下训练集的标签
    则应该是返回15号的数据
    也就是21号
    格式就是['userID', 'itemID', 'behavior']
    :return:
    """
    df = pd.read_csv("../data/df_behavior_train.csv")
    df = df[df['day'] == 21]

    train_labels = df[['userID', 'itemID', 'behavior']]
    train_labels = reduce_mem_usage(train_labels)

    return train_labels
Ejemplo n.º 3
0
def get_users_log_function():
    """
    获取1号到16号的用户行为日志
    :return:
    """
    filename = "../data/df_behavior.csv"
    df_behavior = pd.read_csv(filename)
    df_behavior = reduce_mem_usage(df_behavior)

    matrix = df_behavior[['userID', 'itemID', 'behavior', 'day']].values  # 使用values时会将数据集转化成浮点数的形式
    users_log = dict()

    for row in matrix:
        users_log.setdefault(int(row[0]), [])
        users_log[int(row[0])].append((int(row[1]), int(row[2]), int(row[3])))

    return users_log
Ejemplo n.º 4
0
def generate_online_train_user_sets():
    """
    这个函数是用来生成线上训练集的用户集合,即给这里面的所有用户推荐商品
    包含着1到15天的用户集合
    格式为{user:([i1, i2, i3], .....)}.....其中i1为物品的ID, i2为行为的程度,i3为行为的时间段
    :return:
    """
    filename = "../data/df_behavior_train.csv"
    train_data_df = pd.read_csv(filename)  # 获取datafame的数据集
    train_data_df = reduce_mem_usage(train_data_df)
    matrix = train_data_df[['userID', 'itemID', 'behavior',
                            'day']].values  # 这里返回的是一个二维数组,里面的数字全部转成浮点的数字
    user_logs = dict()
    for row in matrix:
        user_logs.setdefault(int(row[0]), [])
        user_logs[int(row[0])].append((int(row[1]), int(row[2]), int(row[3])))

    return user_logs
Ejemplo n.º 5
0
def get_label_function():
    """
    此函数是为了获得第16天的用户行为日志,从而与1到15天的召回列表合并生成label的功能
    格式为dataframe格式(userID, itemID, behavior)
    :return:
    """
    filename = "../data/df_behavior_test.csv"
    right_data = pd.read_csv(filename)
    right_datas = right_data[['userID', 'itemID', 'behavior']]
    # 讲itemID的数据类型转化成int64类型
    right_datas = right_datas.astype({'itemID': 'int64'})
    # right_data['itemID'] =
    right_datas.loc[right_datas['behavior'] == 'pv', 'behavior'] = 1
    # loc[i, j]的意思就是对于i行j列的意思
    right_datas.loc[right_datas['behavior'] == 'fav', 'behavior'] = 2
    right_datas.loc[right_datas['behavior'] == 'cart', 'behavior'] = 3
    right_datas.loc[right_datas['behavior'] == 'buy', 'behavior'] = 4

    right_datas = reduce_mem_usage(right_datas)
    return right_datas
Ejemplo n.º 6
0
我觉得不应该分动态和静态,应该是线下与线上之说,因为总的来说对于
不同的行为数据集特征值应该不一样的!!!!!!!!!!!!!!!
"""
from code_file.utils import reduce_mem_usage
import numpy as np
import pandas as pd

# 载入原始的数据集 即7到21号的。。
data = pd.read_csv("../data/df_behavior_train.csv"
                   )  # userID,behavior,timestap,itemID,date,day
"""
现在为了时间的,选择19到21号即可
"""
data = data[data['day'] >= 19]

data = reduce_mem_usage(data)

# 载入用户的数据
user = pd.read_csv("../data/user.csv", header=None)
user.columns = ['userID', 'sex', 'age', 'ability']
user = reduce_mem_usage(user)

# 载入商品的数据
item = pd.read_csv("../data/df_item.csv")  # categoryID,shopID,brandID,itemID
item = reduce_mem_usage(item)

# 合并商品以及用户的数据集进行统计过程
data = pd.merge(left=data, right=user, on=['userID'], how='left')
data = pd.merge(left=data, right=item, on=['itemID'], how='left')

# 首先是用户与商品的商品的交叉的特征,比如,userID,再shopID分组之类的行为的程度
Ejemplo n.º 7
0
from sklearn import preprocessing
from code_file.utils import reduce_mem_usage

# 定义超参数
FEATURE_SIZE = 601221  # 此处为转化成特征之后的特征总数
FIELD_SIZE = 9  # 特征域的大小
EMBEDDING_SIZE = 8  # embedding之后的最后维度的大小
BATCH_SIZE = 1024

"""获取训练集的部分"""
"""
之后的总的特征就是['userID', 'itemID', 'sim', 'label', 'behavior', 'day', 'sex', 'age',
       'ability', 'categoryID', 'shopID', 'brandID'],
"""
data_index, data_value = deal_underline_train_data()
data_index = reduce_mem_usage(data_index)
data_value = reduce_mem_usage(data_value)

"""漏了一部分就是数据预处理,对于连续值进行归一化操作!!!!!"""
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0.1, 1))
# 连续特征有哪些'sim', 'age'
tmp1 = np.array(data_value['sim'])
tmp1 = np.reshape(tmp1, (-1, 1))

tmp2 = np.array((data_value['age']))
tmp2 = np.reshape(tmp2, (-1, 1))

data_value['sim'] = min_max_scaler.fit_transform(tmp1)
data_value['age'] = min_max_scaler.fit_transform(tmp2)

# 取得标签的值
Ejemplo n.º 8
0
"""
df_item = pd.read_csv(filename_item, header=None)
df_item.columns = ['itemID', 'categoryID', 'shopID', 'brandID']

le = preprocessing.LabelEncoder()
df_item['itemID_Encoding'] = le.fit_transform(df_item['itemID'])

# print(df_item.head(10))
print(df_item.shape)  # (4318202, 5)
print(df_item['itemID_Encoding'].min())  # 0
print(df_item['itemID_Encoding'].max())  # 4318202

df_behavior = pd.read_csv(filename_behavior, header=None)

# 在此处优化存储空间
df_behavior = reduce_mem_usage(df_behavior)
df_behavior = df_behavior.iloc[:, 1:5]
df_behavior.columns = ['userID', 'itemID', 'behavior', 'timestap']

df_behavior = df_behavior.merge(df_item, on='itemID', how='left')
df_behavior.drop(df_behavior[np.isnan(df_behavior['itemID_Encoding'])].index, inplace=True)  # 删除指定列含有NaN值
# 删除多余的列
df_behavior = df_behavior.drop(['itemID', 'categoryID', 'shopID', 'brandID'], axis=1)
df_behavior['itemID'] = df_behavior['itemID_Encoding']
df_behavior = df_behavior.drop(['itemID_Encoding'], axis=1)
print(df_behavior.head())  #
print(df_behavior['itemID'].min())  # 11
print(df_behavior['itemID'].max())  # 4318196
print(df_behavior.shape)  # (8047545, 4)

# 在对于df_item进行处理
Ejemplo n.º 9
0
def generate_all_feature():
    """
    获取线上训练集的总的特征
    也就是1到15天的数据特征
    :return:
    """
    # 先获取召回1到15天的数据集
    train_data = pd.read_csv(filename_recall)
    train_data = reduce_mem_usage(train_data)
    train_data = train_data.fillna(0)  # 给label为NaN进行补充值
    """
    以上train_data的格式为[userID, itemID, sim, label]
    解释:userID为用户,itemID为用户行为的商品,sim表示用户对于该商品的感兴趣的程度
    label表示的是在16号的时候该用户是否真的对这个商品使用了。。1到3表示使用了,0表示没有
    """
    # 进行负采样的过程
    recall_train = down_sample(train_data, 10)
    recall_train['label'] = recall_train['label'].apply(transfer_label)

    # 然后要跟原始的item文件以及user文件进行左连接
    user_data = pd.read_csv("../data/user.csv", header=None)
    user_data.columns = ['userID', 'sex', 'age', 'ability']

    item_data = pd.read_csv(
        "../data/df_item.csv")  # 其中它的列名为categoryID,shopID,brandID,itemID

    recall_train = pd.merge(left=recall_train,
                            right=user_data,
                            on=['userID'],
                            how='left',
                            sort=False)
    recall_train = pd.merge(left=recall_train,
                            right=item_data,
                            on=['itemID'],
                            how='left',
                            sort=False)
    """
    以上进行合并之后的特征为:
    userID, itemID, sim, label, sex, age, ability, categoryID, shopID, brandID
    """
    """
    再把之前生成的统计统计特征进行连接
    分别为category_higher以及item.higher的统计
    """
    # 格式为['categoryID', 'category_median', 'category_std']
    category_feature = pd.read_csv('../statistics_feature/category_higher.csv')

    # 格式为['itemID', 'item_median', 'item_std']
    item_feature = pd.read_csv('../statistics_feature/item.higher.csv')

    recall_train = pd.merge(left=recall_train,
                            right=category_feature,
                            on=['categoryID'],
                            how='left')
    recall_train = pd.merge(left=recall_train,
                            right=item_feature,
                            on=['itemID'],
                            how='left')

    # 再跟统计的四个特征进行合并
    item_ID_feature = pd.read_csv('../statistics_feature/itemID_count.csv')
    category_ID_feature = pd.read_csv(
        "../statistics_feature/categoryID_count.csv")
    shop_ID_feature = pd.read_csv("../statistics_feature/shopID_count.csv")
    brand_ID_feature = pd.read_csv("../statistics_feature/brandID_count.csv")

    recall_train = pd.merge(left=recall_train,
                            right=item_ID_feature,
                            on=["itemID"],
                            how="left")
    recall_train = pd.merge(left=recall_train,
                            right=category_ID_feature,
                            on=["categoryID"],
                            how="left")
    recall_train = pd.merge(left=recall_train,
                            right=shop_ID_feature,
                            on=["shopID"],
                            how="left")
    recall_train = pd.merge(left=recall_train,
                            right=brand_ID_feature,
                            on=["brandID"],
                            how="left")
    """
    以上之后的总的特征就是
    userID, itemID, sim, label, sex, age, ability, categoryID, shopID, brandID category_median
    category_std item_median item_std, itemID_sum, categoryID_sum, shopID_sum, brandID_sum
    """
    return recall_train
Ejemplo n.º 10
0
def predict_function(data_index, data_value):
    """
    预测模型函数,上传到testdata处理好的训练集
    :param data_value:
    :param data_index:
    :return:
    """
    STEPS = len(data_index) // BATCH_SIZE

    """开始处理测试集"""
    data_index = reduce_mem_usage(data_index)
    data_value = reduce_mem_usage(data_value)

    """漏了一部分就是数据预处理,对于连续值进行归一化操作!!!!!"""
    min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0.1, 1))
    # 连续特征有哪些'sim', 'age'
    tmp1 = np.array(data_value['sim'])
    tmp1 = np.reshape(tmp1, (-1, 1))

    tmp2 = np.array((data_value['age']))
    tmp2 = np.reshape(tmp2, (-1, 1))

    data_value['sim'] = min_max_scaler.fit_transform(tmp1)
    data_value['age'] = min_max_scaler.fit_transform(tmp2)

    feature = [x for x in data_index.columns if x not in ['userID', 'itemID', 'label']]
    # 获得训练的训练集
    data_index = data_index[feature]
    data_index = np.array(data_index)

    data_value = data_value[feature]
    data_value = np.array(data_value)

    # 获取deepfm保存好的网络
    deepfm_model = '../deepfm_save_model/deepfm_model_saver.ckpt'
    # deepfm_model = '../deepfm_real_embeddings_save/deepfm_model_saver.ckpt'

    y_pred = None
    with tf.Session() as sess:
        saver = tf.train.Saver()
        saver.restore(sess, deepfm_model)

        # 开始进行训练
        for step in range(STEPS):
            pred_temp = sess.run(out, feed_dict={
                feature_index: data_index[step * BATCH_SIZE:(step + 1) * BATCH_SIZE],
                feature_value: data_value[step * BATCH_SIZE:(step + 1) * BATCH_SIZE],
                iS_training: False})
            """合并好数据集"""
            pred_temp = np.reshape(pred_temp, (-1, 1))
            if y_pred is None:
                y_pred = pred_temp
            else:
                y_pred = np.concatenate((y_pred, pred_temp), axis=0)
        """处理剩余的部分"""
        print(step)  # 80221, 而step的总数是80222
        pred_last = sess.run(out, feed_dict={
            feature_index: data_index[(step + 1) * BATCH_SIZE:],
            feature_value: data_value[(step + 1) * BATCH_SIZE:],
            iS_training: False})

        pred_last = np.reshape(pred_last, (-1, 1))

        y_pred = np.concatenate((y_pred, pred_last), axis=0)

        return y_pred  # 维度为(-1, 1)