def data_fix_day():
    # 载入数据需要的变量
    file_name = './data/train_data_all_min_complete_v.csv'
    field_list = [  # 输入数据处理:选择需要的列
        "user_id_inc",  # 0
        "creative_id_inc",  # 1
        "time_id",  # 2
        "click_times",  # 3, click_times 属于值,不属于编号,不能再减1
    ]
    file_suffix = 'fix_day'
    period_length = 7
    period_days = 1

    # --------------------------------------------------
    # 加载数据
    label_list = ['age']
    x_csv, y_csv = load_original_data()

    # --------------------------------------------------
    # 清洗数据集,生成所需要的数据
    x_data, y_data = generate_fix_data(x_csv, y_csv)
    del x_csv, y_csv

    # --------------------------------------------------
    # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集
    # 拆分「age」多分类数据
    label_name = label_list[0]
    x_train_age, y_train_age, x_test_age, y_test_age = split_data(
        x_data, y_data[:, 0], label_name)
    del x_data, y_data

    save_data_set(x_train_age, y_train_age, x_test_age, y_test_age)
    del x_train_age, y_train_age, x_test_age, y_test_age

    # --------------------------------------------------
    # 加载数据
    label_list = ['gender']
    x_csv, y_csv = load_original_data()

    # --------------------------------------------------
    # 清洗数据集,生成所需要的数据
    x_data, y_data = generate_fix_data(x_csv, y_csv)
    del x_csv, y_csv

    # 拆分「gender」二分类数据
    label_name = label_list[0]
    x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data(
        x_data, y_data[:, 0], label_name)
    del x_data, y_data

    save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender)
    del x_train_gender, y_train_gender, x_test_gender, y_test_gender
    pass
Example #2
0
def export_data_set():
    from src.无用代码.generate_data import generate_data_no_interval_with_repeat
    from src.data.load_data import load_original_data
    from src.无用代码.save_data import save_data_set
    from sklearn.model_selection import train_test_split

    field_list = [  # 输入数据处理:选择需要的列
        "user_id",
        "creative_id_inc_tf_idf",
        "time_id",
        "click_times",
    ]
    label_name = 'gender'
    label_list = ['age', 'gender']
    load_file_path = '../../data/tf_idf/'
    save_file_path = '../../data/tf_idf/no_interval/with_repeat/'

    creative_id_window = creative_id_step_size * 1
    creative_id_begin = 0
    creative_id_end = creative_id_begin + creative_id_window

    x_csv, y_csv = load_original_data()
    x_data, y_data = generate_data_no_interval_with_repeat(x_csv, y_csv)

    label_data = y_data[:, label_list.index(label_name)]
    x_train, x_test, y_train, y_test = train_test_split(x_data, label_data, random_state=seed, stratify=label_data)
    del x_data, y_data

    save_data_set(x_train, y_train, x_test, y_test)
    del x_train, y_train, x_test, y_test

    print("\n数据清洗完成!")
def export_day_list_data():
    from src.data.load_data import load_original_data
    show_title("加载原始数据")
    x_csv, y_csv = load_original_data()

    from src.data.generate_data import generate_day_list_data
    show_title("导出每个用户每天访问数据的不截断列表")
    lst_data, y_data = generate_day_list_data(x_csv, y_csv)
    save_model_data(lst_data, data_file_path + config.lst_data_file_name,
                    config.base_data_type)
    save_model_data(y_data, data_file_path + config.y_data_file_name,
                    config.base_data_type)
    return lst_data, y_data
def data_no_time():
    # 载入数据需要的变量
    file_name = './data/train_data_all_min_complete_v.csv'
    field_list = [  # 输入数据处理:选择需要的列
        "user_id_inc",  # 0
        "creative_id_inc",  # 1
        "time_id",  # 2
        "click_times",  # 3, click_times 属于值,不属于编号,不能再减1
    ]

    # --------------------------------------------------
    # 加载数据
    label_list = ['age', 'gender']
    x_csv, y_csv = load_original_data()

    # --------------------------------------------------
    # 清洗数据集,生成不重复的数据,用于 MaxPooling()训练
    file_suffix = 'no_time_no_repeat'
    x_data, y_data = generate_no_time_data(x_csv,
                                           y_csv,
                                           len(field_list) - 2,
                                           len(label_list),
                                           repeat_creative_id=False)

    # --------------------------------------------------
    # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集
    # 拆分「age」多分类数据
    label_name = label_list[0]
    x_train_age, y_train_age, x_test_age, y_test_age = split_data(
        x_data, y_data[:, 0], label_name)
    save_data_set(x_train_age, y_train_age, x_test_age, y_test_age)
    del x_train_age, y_train_age, x_test_age, y_test_age

    # 拆分「gender」二分类数据
    label_name = label_list[1]
    x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data(
        x_data, y_data[:, 1], label_name)
    save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender)
    del x_train_gender, y_train_gender, x_test_gender, y_test_gender, x_data, y_data

    # --------------------------------------------------
    # 清洗数据集,生成可重复的数据,用于 Word2Vec 训练
    file_suffix = 'no_time_with_repeat'
    x_data, y_data = generate_no_time_data(x_csv,
                                           y_csv,
                                           len(field_list) - 2,
                                           len(label_list),
                                           repeat_creative_id=True)

    # --------------------------------------------------
    # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集
    # 拆分「age」多分类数据
    label_name = label_list[0]
    x_train_age, y_train_age, x_test_age, y_test_age = split_data(
        x_data, y_data[:, 0], label_name)
    save_data_set(x_train_age, y_train_age, x_test_age, y_test_age)
    del x_train_age, y_train_age, x_test_age, y_test_age

    # 拆分「gender」二分类数据
    label_name = label_list[1]
    x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data(
        x_data, y_data[:, 1], label_name)
    save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender)
    del x_train_gender, y_train_gender, x_test_gender, y_test_gender, x_data, y_data
def data_fix():
    # 载入数据需要的变量
    file_name = './data/train_data_all_min_complete_v.csv'
    field_list = [  # 输入数据处理:选择需要的列
        "user_id_inc",  # 0
        "creative_id_inc",  # 1
        "time_id",  # 2
        "click_times",  # 3, click_times 属于值,不属于编号,不能再减1
    ]
    label_list = ['age', 'gender']

    # --------------------------------------------------
    # 加载数据
    x_csv, y_csv = load_original_data()

    # ==================================================
    # 清洗数据集,生成所需要的数据 ( 每人每天访问素材的数量 )
    period_length = 7
    period_days = 1
    x_data, y_data = generate_fix_data(x_csv, y_csv)

    # --------------------------------------------------
    # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集
    # 拆分「age」多分类数据
    file_suffix = 'fix_day'
    label_name = label_list[0]
    x_train_age, y_train_age, x_test_age, y_test_age = split_data(
        x_data, y_data[:, 0], label_name)
    save_data_set(x_train_age, y_train_age, x_test_age, y_test_age)
    del x_train_age, y_train_age, x_test_age, y_test_age

    # 拆分「gender」二分类数据
    label_name = label_list[1]
    x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data(
        x_data, y_data[:, 1], label_name)
    save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender)
    del x_train_gender, y_train_gender, x_test_gender, y_test_gender

    del x_data, y_data

    # ==================================================
    # 清洗数据集,生成所需要的数据 ( 每人每天访问素材的数量 )
    period_length = 15
    period_days = 3
    x_data, y_data = generate_fix_data(x_csv, y_csv)

    # --------------------------------------------------
    # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集
    # 拆分「age」多分类数据
    file_suffix = 'fix_three_days'
    label_name = label_list[0]
    x_train_age, y_train_age, x_test_age, y_test_age = split_data(
        x_data, y_data[:, 0], label_name)
    save_data_set(x_train_age, y_train_age, x_test_age, y_test_age)
    del x_train_age, y_train_age, x_test_age, y_test_age

    # 拆分「gender」二分类数据
    label_name = label_list[1]
    x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data(
        x_data, y_data[:, 1], label_name)
    save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender)
    del x_train_gender, y_train_gender, x_test_gender, y_test_gender

    del x_data, y_data

    # ==================================================
    # 清洗数据集,生成所需要的数据 ( 每人每天访问素材的数量 )
    period_length = 21
    period_days = 7
    x_data, y_data = generate_fix_data(x_csv, y_csv)

    # --------------------------------------------------
    # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集
    # 拆分「age」多分类数据
    file_suffix = 'fix_week'
    label_name = label_list[0]
    x_train_age, y_train_age, x_test_age, y_test_age = split_data(
        x_data, y_data[:, 0], label_name)
    save_data_set(x_train_age, y_train_age, x_test_age, y_test_age)
    del x_train_age, y_train_age, x_test_age, y_test_age

    # 拆分「gender」二分类数据
    label_name = label_list[1]
    x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data(
        x_data, y_data[:, 1], label_name)
    save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender)
    del x_train_gender, y_train_gender, x_test_gender, y_test_gender

    del x_data, y_data

    pass