def export_data_set(): from src.无用代码.generate_data import generate_data_no_interval_with_repeat from src.data.load_data import load_original_data from src.无用代码.save_data import save_data_set from sklearn.model_selection import train_test_split field_list = [ # 输入数据处理:选择需要的列 "user_id", "creative_id_inc_tf_idf", "time_id", "click_times", ] label_name = 'gender' label_list = ['age', 'gender'] load_file_path = '../../data/tf_idf/' save_file_path = '../../data/tf_idf/no_interval/with_repeat/' creative_id_window = creative_id_step_size * 1 creative_id_begin = 0 creative_id_end = creative_id_begin + creative_id_window x_csv, y_csv = load_original_data() x_data, y_data = generate_data_no_interval_with_repeat(x_csv, y_csv) label_data = y_data[:, label_list.index(label_name)] x_train, x_test, y_train, y_test = train_test_split(x_data, label_data, random_state=seed, stratify=label_data) del x_data, y_data save_data_set(x_train, y_train, x_test, y_test) del x_train, y_train, x_test, y_test print("\n数据清洗完成!")
def data_fix_day(): # 载入数据需要的变量 file_name = './data/train_data_all_min_complete_v.csv' field_list = [ # 输入数据处理:选择需要的列 "user_id_inc", # 0 "creative_id_inc", # 1 "time_id", # 2 "click_times", # 3, click_times 属于值,不属于编号,不能再减1 ] file_suffix = 'fix_day' period_length = 7 period_days = 1 # -------------------------------------------------- # 加载数据 label_list = ['age'] x_csv, y_csv = load_original_data() # -------------------------------------------------- # 清洗数据集,生成所需要的数据 x_data, y_data = generate_fix_data(x_csv, y_csv) del x_csv, y_csv # -------------------------------------------------- # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集 # 拆分「age」多分类数据 label_name = label_list[0] x_train_age, y_train_age, x_test_age, y_test_age = split_data( x_data, y_data[:, 0], label_name) del x_data, y_data save_data_set(x_train_age, y_train_age, x_test_age, y_test_age) del x_train_age, y_train_age, x_test_age, y_test_age # -------------------------------------------------- # 加载数据 label_list = ['gender'] x_csv, y_csv = load_original_data() # -------------------------------------------------- # 清洗数据集,生成所需要的数据 x_data, y_data = generate_fix_data(x_csv, y_csv) del x_csv, y_csv # 拆分「gender」二分类数据 label_name = label_list[0] x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data( x_data, y_data[:, 0], label_name) del x_data, y_data save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender) del x_train_gender, y_train_gender, x_test_gender, y_test_gender pass
def data_no_time(): # 载入数据需要的变量 file_name = './data/train_data_all_min_complete_v.csv' field_list = [ # 输入数据处理:选择需要的列 "user_id_inc", # 0 "creative_id_inc", # 1 "time_id", # 2 "click_times", # 3, click_times 属于值,不属于编号,不能再减1 ] # -------------------------------------------------- # 加载数据 label_list = ['age', 'gender'] x_csv, y_csv = load_original_data() # -------------------------------------------------- # 清洗数据集,生成不重复的数据,用于 MaxPooling()训练 file_suffix = 'no_time_no_repeat' x_data, y_data = generate_no_time_data(x_csv, y_csv, len(field_list) - 2, len(label_list), repeat_creative_id=False) # -------------------------------------------------- # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集 # 拆分「age」多分类数据 label_name = label_list[0] x_train_age, y_train_age, x_test_age, y_test_age = split_data( x_data, y_data[:, 0], label_name) save_data_set(x_train_age, y_train_age, x_test_age, y_test_age) del x_train_age, y_train_age, x_test_age, y_test_age # 拆分「gender」二分类数据 label_name = label_list[1] x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data( x_data, y_data[:, 1], label_name) save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender) del x_train_gender, y_train_gender, x_test_gender, y_test_gender, x_data, y_data # -------------------------------------------------- # 清洗数据集,生成可重复的数据,用于 Word2Vec 训练 file_suffix = 'no_time_with_repeat' x_data, y_data = generate_no_time_data(x_csv, y_csv, len(field_list) - 2, len(label_list), repeat_creative_id=True) # -------------------------------------------------- # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集 # 拆分「age」多分类数据 label_name = label_list[0] x_train_age, y_train_age, x_test_age, y_test_age = split_data( x_data, y_data[:, 0], label_name) save_data_set(x_train_age, y_train_age, x_test_age, y_test_age) del x_train_age, y_train_age, x_test_age, y_test_age # 拆分「gender」二分类数据 label_name = label_list[1] x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data( x_data, y_data[:, 1], label_name) save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender) del x_train_gender, y_train_gender, x_test_gender, y_test_gender, x_data, y_data
def data_fix(): # 载入数据需要的变量 file_name = './data/train_data_all_min_complete_v.csv' field_list = [ # 输入数据处理:选择需要的列 "user_id_inc", # 0 "creative_id_inc", # 1 "time_id", # 2 "click_times", # 3, click_times 属于值,不属于编号,不能再减1 ] label_list = ['age', 'gender'] # -------------------------------------------------- # 加载数据 x_csv, y_csv = load_original_data() # ================================================== # 清洗数据集,生成所需要的数据 ( 每人每天访问素材的数量 ) period_length = 7 period_days = 1 x_data, y_data = generate_fix_data(x_csv, y_csv) # -------------------------------------------------- # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集 # 拆分「age」多分类数据 file_suffix = 'fix_day' label_name = label_list[0] x_train_age, y_train_age, x_test_age, y_test_age = split_data( x_data, y_data[:, 0], label_name) save_data_set(x_train_age, y_train_age, x_test_age, y_test_age) del x_train_age, y_train_age, x_test_age, y_test_age # 拆分「gender」二分类数据 label_name = label_list[1] x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data( x_data, y_data[:, 1], label_name) save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender) del x_train_gender, y_train_gender, x_test_gender, y_test_gender del x_data, y_data # ================================================== # 清洗数据集,生成所需要的数据 ( 每人每天访问素材的数量 ) period_length = 15 period_days = 3 x_data, y_data = generate_fix_data(x_csv, y_csv) # -------------------------------------------------- # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集 # 拆分「age」多分类数据 file_suffix = 'fix_three_days' label_name = label_list[0] x_train_age, y_train_age, x_test_age, y_test_age = split_data( x_data, y_data[:, 0], label_name) save_data_set(x_train_age, y_train_age, x_test_age, y_test_age) del x_train_age, y_train_age, x_test_age, y_test_age # 拆分「gender」二分类数据 label_name = label_list[1] x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data( x_data, y_data[:, 1], label_name) save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender) del x_train_gender, y_train_gender, x_test_gender, y_test_gender del x_data, y_data # ================================================== # 清洗数据集,生成所需要的数据 ( 每人每天访问素材的数量 ) period_length = 21 period_days = 7 x_data, y_data = generate_fix_data(x_csv, y_csv) # -------------------------------------------------- # 拆分数据集,按 3:1 分成 训练数据集 和 测试数据集 # 拆分「age」多分类数据 file_suffix = 'fix_week' label_name = label_list[0] x_train_age, y_train_age, x_test_age, y_test_age = split_data( x_data, y_data[:, 0], label_name) save_data_set(x_train_age, y_train_age, x_test_age, y_test_age) del x_train_age, y_train_age, x_test_age, y_test_age # 拆分「gender」二分类数据 label_name = label_list[1] x_train_gender, y_train_gender, x_test_gender, y_test_gender = split_data( x_data, y_data[:, 1], label_name) save_data_set(x_train_gender, y_train_gender, x_test_gender, y_test_gender) del x_train_gender, y_train_gender, x_test_gender, y_test_gender del x_data, y_data pass