def export_day_statistical_sequence(lst_data): from src.data.generate_data import generate_day_statistical_sequence show_title("加工数据为 91 天的序列数据,每天为6个特征(最大值、最小值、平均值)96维数据") x_data = generate_day_statistical_sequence(lst_data) from src.base.config import x_data_file_name, base_data_type save_model_data(x_data, data_file_path + x_data_file_name, base_data_type) return x_data
def export_train_balance(x_train, y_train): show_title(f"对类别 :{label_name}实施平衡{config.train_data_type}") x_train_balance, y_train_balance = generate_balance_data(x_train, y_train) from src.base.config import train_balance_data_type from src.base.config import x_train_balance_file_name, y_train_balance_file_name save_model_data(x_train_balance, data_file_path + x_train_balance_file_name, train_balance_data_type) save_model_data(y_train_balance, data_file_path + y_train_balance_file_name, train_balance_data_type)
def export_day_list_data(): from src.data.load_data import load_original_data show_title("加载原始数据") x_csv, y_csv = load_original_data() from src.data.generate_data import generate_day_list_data show_title("导出每个用户每天访问数据的不截断列表") lst_data, y_data = generate_day_list_data(x_csv, y_csv) save_model_data(lst_data, data_file_path + config.lst_data_file_name, config.base_data_type) save_model_data(y_data, data_file_path + config.y_data_file_name, config.base_data_type) return lst_data, y_data
def export_train_test_data(x_data, y_data): x_data = x_data[0:config.user_id_max] show_title("拆分训练数据集和测试数据集") x_train, x_test, y_train, y_test = train_test_split( x_data, y_data, random_state=config.seed, stratify=y_data) from src.base.config import train_data_type from src.base.config import x_train_file_name, y_train_file_name save_model_data(x_train, data_file_path + x_train_file_name, train_data_type) save_model_data(y_train, data_file_path + y_train_file_name, train_data_type) from src.base.config import test_data_type from src.base.config import x_test_file_name, y_test_file_name save_model_data(x_test, data_file_path + x_test_file_name, test_data_type) save_model_data(y_test, data_file_path + y_test_file_name, test_data_type) return x_train, y_train
def export_val_data(x_train, y_train): show_title("拆分训练数据集和验证数据集") x_train_val, x_val, y_train_val, y_val = train_test_split( x_train, y_train, random_state=config.seed, stratify=y_train) from src.base.config import train_val_data_type from src.base.config import x_train_val_file_name, y_train_val_file_name save_model_data(x_train_val, data_file_path + x_train_val_file_name, train_val_data_type) save_model_data(y_train_val, data_file_path + y_train_val_file_name, train_val_data_type) from src.base.config import val_data_type from src.base.config import x_val_file_name, y_val_file_name save_model_data(x_val, data_file_path + x_val_file_name, val_data_type) save_model_data(y_val, data_file_path + y_val_file_name, val_data_type) return x_train_val, y_train_val
def export_w2v_data(lst_data): show_title("导出用于Word2Vec训练的数据") from src.data.generate_data import generate_w2v_data x_w2v = generate_w2v_data(lst_data) from src.base.config import data_w2v_path, w2v_file_name, w2v_data_type save_model_data(x_w2v, data_w2v_path + w2v_file_name, w2v_data_type)