def preprocess(self, data_dir, re_seg=True, to_file=False, mid_data_paths=None, split_train_test=True, test_ratio=0.2, vec_method="count", feature_select=True, is_percent=True, feature_keep_percent=90, feature_keep_num=10, min_df=3): """根据指定目录 获得数据特征 [out] train_data_vec: matrix, 数据集特征 """ preprocessor = Preprocessor( feature_gen_func=self.feature_label_gen, vec_method=vec_method, feature_keep_percent=feature_keep_percent, feature_keep_num=feature_keep_num, is_percent=is_percent, test_ratio=test_ratio, min_df=min_df) # 根据数据生成特征 _, self.train_data_vec, _, _, _ = preprocessor.gen_data_vec( data_dir, self.feature_id_path, split_train_test=split_train_test, feature_select=feature_select, to_file=to_file, re_seg=re_seg, process_file_path=mid_data_paths)
def preprocess(self, data_dir, re_seg=True, to_file=False, mid_data_paths=None, split_train_test=True, test_ratio=0.2, vec_method="count", feature_select=True, is_percent=True, feature_keep_percent=90, feature_keep_num=10, min_df=3): """ """ preprocessor = Preprocessor( feature_gen_func=self.feature_label_gen, vec_method=vec_method, feature_keep_percent=feature_keep_percent, feature_keep_num=feature_keep_num, is_percent=is_percent, test_ratio=test_ratio, min_df=min_df) _, train_data, train_label, val_data, val_label = preprocessor.gen_data_vec( data_dir, self.feature_id_path, split_train_test=split_train_test, feature_select=feature_select, to_file=to_file, re_seg=re_seg, process_file_path=mid_data_paths)