def get_train_test(feat_path_list, base=[], target='target'): print(base.shape) feature_list = utils.parallel_load_data(path_list=feat_path_list) df_feat = pd.concat(feature_list, axis=1) df_feat = pd.concat([base, df_feat], axis=1) train = df_feat[~df_feat[target].isnull()].reset_index(drop=True) test = df_feat[df_feat[target].isnull()].reset_index(drop=True) return train, test
def get_dataset(base, model_no): win_path = f'../features/4_winner/*.gz' # win_path = f'../features/1_first_valid/*.gz' model_path_list = [ f'../model/LB3670_70leaves_colsam0322/*.gz', '../model/E2_lift_set/*.gz', '../model/E3_PCA_set/*.gz', '../model/E4_mix_set/*.gz', '../model/LB3669LB_70leaves/*.gz' ][model_no] model_path = model_path_list[model_no] tmp_path_list = glob.glob(f'../features/5_tmp/*.gz') + glob.glob( f'../features/0_exp/*.gz') # tmp_path_list = glob.glob(f'../features/5_tmp/*.gz') win_path_list = glob.glob(model_path) + glob.glob(win_path) + tmp_path_list # win_path_list = glob.glob(model_path) + tmp_path_list # win_path_list = glob.glob(model_path) + glob.glob(win_path) win_path_list = glob.glob(win_path) + tmp_path_list # win_path_list = glob.glob(model_path) + glob.glob(win_path) + tmp_path_list #======================================================================== feature_list = utils.parallel_load_data(path_list=win_path_list) df_feat = pd.concat(feature_list, axis=1) base = pd.concat([base, df_feat], axis=1) train = base[~base[target].isnull()] test = base[base[target].isnull()] if debug: train = train.head(10000) test = test.head(1000) for col in train.columns: if col in ignore_list: continue train[col] = utils.impute_feature(df=train, col=col) test[col] = utils.impute_feature(df=test, col=col) return train, test
base = utils.read_df_pkl('../input/base*') win_path_list = glob.glob(win_path) train_path_list = [] test_path_list = [] for path in win_path_list: if path.count('train'): train_path_list.append(path) elif path.count('test'): test_path_list.append(path) # train_path_list = sorted(train_path_list)[:20] # test_path_list = sorted(test_path_list)[:20] base_train = base[~base[target].isnull()].reset_index(drop=True) base_test = base[base[target].isnull()].reset_index(drop=True) train_feature_list = utils.parallel_load_data(path_list=train_path_list) test_feature_list = utils.parallel_load_data(path_list=test_path_list) train = pd.concat(train_feature_list, axis=1) train = pd.concat([base_train, train], axis=1) test = pd.concat(test_feature_list, axis=1) test = pd.concat([base_test, test], axis=1) train.set_index(key, inplace=True) test.set_index(key, inplace=True) train.fillna(train.median(), inplace=True) test.fillna(test.median(), inplace=True) num_list = [ col for col in train.columns if (str(train[col].dtype).count('int') or str(train[col].dtype).count( 'float')) and col != target and not (col.count('amount'))
# Ensemble 3 set3 = f'../model/E3_set/*.gz' # Ensemble 4 set4 = f'../model/E4_set/*.gz' set_list = [set1, set2, set3, set4] win_path = set_list[int(sys.argv[2])] win_path_list = glob.glob(win_path) base = utils.read_df_pkl('../input/base_term*0*')[[ key, target, 'first_active_month' ]] base_train = base[~base[target].isnull()].reset_index(drop=True) base_test = base[base[target].isnull()].reset_index(drop=True) feature_list = utils.parallel_load_data(path_list=win_path_list) df = pd.concat(feature_list, axis=1) train = pd.concat([base_train, df.iloc[:len(base_train), :]], axis=1) test = pd.concat( [base_test, df.iloc[len(base_train):, :].reset_index(drop=True)], axis=1) train.reset_index(inplace=True, drop=True) test.reset_index(inplace=True, drop=True) if out_part == 'no_out': train = train[train[target] > -30] #======================================================================== #======================================================================== # 正規化の前処理(Null埋め, inf, -infの処理) for col in train.columns:
def feature_matrix(self, feat_key_list=[], is_reduce=False, feim_path='', rank=50000, gain=0, limit=3000): ''' Explain: feature_path_listからtrain, testを作成する。 feature_path_listの作成がされてない時は、featuresディレクトリ配下の feature全てを読み込む Args: is_reduce: メモリ削減したい時 Return: train, test(DF) ''' if len(self.feature_path_list) == 0: self.get_feature_path_list(feat_key_list=feat_key_list) self.feature_path_list += glob.glob('../features/raw_feature/*.gz') train_path_list = [] test_path_list = [] for path in self.feature_path_list: filename = utils.get_filename(path) if filename[:3] == 'tra': if len(feim_path): if gain: select_list = self.select_feature(feim_path, gain=gain) else: select_list = self.select_feature(feim_path, rank=rank) trn_name = filename[6:] if trn_name in select_list: train_path_list.append(path) else: train_path_list.append(path) elif filename[:3] == 'tes': if len(feim_path): select_list = self.select_feature(feim_path, rank) tes_name = filename[5:] if tes_name in select_list: test_path_list.append(path) else: test_path_list.append(path) #======================================================================== # Valid Feature valid_list = glob.glob('../features/valid_features/*.gz') for path in valid_list: filename = utils.get_filename(path) if filename[:3] == 'tra': train_path_list.append(path) elif filename[:3] == 'tes': test_path_list.append(path) #======================================================================== test_path_list = sorted(test_path_list)[:limit] remove_list = [] for path in train_path_list: if path.replace('train_', 'test_') not in test_path_list: remove_list.append(path) train_path_list = list(set(train_path_list) - set(remove_list)) train_list = utils.parallel_load_data(path_list=train_path_list) test_list = utils.parallel_load_data(path_list=test_path_list) train = pd.concat(train_list, axis=1) test = pd.concat(test_list, axis=1) train = pd.concat([self.base_train, train], axis=1) test = pd.concat([self.base_test, test], axis=1) if is_reduce: train = utils.reduce_mem_usage(train) test = utils.reduce_mem_usage(test) return train, test