del df[f'{PREF}_object_id'] utils.to_pkl_gzip(df, f'../data/train_aug_{PREF}.pkl') os.system(f'rm ../data/tmp_{PREF}*') # test if utils.GENERATE_TEST: imp = pd.read_csv(utils.IMP_FILE).head(utils.GENERATE_FEATURE_SIZE) usecols = imp[imp.feature.str.startswith(f'{PREF}')][ imp.gain > 0].feature.tolist() # usecols = [c.replace(f'{PREF}_', '') for c in usecols] # usecols += ['object_id'] os.system(f'rm ../data/tmp_{PREF}*') argss = [] for i, file in enumerate(utils.TEST_LOGS): argss.append([file, f'../data/tmp_{PREF}{i}.pkl']) pool = Pool(cpu_count()) pool.map(multi, argss) pool.close() df = pd.concat( [pd.read_pickle(f) for f in glob(f'../data/tmp_{PREF}*')], ignore_index=True) df.sort_values(f'{PREF}_object_id', inplace=True) df.reset_index(drop=True, inplace=True) del df[f'{PREF}_object_id'] utils.to_pkl_gzip(df, f'../data/test_{PREF}.pkl') utils.save_test_features(df[usecols]) os.system(f'rm ../data/tmp_{PREF}*') utils.end(__file__)
def multi(pref): df = pd.read_pickle(f'../data/test_{pref}.pkl.gz') col = list( set(df.columns) & set(gen_features) ) print(pref, col) utils.save_test_features(df[col]) return
os.system(f'rm ../data/t*_{PREF}*') os.system(f'rm ../feature/t*_{PREF}*') #def mk_feats(df): # df['hostgal_specz-m-hostgal_photoz'] = df['hostgal_specz'] - df['hostgal_photoz'] # df['hostgal_specz-d-hostgal_photoz'] = df['hostgal_specz'] / df['hostgal_photoz'] # df['hostgal_photoz-d-hostgal_photoz_err'] = df['hostgal_photoz'] / df['hostgal_photoz_err'] # df['hostgal_specz-d-hostgal_photoz_err'] = df['hostgal_specz'] / df['hostgal_photoz_err'] # return # ============================================================================= # main # ============================================================================= if __name__ == "__main__": utils.start(__file__) train = utils.load_train().drop(['object_id', 'target'], axis=1) train.add_prefix(PREF+'_').to_pickle(f'../data/train_{PREF}.pkl') train_aug = pd.read_pickle('../data/train_aug.pkl').drop(['object_id', 'object_id_bk', 'target'], axis=1) train_aug.add_prefix(PREF+'_').to_pickle(f'../data/train_aug_{PREF}.pkl') test = utils.load_test().drop(['object_id'], axis=1) test.loc[test.hostgal_photoz==0, 'hostgal_specz'] = 0 test = test.add_prefix(PREF+'_') test.to_pickle(f'../data/test_{PREF}.pkl') utils.save_test_features(test) utils.end(__file__)
if __name__ == "__main__": utils.start(__file__) usecols = None aggregate(pd.read_pickle('../data/train_log.pkl'), f'../data/train_{PREF}.pkl') # test if is_test: imp = pd.read_csv(utils.IMP_FILE).head(utils.GENERATE_FEATURE_SIZE) usecols = imp[imp.feature.str.startswith(f'{PREF}')][imp.gain>0].feature.tolist() usecols = [c.replace(f'{PREF}_', '') for c in usecols] usecols += ['object_id'] os.system(f'rm ../data/tmp_{PREF}*') argss = [] for i,file in enumerate(utils.TEST_LOGS): argss.append([file, f'../data/tmp_{PREF}{i}.pkl']) pool = Pool( cpu_count() ) pool.map(multi, argss) pool.close() df = pd.concat([pd.read_pickle(f) for f in glob(f'../data/tmp_{PREF}*')], ignore_index=True) df.sort_values(f'{PREF}_object_id', inplace=True) df.reset_index(drop=True, inplace=True) del df[f'{PREF}_object_id'] df.to_pickle(f'../data/test_{PREF}.pkl') utils.save_test_features(df) os.system(f'rm ../data/tmp_{PREF}*') utils.end(__file__)