del df[f'{PREF}_object_id']
        utils.to_pkl_gzip(df, f'../data/train_aug_{PREF}.pkl')
        os.system(f'rm ../data/tmp_{PREF}*')

    # test
    if utils.GENERATE_TEST:
        imp = pd.read_csv(utils.IMP_FILE).head(utils.GENERATE_FEATURE_SIZE)
        usecols = imp[imp.feature.str.startswith(f'{PREF}')][
            imp.gain > 0].feature.tolist()
        #        usecols = [c.replace(f'{PREF}_', '') for c in usecols]
        #        usecols += ['object_id']

        os.system(f'rm ../data/tmp_{PREF}*')
        argss = []
        for i, file in enumerate(utils.TEST_LOGS):
            argss.append([file, f'../data/tmp_{PREF}{i}.pkl'])
        pool = Pool(cpu_count())
        pool.map(multi, argss)
        pool.close()
        df = pd.concat(
            [pd.read_pickle(f) for f in glob(f'../data/tmp_{PREF}*')],
            ignore_index=True)
        df.sort_values(f'{PREF}_object_id', inplace=True)
        df.reset_index(drop=True, inplace=True)
        del df[f'{PREF}_object_id']
        utils.to_pkl_gzip(df, f'../data/test_{PREF}.pkl')
        utils.save_test_features(df[usecols])
        os.system(f'rm ../data/tmp_{PREF}*')

    utils.end(__file__)
Beispiel #2
0
def multi(pref):
    df = pd.read_pickle(f'../data/test_{pref}.pkl.gz')
    col = list( set(df.columns) & set(gen_features) )
    print(pref, col)
    utils.save_test_features(df[col])
    return
Beispiel #3
0
os.system(f'rm ../data/t*_{PREF}*')
os.system(f'rm ../feature/t*_{PREF}*')

#def mk_feats(df):
#    df['hostgal_specz-m-hostgal_photoz'] = df['hostgal_specz'] - df['hostgal_photoz']
#    df['hostgal_specz-d-hostgal_photoz'] = df['hostgal_specz'] / df['hostgal_photoz']
#    df['hostgal_photoz-d-hostgal_photoz_err'] = df['hostgal_photoz'] / df['hostgal_photoz_err']
#    df['hostgal_specz-d-hostgal_photoz_err'] = df['hostgal_specz'] / df['hostgal_photoz_err']
#    return

# =============================================================================
# main
# =============================================================================
if __name__ == "__main__":
    utils.start(__file__)
    
    train = utils.load_train().drop(['object_id', 'target'], axis=1)
    train.add_prefix(PREF+'_').to_pickle(f'../data/train_{PREF}.pkl')
    train_aug = pd.read_pickle('../data/train_aug.pkl').drop(['object_id', 'object_id_bk', 'target'], axis=1)
    train_aug.add_prefix(PREF+'_').to_pickle(f'../data/train_aug_{PREF}.pkl')
    
    
    test  = utils.load_test().drop(['object_id'], axis=1)
    test.loc[test.hostgal_photoz==0, 'hostgal_specz'] = 0
    test = test.add_prefix(PREF+'_')
    test.to_pickle(f'../data/test_{PREF}.pkl')
    utils.save_test_features(test)
    
    utils.end(__file__)

if __name__ == "__main__":
    utils.start(__file__)
    
    usecols = None
    aggregate(pd.read_pickle('../data/train_log.pkl'), f'../data/train_{PREF}.pkl')
    
    # test
    if is_test:
        imp = pd.read_csv(utils.IMP_FILE).head(utils.GENERATE_FEATURE_SIZE)
        usecols = imp[imp.feature.str.startswith(f'{PREF}')][imp.gain>0].feature.tolist()
        usecols = [c.replace(f'{PREF}_', '') for c in usecols]
        usecols += ['object_id']
        
        os.system(f'rm ../data/tmp_{PREF}*')
        argss = []
        for i,file in enumerate(utils.TEST_LOGS):
            argss.append([file, f'../data/tmp_{PREF}{i}.pkl'])
        pool = Pool( cpu_count() )
        pool.map(multi, argss)
        pool.close()
        df = pd.concat([pd.read_pickle(f) for f in glob(f'../data/tmp_{PREF}*')], 
                        ignore_index=True)
        df.sort_values(f'{PREF}_object_id', inplace=True)
        df.reset_index(drop=True, inplace=True)
        del df[f'{PREF}_object_id']
        df.to_pickle(f'../data/test_{PREF}.pkl')
        utils.save_test_features(df)
        os.system(f'rm ../data/tmp_{PREF}*')
    
    utils.end(__file__)