def _make_df(input_feature_list: List[str], remove_galactic_test_data: bool = True, drop_features: List[str] = None): df = common.load_metadata() for f in tqdm(input_feature_list): df = pd.merge(df, common.load_feature(f), on='object_id', how='left') if drop_features is not None: df.drop(drop_features, axis=1, inplace=True) df.set_index('object_id', inplace=True) x_train = df[df.hostgal_specz > 0.0] if remove_galactic_test_data: x_test = df[df.hostgal_specz.isnull() & (df.hostgal_photoz > 0.0)] else: x_test = df[df.hostgal_specz.isnull()] x_train.drop('target', axis=1, inplace=True) x_test.drop('target', axis=1, inplace=True) y_train = x_train.hostgal_specz x_train.drop('hostgal_specz', axis=1, inplace=True) x_test.drop('hostgal_specz', axis=1, inplace=True) return x_train, x_test, y_train
def save_v4(): features = ['f513', 'f515', 'f517'] base = common.load_metadata()[['object_id', 'target']] for f in features: tmp = common.load_feature(f) base = pd.merge(base, tmp, on='object_id', how='left') _save(base, 'features_nyanp_all_v4_{}')
def f701_redshift_difference(): f601_estimate_redshift() estimated = common.load_feature("f601") meta = common.load_metadata() dst = pd.merge(meta[['object_id', 'hostgal_photoz']], estimated, on='object_id', how='left') dst['hostgal_photoz_predicted_diff'] = dst['hostgal_photoz'] - dst[ 'hostgal_z_predicted'] common.save_feature(dst[['object_id', 'hostgal_photoz_predicted_diff']], "f701")
def f517_blending_salts(): meta = common.load_metadata() f500 = common.load_feature('f500') f515 = common.load_feature('f515') f516 = common.load_feature('f516') df = pd.merge(meta[['object_id', 'target', 'hostgal_photoz', 'ddf']], f500, on='object_id', how='left') df = pd.merge(df, f515, on='object_id', how='left') df = pd.merge(df, f516, on='object_id', how='left') prefix = [ 'sn_salt2_', 'salt2-extended_p_sn3_salt2-extended_', 'salt2_p_sn3_salt2_' ] params = ['x0', 't0', 'z', 'c', 'x1'] for p in params: print('param: {}'.format(p)) # weighted average based on error weights = [] weighted_sum = [] for m in prefix: col = 'w_{}{}'.format(p, m) df[col] = 1 / (df['{}{}_err'.format(m, p)] * df['{}{}_err'.format(m, p)]) weights.append(col) df[col + '_s'] = df[col] * df[m + p] weighted_sum.append(col + '_s') df['salt2-{}-weighted-avg'.format(p)] = df[weighted_sum].sum(axis=1) df['tmp'] = df[weights].sum(axis=1) df['salt2-{}-weighted-avg'.format( p)] = df['salt2-{}-weighted-avg'.format(p)] / df['tmp'] df.drop('tmp', axis=1, inplace=True) df.drop(weighted_sum, axis=1, inplace=True) df.drop(weights, axis=1, inplace=True) common.save_feature( df[['object_id'] + ['salt2-{}-weighted-avg'.format(p) for p in params]], 'f517')
def f1010_redshift_difference_perch(): meta = common.load_metadata() meta = pd.merge(meta, common.load_feature('f603'), on='object_id', how='left') meta = pd.merge(meta, common.load_feature('f000'), on='object_id', how='left') meta['Mpc'] = meta['hostgal_z_predicted'].apply(z2pc) meta['Gpc'] = meta['Mpc'] / 1000.0 features = [] for i in range(6): ch = i meta['flux_diff_ch{}'.format(ch)] = meta['max(flux)_ch{}'.format( ch)] - meta['min(flux)_ch{}'.format(ch)] meta['luminosity_diff_ch{}'.format(ch)] = meta['flux_diff_ch{}'.format( ch)] * meta['Gpc'] * meta['Gpc'] features.append('luminosity_diff_ch{}'.format(ch)) common.save_feature(meta[['object_id'] + features], "f1010")
from features.f3xx_tsfresh import * from features.f40x_astropy import * import common import config debug = config.TRAINING_ONLY output = config.FEATURE_SAVE_DIR cv_only = config.TRAINING_ONLY meta = common.load_metadata() lc = common.load_lightcurve() pb = common.load_passband_metadata() input = Input(meta, pb, lc) f400_lombscargle(input=input, debug=debug, target_dir=output) del lc['detected'] del lc['flux_err'] lc['passband'] = lc['passband'].astype(np.uint8) gc.collect() lc['id_passband'] = lc['object_id'] * 10 + lc['passband'] input = Input(meta, pb, lc) f300_num_peaks(input=input, debug=debug, target_dir=output) f301_quantile2(input=input, debug=debug, target_dir=output) f302_quantile8(input=input, debug=debug, target_dir=output) f303_c3(input=input, debug=debug, target_dir=output) f304_autocorr1(input=input, debug=debug, target_dir=output) f305_autocorr2(input=input, debug=debug, target_dir=output)
def __init__(self, features_inner: List[str], features_extra: List[str], model_inner: Model, model_extra: Model, submit_filename: str = 'submission.csv', logdir: str = 'default', drop_feat_inner=None, drop_feat_extra=None, logging_level=logging.DEBUG, postproc_version=1, mode='both', pseudo_n_loop=0, pseudo_th=0.97, pseudo_classes=[90], save_pseudo_label=True, cache_path_inner=None, cache_path_extra=None, pl_labels: Dict[str, str] = None, use_cache=False): try: os.mkdir(logdir) except: pass df = common.load_metadata() self.mode = mode self.logdir = logdir if submit_filename is None: self.submit_filename = None df = df[~df.target.isnull()].reset_index( ) # use training data only else: self.submit_filename = submit_filename df['extra'] = (df['hostgal_photoz'] > 0.0).astype(np.int32) self.df_inner = df[df.extra == 0].reset_index(drop=True) self.df_extra = df[df.extra == 1].reset_index(drop=True) self.model_inner = model_inner self.model_extra = model_extra self.logger = logging.getLogger(logdir) self.logger.setLevel(logging_level) self.fh = logging.FileHandler(os.path.join(self.logdir, 'log.txt')) self.fh.setLevel(logging_level) if len(self.logger.handlers) == 0: self.logger.addHandler(self.fh) self.logger.info('load features...') if self._use_inner: self.df_inner = self._setup(self.df_inner, features_inner, drop_feat_inner, cache_path_inner, use_cache) if config.MODELING_MODE == 'small': self.df_inner = self.df_inner[galactic_top16 + ['object_id', 'target']] gc.collect() if self._use_extra: self.df_extra = self._setup(self.df_extra, features_extra, drop_feat_extra, cache_path_extra, use_cache) if config.MODELING_MODE == 'small': self.df_extra = self.df_extra[extragalactic_top16 + ['object_id', 'target']] gc.collect() self.df_extra_pseudo = self.df_extra.copy() self.postproc_version = postproc_version self.pseudo_n_loop = pseudo_n_loop self.pseudo_classes = pseudo_classes self.pseudo_th = pseudo_th self.save_pseudo_label = save_pseudo_label self.pl_labels = pl_labels
def save_v1(): features = ['f000', 'f001', 'f002', 'f010', 'f026', 'f050', 'f051', 'f052', 'f053', 'f054', 'f061', 'f063', 'f100', 'f1000', 'f1001', 'f1002', 'f1003', 'f1004', 'f1005', 'f1006', 'f101', 'f1010', 'f102', 'f103', 'f104', 'f106', 'f107', 'f108', 'f1080', 'f1081', 'f1082', 'f1083', 'f1085', 'f1086', 'f1087', 'f1088', 'f1089', 'f109', 'f110', 'f140', 'f141', 'f142', 'f143', 'f144', 'f150', 'f151', 'f152', 'f153', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f300', 'f301', 'f302', 'f303', 'f304', 'f305', 'f306', 'f307', 'f308', 'f309', 'f310', 'f311', 'f330', 'f340', 'f350', 'f370', 'f400', 'f500', 'f505', 'f506', 'f507', 'f600', 'f701'] best_subset_v1 = ['object_id', 'hostgal_photoz_err', 'distmod', 'mwebv', 'mean(flux)_ch0', 'mean(flux)_ch1', 'mean(flux)_ch2', 'mean(flux)_ch3', 'mean(flux)_ch4', 'mean(flux)_ch5', 'max(flux)_ch0', 'max(flux)_ch1', 'min(flux)_ch0', 'min(flux)_ch1', 'min(flux)_ch2', 'min(flux)_ch3', 'min(flux)_ch4', 'min(flux)_ch5', 'median(flux)_ch0', 'median(flux)_ch1', 'median(flux)_ch2', 'median(flux)_ch3', 'median(flux)_ch4', 'median(flux)_ch5', 'std(flux)_ch0', 'std(flux)_ch1', 'std(flux)_ch4', 'std(flux)_ch5', 'timescale_th0.35_max_ch0', 'timescale_th0.35_max_ch1', 'timescale_th0.35_max_ch2', 'timescale_th0.35_max_ch3', 'timescale_th0.35_max_ch4', 'timescale_th0.35_max_ch5', 'diff(max(flux))_0_1', 'diff(max(flux))_1_2', 'diff(max(flux))_2_3', 'diff(max(flux))_3_4', 'diff(max(flux))_4_5', 'mean(detected)_ch1', 'mean(detected)_ch2', 'mean(detected)_ch3', 'mean(detected)_ch5', 'std(detected)_ch3', 'std(detected)_ch5', 'diff(max(flux))_0_3', 'diff(max(flux))_1_4', 'diff(max(flux))_2_5', 'timescale_th0.5_min_ch0', 'timescale_th0.5_min_ch1', 'timescale_th0.5_min_ch2', 'timescale_th0.5_min_ch4', 'timescale_th0.5_min_ch5', 'mean(flux)', 'max(flux)', 'min(flux)', 'timescale_th0.35_min_ch0', 'timescale_th0.35_min_ch1', 'timescale_th0.35_min_ch2', 'timescale_th0.35_min_ch4', 'timescale_th0.35_min_ch5', 'timescale_th0.15_max_ch0', 'timescale_th0.15_max_ch1', 'timescale_th0.15_max_ch2', 'timescale_th0.15_max_ch3', 'timescale_th0.15_max_ch4', 'timescale_th0.15_max_ch5', 'max(flux_slope)_ch0', 'max(flux_slope)_ch1', 'max(flux_slope)_ch2', 'max(flux_slope)_ch3', 'max(flux_slope)_ch4', 'max(flux_slope)_ch5', 'min(flux_slope)_ch0', 'min(flux_slope)_ch1', 'min(flux_slope)_ch2', 'min(flux_slope)_ch3', 'min(flux_slope)_ch4', 'min(flux_slope)_ch5', 'flux__c3__lag_1_ch0', 'flux__c3__lag_1_ch1', 'flux__c3__lag_1_ch2', 'flux__c3__lag_1_ch3', 'flux__c3__lag_1_ch4', 'flux__c3__lag_1_ch5', 'flux__autocorrelation__lag_1_ch0', 'flux__autocorrelation__lag_1_ch1', 'flux__autocorrelation__lag_1_ch2', 'flux__autocorrelation__lag_1_ch3', 'flux__autocorrelation__lag_1_ch4', 'flux__autocorrelation__lag_1_ch5', 'delta', 'max(astropy.lombscargle.power)_ch0', 'max(astropy.lombscargle.power)_ch1', 'max(astropy.lombscargle.power)_ch2', 'max(astropy.lombscargle.power)_ch3', 'max(astropy.lombscargle.power)_ch4', 'max(astropy.lombscargle.power)_ch5', 'astropy.lombscargle.timescale_ch0', 'astropy.lombscargle.timescale_ch1', 'astropy.lombscargle.timescale_ch2', 'astropy.lombscargle.timescale_ch3', 'astropy.lombscargle.timescale_ch4', 'astropy.lombscargle.timescale_ch5', 'diff(max(flux))_0_4', 'diff(max(flux))_1_5', 'diff(max(flux))_0_5', 'diff(min(flux))_0_1', 'diff(min(flux))_2_3', 'diff(min(flux))_4_5', 'amp(flux)_ch0/ch1', 'amp(flux)_ch1/ch2', 'amp(flux)_ch2/ch3', 'amp(flux)_ch3/ch4', 'amp(flux)_ch4/ch5', 'amp(flux)_ch0/ch2', 'amp(flux)_ch1/ch3', 'amp(flux)_ch2/ch4', 'amp(flux)_ch3/ch5', 'amp(flux)_ch0/ch3', 'amp(flux)_ch1/ch4', 'amp(flux)_ch2/ch5', 'amp(flux)_ch0/ch4', 'amp(flux)_ch1/ch5', 'amp(flux)_ch0/ch5', 'delta(max(flux), last(detected))', 'delta(first(detected), max(flux))', 'delta(max(flux), last(detected))_ch1', 'delta(max(flux), last(detected))_ch2', 'delta(max(flux), last(detected))_ch3', 'delta(max(flux), last(detected))_ch4', 'delta(max(flux), last(detected))_ch5', 'delta(first(detected), max(flux))_ch1', 'delta(first(detected), max(flux))_ch2', 'delta(first(detected), max(flux))_ch3', 'delta(first(detected), max(flux))_ch4', 'delta(first(detected), max(flux))_ch5', 'detected_median(flux)_ch1', 'detected_median(flux)_ch2', 'detected_median(flux)_ch3', 'detected_median(flux)_ch4', 'detected_median(flux)_ch5', 'detected_diff(median(flux))_0_1', 'detected_diff(median(flux))_1_2', 'detected_diff(median(flux))_2_3', 'detected_diff(median(flux))_3_4', 'detected_diff(median(flux))_4_5', '0__fft_coefficient__coeff_0__attr_"abs"', '0__fft_coefficient__coeff_1__attr_"abs"', '0__kurtosis', '0__skewness', '1__fft_coefficient__coeff_0__attr_"abs"', '1__fft_coefficient__coeff_1__attr_"abs"', '1__kurtosis', '1__skewness', '2__fft_coefficient__coeff_1__attr_"abs"', '2__kurtosis', '2__skewness', '3__fft_coefficient__coeff_0__attr_"abs"', '3__kurtosis', '3__skewness', '4__fft_coefficient__coeff_0__attr_"abs"', '4__fft_coefficient__coeff_1__attr_"abs"', '4__kurtosis', '4__skewness', '5__fft_coefficient__coeff_0__attr_"abs"', '5__fft_coefficient__coeff_1__attr_"abs"', '5__kurtosis', '5__skewness', 'hostgal_z_predicted', 'sn_salt2_chisq', 'sn_salt2_z', 'sn_salt2_t0', 'sn_salt2_x0', 'sn_salt2_x1', 'sn_salt2_c', 'sn_salt2_z_err', 'sn_salt2_t0_err', 'sn_salt2_x0_err', 'sn_salt2_x1_err', 'sn_salt2_c_err', 'luminosity_est_diff_ch0', 'luminosity_est_diff_ch1', 'luminosity_est_diff_ch2', 'luminosity_est_diff_ch3', 'luminosity_est_diff_ch4', 'luminosity_est_diff_ch5'] best16_v1 = ['object_id', 'sn_salt2_c', 'delta', 'sn_salt2_x1', 'distmod', 'luminosity_est_diff_ch4', 'luminosity_est_diff_ch5', 'luminosity_est_diff_ch3', 'luminosity_est_diff_ch2', 'hostgal_photoz_err', 'hostgal_z_predicted', 'luminosity_est_diff_ch0', 'luminosity_est_diff_ch1', 'sn_salt2_chisq', 'sn_salt2_z', 'amp(flux)_ch3/ch5', '0__skewness'] base = common.load_metadata()[['object_id', 'distmod', 'hostgal_photoz_err', 'mwebv', 'target']] for f in tqdm(features): tmp = common.load_feature(f) if f == 'f1080': tmp.columns = ['object_id', 'delta_SNR3'] if f == 'f1010': tmp.columns = ['object_id'] + [c + '_estimated' for c in tmp.columns.tolist()[1:]] for c in tmp: if c == 'object_id': continue if c in base: print('{} is already in base(f): {}, {}'.format(c, f, base.columns.tolist())) assert c not in base tmp['object_id'] = tmp['object_id'].astype(np.int32) base = pd.merge(base, tmp, on='object_id', how='left') # -> yuval _save(base[best_subset_v1+['target']], 'nyanp_feat_v1_{}') _save(base[best16_v1+['target']], 'nyanp_feat_v1_{}_top16') # add prefix to oof features xlist = [ 'hostgal_z_predicted', 'hostgal_photoz_predicted_diff', 'luminosity_est_diff_ch0', 'luminosity_est_diff_ch1', 'luminosity_est_diff_ch2', 'luminosity_est_diff_ch3', 'luminosity_est_diff_ch4', 'luminosity_est_diff_ch5', 'luminosity_diff_ch0_estimated', 'luminosity_diff_ch1_estimated', 'luminosity_diff_ch2_estimated', 'luminosity_diff_ch3_estimated', 'luminosity_diff_ch4_estimated', 'luminosity_diff_ch5_estimated' ] renames = {x: 'xxx_' + x for x in xlist} base.rename(columns=renames, inplace=True) # -> mamas _save(base.drop(['distmod', 'hostgal_photoz_err', 'mwebv'], axis=1), 'features_nyanp_all_v1_{}')
def save_v2(): base = common.load_metadata()[['object_id', 'target']] base = pd.merge(base, common.load_feature('f509'), on='object_id', how='inner') _save(base, 'features_nyanp_all_v2_{}')