def bin(self, var): import binning x = self.data.xs(var, 1, 'field', False) self.binned = pd.concat([ binning.bin(x.xs(i, 1, 'station', False).dropna(0, 'all')) for i in self.data.columns.get_level_values('station').unique() ], 1)
def bin_pca(df_pca, bool_df, df_cont, b_pca): if b_pca=='True': pca_leng = {} # df_pca.drop(['ABANDONED'], inplace=True,axis=1, errors='ignore') print df_cont.dtypes lists=list(df_cont) print lists for k in lists: bool = pd.DataFrame(bool_df['ABANDONED'], columns=['ABANDONED']) # print bool # bool.reset_index(level=['CUSTOMER_KEY'], inplace=True) df_bin = pd.concat([df_cont[k], bool['ABANDONED']], axis=1) dict = bin(df_bin) leng = len(dict) pca_level = 'pcl_' labels = [pca_level + `r` for r in range(leng)] df_pca[k]= pd.cut(df_cont[k], bins=leng, labels=labels, include_lowest=True) print df_cont[k] pca_leng[k] = leng
def main(): """ TKS prioritization code. Example command line call: python prioritize.py toi+-2019-11-14.csv exofop_search_2019-11-14_combined.csv """ # Handle the command line input args = handle_args() toi_fname = os.path.join("data/toi", args.toi_fname) exo_fname = os.path.join("data/exofop", args.exofop_fname) planet_df = load_and_merge(toi_fname, exo_fname) toi_col_dict = load_toi_col_names(args.toi_col_dict) # Clean up the resulting df a little bit planet_df = planet_df.drop(columns=['TFOP SG1a','TFOP SG1b','TFOP SG2', 'TFOP SG3','TFOP SG4','TFOP SG5','TFOP Master', 'TOI Disposition']) planet_df = planet_df.drop_duplicates(subset="Full TOI ID").sort_values("Full TOI ID") # Add a column for stellar mass calculated from surface gravity # N.B. the Exofop data contains stellar mass, but for rows that are not matched # with Exofop data, give them a stellar mass. planet_df[toi_col_dict["ms_key"]] = (10**planet_df["Surface Gravity Value"] * (planet_df[toi_col_dict["rs_key"]] * const.R_sun)\ / const.G) / const.M_sun # Remove rows from the df that don't meet these three criteria planet_df = planet_df[np.logical_and.reduce((planet_df[toi_col_dict["rp_key"]] > 0, planet_df[toi_col_dict["pp_key"]] > 0, planet_df["Source Pipeline"] == "spoc"))] # Add a column for the ratio of the planet's semi-major orbital axis and the radius of the host star # This column is needed for calculating the TSM (but don't need it if calculating equilibrium temperature via insolation flux) planet_df["ar_ratio"] = ar_ratio(planet_df[toi_col_dict["pp_key"]], planet_df[toi_col_dict["ms_key"]], planet_df[toi_col_dict["rs_key"]]) planet_df = planet_df.reset_index(drop = True) # Reset the indices because some rows might've been removed # Estimate planet masses given radii planet_df[toi_col_dict["mp_key"]] = chen_kipping_louie_mass(planet_df[toi_col_dict["rp_key"]]) # Estimate K amplitude of RV observation planet_df["K_amp"] = k_amp(planet_df[toi_col_dict["pp_key"]], planet_df[toi_col_dict["mp_key"]], planet_df[toi_col_dict["ms_key"]]) # Cull the sample for systems that are observable by Keck in both dec and RV resolution (above -20 degrees dec, > 2 m/s k_amp) desirable_inds = np.logical_and(planet_df[toi_col_dict["dec_key"]] > -20, planet_df['K_amp'] > 2) planet_df = planet_df[desirable_inds] planet_df = planet_df.reset_index(drop = True) # Calculate TSM values if args.use_TSM_natalie: planet_df["TSM"] = calculate_TSM_natalie(planet_df[toi_col_dict["rp_key"]], planet_df[toi_col_dict["rs_key"]], planet_df[toi_col_dict["Ts_key"]], planet_df[toi_col_dict["Jmag_key"]], planet_df[toi_col_dict["mp_key"]], planet_df[toi_col_dict["Fp_key"]]) else: planet_df["TSM"] = calculate_TSM(planet_df[toi_col_dict["rp_key"]], planet_df[toi_col_dict["rs_key"]], planet_df[toi_col_dict["Ts_key"]], planet_df[toi_col_dict["Jmag_key"]], planet_df[toi_col_dict["mp_key"]], planet_df[toi_col_dict["ar_key"]]) ##### Copied from Nicholas' notebook ##### ########################################## rad_bins = 10**(np.linspace(0,1,6)) fpl_bins = 10**(np.linspace(-1,4,6)) tef_bins = np.array([2500,3900,5200,6500]) all_bins = [rad_bins, fpl_bins, tef_bins] id_key = "Full TOI ID" binned = bin(toi_col_dict, planet_df, all_bins, id_key, "TSM") priority = int(args.priority) my_tois = binned[binned["priority"]==priority].reset_index(drop=True).sort_values(id_key)[id_key].values ########################################## ########################################## print("Priority {} targets: \n{}".format(priority, my_tois)) output_fname = args.output_fname + "_priority_{}.txt".format(priority) np.savetxt(output_fname, my_tois, fmt='%.2f') print("Binning sucessful, output stored in {}".format(output_fname))
r = pd.concat([self.regression(b, y) for b in self.blocks]).sort_index() else: r = self.regression(pd.concat(self.blocks, 1).sort_index(), y) r = pd.DataFrame(r) if isinstance(y, pd.Series): r.columns = pd.MultiIndex.from_tuples([y.name], names=x.columns.names) else: r.columns = y.columns return r if __name__ == "__main__": import binning, data D = data.Data() D.open('r', 's_raw.h5') X = binning.bin(D.r).xs('avg', 1, 'aggr') X = X - X.mean() y = X.xs('3', 1, 'station', False).iloc[:, 0].dropna() x = X.drop(y.name, 1).loc[y.index] # b = tree_blocks(x) # a = block_predictors(x, b) # a0 = pd.concat(a, 1).fillna(0) # r1 = np.linalg.lstsq(a0, y) # r2 = pd.concat([regression(c, y) for c in a], 0).sort_index() # r3 = affinity_regression(x, y)
X = df.values X_cont = X[:, :] # print X_cont pca = PCA() pca.fit(X_cont) x3 = pca.transform(X_cont) string = "pca_" pca_column_name = [string + ` i ` for i in range(x3.shape[1])] # print pca_column_name pca_df = pd.DataFrame(x3, columns=pca_column_name) pca_array = [] for i in pca_df.columns: y = 'ABANDONED_FLAG' df_bin = pd.concat((pca_df[i], df_bool[y]), axis=1) dict = bin(df_bin, y) leng = len(dict) if leng > 2: pca_level = 'pcl_' labels = [pca_level + ` r ` for r in range(leng)] pca_df[i] = pd.cut(pca_df[i], bins=leng, labels=labels, include_lowest=True) elif leng <= 2: pca_df.drop([i], axis=1, inplace=True) pca_dummies = pd.get_dummies(pca_df) df_char = pd.read_csv( "cs_loyal_sample.csv", sep='\t',