def main(): flux_arr, exp_arr, ivar_arr, mask_arr, wavelengths = \ ICAize.load_all_in_dir('.', pattern="stacked*exp??????.csv") temp_flux_arr, temp_exp_arr, temp_ivar_arr, temp_mask_arr, temp_wavelengths = \ ICAize.load_all_in_dir('.', pattern="stacked*exp??????.fits") if len(flux_arr) > 0: if len(temp_flux_arr) > 0: flux_arr = np.concatenate((flux_arr, temp_flux_arr)) exp_arr = np.concatenate((exp_arr, temp_exp_arr)) ivar_arr = np.concatenate((ivar_arr, temp_ivar_arr)) wavelengths = np.concatenate((wavelengths, temp_wavelengths)) elif len(temp_flux_arr) > 0: flux_arr = temp_flux_arr exp_arr = temp_exp_arr ivar_arr = temp_ivar_arr wavelengths = temp_wavelengths else: return np.savez("compacted_flux_data.npz", flux=flux_arr, exp=exp_arr, ivar=ivar_arr, wavelengths=wavelengths)
def MSE(Y, y, multioutput='uniform_average', Y_full=None, flux_arr=None, source_model=None, ss=None, source_model_args=None, method=None): if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None: inds = get_inds_(Y, Y_full) back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args) try: return mean_squared_error(flux_arr[inds], back_trans_flux, multioutput=multioutput) except: return mean_squared_error(flux_arr[inds], back_trans_flux) else: try: yss = pp.MaxAbsScaler() Y = yss.fit_transform(Y) y = yss.transform(y) except: scalefactor = np.amax(np.abs(Y), axis=0) Y = Y / scalefactor y = y / scalefactor try: return mean_squared_error(Y, y, multioutput=multioutput) except: return mean_squared_error(Y, y)
def MAPED(Y, y, multioutput='uniform_average', power=4, cutoff=0.1, Y_full=None, flux_arr=None, source_model=None, ss=None, source_model_args=None, method=None): #Mean Absolute Power Error Difference; take sum of (absolute) diffs, subtract MAPE from it if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None: inds = get_inds_(Y, Y_full) back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args) diffs = np.abs(flux_arr[inds] - back_trans_flux) diffs[diffs < cutoff] = 0 sums = np.sum(diffs, axis=1) diffs = np.sum(np.power(diffs, power), axis=1) return float( np.mean(np.abs(sums - np.power(diffs, 1.0 / power))) / flux_arr.shape[1]) else: diffs = np.abs(Y - y) diffs[diff < cutoff] = 0 sums = np.sum(diffs, axis=1) diffs = np.sum(np.power(diffs, power), axis=1) return float( np.mean(np.abs(sums - np.power(diffs, 1.0 / power))) / Y.shape[1])
def EXP_VAR(Y, y, multioutput='uniform_average', Y_full=None, flux_arr=None, source_model=None, ss=None, source_model_args=None, method=None): if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None: inds = get_inds_(Y, Y_full) back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args) try: return explained_variance_score(flux_arr[inds], back_trans_flux, multioutput=multioutput) except: return float( np.mean( np.var(flux_arr[inds] - back_trans_flux, axis=1) / np.var(flux_arr[inds], axis=1))) else: try: return explained_variance_score(Y, y, multioutput=multioutput) except: return float(np.mean(np.var(Y - y, axis=1) / np.var(Y, axis=1)))
def MAE(Y, y, multioutput='uniform_average', Y_full=None, flux_arr=None, source_model=None, ss=None, source_model_args=None, method=None): if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None: inds = get_inds_(Y, Y_full) back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args) return float(np.mean(np.median(np.abs(flux_arr[inds] - back_trans_flux), axis=1))) else: return float(np.mean(np.median(np.abs(Y - y), axis=1)))
def animate_sky_spectra_for_coord(obs_time_start, obs_time_end, point_coord, lunar_metadata_file, solar_metadata_file, sunspot_metadata_file, model_path, dm_path, dm_method): metadata_tups, dates, lunar_data, solar_data, sunspot_data = get_sky_for_coord(obs_time_start, obs_time_end, point_coord, lunar_metadata_file, solar_metadata_file, sunspot_metadata_file) model = rfs.load_model(model_path) dm, ss, model_args = iz.unpickle_model(path=dm_path, method=dm_method) inv_spec = [] labels = [] for i, metadata in enumerate(metadata_tups): #print(metadata) np_metadata = np.array(metadata) pred = model.predict(np_metadata.reshape(1, -1)) inv_spec.append(iz.inverse_transform(pred, dm, ss, dm_method, model_args)[0, :]) labels.append(dates[i] + "(ALT,AZ): (" + str(metadata[3]) + ", " + str(metadata[2]) + ")") return inv_spec, labels
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= 'Build and test models based on dim reductions and provided spectra') parser.add_argument('--spectra_path', type=str, default='.', metavar='PATH', help='Spectra path to work from, if not ' '.' '') parser.add_argument('--method', type=str, default='ICA', metavar='METHOD', help='Dim reduction method to load data for') parser.add_argument( '--file_path', type=str, default=None, metavar='FILE_PATH', help='COMPLETE path from which to load a dim reduction') args = parser.parse_args() data_model = None scaler = None if args.file_path is not None: data_model, scaler = ize.unpickle_model(filename=args.file_path) else: data_model, scaler = ize.unpickle_model(path=args.spectra_path, method=args.method) components = ize.get_components(args.method, data_model) offset = 0 for i, comp_i in enumerate(components): if i > 0: offset += np.max(np.abs(comp_i[comp_i < 0])) * 1.2 plt.plot(stack.skyexp_wlen_out, comp_i + offset) offset += np.max(comp_i[comp_i > 0]) * 1.2 plt.show() plt.close()
def _iter_scorer(train_inds, test_inds, flux_arr, model__and__model_flux_mean, method, score_methods, include_mle): model = model__and__model_flux_mean[0] model_flux_mean = model__and__model_flux_mean[1] flux_test = flux_arr[test_inds] flux_conv_test = None if score_methods != ['LL']: for pca_model in model: if flux_conv_test is None: flux_conv_test = iz.transform_inverse_transform(flux_test, pca_model, model_flux_mean, method) flux_test -= flux_conv_test else: residual = iz.transform_inverse_transform(flux_test, pca_model, model_flux_mean, method) flux_conv_test += residual flux_test -= residual scores = {} for score_method in score_methods: #print("Calculating score:" + score_method) score_func = iz.get_score_func(score_method) if score_func is not None: if score_method != 'MAE': scores[score_method] = score_func(flux_test, flux_conv_test, multioutput='uniform_average') else: scores[score_method] = np.mean(np.median(np.abs(flux_test - flux_conv_test), axis=1)) if (include_mle or score_method == 'LL') and method in ['FA', 'PCA']: try: scores['mle'] = model.score(flux_test) except np.linalg.linalg.LinAlgError: scores['mle'] = 0 #-2**10 #float("-inf") except ValueError: scores['mle'] = 0 #-2**10 #float("-inf") #print("Scores: " + str(scores)) return scores
def EXP_VAR(Y, y, multioutput='uniform_average', Y_full=None, flux_arr=None, source_model=None, ss=None, source_model_args=None, method=None): if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None: inds = get_inds_(Y, Y_full) back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args) try: return explained_variance_score(flux_arr[inds], back_trans_flux, multioutput=multioutput) except: return float(np.mean(np.var(flux_arr[inds] - back_trans_flux, axis=1) / np.var(flux_arr[inds], axis=1))) else: try: return explained_variance_score(Y, y, multioutput=multioutput) except: return float(np.mean(np.var(Y - y, axis=1) / np.var(Y, axis=1)))
def _iter_modeler(train_inds, test_inds, flux_arr, model, method): model_list = [] flux_train = flux_arr[train_inds] flux_avg = np.mean(flux_train, axis=0) for i in range(2): new_model = est_clone(model) new_model.fit(flux_train) back_train = iz.transform_inverse_transform(flux_train, new_model, flux_avg, method) flux_train -= back_train model_list.append(new_model) return model_list, flux_avg
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Build and test models based on dim reductions and provided spectra", ) parser.add_argument( "--spectra_path", type=str, default=".", metavar="PATH", help="Spectra path to work from, if not " "." "" ) parser.add_argument( "--method", type=str, default="ICA", metavar="METHOD", help="Dim reduction method to load data for" ) parser.add_argument( "--file_path", type=str, default=None, metavar="FILE_PATH", help="COMPLETE path from which to load a dim reduction", ) args = parser.parse_args() data_model = None scaler = None if args.file_path is not None: data_model, scaler = ize.unpickle_model(filename=args.file_path) else: data_model, scaler = ize.unpickle_model(path=args.spectra_path, method=args.method) components = ize.get_components(args.method, data_model) offset = 0 for i, comp_i in enumerate(components): if i > 0: offset += np.max(np.abs(comp_i[comp_i < 0])) * 1.2 plt.plot(stack.skyexp_wlen_out, comp_i + offset) offset += np.max(comp_i[comp_i > 0]) * 1.2 plt.show() plt.close()
def MAE(Y, y, multioutput='uniform_average', Y_full=None, flux_arr=None, source_model=None, ss=None, source_model_args=None, method=None): if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None: inds = get_inds_(Y, Y_full) back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args) return float( np.mean(np.median(np.abs(flux_arr[inds] - back_trans_flux), axis=1))) else: return float(np.mean(np.median(np.abs(Y - y), axis=1)))
def MAPED(Y, y, multioutput='uniform_average', power=4, cutoff=0.1, Y_full=None, flux_arr=None, source_model=None, ss=None, source_model_args=None, method=None): #Mean Absolute Power Error Difference; take sum of (absolute) diffs, subtract MAPE from it if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None: inds = get_inds_(Y, Y_full) back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args) diffs = np.abs(flux_arr[inds] - back_trans_flux) diffs[diffs < cutoff] = 0 sums = np.sum(diffs, axis=1) diffs = np.sum(np.power(diffs, power), axis=1) return float(np.mean(np.abs(sums - np.power(diffs, 1.0/power))) / flux_arr.shape[1]) else: diffs = np.abs(Y - y) diffs[diff < cutoff] = 0 sums = np.sum(diffs, axis=1) diffs = np.sum(np.power(diffs, power), axis=1) return float(np.mean(np.abs(sums - np.power(diffs, 1.0/power))) / Y.shape[1])
def main(): path = "." metadata_path = ".." rfr = Pipeline([ ('ica', FastICA(random_state=random_state, max_iter=ica_max_iter)), ('rfr', ensemble.RandomForestRegressor(random_state=random_state, n_jobs=-1)) ]) param_grid = { "ica__n_components": sp_randint(15, 200), "rfr__n_estimators": sp_randint(25, 400), "rfr__min_samples_split": sp_randint(1, 10) #, #"rfr__max_features": [None, "log2", "sqrt"] } randsearch = RandomizedSearchCV(rfr, param_grid, n_iter = n_iter_search) flux_arr, exp_arr, wavelengths = ICAize.load_all_in_dir(path=path, use_con_flux=True, recombine_flux=False) obs_metadata = random_forest_spectra.trim_observation_metadata(random_forest_spectra.load_observation_metadata(metadata_path)) reduced_obs_metadata = obs_metadata[np.in1d(obs_metadata['EXP_ID'], exp_arr)] reduced_obs_metadata.sort('EXP_ID') sorted_inds = np.argsort(exp_arr) reduced_obs_metadata.remove_column('EXP_ID') md_len = len(reduced_obs_metadata) X_arr = np.array(reduced_obs_metadata).view('f8').reshape((md_len,-1)) randsearch.fit(flux_arr[sorted_inds], X_arr) top_scores = sorted(randsearch.grid_scores_, key=itemgetter(1), reverse=True)[:5] for i, score in enumerate(top_scores): print "Model with rank:", i print "Mean validation score/std:", score.mean_validation_score, np.std(score.cv_validation_scores) print "Parameters:", score.parameters print ""
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= 'Build and test models based on dim reductions and provided spectra') subparsers = parser.add_subparsers(dest='subparser_name') parser.add_argument('--metadata_path', type=str, default='.', metavar='PATH', help='Metadata path to work from, if not ' '.' '') parser.add_argument('--spectra_path', type=str, default='.', metavar='PATH', help='Spectra path to work from, if not ' '.' '') parser.add_argument('--method', type=str, default='ICA', metavar='METHOD', help='Dim reduction method to load data for') parser.add_argument('--n_jobs', type=int, default=1, metavar='N_JOBS', help='N_JOBS') parser.add_argument( '--model', type=str, choices=['ET', 'RF', 'GP', 'KNN', 'SVR'], default='ET', help= 'Which model type to use: ET (Extra Trees), RF (Random Forest), GP (Gaussian Process), KNN, or SVR (Support Vector Regression)' ) parser.add_argument( '--load_model', action='store_true', help='Whether or not to load the model from --model_path') parser.add_argument('--model_path', type=str, default='model.pkl', metavar='MODEL_PATH', help='COMPLETE path from which to load a model') parser.add_argument( '--metadata_flags', type=str, default='', metavar='METADATA_FLAGS', help='Flags specifying observational metadata pre-processing, e.g. LUNAR_MAG which takes the '\ 'magnitude and linearizes it (ignoring that it is an area magnitude)' ) parser.add_argument( '--compacted_path', type=str, default=None, metavar='COMPATED_PATH', help= 'Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored' ) parser_compare = subparsers.add_parser('compare') parser_compare.add_argument( '--folds', type=int, default=3, metavar='TEST_FOLDS', help= 'Do k-fold cross validation with specified number of folds. Defaults to 3.' ) parser_compare.add_argument( '--iters', type=int, default=50, metavar='HYPER_FIT_ITERS', help='Number of iterations when fitting hyper-params') parser_compare.add_argument( '--outputfbk', action='store_true', help='If set, outputs \'grid_scores_\' data from RandomizedSearchCV') parser_compare.add_argument( '--save_best', action='store_true', help= 'Whether or not to save the (last/best) model built for e.g. --hyper_fit' ) parser_compare.add_argument( '--scorer', type=str, choices=['R2', 'MAE', 'MSE', 'LL', 'EXP_VAR', 'MAPED', 'MSEMV'], default='R2', help= 'Which scoring method to use to determine ranking of model instances.') parser_compare.add_argument( '--use_spectra', action='store_true', help= 'Whether scoring is done against the DM components or the predicted spectra' ) parser_compare.add_argument( '--ivar_cutoff', type=float, default=0.001, metavar='IVAR_CUTOFF', help='data with inverse variace below cutoff is masked as if ivar==0') parser_compare.add_argument( '--plot_final_errors', action='store_true', help='If set, will plot the errors from the final/best model, for the whole dataset, from ' + \ 'the best model re-trained on CV folds used for testing.' + \ 'Plots all errors on top of each other with low-ish alpha, to give a kind of visual ' + \ 'density map of errors.' ) args = parser.parse_args() obs_metadata = trim_observation_metadata( load_observation_metadata(args.metadata_path, flags=args.metadata_flags)) sources, components, exposures, wavelengths = ICAize.deserialize_data( args.spectra_path, args.method) source_model, ss, model_args = ICAize.unpickle_model( args.spectra_path, args.method) comb_flux_arr, comb_exposure_arr, comb_wavelengths = None, None, None if args.use_spectra: comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = ICAize.load_data( args) filter_arr = np.in1d(comb_exposure_arr, exposures) comb_flux_arr = comb_flux_arr[filter_arr] comb_exposure_arr = comb_exposure_arr[filter_arr] sorted_inds = np.argsort(comb_exposure_arr) comb_flux_arr = comb_flux_arr[sorted_inds] comb_exposure_arr = comb_exposure_arr[sorted_inds] del comb_ivar_arr del comb_masks reduced_obs_metadata = obs_metadata[np.in1d(obs_metadata['EXP_ID'], exposures)] reduced_obs_metadata.sort('EXP_ID') sorted_inds = np.argsort(exposures) reduced_obs_metadata.remove_column('EXP_ID') md_len = len(reduced_obs_metadata) var_count = len(reduced_obs_metadata.columns) X_arr = np.array(reduced_obs_metadata).view('f8').reshape((md_len, -1)) Y_arr = sources[sorted_inds] if args.load_model: predictive_model = load_model(args.model_path) else: predictive_model = get_model(args.model) if args.subparser_name == 'compare': pdist = get_param_distribution_for_model(args.model, args.iters) scorer = None if args.scorer == 'R2': scorer = make_scorer(R2) elif args.scorer == 'MAE': if args.use_spectra: p_MAE_ = partial(MAE, Y_full=Y_arr, flux_arr=comb_flux_arr, source_model=source_model, ss=ss, source_model_args=model_args, method=args.method) scorer = make_scorer(p_MAE_, greater_is_better=False) else: scorer = make_scorer(MAE, greater_is_better=False) elif args.scorer == 'MSE': if args.use_spectra: p_MSE_ = partial(MSE, Y_full=Y_arr, flux_arr=comb_flux_arr, source_model=source_model, ss=ss, source_model_args=model_args, method=args.method) scorer = make_scorer(p_MSE_, greater_is_better=False) else: scorer = make_scorer(MSE, greater_is_better=False) elif args.scorer == 'MSEMV': if args.use_spectra: p_MSEMV_ = partial(MSEMV, Y_full=Y_arr, flux_arr=comb_flux_arr, source_model=source_model, ss=ss, source_model_args=model_args, method=args.method) scorer = make_scorer(p_MSEMV_, greater_is_better=False) else: scorer = make_scorer(MSEMV, greater_is_better=False) elif args.scorer == 'EXP_VAR': if args.use_spectra: p_EXP_VAR_ = partial(EXP_VAR, Y_full=Y_arr, flux_arr=comb_flux_arr, source_model=source_model, ss=ss, source_model_args=model_args, method=args.method) scorer = make_scorer(p_EXP_VAR_) else: scorer = make_scorer(EXP_VAR) elif args.scorer == 'MAPED': if args.use_spectra: p_MAPED_ = partial(MAPED, Y_full=Y_arr, flux_arr=comb_flux_arr, source_model=source_model, ss=ss, source_model_args=model_args, method=args.method) scorer = make_scorer(p_MAPED_, greater_is_better=False) else: scorer = make_scorer(MAPED, greater_is_better=False) elif args.scorer == 'LL': scorer = None folder = ShuffleSplit(exposures.shape[0], n_iter=args.folds, test_size=1.0 / args.folds, random_state=12345) if args.model == 'GP': predictive_model.random_start = args.folds rcv = GridSearchCV(predictive_model, param_grid=pdist, error_score=0, cv=3, n_jobs=args.n_jobs, scoring=scorer) #random_state=RANDOM_STATE, #n_iter=args.iters, else: rcv = RandomizedSearchCV(predictive_model, param_distributions=pdist, n_iter=args.iters, cv=folder, n_jobs=args.n_jobs, scoring=scorer) # This is going to fit X (metdata) to Y (DM'ed sources). But there are # really two tests here: how well hyperparams fit/predict the sources # and how well they fit/predict the actual source spectra. Until I know # better, I 'm going to need to build a way to test both. rcv.fit(X_arr, Y_arr) print(rcv.best_score_) print(rcv.best_params_) print(rcv.best_estimator_) if args.outputfbk: print("=+" * 10 + "=") for val in rcv.grid_scores_: print(val) print("=+" * 10 + "=") if args.save_best: save_model(rcv.best_estimator_, args.model_path) if args.plot_final_errors: for train_inds, test_inds in folder: rcv.best_estimator_.fit(X_arr[train_inds], Y_arr[train_inds]) predicted = rcv.best_estimator_.predict(X_arr[test_inds]) back_trans_flux = ICAize.inverse_transform( predicted, source_model, ss, args.method, model_args) diffs = np.abs(comb_flux_arr[test_inds] - back_trans_flux) #Is there not 'trick' to getting matplotlib to do this without a loop? for i in range(diffs.shape[0]): plt.plot(comb_wavelengths, diffs[i, :], 'b-', alpha=0.01) plt.show()
def main(): flux_arr, exp_arr, ivar_arr, mask_arr, wavelengths = \ ICAize.load_all_in_dir('.', use_con_flux=False, recombine_flux=False, pattern="stacked*exp??????.csv") np.savez("compacted_flux_data.npz", flux=flux_arr, exp=exp_arr, ivar=ivar_arr, wavelengths=wavelengths)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Compute PCA/ICA/NMF/etc. components over set of stacked spectra, save those out, and pickle model' ) parser.add_argument( '--pattern', type=str, default='stacked*exp??????.*', metavar='PATTERN', help='File pattern for stacked sky fibers.' ) parser.add_argument( '--path', type=str, default='.', metavar='PATH', help='Path to work from, if not ''.''' ) parser.add_argument( '--compacted_path', type=str, default=None, metavar='COMPATED_PATH', help='Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored' ) parser.add_argument( '--n_components', type=int, default=40, metavar='N_COMPONENTS', help='Number of ICA/PCA/etc. components' ) parser.add_argument( '--method', type=str, default='ICA', metavar='METHOD', choices=['ICA', 'PCA', 'SPCA', 'NMF', 'ISO', 'KPCA', 'FA', 'DL'], help='Which dim. reduction method to use' ) parser.add_argument( '--scale', action='store_true', help='Should inputs variance be scaled? Defaults to mean subtract and value scale, but w/out this does not scale variance.' ) parser.add_argument( '--no_scale', action='store_true', help='Suppresses all scaling' ) parser.add_argument( '--ivar_cutoff', type=float, default=0.001, metavar='IVAR_CUTOFF', help='data with inverse variace below cutoff is masked as if ivar==0' ) parser.add_argument( '--n_iter', type=int, default=1200, metavar='MAX_ITER', help='Maximum number of iterations to allow for convergence. For SDSS data 1000 is a safe number of ICA, while SPCA requires larger values e.g. ~2000 to ~2500' ) parser.add_argument( '--n_jobs', type=int, default=None, metavar='N_JOBS', help='N_JOBS' ) args = parser.parse_args() comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = iz.load_data(args) model = iz.get_model(args.method, n=args.n_components, n_neighbors=None, max_iter=args.n_iter, random_state=iz.random_state, n_jobs=args.n_jobs) ss = None if args.no_scale: scaled_flux_arr = comb_flux_arr else: ss = skpp.StandardScaler(with_std=False) if args.scale: ss = skpp.StandardScaler(with_std=True) scaled_flux_arr = ss.fit_transform(comb_flux_arr) #Heavily copied from J. Vanderplas/astroML bayesian_blocks.py N = comb_wavelengths.size step = args.n_components * 4 edges = np.concatenate([comb_wavelengths[:1:step], 0.5 * (comb_wavelengths[1::step] + comb_wavelengths[:-1:step]), comb_wavelengths[-1::step]]) block_length = comb_wavelengths[-1::step] - edges # arrays to store the best configuration nn_vec = np.ones(N/step) * step best = np.zeros(N, dtype=float) last = np.zeros(N, dtype=int) for R in range(N/step): print("R: " + str(R)) width = block_length[:R + 1] - block_length[R + 1] count_vec = np.cumsum(nn_vec[:R + 1][::-1])[::-1] #width = nn_vec[:R + 1] - nn_vec[R + 1] #count_vec = np.cumsum(nn_vec[:R + 1][::-1])[::-1] #print(width) #print(count_vec) #raw_input("Pausing... ") fit_vec = map(lambda n: iz.score_via_CV(['LL'], scaled_flux_arr[:, :n], model, ss, args.method, folds=3, n_jobs=args.n_jobs), count_vec) fit_vec = [d["mle"] for d in fit_vec] #print(fit_vec) fit_vec[1:] += best[:R] #print(fit_vec) i_max = np.argmax(fit_vec) last[R] = i_max best[R] = fit_vec[i_max] #print(best) change_points = np.zeros(N/step, dtype=int) i_cp = N/step ind = N/step while True: i_cp -= 1 change_points[i_cp] = ind if ind == 0: break ind = last[ind - 1] change_points = change_points[i_cp:] print(edges[change_points]) '''
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= 'Compute PCA/ICA/NMF/etc. components over set of stacked spectra, save those out, and pickle model' ) parser.add_argument('--pattern', type=str, default='stacked*exp??????.*', metavar='PATTERN', help='File pattern for stacked sky fibers.') parser.add_argument('--path', type=str, default='.', metavar='PATH', help='Path to work from, if not ' '.' '') parser.add_argument( '--compacted_path', type=str, default=None, metavar='COMPATED_PATH', help= 'Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored' ) parser.add_argument('--n_components', type=int, default=40, metavar='N_COMPONENTS', help='Number of ICA/PCA/etc. components') parser.add_argument( '--method', type=str, default='ICA', metavar='METHOD', choices=['ICA', 'PCA', 'SPCA', 'NMF', 'ISO', 'KPCA', 'FA', 'DL'], help='Which dim. reduction method to use') parser.add_argument( '--scale', action='store_true', help= 'Should inputs variance be scaled? Defaults to mean subtract and value scale, but w/out this does not scale variance.' ) parser.add_argument('--no_scale', action='store_true', help='Suppresses all scaling') parser.add_argument( '--ivar_cutoff', type=float, default=0.001, metavar='IVAR_CUTOFF', help='data with inverse variace below cutoff is masked as if ivar==0') parser.add_argument( '--n_iter', type=int, default=1200, metavar='MAX_ITER', help= 'Maximum number of iterations to allow for convergence. For SDSS data 1000 is a safe number of ICA, while SPCA requires larger values e.g. ~2000 to ~2500' ) parser.add_argument('--n_jobs', type=int, default=None, metavar='N_JOBS', help='N_JOBS') args = parser.parse_args() comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = iz.load_data( args) model = iz.get_model(args.method, n=args.n_components, n_neighbors=None, max_iter=args.n_iter, random_state=iz.random_state, n_jobs=args.n_jobs) ss = None if args.no_scale: scaled_flux_arr = comb_flux_arr else: ss = skpp.StandardScaler(with_std=False) if args.scale: ss = skpp.StandardScaler(with_std=True) scaled_flux_arr = ss.fit_transform(comb_flux_arr) #Heavily copied from J. Vanderplas/astroML bayesian_blocks.py N = comb_wavelengths.size step = args.n_components * 4 edges = np.concatenate([ comb_wavelengths[:1:step], 0.5 * (comb_wavelengths[1::step] + comb_wavelengths[:-1:step]), comb_wavelengths[-1::step] ]) block_length = comb_wavelengths[-1::step] - edges # arrays to store the best configuration nn_vec = np.ones(N / step) * step best = np.zeros(N, dtype=float) last = np.zeros(N, dtype=int) for R in range(N / step): print("R: " + str(R)) width = block_length[:R + 1] - block_length[R + 1] count_vec = np.cumsum(nn_vec[:R + 1][::-1])[::-1] #width = nn_vec[:R + 1] - nn_vec[R + 1] #count_vec = np.cumsum(nn_vec[:R + 1][::-1])[::-1] #print(width) #print(count_vec) #raw_input("Pausing... ") fit_vec = map( lambda n: iz.score_via_CV(['LL'], scaled_flux_arr[:, :n], model, ss, args.method, folds=3, n_jobs=args.n_jobs), count_vec) fit_vec = [d["mle"] for d in fit_vec] #print(fit_vec) fit_vec[1:] += best[:R] #print(fit_vec) i_max = np.argmax(fit_vec) last[R] = i_max best[R] = fit_vec[i_max] #print(best) change_points = np.zeros(N / step, dtype=int) i_cp = N / step ind = N / step while True: i_cp -= 1 change_points[i_cp] = ind if ind == 0: break ind = last[ind - 1] change_points = change_points[i_cp:] print(edges[change_points]) '''
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Build and test models based on dim reductions and provided spectra' ) subparsers = parser.add_subparsers(dest='subparser_name') parser.add_argument( '--metadata_path', type=str, default='.', metavar='PATH', help='Metadata path to work from, if not ''.''' ) parser.add_argument( '--spectra_path', type=str, default='.', metavar='PATH', help='Spectra path to work from, if not ''.''' ) parser.add_argument( '--method', type=str, default='ICA', metavar='METHOD', help='Dim reduction method to load data for' ) parser.add_argument( '--n_jobs', type=int, default=1, metavar='N_JOBS', help='N_JOBS' ) parser.add_argument( '--model', type=str, choices=['ET', 'RF', 'GP', 'KNN', 'SVR'], default='ET', help='Which model type to use: ET (Extra Trees), RF (Random Forest), GP (Gaussian Process), KNN, or SVR (Support Vector Regression)' ) parser.add_argument( '--load_model', action='store_true', help='Whether or not to load the model from --model_path' ) parser.add_argument( '--model_path', type=str, default='model.pkl', metavar='MODEL_PATH', help='COMPLETE path from which to load a model' ) parser.add_argument( '--metadata_flags', type=str, default='', metavar='METADATA_FLAGS', help='Flags specifying observational metadata pre-processing, e.g. LUNAR_MAG which takes the '\ 'magnitude and linearizes it (ignoring that it is an area magnitude)' ) parser.add_argument( '--compacted_path', type=str, default=None, metavar='COMPATED_PATH', help='Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored' ) parser_compare = subparsers.add_parser('compare') parser_compare.add_argument( '--folds', type=int, default=3, metavar='TEST_FOLDS', help='Do k-fold cross validation with specified number of folds. Defaults to 3.' ) parser_compare.add_argument( '--iters', type=int, default=50, metavar='HYPER_FIT_ITERS', help='Number of iterations when fitting hyper-params' ) parser_compare.add_argument( '--outputfbk', action='store_true', help='If set, outputs \'grid_scores_\' data from RandomizedSearchCV' ) parser_compare.add_argument( '--save_best', action='store_true', help='Whether or not to save the (last/best) model built for e.g. --hyper_fit' ) parser_compare.add_argument( '--scorer', type=str, choices=['R2', 'MAE', 'MSE', 'LL', 'EXP_VAR', 'MAPED', 'MSEMV'], default='R2', help='Which scoring method to use to determine ranking of model instances.' ) parser_compare.add_argument( '--use_spectra', action='store_true', help='Whether scoring is done against the DM components or the predicted spectra' ) parser_compare.add_argument( '--ivar_cutoff', type=float, default=0.001, metavar='IVAR_CUTOFF', help='data with inverse variace below cutoff is masked as if ivar==0' ) parser_compare.add_argument( '--plot_final_errors', action='store_true', help='If set, will plot the errors from the final/best model, for the whole dataset, from ' + \ 'the best model re-trained on CV folds used for testing.' + \ 'Plots all errors on top of each other with low-ish alpha, to give a kind of visual ' + \ 'density map of errors.' ) args = parser.parse_args() obs_metadata = trim_observation_metadata(load_observation_metadata(args.metadata_path, flags=args.metadata_flags)) sources, components, exposures, wavelengths = ICAize.deserialize_data(args.spectra_path, args.method) source_model, ss, model_args = ICAize.unpickle_model(args.spectra_path, args.method) comb_flux_arr, comb_exposure_arr, comb_wavelengths = None, None, None if args.use_spectra: comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = ICAize.load_data(args) filter_arr = np.in1d(comb_exposure_arr, exposures) comb_flux_arr = comb_flux_arr[filter_arr] comb_exposure_arr = comb_exposure_arr[filter_arr] sorted_inds = np.argsort(comb_exposure_arr) comb_flux_arr = comb_flux_arr[sorted_inds] comb_exposure_arr = comb_exposure_arr[sorted_inds] del comb_ivar_arr del comb_masks reduced_obs_metadata = obs_metadata[np.in1d(obs_metadata['EXP_ID'], exposures)] reduced_obs_metadata.sort('EXP_ID') sorted_inds = np.argsort(exposures) reduced_obs_metadata.remove_column('EXP_ID') md_len = len(reduced_obs_metadata) var_count = len(reduced_obs_metadata.columns) X_arr = np.array(reduced_obs_metadata).view('f8').reshape((md_len,-1)) Y_arr = sources[sorted_inds] if args.load_model: predictive_model = load_model(args.model_path) else: predictive_model = get_model(args.model) if args.subparser_name == 'compare': pdist = get_param_distribution_for_model(args.model, args.iters) scorer = None if args.scorer == 'R2': scorer = make_scorer(R2) elif args.scorer == 'MAE': if args.use_spectra: p_MAE_ = partial(MAE, Y_full=Y_arr, flux_arr=comb_flux_arr, source_model=source_model, ss=ss, source_model_args=model_args, method=args.method) scorer = make_scorer(p_MAE_, greater_is_better=False) else: scorer = make_scorer(MAE, greater_is_better=False) elif args.scorer == 'MSE': if args.use_spectra: p_MSE_ = partial(MSE, Y_full=Y_arr, flux_arr=comb_flux_arr, source_model=source_model, ss=ss, source_model_args=model_args, method=args.method) scorer = make_scorer(p_MSE_, greater_is_better=False) else: scorer = make_scorer(MSE, greater_is_better=False) elif args.scorer == 'MSEMV': if args.use_spectra: p_MSEMV_ = partial(MSEMV, Y_full=Y_arr, flux_arr=comb_flux_arr, source_model=source_model, ss=ss, source_model_args=model_args, method=args.method) scorer = make_scorer(p_MSEMV_, greater_is_better=False) else: scorer = make_scorer(MSEMV, greater_is_better=False) elif args.scorer == 'EXP_VAR': if args.use_spectra: p_EXP_VAR_ = partial(EXP_VAR, Y_full=Y_arr, flux_arr=comb_flux_arr, source_model=source_model, ss=ss, source_model_args=model_args, method=args.method) scorer = make_scorer(p_EXP_VAR_) else: scorer = make_scorer(EXP_VAR) elif args.scorer == 'MAPED': if args.use_spectra: p_MAPED_ = partial(MAPED, Y_full=Y_arr, flux_arr=comb_flux_arr, source_model=source_model, ss=ss, source_model_args=model_args, method=args.method) scorer = make_scorer(p_MAPED_, greater_is_better=False) else: scorer = make_scorer(MAPED, greater_is_better=False) elif args.scorer == 'LL': scorer = None folder = ShuffleSplit(exposures.shape[0], n_iter=args.folds, test_size=1.0/args.folds, random_state=12345) if args.model == 'GP': predictive_model.random_start = args.folds rcv = GridSearchCV(predictive_model, param_grid=pdist, error_score=0, cv=3, n_jobs=args.n_jobs, scoring=scorer) #random_state=RANDOM_STATE, #n_iter=args.iters, else: rcv = RandomizedSearchCV(predictive_model, param_distributions=pdist, n_iter=args.iters, cv=folder, n_jobs=args.n_jobs, scoring=scorer) # This is going to fit X (metdata) to Y (DM'ed sources). But there are # really two tests here: how well hyperparams fit/predict the sources # and how well they fit/predict the actual source spectra. Until I know # better, I 'm going to need to build a way to test both. rcv.fit(X_arr, Y_arr) print(rcv.best_score_) print(rcv.best_params_) print(rcv.best_estimator_) if args.outputfbk: print("=+"*10 + "=") for val in rcv.grid_scores_: print(val) print("=+"*10 + "=") if args.save_best: save_model(rcv.best_estimator_, args.model_path) if args.plot_final_errors: for train_inds, test_inds in folder: rcv.best_estimator_.fit(X_arr[train_inds], Y_arr[train_inds]) predicted = rcv.best_estimator_.predict(X_arr[test_inds]) back_trans_flux = ICAize.inverse_transform(predicted, source_model, ss, args.method, model_args) diffs = np.abs(comb_flux_arr[test_inds] - back_trans_flux) #Is there not 'trick' to getting matplotlib to do this without a loop? for i in range(diffs.shape[0]): plt.plot(comb_wavelengths, diffs[i, :], 'b-', alpha=0.01) plt.show()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Compute PCA/ICA/NMF/etc. components over set of stacked spectra, save those out, and pickle model' ) subparsers = parser.add_subparsers(dest='subparser_name') parser.add_argument( '--pattern', type=str, default='stacked*exp??????.*', metavar='PATTERN', help='File pattern for stacked sky fibers.' ) parser.add_argument( '--path', type=str, default='.', metavar='PATH', help='Path to work from, if not ''.''' ) parser.add_argument( '--compacted_path', type=str, default=None, metavar='COMPATED_PATH', help='Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored' ) parser.add_argument( '--method', type=str, default=['ICA'], metavar='METHOD', choices=['ICA', 'PCA', 'SPCA', 'NMF', 'ISO', 'KPCA', 'FA', 'DL'], nargs='+', help='Which dim. reduction method to use' ) parser.add_argument( '--scale', action='store_true', help='Should inputs be scaled? Will mean subtract and value scale, but does not scale variace.' ) parser.add_argument( '--ivar_cutoff', type=float, default=0.001, metavar='IVAR_CUTOFF', help='data with inverse variace below cutoff is masked as if ivar==0' ) parser.add_argument( '--n_iter', type=int, default=1200, metavar='MAX_ITER', help='Maximum number of iterations to allow for convergence. For SDSS data 1000 is a safe number of ICA, while SPCA requires larger values e.g. ~2000 to ~2500' ) parser.add_argument( '--n_jobs', type=int, default=None, metavar='N_JOBS', help='N_JOBS' ) parser_compare = subparsers.add_parser('compare') parser_compare.add_argument( '--max_components', type=int, default=50, metavar='COMP_MAX', help='Max number of components to use/test' ) parser_compare.add_argument( '--min_components', type=int, default=0, metavar='COMP_MIN', help='Min number of compoenents to use/test' ) parser_compare.add_argument( '--step_size', type=int, default=5, metavar='COMP_STEP', help='Step size from comp_min to comp_max' ) parser_compare.add_argument( '--comparison', choices=['EXP_VAR', 'R2', 'MSE', 'MAE'], nargs='*', default=['EXP_VAR'], help='Comparison methods: Explained variance (score), R2 (score), mean sq. error (loss), MEDIAN absolute error (loss)' ) parser_compare.add_argument( '--mle_if_avail', action='store_true', help='In additon to --comparison, include MLE if PCA or FA methods specified' ) parser_compare.add_argument( '--plot_example_reconstruction', action='store_true', help='Pick a random spectrum, plot its actual and reconstructed versions' ) parser_build = subparsers.add_parser('build') parser_build.add_argument( '--n_components', type=int, default=40, metavar='N_COMPONENTS', help='Number of ICA/PCA/etc. components' ) parser_build.add_argument( '--n_neighbors', type=int, default=10, metavar='N_NEIGHBORS', help='Number of neighbots for e.g. IsoMap' ) args = parser.parse_args() comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = iz.load_data(args) if 'DL' in args.method: flux_arr = comb_flux_arr.astype(dtype=np.float64) else: flux_arr = comb_flux_arr scaled_flux_arr = None ss = None if args.scale: ss = skpp.StandardScaler(with_std=False) scaled_flux_arr = ss.fit_transform(flux_arr) else: scaled_flux_arr = flux_arr if args.subparser_name == 'compare': fig, ax1 = plt.subplots() ax2 = ax1.twinx() for method in args.method: model = iz.get_model(method, max_iter=args.n_iter, random_state=iz.random_state, n_jobs=args.n_jobs) scores = {} mles_and_covs = args.mle_if_avail and (method == 'FA' or method == 'PCA') n_components = np.arange(args.min_components, args.max_components+1, args.step_size) for n in n_components: print("Cross validating for n=" + str(n) + " on method " + method) model.n_components = n comparisons = iz.score_via_CV(args.comparison, flux_arr if method == 'NMF' else scaled_flux_arr, model, method, n_jobs=args.n_jobs, include_mle=mles_and_covs, modeler=_iter_modeler, scorer=_iter_scorer) for key, val in comparisons.items(): if key in scores: scores[key].append(val) else: scores[key] = [val] if mles_and_covs: #ax2.axhline(cov_mcd_score(scaled_flux_arr, args.scale), color='violet', label='MCD Cov', linestyle='--') ax2.axhline(cov_lw_score(scaled_flux_arr, args.scale), color='orange', label='LW Cov', linestyle='--') for key, score_list in scores.items(): if key != 'mle': ax1.plot(n_components, score_list, label=method + ':' + key + ' scores') else: ax2.plot(n_components, score_list, '-.', label=method + ' mle scores') ax1.set_xlabel('nb of components') ax1.set_ylabel('CV scores', figure=fig) ax1.legend(loc='lower left') ax2.legend(loc='lower right') plt.show()
def load_plot_etc_target_type(metadata_path, spectra_path, test_inds, target_type, no_plot=False, save_out=False, restrict_delta=False, use_spca=False, use_pca=False): obs_metadata = trim_observation_metadata(load_observation_metadata(metadata_path)) if use_filter_split: c_sources, c_mixing, c_exposures, c_wavelengths, c_filter_split_arr = load_spectra_data(spectra_path, target_type=target_type, filter_str='nonem', use_spca=use_spca, use_pca=use_pca) c_sources_e, c_mixing_e, c_exposures_e, c_wavelengths_e, c_filter_split_arr_e = load_spectra_data(spectra_path, target_type=target_type, filter_str='em', use_spca=use_spca, use_pca=use_pca) else: c_sources, c_mixing, c_exposures, c_wavelengths, c_filter_split_arr = load_spectra_data(spectra_path, target_type=target_type, filter_str='both', use_spca=use_spca, use_pca=use_pca) reduced_obs_metadata = obs_metadata[np.in1d(obs_metadata['EXP_ID'], c_exposures)] reduced_obs_metadata.sort('EXP_ID') sorted_inds = np.argsort(c_exposures) if use_filter_split: sorted_e_inds = np.argsort(c_exposures_e) if not linear_only: if reg_type == 'etr': rfr = ensemble.ExtraTreesRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split, random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap) if use_filter_split: rfr_e = ensemble.ExtraTreesRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split, random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap) else: rfr = ensemble.RandomForestRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split, random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap) if use_filter_split: rfr_e = ensemble.RandomForestRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split, random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap) if include_knn: knn = neighbors.KNeighborsRegressor(weights='distance', n_neighbors=10, p=64) if use_filter_split: knn_e = neighbors.KNeighborsRegressor(weights='distance', n_neighbors=10, p=64) if include_linear: linear = Linear(fit_intercept=True, copy_X=True, n_jobs=-1) poly_2_linear = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))]) poly_3_linear = Pipeline([('poly', PolynomialFeatures(degree=3)), ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))]) poly_4_linear = Pipeline([('poly', PolynomialFeatures(degree=4)), ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))]) if use_filter_split: linear_e = Linear(fit_intercept=True, copy_X=True, n_jobs=-1) poly_2_linear_e = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))]) poly_3_linear_e = Pipeline([('poly', PolynomialFeatures(degree=3)), ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))]) poly_4_linear_e = Pipeline([('poly', PolynomialFeatures(degree=4)), ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))]) reduced_obs_metadata.remove_column('EXP_ID') md_len = len(reduced_obs_metadata) var_count = len(reduced_obs_metadata.columns) X_arr = np.array(reduced_obs_metadata).view('f8').reshape((md_len,-1)) ica = None if not use_spca and not use_pca: if use_filter_split: ica = ICAize.unpickle_FastICA(path=spectra_path, target_type=target_type, filter_str='nonem') ica_e = ICAize.unpickle_FastICA(path=spectra_path, target_type=target_type, filter_str='em') else: ica = ICAize.unpickle_FastICA(path=spectra_path, target_type=target_type, filter_str='both') elif use_spca: ica = ICAize.unpickle_SPCA(path=spectra_path, target_type=target_type) else: if use_filter_split: ica = ICAize.unpickle_PCA(path=spectra_path, target_type=target_type, filter_str='nonem') ica_e = ICAize.unpickle_PCA(path=spectra_path, target_type=target_type, filter_str='em') else: ica = ICAize.unpickle_PCA(path=spectra_path, target_type=target_type, filter_str='both') spectra_dir_list = os.listdir(spectra_path) ################################################################ results = None for test_ind in test_inds: test_X = X_arr[test_ind] train_X = np.vstack( [X_arr[:test_ind], X_arr[test_ind+1:]] ) test_y = (c_sources[sorted_inds])[test_ind] train_y = np.vstack( [(c_sources[sorted_inds])[:test_ind], (c_sources[sorted_inds])[test_ind+1:]] ) if use_filter_split: test_y_e = (c_sources_e[sorted_e_inds])[test_ind] train_y_e = np.vstack( [(c_sources_e[sorted_e_inds])[:test_ind], (c_sources_e[sorted_e_inds])[test_ind+1:]] ) if scale: scaler = StandardScaler(with_std=scale_std) train_X = scaler.fit_transform(train_X) test_X = scaler.transform(test_X) title_str = "exp{}, {}".format(c_exposures[sorted_inds[test_ind]], target_type) if not linear_only: rfr.fit(X=train_X, y=train_y) if use_filter_split: rfr_e.fit(X=train_X, y=train_y_e) if include_knn: knn.fit(X=train_X, y=train_y) if user_filter_split: knn_e.fit(X=train_X, y=train_y_e) if include_linear: linear.fit(train_X, train_y) poly_2_linear.fit(train_X, train_y) if order_3: poly_3_linear.fit(train_X, train_y) if order_4: poly_4_linear.fit(train_X, train_y) if use_filter_split and include_linear: linear_e.fit(train_X, train_y_e) poly_2_linear_e.fit(train_X, train_y_e) if order_3: poly_3_linear_e.fit(train_X, train_y_e) if order_4: poly_4_linear_e.fit(train_X, train_y_e) print test_ind, c_exposures[sorted_inds[test_ind]], data = None actual = None mask = None delta_mask = None ivar = None for file in spectra_dir_list: if fnmatch.fnmatch(file, "stacked_sky_*exp{}.csv".format(c_exposures[sorted_inds[test_ind]])): data = Table.read(os.path.join(spectra_path, file), format="ascii.csv") ivar = data['ivar'] mask = (data['ivar'] == 0) delta_mask = mask.copy() if restrict_delta: if restrict_color == 'blue': delta_mask[2700:] = True else: delta_mask[:2700] = True actual = data['flux'] break if actual is None: continue if not linear_only: rfr_prediction = rfr.predict(test_X) if not use_spca and not use_pca: rfr_predicted = ica.inverse_transform(rfr_prediction, copy=True) else: rfr_predicted = np.zeros( (1, ica.components_.shape[1]) ) rfr_predicted[0,:] = np.sum(rfr_prediction.T * ica.components_, 0) if use_filter_split: rfr_e_prediction = rfr_e.predict(test_X) if not use_spca and not use_pca: rfr_e_predicted = ica_e.inverse_transform(rfr_e_prediction, copy=True) else: rfr_e_predicted = np.zeros( (1, ica_e.components_.shape[1]) ) rfr_e_predicted[0,:] = np.sum(rfr_e_prediction.T * ica_e.components_, 0) rfr_predicted = rfr_predicted + rfr_e_predicted rfr_delta = rfr_predicted[0] - actual if not no_plot: plt.plot(c_wavelengths[~mask], rfr_predicted[0][~mask]) plt.plot(c_wavelengths[~mask], actual[~mask]) plt.plot(c_wavelengths[~mask], rfr_delta[~mask]) if not no_plot: plt.plot(c_wavelengths, [0]*len(c_wavelengths)) err_term = np.sum(np.power(rfr_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask]) err_sum = np.sum(rfr_delta[~delta_mask])/len(rfr_delta[~delta_mask]) red_chi = np.sum(np.power(rfr_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1) if not no_plot: plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)]) plt.tight_layout() plt.title("Random Forest Regressor: {}".format(title_str)) plt.show() plt.close() print err_term, red_chi, err_sum, if include_knn: knn_prediction = knn.predict(test_X) if not use_spca and not use_pca: knn_predicted = ica.inverse_transform(knn_prediction, copy=True) else: knn_predicted = np.zeros( (1, ica.components_.shape[1]) ) knn_predicted[0,:] = np.sum(knn_prediction.T * ica.components_, 0) if use_filter_split: knn_e_prediction = knn_e.predict(test_X) if not use_spca and not use_pca: knn_e_predicted = ica_e.inverse_transform(knn_e_prediction, copy=True) else: knn_e_predicted = np.zeros( (1, ica_e.components_.shape[1]) ) knn_e_predicted[0,:] = np.sum(knn_e_prediction.T * ica_e.components_, 0) knn_predicted = knn_predicted + knn_e_predicted if not no_plot: plt.plot(c_wavelengths[~mask], knn_predicted[0][~mask]) plt.plot(c_wavelengths[~mask], actual[~mask]) knn_delta = knn_predicted[0] - actual err_term = np.sum(np.power(knn_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask]) err_sum = np.sum(knn_delta[~delta_mask])/len(knn_delta[~delta_mask]) red_chi = np.sum(np.power(knn_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1) if not no_plot: plt.plot(c_wavelengths[~mask], knn_delta[~mask]) plt.plot(c_wavelengths, [0]*len(c_wavelengths)) plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)]) plt.tight_layout() plt.title("Good 'ol K-NN: {}".format(title_str)) plt.show() plt.close() print err_term, red_chi, err_sum, if include_linear: poly_1_prediction = linear.predict(test_X) if not use_spca and not use_pca: poly_1_predicted = ica.inverse_transform(poly_1_prediction, copy=True) else: poly_1_predicted = np.zeros( (1, ica.components_.shape[1]) ) poly_1_predicted[0,:] = np.sum(poly_1_prediction.T * ica.components_, 0) if use_filter_split: poly_1_e_prediction = linear.predict(test_X) if not use_spca and not use_pca: poly_1_e_predicted = ica_e.inverse_transform(poly_1_e_prediction, copy=True) else: poly_1_e_predicted = np.zeros( (1, ica_e.components_.shape[1]) ) poly_1_e_predicted[0,:] = np.sum(poly_1_e_prediction.T * ica_e.components_, 0) poly_1_predicted = poly_1_predicted + poly_1_e_predicted poly_1_delta = poly_1_predicted[0] - actual if not no_plot: plt.plot(c_wavelengths[~mask], poly_1_predicted[0][~mask]) plt.plot(c_wavelengths[~mask], actual[~mask]) err_term = np.sum(np.power(poly_1_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask]) err_sum = np.sum(poly_1_delta[~delta_mask])/len(poly_1_delta[~delta_mask]) red_chi = np.sum(np.power(poly_1_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1) if not no_plot: plt.plot(c_wavelengths[~mask], poly_1_delta[~mask]) plt.plot(c_wavelengths, [0]*len(c_wavelengths)) plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)]) plt.tight_layout() plt.title("Poly 1: {}".format(title_str)) plt.show() plt.close() print err_term, red_chi, err_sum, poly_2_prediction = poly_2_linear.predict(test_X) if not use_spca and not use_pca: poly_2_predicted = ica.inverse_transform(poly_2_prediction, copy=True) else: poly_2_predicted = np.zeros( (1, ica.components_.shape[1]) ) poly_2_predicted[0,:] = np.sum(poly_2_prediction.T * ica.components_, 0) poly_2_delta = poly_2_predicted[0] - actual if not no_plot: plt.plot(c_wavelengths[~mask], poly_2_predicted[0][~mask]) plt.plot(c_wavelengths[~mask], actual[~mask]) err_term = np.sum(np.power(poly_2_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask]) err_sum = np.sum(poly_2_delta[~delta_mask])/len(poly_2_delta[~delta_mask]) red_chi = np.sum(np.power(poly_2_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1) if not no_plot: plt.plot(c_wavelengths[~mask], poly_2_delta[~mask]) plt.plot(c_wavelengths, [0]*len(c_wavelengths)) plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)]) plt.tight_layout() plt.title("Poly 2: {}".format(title_str)) plt.show() plt.close() print err_term, red_chi, err_sum, err_ind =+ 1 if order_3: poly_3_prediction = poly_3_linear.predict(test_X) if not use_spca and not use_pca: poly_3_predicted = ica.inverse_transform(poly_3_prediction, copy=True) else: poly_3_predicted = np.zeros( (1, ica.components_.shape[1]) ) poly_3_predicted[0,:] = np.sum(poly_3_prediction.T * ica.components_, 0) poly_3_delta = poly_3_predicted[0] - actual if not no_plot: plt.plot(c_wavelengths[~mask], poly_3_predicted[0][~mask]) plt.plot(c_wavelengths[~mask], actual[~mask]) err_term = np.sum(np.power(poly_3_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask]) err_sum = np.sum(poly_3_delta[~delta_mask])/len(poly_3_delta[~delta_mask]) red_chi = np.sum(np.power(poly_3_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1) if not no_plot: plt.plot(c_wavelengths[~mask], poly_3_delta[~mask]) plt.plot(c_wavelengths, [0]*len(c_wavelengths)) plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)]) plt.tight_layout() plt.title("Poly 3: {}".format(title_str)) plt.show() plt.close() print err_term, red_chi, err_sum, err_ind =+ 1 if order_4: poly_4_prediction = poly_4_linear.predict(test_X) if not use_spca and not use_pca: poly_4_predicted = ica.inverse_transform(poly_4_prediction, copy=True) else: poly_4_predicted = np.zeros( (1, ica.components_.shape[1]) ) poly_4_predicted[0,:] = np.sum(poly_4_prediction.T * ica.components_, 0) poly_4_delta = poly_4_predicted[0] - actual if not no_plot: plt.plot(c_wavelengths[~mask], poly_4_predicted[0][~mask]) plt.plot(c_wavelengths[~mask], actual[~mask]) err_term = np.sum(np.power(poly_4_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask]) err_sum = np.sum(poly_4_delta[~delta_mask])/len(poly_4_delta[~delta_mask]) red_chi = np.sum(np.power(poly_4_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1) if not no_plot: plt.plot(c_wavelengths[~mask], poly_4_delta[~mask]) plt.plot(c_wavelengths, [0]*len(c_wavelengths)) plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)]) plt.tight_layout() plt.title("Poly 4: {}".format(title_str)) plt.show() plt.close() print err_term, red_chi, err_sum, err_ind =+ 1 print if save_out: out_table = Table() wavelength_col = Column(c_wavelengths, name="wavelength", dtype=float) out_table.add_columns([wavelength_col]) if not linear_only: rf_col = Column(rfr_predicted[0], name="rf_flux", dtype=float) out_table.add_columns([rf_col]) if include_knn: knn_col = Column(knn_predicted[0], name="knn_flux", dtype=float) avg_col = Column(avg_predicted[0], name="avg_flux", dtype=float) out_table.add_columns([knn_col, avg_col]) if include_linear: poly_1_col = Column(poly_1_predicted[0], name="poly_1_flux", dtype=float) poly_2_col = Column(poly_2_predicted[0], name="poly_2_flux", dtype=float) out_table.add_columns([poly_1_col, poly_2_col]) if order_3: poly_3_col = Column(poly_3_predicted[0], name="poly_3_flux", dtype=float) out_table.add_columns([poly_3_col]) if order_4: poly_4_col = Column(poly_4_predicted[0], name="poly_4_flux", dtype=float) out_table.add_columns([poly_4_col]) mask_col = Column(~mask, name="mask_col", dtype=bool) out_table.add_columns([mask_col]) out_table.write("predicted_sky_exp{}.csv".format(c_exposures[sorted_inds[test_ind]]), format="ascii.csv")
import ICAize import stack import matplotlib.pyplot as plt import numpy as np import random_forest_spectra as rfs import sklearn.metrics as sm import sys import os.path import pickle path = '.' if len(sys.argv) == 2: path = sys.argv[1] fastica = ICAize.unpickle_FastICA(target_type="combined", filter_str="both") for comp_i in range(min(fastica.components_.shape[0], 25)): scale_factor = 2.4/np.max(np.abs(fastica.components_[comp_i])) plt.plot(stack.skyexp_wlen_out, (fastica.components_[comp_i]*scale_factor)+(5*comp_i) ) plt.show() plt.close() fastica = ICAize.unpickle_FastICA(target_type="combined", filter_str="em") for comp_i in range(min(fastica.components_.shape[0], 25)): scale_factor = 2.4/np.max(np.abs(fastica.components_[comp_i])) plt.plot(stack.skyexp_wlen_out, (fastica.components_[comp_i]*scale_factor)+(5*comp_i) ) plt.show() plt.close() fastica = ICAize.unpickle_FastICA(target_type="combined", filter_str="nonem") for comp_i in range(min(fastica.components_.shape[0], 25)):