Example #1
0
def main():
    flux_arr, exp_arr, ivar_arr, mask_arr, wavelengths = \
        ICAize.load_all_in_dir('.', pattern="stacked*exp??????.csv")
    temp_flux_arr, temp_exp_arr, temp_ivar_arr, temp_mask_arr, temp_wavelengths = \
        ICAize.load_all_in_dir('.', pattern="stacked*exp??????.fits")

    if len(flux_arr) > 0:
        if len(temp_flux_arr) > 0:
            flux_arr = np.concatenate((flux_arr, temp_flux_arr))
            exp_arr = np.concatenate((exp_arr, temp_exp_arr))
            ivar_arr = np.concatenate((ivar_arr, temp_ivar_arr))
            wavelengths = np.concatenate((wavelengths, temp_wavelengths))
    elif len(temp_flux_arr) > 0:
        flux_arr = temp_flux_arr
        exp_arr = temp_exp_arr
        ivar_arr = temp_ivar_arr
        wavelengths = temp_wavelengths
    else:
        return

    np.savez("compacted_flux_data.npz",
             flux=flux_arr,
             exp=exp_arr,
             ivar=ivar_arr,
             wavelengths=wavelengths)
Example #2
0
def MSE(Y,
        y,
        multioutput='uniform_average',
        Y_full=None,
        flux_arr=None,
        source_model=None,
        ss=None,
        source_model_args=None,
        method=None):
    if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None:
        inds = get_inds_(Y, Y_full)
        back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method,
                                                   source_model_args)

        try:
            return mean_squared_error(flux_arr[inds],
                                      back_trans_flux,
                                      multioutput=multioutput)
        except:
            return mean_squared_error(flux_arr[inds], back_trans_flux)
    else:
        try:
            yss = pp.MaxAbsScaler()
            Y = yss.fit_transform(Y)
            y = yss.transform(y)
        except:
            scalefactor = np.amax(np.abs(Y), axis=0)
            Y = Y / scalefactor
            y = y / scalefactor

        try:
            return mean_squared_error(Y, y, multioutput=multioutput)
        except:
            return mean_squared_error(Y, y)
Example #3
0
def MAPED(Y,
          y,
          multioutput='uniform_average',
          power=4,
          cutoff=0.1,
          Y_full=None,
          flux_arr=None,
          source_model=None,
          ss=None,
          source_model_args=None,
          method=None):
    #Mean Absolute Power Error Difference;  take sum of (absolute) diffs, subtract MAPE from it
    if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None:
        inds = get_inds_(Y, Y_full)
        back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method,
                                                   source_model_args)

        diffs = np.abs(flux_arr[inds] - back_trans_flux)
        diffs[diffs < cutoff] = 0

        sums = np.sum(diffs, axis=1)
        diffs = np.sum(np.power(diffs, power), axis=1)

        return float(
            np.mean(np.abs(sums - np.power(diffs, 1.0 / power))) /
            flux_arr.shape[1])
    else:
        diffs = np.abs(Y - y)
        diffs[diff < cutoff] = 0

        sums = np.sum(diffs, axis=1)
        diffs = np.sum(np.power(diffs, power), axis=1)

        return float(
            np.mean(np.abs(sums - np.power(diffs, 1.0 / power))) / Y.shape[1])
Example #4
0
def EXP_VAR(Y,
            y,
            multioutput='uniform_average',
            Y_full=None,
            flux_arr=None,
            source_model=None,
            ss=None,
            source_model_args=None,
            method=None):
    if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None:
        inds = get_inds_(Y, Y_full)
        back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method,
                                                   source_model_args)
        try:
            return explained_variance_score(flux_arr[inds],
                                            back_trans_flux,
                                            multioutput=multioutput)
        except:
            return float(
                np.mean(
                    np.var(flux_arr[inds] - back_trans_flux, axis=1) /
                    np.var(flux_arr[inds], axis=1)))
    else:
        try:
            return explained_variance_score(Y, y, multioutput=multioutput)
        except:
            return float(np.mean(np.var(Y - y, axis=1) / np.var(Y, axis=1)))
def MAE(Y, y, multioutput='uniform_average', Y_full=None, flux_arr=None, source_model=None,
        ss=None, source_model_args=None, method=None):
    if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None:
        inds = get_inds_(Y, Y_full)
        back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args)
        return float(np.mean(np.median(np.abs(flux_arr[inds] - back_trans_flux), axis=1)))
    else:
        return float(np.mean(np.median(np.abs(Y - y), axis=1)))
Example #6
0
def animate_sky_spectra_for_coord(obs_time_start, obs_time_end, point_coord, lunar_metadata_file,
                    solar_metadata_file, sunspot_metadata_file, model_path, dm_path, dm_method):
    metadata_tups, dates, lunar_data, solar_data, sunspot_data = get_sky_for_coord(obs_time_start,
                    obs_time_end, point_coord, lunar_metadata_file, solar_metadata_file, sunspot_metadata_file)

    model = rfs.load_model(model_path)
    dm, ss, model_args = iz.unpickle_model(path=dm_path, method=dm_method)

    inv_spec = []
    labels = []
    for i, metadata in enumerate(metadata_tups):
        #print(metadata)
        np_metadata = np.array(metadata)
        pred = model.predict(np_metadata.reshape(1, -1))
        inv_spec.append(iz.inverse_transform(pred, dm, ss, dm_method, model_args)[0, :])
        labels.append(dates[i] + "(ALT,AZ): (" + str(metadata[3]) + ", " + str(metadata[2]) + ")")

    return inv_spec, labels
Example #7
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        'Build and test models based on dim reductions and provided spectra')
    parser.add_argument('--spectra_path',
                        type=str,
                        default='.',
                        metavar='PATH',
                        help='Spectra path to work from, if not '
                        '.'
                        '')
    parser.add_argument('--method',
                        type=str,
                        default='ICA',
                        metavar='METHOD',
                        help='Dim reduction method to load data for')
    parser.add_argument(
        '--file_path',
        type=str,
        default=None,
        metavar='FILE_PATH',
        help='COMPLETE path from which to load a dim reduction')

    args = parser.parse_args()

    data_model = None
    scaler = None
    if args.file_path is not None:
        data_model, scaler = ize.unpickle_model(filename=args.file_path)
    else:
        data_model, scaler = ize.unpickle_model(path=args.spectra_path,
                                                method=args.method)
    components = ize.get_components(args.method, data_model)

    offset = 0
    for i, comp_i in enumerate(components):
        if i > 0:
            offset += np.max(np.abs(comp_i[comp_i < 0])) * 1.2
        plt.plot(stack.skyexp_wlen_out, comp_i + offset)
        offset += np.max(comp_i[comp_i > 0]) * 1.2
    plt.show()
    plt.close()
Example #8
0
def main():
    flux_arr, exp_arr, ivar_arr, mask_arr, wavelengths = \
        ICAize.load_all_in_dir('.', pattern="stacked*exp??????.csv")
    temp_flux_arr, temp_exp_arr, temp_ivar_arr, temp_mask_arr, temp_wavelengths = \
        ICAize.load_all_in_dir('.', pattern="stacked*exp??????.fits")

    if len(flux_arr) > 0:
        if len(temp_flux_arr) > 0:
            flux_arr = np.concatenate((flux_arr, temp_flux_arr))
            exp_arr = np.concatenate((exp_arr, temp_exp_arr))
            ivar_arr = np.concatenate((ivar_arr, temp_ivar_arr))
            wavelengths = np.concatenate((wavelengths, temp_wavelengths))
    elif len(temp_flux_arr) > 0:
        flux_arr = temp_flux_arr
        exp_arr = temp_exp_arr
        ivar_arr = temp_ivar_arr
        wavelengths = temp_wavelengths
    else:
        return

    np.savez("compacted_flux_data.npz", flux=flux_arr, exp=exp_arr, ivar=ivar_arr, wavelengths=wavelengths)
Example #9
0
def _iter_scorer(train_inds, test_inds, flux_arr, model__and__model_flux_mean, method, score_methods, include_mle):
    model = model__and__model_flux_mean[0]
    model_flux_mean = model__and__model_flux_mean[1]

    flux_test = flux_arr[test_inds]
    flux_conv_test = None

    if score_methods != ['LL']:
        for pca_model in model:
            if flux_conv_test is None:
                flux_conv_test = iz.transform_inverse_transform(flux_test, pca_model, model_flux_mean, method)
                flux_test -= flux_conv_test
            else:
                residual = iz.transform_inverse_transform(flux_test, pca_model, model_flux_mean, method)
                flux_conv_test += residual
                flux_test -= residual

    scores = {}

    for score_method in score_methods:
        #print("Calculating score:" + score_method)

        score_func = iz.get_score_func(score_method)

        if score_func is not None:
            if score_method != 'MAE':
                scores[score_method] = score_func(flux_test, flux_conv_test, multioutput='uniform_average')
            else:
                scores[score_method] = np.mean(np.median(np.abs(flux_test - flux_conv_test), axis=1))

    if (include_mle or score_method == 'LL') and method in ['FA', 'PCA']:
        try:
            scores['mle'] = model.score(flux_test)
        except np.linalg.linalg.LinAlgError:
            scores['mle'] = 0 #-2**10 #float("-inf")
        except ValueError:
            scores['mle'] = 0 #-2**10 #float("-inf")

    #print("Scores: " + str(scores))
    return scores
def EXP_VAR(Y, y, multioutput='uniform_average', Y_full=None, flux_arr=None, source_model=None,
        ss=None, source_model_args=None, method=None):
    if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None:
        inds = get_inds_(Y, Y_full)
        back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args)
        try:
            return explained_variance_score(flux_arr[inds], back_trans_flux, multioutput=multioutput)
        except:
            return float(np.mean(np.var(flux_arr[inds] - back_trans_flux, axis=1) / np.var(flux_arr[inds], axis=1)))
    else:
        try:
            return explained_variance_score(Y, y, multioutput=multioutput)
        except:
            return float(np.mean(np.var(Y - y, axis=1) / np.var(Y, axis=1)))
Example #11
0
def _iter_modeler(train_inds, test_inds, flux_arr, model, method):
    model_list = []
    flux_train = flux_arr[train_inds]
    flux_avg = np.mean(flux_train, axis=0)

    for i in range(2):
        new_model = est_clone(model)
        new_model.fit(flux_train)
        back_train = iz.transform_inverse_transform(flux_train, new_model, flux_avg, method)
        flux_train -= back_train

        model_list.append(new_model)

    return model_list, flux_avg
Example #12
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Build and test models based on dim reductions and provided spectra",
    )
    parser.add_argument(
        "--spectra_path", type=str, default=".", metavar="PATH", help="Spectra path to work from, if not " "." ""
    )
    parser.add_argument(
        "--method", type=str, default="ICA", metavar="METHOD", help="Dim reduction method to load data for"
    )
    parser.add_argument(
        "--file_path",
        type=str,
        default=None,
        metavar="FILE_PATH",
        help="COMPLETE path from which to load a dim reduction",
    )

    args = parser.parse_args()

    data_model = None
    scaler = None
    if args.file_path is not None:
        data_model, scaler = ize.unpickle_model(filename=args.file_path)
    else:
        data_model, scaler = ize.unpickle_model(path=args.spectra_path, method=args.method)
    components = ize.get_components(args.method, data_model)

    offset = 0
    for i, comp_i in enumerate(components):
        if i > 0:
            offset += np.max(np.abs(comp_i[comp_i < 0])) * 1.2
        plt.plot(stack.skyexp_wlen_out, comp_i + offset)
        offset += np.max(comp_i[comp_i > 0]) * 1.2
    plt.show()
    plt.close()
Example #13
0
def MAE(Y,
        y,
        multioutput='uniform_average',
        Y_full=None,
        flux_arr=None,
        source_model=None,
        ss=None,
        source_model_args=None,
        method=None):
    if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None:
        inds = get_inds_(Y, Y_full)
        back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method,
                                                   source_model_args)
        return float(
            np.mean(np.median(np.abs(flux_arr[inds] - back_trans_flux),
                              axis=1)))
    else:
        return float(np.mean(np.median(np.abs(Y - y), axis=1)))
def MAPED(Y, y, multioutput='uniform_average', power=4, cutoff=0.1, Y_full=None, flux_arr=None, source_model=None,
        ss=None, source_model_args=None, method=None):
    #Mean Absolute Power Error Difference;  take sum of (absolute) diffs, subtract MAPE from it
    if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None:
        inds = get_inds_(Y, Y_full)
        back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args)

        diffs = np.abs(flux_arr[inds] - back_trans_flux)
        diffs[diffs < cutoff] = 0

        sums = np.sum(diffs, axis=1)
        diffs = np.sum(np.power(diffs, power), axis=1)

        return float(np.mean(np.abs(sums - np.power(diffs, 1.0/power))) / flux_arr.shape[1])
    else:
        diffs = np.abs(Y - y)
        diffs[diff < cutoff] = 0

        sums = np.sum(diffs, axis=1)
        diffs = np.sum(np.power(diffs, power), axis=1)

        return float(np.mean(np.abs(sums - np.power(diffs, 1.0/power))) / Y.shape[1])
Example #15
0
def main():
    path = "."
    metadata_path = ".."

    rfr = Pipeline([ ('ica', FastICA(random_state=random_state, max_iter=ica_max_iter)),
                     ('rfr', ensemble.RandomForestRegressor(random_state=random_state, n_jobs=-1))
                ])

    param_grid = {
        "ica__n_components": sp_randint(15, 200),
        "rfr__n_estimators": sp_randint(25, 400),
        "rfr__min_samples_split": sp_randint(1, 10)
        #,
        #"rfr__max_features": [None, "log2", "sqrt"]
    }

    randsearch = RandomizedSearchCV(rfr, param_grid, n_iter = n_iter_search)

    flux_arr, exp_arr, wavelengths = ICAize.load_all_in_dir(path=path, use_con_flux=True, recombine_flux=False)

    obs_metadata = random_forest_spectra.trim_observation_metadata(random_forest_spectra.load_observation_metadata(metadata_path))
    reduced_obs_metadata = obs_metadata[np.in1d(obs_metadata['EXP_ID'], exp_arr)]
    reduced_obs_metadata.sort('EXP_ID')
    sorted_inds = np.argsort(exp_arr)
    reduced_obs_metadata.remove_column('EXP_ID')
    md_len = len(reduced_obs_metadata)
    X_arr = np.array(reduced_obs_metadata).view('f8').reshape((md_len,-1))

    randsearch.fit(flux_arr[sorted_inds], X_arr)

    top_scores = sorted(randsearch.grid_scores_, key=itemgetter(1), reverse=True)[:5]
    for i, score in enumerate(top_scores):
        print "Model with rank:", i
        print "Mean validation score/std:", score.mean_validation_score, np.std(score.cv_validation_scores)
        print "Parameters:", score.parameters
        print ""
def MSE(Y, y, multioutput='uniform_average', Y_full=None, flux_arr=None, source_model=None,
        ss=None, source_model_args=None, method=None):
    if Y_full is not None and flux_arr is not None and source_model is not None and ss is not None:
        inds = get_inds_(Y, Y_full)
        back_trans_flux = ICAize.inverse_transform(y, source_model, ss, method, source_model_args)

        try:
            return mean_squared_error(flux_arr[inds], back_trans_flux, multioutput=multioutput)
        except:
            return mean_squared_error(flux_arr[inds], back_trans_flux)
    else:
        try:
            yss = pp.MaxAbsScaler()
            Y = yss.fit_transform(Y)
            y = yss.transform(y)
        except:
            scalefactor = np.amax(np.abs(Y), axis=0)
            Y = Y / scalefactor
            y = y / scalefactor

        try:
            return mean_squared_error(Y, y, multioutput=multioutput)
        except:
            return mean_squared_error(Y, y)
Example #17
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        'Build and test models based on dim reductions and provided spectra')
    subparsers = parser.add_subparsers(dest='subparser_name')

    parser.add_argument('--metadata_path',
                        type=str,
                        default='.',
                        metavar='PATH',
                        help='Metadata path to work from, if not '
                        '.'
                        '')
    parser.add_argument('--spectra_path',
                        type=str,
                        default='.',
                        metavar='PATH',
                        help='Spectra path to work from, if not '
                        '.'
                        '')
    parser.add_argument('--method',
                        type=str,
                        default='ICA',
                        metavar='METHOD',
                        help='Dim reduction method to load data for')
    parser.add_argument('--n_jobs',
                        type=int,
                        default=1,
                        metavar='N_JOBS',
                        help='N_JOBS')
    parser.add_argument(
        '--model',
        type=str,
        choices=['ET', 'RF', 'GP', 'KNN', 'SVR'],
        default='ET',
        help=
        'Which model type to use: ET (Extra Trees), RF (Random Forest), GP (Gaussian Process), KNN, or SVR (Support Vector Regression)'
    )
    parser.add_argument(
        '--load_model',
        action='store_true',
        help='Whether or not to load the model from --model_path')
    parser.add_argument('--model_path',
                        type=str,
                        default='model.pkl',
                        metavar='MODEL_PATH',
                        help='COMPLETE path from which to load a model')
    parser.add_argument(
        '--metadata_flags', type=str, default='', metavar='METADATA_FLAGS',
        help='Flags specifying observational metadata pre-processing, e.g. LUNAR_MAG which takes the '\
            'magnitude and linearizes it (ignoring that it is an area magnitude)'
    )
    parser.add_argument(
        '--compacted_path',
        type=str,
        default=None,
        metavar='COMPATED_PATH',
        help=
        'Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored'
    )

    parser_compare = subparsers.add_parser('compare')
    parser_compare.add_argument(
        '--folds',
        type=int,
        default=3,
        metavar='TEST_FOLDS',
        help=
        'Do k-fold cross validation with specified number of folds.  Defaults to 3.'
    )
    parser_compare.add_argument(
        '--iters',
        type=int,
        default=50,
        metavar='HYPER_FIT_ITERS',
        help='Number of iterations when fitting hyper-params')
    parser_compare.add_argument(
        '--outputfbk',
        action='store_true',
        help='If set, outputs \'grid_scores_\' data from RandomizedSearchCV')
    parser_compare.add_argument(
        '--save_best',
        action='store_true',
        help=
        'Whether or not to save the (last/best) model built for e.g. --hyper_fit'
    )
    parser_compare.add_argument(
        '--scorer',
        type=str,
        choices=['R2', 'MAE', 'MSE', 'LL', 'EXP_VAR', 'MAPED', 'MSEMV'],
        default='R2',
        help=
        'Which scoring method to use to determine ranking of model instances.')
    parser_compare.add_argument(
        '--use_spectra',
        action='store_true',
        help=
        'Whether scoring is done against the DM components or the predicted spectra'
    )
    parser_compare.add_argument(
        '--ivar_cutoff',
        type=float,
        default=0.001,
        metavar='IVAR_CUTOFF',
        help='data with inverse variace below cutoff is masked as if ivar==0')
    parser_compare.add_argument(
        '--plot_final_errors', action='store_true',
        help='If set, will plot the errors from the final/best model, for the whole dataset, from ' + \
            'the best model re-trained on CV folds used for testing.' + \
            'Plots all errors on top of each other with low-ish alpha, to give a kind of visual ' + \
            'density map of errors.'
    )

    args = parser.parse_args()

    obs_metadata = trim_observation_metadata(
        load_observation_metadata(args.metadata_path,
                                  flags=args.metadata_flags))
    sources, components, exposures, wavelengths = ICAize.deserialize_data(
        args.spectra_path, args.method)
    source_model, ss, model_args = ICAize.unpickle_model(
        args.spectra_path, args.method)

    comb_flux_arr, comb_exposure_arr, comb_wavelengths = None, None, None
    if args.use_spectra:
        comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = ICAize.load_data(
            args)

        filter_arr = np.in1d(comb_exposure_arr, exposures)
        comb_flux_arr = comb_flux_arr[filter_arr]
        comb_exposure_arr = comb_exposure_arr[filter_arr]

        sorted_inds = np.argsort(comb_exposure_arr)
        comb_flux_arr = comb_flux_arr[sorted_inds]
        comb_exposure_arr = comb_exposure_arr[sorted_inds]

        del comb_ivar_arr
        del comb_masks

    reduced_obs_metadata = obs_metadata[np.in1d(obs_metadata['EXP_ID'],
                                                exposures)]
    reduced_obs_metadata.sort('EXP_ID')
    sorted_inds = np.argsort(exposures)

    reduced_obs_metadata.remove_column('EXP_ID')
    md_len = len(reduced_obs_metadata)
    var_count = len(reduced_obs_metadata.columns)
    X_arr = np.array(reduced_obs_metadata).view('f8').reshape((md_len, -1))
    Y_arr = sources[sorted_inds]

    if args.load_model:
        predictive_model = load_model(args.model_path)
    else:
        predictive_model = get_model(args.model)

    if args.subparser_name == 'compare':
        pdist = get_param_distribution_for_model(args.model, args.iters)

        scorer = None
        if args.scorer == 'R2':
            scorer = make_scorer(R2)
        elif args.scorer == 'MAE':
            if args.use_spectra:
                p_MAE_ = partial(MAE,
                                 Y_full=Y_arr,
                                 flux_arr=comb_flux_arr,
                                 source_model=source_model,
                                 ss=ss,
                                 source_model_args=model_args,
                                 method=args.method)
                scorer = make_scorer(p_MAE_, greater_is_better=False)
            else:
                scorer = make_scorer(MAE, greater_is_better=False)
        elif args.scorer == 'MSE':
            if args.use_spectra:
                p_MSE_ = partial(MSE,
                                 Y_full=Y_arr,
                                 flux_arr=comb_flux_arr,
                                 source_model=source_model,
                                 ss=ss,
                                 source_model_args=model_args,
                                 method=args.method)
                scorer = make_scorer(p_MSE_, greater_is_better=False)
            else:
                scorer = make_scorer(MSE, greater_is_better=False)
        elif args.scorer == 'MSEMV':
            if args.use_spectra:
                p_MSEMV_ = partial(MSEMV,
                                   Y_full=Y_arr,
                                   flux_arr=comb_flux_arr,
                                   source_model=source_model,
                                   ss=ss,
                                   source_model_args=model_args,
                                   method=args.method)
                scorer = make_scorer(p_MSEMV_, greater_is_better=False)
            else:
                scorer = make_scorer(MSEMV, greater_is_better=False)
        elif args.scorer == 'EXP_VAR':
            if args.use_spectra:
                p_EXP_VAR_ = partial(EXP_VAR,
                                     Y_full=Y_arr,
                                     flux_arr=comb_flux_arr,
                                     source_model=source_model,
                                     ss=ss,
                                     source_model_args=model_args,
                                     method=args.method)
                scorer = make_scorer(p_EXP_VAR_)
            else:
                scorer = make_scorer(EXP_VAR)
        elif args.scorer == 'MAPED':
            if args.use_spectra:
                p_MAPED_ = partial(MAPED,
                                   Y_full=Y_arr,
                                   flux_arr=comb_flux_arr,
                                   source_model=source_model,
                                   ss=ss,
                                   source_model_args=model_args,
                                   method=args.method)
                scorer = make_scorer(p_MAPED_, greater_is_better=False)
            else:
                scorer = make_scorer(MAPED, greater_is_better=False)
        elif args.scorer == 'LL':
            scorer = None

        folder = ShuffleSplit(exposures.shape[0],
                              n_iter=args.folds,
                              test_size=1.0 / args.folds,
                              random_state=12345)

        if args.model == 'GP':
            predictive_model.random_start = args.folds
            rcv = GridSearchCV(predictive_model,
                               param_grid=pdist,
                               error_score=0,
                               cv=3,
                               n_jobs=args.n_jobs,
                               scoring=scorer)
            #random_state=RANDOM_STATE,
            #n_iter=args.iters,
        else:
            rcv = RandomizedSearchCV(predictive_model,
                                     param_distributions=pdist,
                                     n_iter=args.iters,
                                     cv=folder,
                                     n_jobs=args.n_jobs,
                                     scoring=scorer)

        # This is going to fit X (metdata) to Y (DM'ed sources).  But there are
        # really two tests here:  how well hyperparams fit/predict the sources
        # and how well they fit/predict the actual source spectra.  Until I know
        # better, I 'm going to need to build a way to test both.
        rcv.fit(X_arr, Y_arr)

        print(rcv.best_score_)
        print(rcv.best_params_)
        print(rcv.best_estimator_)
        if args.outputfbk:
            print("=+" * 10 + "=")
            for val in rcv.grid_scores_:
                print(val)
            print("=+" * 10 + "=")

        if args.save_best:
            save_model(rcv.best_estimator_, args.model_path)

        if args.plot_final_errors:
            for train_inds, test_inds in folder:
                rcv.best_estimator_.fit(X_arr[train_inds], Y_arr[train_inds])
                predicted = rcv.best_estimator_.predict(X_arr[test_inds])
                back_trans_flux = ICAize.inverse_transform(
                    predicted, source_model, ss, args.method, model_args)
                diffs = np.abs(comb_flux_arr[test_inds] - back_trans_flux)
                #Is there not 'trick' to getting matplotlib to do this without a loop?
                for i in range(diffs.shape[0]):
                    plt.plot(comb_wavelengths, diffs[i, :], 'b-', alpha=0.01)
            plt.show()
Example #18
0
def main():
    flux_arr, exp_arr, ivar_arr, mask_arr, wavelengths = \
        ICAize.load_all_in_dir('.', use_con_flux=False, recombine_flux=False,
                        pattern="stacked*exp??????.csv")
    np.savez("compacted_flux_data.npz", flux=flux_arr, exp=exp_arr, ivar=ivar_arr, wavelengths=wavelengths)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Compute PCA/ICA/NMF/etc. components over set of stacked spectra, save those out, and pickle model'
    )
    parser.add_argument(
        '--pattern', type=str, default='stacked*exp??????.*', metavar='PATTERN',
        help='File pattern for stacked sky fibers.'
    )
    parser.add_argument(
        '--path', type=str, default='.', metavar='PATH',
        help='Path to work from, if not ''.'''
    )
    parser.add_argument(
        '--compacted_path', type=str, default=None, metavar='COMPATED_PATH',
        help='Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored'
    )
    parser.add_argument(
        '--n_components', type=int, default=40, metavar='N_COMPONENTS',
        help='Number of ICA/PCA/etc. components'
    )
    parser.add_argument(
        '--method', type=str, default='ICA', metavar='METHOD',
        choices=['ICA', 'PCA', 'SPCA', 'NMF', 'ISO', 'KPCA', 'FA', 'DL'],
        help='Which dim. reduction method to use'
    )
    parser.add_argument(
        '--scale', action='store_true',
        help='Should inputs variance be scaled?  Defaults to mean subtract and value scale, but w/out this does not scale variance.'
    )
    parser.add_argument(
        '--no_scale', action='store_true',
        help='Suppresses all scaling'
    )
    parser.add_argument(
        '--ivar_cutoff', type=float, default=0.001, metavar='IVAR_CUTOFF',
        help='data with inverse variace below cutoff is masked as if ivar==0'
    )
    parser.add_argument(
        '--n_iter', type=int, default=1200, metavar='MAX_ITER',
        help='Maximum number of iterations to allow for convergence.  For SDSS data 1000 is a safe number of ICA, while SPCA requires larger values e.g. ~2000 to ~2500'
    )
    parser.add_argument(
        '--n_jobs', type=int, default=None, metavar='N_JOBS',
        help='N_JOBS'
    )
    args = parser.parse_args()


    comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = iz.load_data(args)
    model = iz.get_model(args.method, n=args.n_components, n_neighbors=None, max_iter=args.n_iter, random_state=iz.random_state, n_jobs=args.n_jobs)

    ss = None
    if args.no_scale:
        scaled_flux_arr = comb_flux_arr
    else:
        ss = skpp.StandardScaler(with_std=False)
        if args.scale:
            ss = skpp.StandardScaler(with_std=True)
            scaled_flux_arr = ss.fit_transform(comb_flux_arr)

    #Heavily copied from J. Vanderplas/astroML bayesian_blocks.py
    N = comb_wavelengths.size
    step = args.n_components * 4

    edges = np.concatenate([comb_wavelengths[:1:step],
                            0.5 * (comb_wavelengths[1::step] + comb_wavelengths[:-1:step]),
                            comb_wavelengths[-1::step]])
    block_length = comb_wavelengths[-1::step] - edges

    # arrays to store the best configuration
    nn_vec = np.ones(N/step) * step
    best = np.zeros(N, dtype=float)
    last = np.zeros(N, dtype=int)

    for R in range(N/step):
        print("R: " + str(R))

        width = block_length[:R + 1] - block_length[R + 1]
        count_vec = np.cumsum(nn_vec[:R + 1][::-1])[::-1]

        #width = nn_vec[:R + 1] - nn_vec[R + 1]
        #count_vec = np.cumsum(nn_vec[:R + 1][::-1])[::-1]

        #print(width)
        #print(count_vec)
        #raw_input("Pausing... ")

        fit_vec = map(lambda n: iz.score_via_CV(['LL'], scaled_flux_arr[:, :n], model, ss, args.method, folds=3, n_jobs=args.n_jobs), count_vec)
        fit_vec = [d["mle"] for d in fit_vec]

        #print(fit_vec)
        fit_vec[1:] += best[:R]
        #print(fit_vec)

        i_max = np.argmax(fit_vec)
        last[R] = i_max
        best[R] = fit_vec[i_max]

        #print(best)

    change_points =  np.zeros(N/step, dtype=int)
    i_cp = N/step
    ind = N/step
    while True:
        i_cp -= 1
        change_points[i_cp] = ind
        if ind == 0:
            break
        ind = last[ind - 1]
    change_points = change_points[i_cp:]

    print(edges[change_points])


    '''
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        'Compute PCA/ICA/NMF/etc. components over set of stacked spectra, save those out, and pickle model'
    )
    parser.add_argument('--pattern',
                        type=str,
                        default='stacked*exp??????.*',
                        metavar='PATTERN',
                        help='File pattern for stacked sky fibers.')
    parser.add_argument('--path',
                        type=str,
                        default='.',
                        metavar='PATH',
                        help='Path to work from, if not '
                        '.'
                        '')
    parser.add_argument(
        '--compacted_path',
        type=str,
        default=None,
        metavar='COMPATED_PATH',
        help=
        'Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored'
    )
    parser.add_argument('--n_components',
                        type=int,
                        default=40,
                        metavar='N_COMPONENTS',
                        help='Number of ICA/PCA/etc. components')
    parser.add_argument(
        '--method',
        type=str,
        default='ICA',
        metavar='METHOD',
        choices=['ICA', 'PCA', 'SPCA', 'NMF', 'ISO', 'KPCA', 'FA', 'DL'],
        help='Which dim. reduction method to use')
    parser.add_argument(
        '--scale',
        action='store_true',
        help=
        'Should inputs variance be scaled?  Defaults to mean subtract and value scale, but w/out this does not scale variance.'
    )
    parser.add_argument('--no_scale',
                        action='store_true',
                        help='Suppresses all scaling')
    parser.add_argument(
        '--ivar_cutoff',
        type=float,
        default=0.001,
        metavar='IVAR_CUTOFF',
        help='data with inverse variace below cutoff is masked as if ivar==0')
    parser.add_argument(
        '--n_iter',
        type=int,
        default=1200,
        metavar='MAX_ITER',
        help=
        'Maximum number of iterations to allow for convergence.  For SDSS data 1000 is a safe number of ICA, while SPCA requires larger values e.g. ~2000 to ~2500'
    )
    parser.add_argument('--n_jobs',
                        type=int,
                        default=None,
                        metavar='N_JOBS',
                        help='N_JOBS')
    args = parser.parse_args()

    comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = iz.load_data(
        args)
    model = iz.get_model(args.method,
                         n=args.n_components,
                         n_neighbors=None,
                         max_iter=args.n_iter,
                         random_state=iz.random_state,
                         n_jobs=args.n_jobs)

    ss = None
    if args.no_scale:
        scaled_flux_arr = comb_flux_arr
    else:
        ss = skpp.StandardScaler(with_std=False)
        if args.scale:
            ss = skpp.StandardScaler(with_std=True)
            scaled_flux_arr = ss.fit_transform(comb_flux_arr)

    #Heavily copied from J. Vanderplas/astroML bayesian_blocks.py
    N = comb_wavelengths.size
    step = args.n_components * 4

    edges = np.concatenate([
        comb_wavelengths[:1:step],
        0.5 * (comb_wavelengths[1::step] + comb_wavelengths[:-1:step]),
        comb_wavelengths[-1::step]
    ])
    block_length = comb_wavelengths[-1::step] - edges

    # arrays to store the best configuration
    nn_vec = np.ones(N / step) * step
    best = np.zeros(N, dtype=float)
    last = np.zeros(N, dtype=int)

    for R in range(N / step):
        print("R: " + str(R))

        width = block_length[:R + 1] - block_length[R + 1]
        count_vec = np.cumsum(nn_vec[:R + 1][::-1])[::-1]

        #width = nn_vec[:R + 1] - nn_vec[R + 1]
        #count_vec = np.cumsum(nn_vec[:R + 1][::-1])[::-1]

        #print(width)
        #print(count_vec)
        #raw_input("Pausing... ")

        fit_vec = map(
            lambda n: iz.score_via_CV(['LL'],
                                      scaled_flux_arr[:, :n],
                                      model,
                                      ss,
                                      args.method,
                                      folds=3,
                                      n_jobs=args.n_jobs), count_vec)
        fit_vec = [d["mle"] for d in fit_vec]

        #print(fit_vec)
        fit_vec[1:] += best[:R]
        #print(fit_vec)

        i_max = np.argmax(fit_vec)
        last[R] = i_max
        best[R] = fit_vec[i_max]

        #print(best)

    change_points = np.zeros(N / step, dtype=int)
    i_cp = N / step
    ind = N / step
    while True:
        i_cp -= 1
        change_points[i_cp] = ind
        if ind == 0:
            break
        ind = last[ind - 1]
    change_points = change_points[i_cp:]

    print(edges[change_points])
    '''
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Build and test models based on dim reductions and provided spectra'
    )
    subparsers = parser.add_subparsers(dest='subparser_name')

    parser.add_argument(
        '--metadata_path', type=str, default='.', metavar='PATH',
        help='Metadata path to work from, if not ''.'''
    )
    parser.add_argument(
        '--spectra_path', type=str, default='.', metavar='PATH',
        help='Spectra path to work from, if not ''.'''
    )
    parser.add_argument(
        '--method', type=str, default='ICA', metavar='METHOD',
        help='Dim reduction method to load data for'
    )
    parser.add_argument(
        '--n_jobs', type=int, default=1, metavar='N_JOBS',
        help='N_JOBS'
    )
    parser.add_argument(
        '--model', type=str, choices=['ET', 'RF', 'GP', 'KNN', 'SVR'], default='ET',
        help='Which model type to use: ET (Extra Trees), RF (Random Forest), GP (Gaussian Process), KNN, or SVR (Support Vector Regression)'
    )
    parser.add_argument(
        '--load_model', action='store_true',
        help='Whether or not to load the model from --model_path'
    )
    parser.add_argument(
        '--model_path', type=str, default='model.pkl', metavar='MODEL_PATH',
        help='COMPLETE path from which to load a model'
    )
    parser.add_argument(
        '--metadata_flags', type=str, default='', metavar='METADATA_FLAGS',
        help='Flags specifying observational metadata pre-processing, e.g. LUNAR_MAG which takes the '\
            'magnitude and linearizes it (ignoring that it is an area magnitude)'
    )
    parser.add_argument(
        '--compacted_path', type=str, default=None, metavar='COMPATED_PATH',
        help='Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored'
    )

    parser_compare = subparsers.add_parser('compare')
    parser_compare.add_argument(
        '--folds', type=int, default=3, metavar='TEST_FOLDS',
        help='Do k-fold cross validation with specified number of folds.  Defaults to 3.'
    )
    parser_compare.add_argument(
        '--iters', type=int, default=50, metavar='HYPER_FIT_ITERS',
        help='Number of iterations when fitting hyper-params'
    )
    parser_compare.add_argument(
        '--outputfbk', action='store_true',
        help='If set, outputs \'grid_scores_\' data from RandomizedSearchCV'
    )
    parser_compare.add_argument(
        '--save_best', action='store_true',
        help='Whether or not to save the (last/best) model built for e.g. --hyper_fit'
    )
    parser_compare.add_argument(
        '--scorer', type=str, choices=['R2', 'MAE', 'MSE', 'LL', 'EXP_VAR', 'MAPED', 'MSEMV'], default='R2',
        help='Which scoring method to use to determine ranking of model instances.'
    )
    parser_compare.add_argument(
        '--use_spectra', action='store_true',
        help='Whether scoring is done against the DM components or the predicted spectra'
    )
    parser_compare.add_argument(
        '--ivar_cutoff', type=float, default=0.001, metavar='IVAR_CUTOFF',
        help='data with inverse variace below cutoff is masked as if ivar==0'
    )
    parser_compare.add_argument(
        '--plot_final_errors', action='store_true',
        help='If set, will plot the errors from the final/best model, for the whole dataset, from ' + \
            'the best model re-trained on CV folds used for testing.' + \
            'Plots all errors on top of each other with low-ish alpha, to give a kind of visual ' + \
            'density map of errors.'
    )

    args = parser.parse_args()

    obs_metadata = trim_observation_metadata(load_observation_metadata(args.metadata_path, flags=args.metadata_flags))
    sources, components, exposures, wavelengths = ICAize.deserialize_data(args.spectra_path, args.method)
    source_model, ss, model_args = ICAize.unpickle_model(args.spectra_path, args.method)

    comb_flux_arr, comb_exposure_arr, comb_wavelengths = None, None, None
    if args.use_spectra:
        comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = ICAize.load_data(args)

        filter_arr = np.in1d(comb_exposure_arr, exposures)
        comb_flux_arr = comb_flux_arr[filter_arr]
        comb_exposure_arr = comb_exposure_arr[filter_arr]

        sorted_inds = np.argsort(comb_exposure_arr)
        comb_flux_arr = comb_flux_arr[sorted_inds]
        comb_exposure_arr = comb_exposure_arr[sorted_inds]

        del comb_ivar_arr
        del comb_masks

    reduced_obs_metadata = obs_metadata[np.in1d(obs_metadata['EXP_ID'], exposures)]
    reduced_obs_metadata.sort('EXP_ID')
    sorted_inds = np.argsort(exposures)

    reduced_obs_metadata.remove_column('EXP_ID')
    md_len = len(reduced_obs_metadata)
    var_count = len(reduced_obs_metadata.columns)
    X_arr = np.array(reduced_obs_metadata).view('f8').reshape((md_len,-1))
    Y_arr = sources[sorted_inds]

    if args.load_model:
        predictive_model = load_model(args.model_path)
    else:
        predictive_model = get_model(args.model)

    if args.subparser_name == 'compare':
        pdist = get_param_distribution_for_model(args.model, args.iters)

        scorer = None
        if args.scorer == 'R2':
            scorer = make_scorer(R2)
        elif args.scorer == 'MAE':
            if args.use_spectra:
                p_MAE_ = partial(MAE, Y_full=Y_arr, flux_arr=comb_flux_arr,
                            source_model=source_model, ss=ss,
                            source_model_args=model_args, method=args.method)
                scorer = make_scorer(p_MAE_, greater_is_better=False)
            else:
                scorer = make_scorer(MAE, greater_is_better=False)
        elif args.scorer == 'MSE':
            if args.use_spectra:
                p_MSE_ = partial(MSE, Y_full=Y_arr, flux_arr=comb_flux_arr,
                            source_model=source_model, ss=ss,
                            source_model_args=model_args, method=args.method)
                scorer = make_scorer(p_MSE_, greater_is_better=False)
            else:
                scorer = make_scorer(MSE, greater_is_better=False)
        elif args.scorer == 'MSEMV':
            if args.use_spectra:
                p_MSEMV_ = partial(MSEMV, Y_full=Y_arr, flux_arr=comb_flux_arr,
                            source_model=source_model, ss=ss,
                            source_model_args=model_args, method=args.method)
                scorer = make_scorer(p_MSEMV_, greater_is_better=False)
            else:
                scorer = make_scorer(MSEMV, greater_is_better=False)
        elif args.scorer == 'EXP_VAR':
            if args.use_spectra:
                p_EXP_VAR_ = partial(EXP_VAR, Y_full=Y_arr, flux_arr=comb_flux_arr,
                            source_model=source_model, ss=ss,
                            source_model_args=model_args, method=args.method)
                scorer = make_scorer(p_EXP_VAR_)
            else:
                scorer = make_scorer(EXP_VAR)
        elif args.scorer == 'MAPED':
            if args.use_spectra:
                p_MAPED_ = partial(MAPED, Y_full=Y_arr, flux_arr=comb_flux_arr,
                            source_model=source_model, ss=ss,
                            source_model_args=model_args, method=args.method)
                scorer = make_scorer(p_MAPED_, greater_is_better=False)
            else:
                scorer = make_scorer(MAPED, greater_is_better=False)
        elif args.scorer == 'LL':
            scorer = None

        folder = ShuffleSplit(exposures.shape[0], n_iter=args.folds, test_size=1.0/args.folds,
                            random_state=12345)

        if args.model == 'GP':
            predictive_model.random_start = args.folds
            rcv = GridSearchCV(predictive_model, param_grid=pdist,
                            error_score=0, cv=3, n_jobs=args.n_jobs,
                            scoring=scorer)
                            #random_state=RANDOM_STATE,
                            #n_iter=args.iters,
        else:
            rcv = RandomizedSearchCV(predictive_model, param_distributions=pdist,
                            n_iter=args.iters, cv=folder, n_jobs=args.n_jobs,
                            scoring=scorer)

        # This is going to fit X (metdata) to Y (DM'ed sources).  But there are
        # really two tests here:  how well hyperparams fit/predict the sources
        # and how well they fit/predict the actual source spectra.  Until I know
        # better, I 'm going to need to build a way to test both.
        rcv.fit(X_arr, Y_arr)

        print(rcv.best_score_)
        print(rcv.best_params_)
        print(rcv.best_estimator_)
        if args.outputfbk:
            print("=+"*10 + "=")
            for val in rcv.grid_scores_:
                print(val)
            print("=+"*10 + "=")

        if args.save_best:
            save_model(rcv.best_estimator_, args.model_path)

        if args.plot_final_errors:
            for train_inds, test_inds in folder:
                rcv.best_estimator_.fit(X_arr[train_inds], Y_arr[train_inds])
                predicted = rcv.best_estimator_.predict(X_arr[test_inds])
                back_trans_flux = ICAize.inverse_transform(predicted, source_model, ss, args.method, model_args)
                diffs = np.abs(comb_flux_arr[test_inds] - back_trans_flux)
                #Is there not 'trick' to getting matplotlib to do this without a loop?
                for i in range(diffs.shape[0]):
                    plt.plot(comb_wavelengths, diffs[i, :], 'b-', alpha=0.01)
            plt.show()
Example #22
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Compute PCA/ICA/NMF/etc. components over set of stacked spectra, save those out, and pickle model'
    )
    subparsers = parser.add_subparsers(dest='subparser_name')

    parser.add_argument(
        '--pattern', type=str, default='stacked*exp??????.*', metavar='PATTERN',
        help='File pattern for stacked sky fibers.'
    )
    parser.add_argument(
        '--path', type=str, default='.', metavar='PATH',
        help='Path to work from, if not ''.'''
    )
    parser.add_argument(
        '--compacted_path', type=str, default=None, metavar='COMPATED_PATH',
        help='Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored'
    )
    parser.add_argument(
        '--method', type=str, default=['ICA'], metavar='METHOD',
        choices=['ICA', 'PCA', 'SPCA', 'NMF', 'ISO', 'KPCA', 'FA', 'DL'], nargs='+',
        help='Which dim. reduction method to use'
    )
    parser.add_argument(
        '--scale', action='store_true',
        help='Should inputs be scaled?  Will mean subtract and value scale, but does not scale variace.'
    )
    parser.add_argument(
        '--ivar_cutoff', type=float, default=0.001, metavar='IVAR_CUTOFF',
        help='data with inverse variace below cutoff is masked as if ivar==0'
    )
    parser.add_argument(
        '--n_iter', type=int, default=1200, metavar='MAX_ITER',
        help='Maximum number of iterations to allow for convergence.  For SDSS data 1000 is a safe number of ICA, while SPCA requires larger values e.g. ~2000 to ~2500'
    )
    parser.add_argument(
        '--n_jobs', type=int, default=None, metavar='N_JOBS',
        help='N_JOBS'
    )

    parser_compare = subparsers.add_parser('compare')
    parser_compare.add_argument(
        '--max_components', type=int, default=50, metavar='COMP_MAX',
        help='Max number of components to use/test'
    )
    parser_compare.add_argument(
        '--min_components', type=int, default=0, metavar='COMP_MIN',
        help='Min number of compoenents to use/test'
    )
    parser_compare.add_argument(
        '--step_size', type=int, default=5, metavar='COMP_STEP',
        help='Step size from comp_min to comp_max'
    )
    parser_compare.add_argument(
        '--comparison', choices=['EXP_VAR', 'R2', 'MSE', 'MAE'], nargs='*', default=['EXP_VAR'],
        help='Comparison methods: Explained variance (score), R2 (score), mean sq. error (loss), MEDIAN absolute error (loss)'
    )
    parser_compare.add_argument(
        '--mle_if_avail', action='store_true',
        help='In additon to --comparison, include MLE if PCA or FA methods specified'
    )
    parser_compare.add_argument(
        '--plot_example_reconstruction', action='store_true',
        help='Pick a random spectrum, plot its actual and reconstructed versions'
    )

    parser_build = subparsers.add_parser('build')
    parser_build.add_argument(
        '--n_components', type=int, default=40, metavar='N_COMPONENTS',
        help='Number of ICA/PCA/etc. components'
    )
    parser_build.add_argument(
        '--n_neighbors', type=int, default=10, metavar='N_NEIGHBORS',
        help='Number of neighbots for e.g. IsoMap'
    )

    args = parser.parse_args()

    comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = iz.load_data(args)

    if 'DL' in args.method:
        flux_arr = comb_flux_arr.astype(dtype=np.float64)
    else:
        flux_arr = comb_flux_arr
    scaled_flux_arr = None
    ss = None
    if args.scale:
        ss = skpp.StandardScaler(with_std=False)
        scaled_flux_arr = ss.fit_transform(flux_arr)
    else:
        scaled_flux_arr = flux_arr

    if args.subparser_name == 'compare':
        fig, ax1 = plt.subplots()
        ax2 = ax1.twinx()

        for method in args.method:
            model = iz.get_model(method, max_iter=args.n_iter, random_state=iz.random_state, n_jobs=args.n_jobs)
            scores = {}
            mles_and_covs = args.mle_if_avail and (method == 'FA' or method == 'PCA')

            n_components = np.arange(args.min_components, args.max_components+1, args.step_size)
            for n in n_components:
                print("Cross validating for n=" + str(n) + " on method " + method)

                model.n_components = n

                comparisons = iz.score_via_CV(args.comparison,
                                    flux_arr if method == 'NMF' else scaled_flux_arr,
                                    model, method, n_jobs=args.n_jobs, include_mle=mles_and_covs,
                                    modeler=_iter_modeler, scorer=_iter_scorer)
                for key, val in comparisons.items():
                    if key in scores:
                        scores[key].append(val)
                    else:
                        scores[key] = [val]

            if mles_and_covs:
                #ax2.axhline(cov_mcd_score(scaled_flux_arr, args.scale), color='violet', label='MCD Cov', linestyle='--')
                ax2.axhline(cov_lw_score(scaled_flux_arr, args.scale), color='orange', label='LW Cov', linestyle='--')

            for key, score_list in scores.items():
                if key != 'mle':
                    ax1.plot(n_components, score_list, label=method + ':' + key + ' scores')
                else:
                    ax2.plot(n_components, score_list, '-.', label=method + ' mle scores')

        ax1.set_xlabel('nb of components')
        ax1.set_ylabel('CV scores', figure=fig)

        ax1.legend(loc='lower left')
        ax2.legend(loc='lower right')

        plt.show()
def load_plot_etc_target_type(metadata_path, spectra_path, test_inds, target_type, no_plot=False,
				save_out=False, restrict_delta=False, use_spca=False, use_pca=False):
    obs_metadata = trim_observation_metadata(load_observation_metadata(metadata_path))
    if use_filter_split:
        c_sources, c_mixing, c_exposures, c_wavelengths, c_filter_split_arr = load_spectra_data(spectra_path,
						target_type=target_type, filter_str='nonem', use_spca=use_spca, use_pca=use_pca)
        c_sources_e, c_mixing_e, c_exposures_e, c_wavelengths_e, c_filter_split_arr_e = load_spectra_data(spectra_path,
						target_type=target_type, filter_str='em', use_spca=use_spca, use_pca=use_pca)
    else:
        c_sources, c_mixing, c_exposures, c_wavelengths, c_filter_split_arr = load_spectra_data(spectra_path,
						target_type=target_type, filter_str='both', use_spca=use_spca, use_pca=use_pca)

    reduced_obs_metadata = obs_metadata[np.in1d(obs_metadata['EXP_ID'], c_exposures)]
    reduced_obs_metadata.sort('EXP_ID')
    sorted_inds = np.argsort(c_exposures)
    if use_filter_split:
        sorted_e_inds = np.argsort(c_exposures_e)

    if not linear_only:
        if reg_type == 'etr':
            rfr = ensemble.ExtraTreesRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split,
                        random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap)
            if use_filter_split:
                rfr_e = ensemble.ExtraTreesRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split,
                        random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap)
        else:
            rfr = ensemble.RandomForestRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split,
                        random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap)
            if use_filter_split:
                rfr_e = ensemble.RandomForestRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split,
                        random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap)
        if include_knn:
            knn = neighbors.KNeighborsRegressor(weights='distance', n_neighbors=10, p=64)
            if use_filter_split:
                knn_e = neighbors.KNeighborsRegressor(weights='distance', n_neighbors=10, p=64)

    if include_linear:
        linear = Linear(fit_intercept=True, copy_X=True, n_jobs=-1)
        poly_2_linear = Pipeline([('poly', PolynomialFeatures(degree=2)),
                            ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])
        poly_3_linear = Pipeline([('poly', PolynomialFeatures(degree=3)),
                        ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])
        poly_4_linear = Pipeline([('poly', PolynomialFeatures(degree=4)),
                        ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])
        if use_filter_split:
            linear_e = Linear(fit_intercept=True, copy_X=True, n_jobs=-1)
            poly_2_linear_e = Pipeline([('poly', PolynomialFeatures(degree=2)),
                            ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])
            poly_3_linear_e = Pipeline([('poly', PolynomialFeatures(degree=3)),
                        ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])
            poly_4_linear_e = Pipeline([('poly', PolynomialFeatures(degree=4)),
                        ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])

    reduced_obs_metadata.remove_column('EXP_ID')
    md_len = len(reduced_obs_metadata)
    var_count = len(reduced_obs_metadata.columns)
    X_arr = np.array(reduced_obs_metadata).view('f8').reshape((md_len,-1))

    ica = None
    if not use_spca and not use_pca:
        if use_filter_split:
            ica = ICAize.unpickle_FastICA(path=spectra_path, target_type=target_type, filter_str='nonem')
            ica_e = ICAize.unpickle_FastICA(path=spectra_path, target_type=target_type, filter_str='em')
        else:
            ica = ICAize.unpickle_FastICA(path=spectra_path, target_type=target_type, filter_str='both')
    elif use_spca:
        ica = ICAize.unpickle_SPCA(path=spectra_path, target_type=target_type)
    else:
        if use_filter_split:
            ica = ICAize.unpickle_PCA(path=spectra_path, target_type=target_type, filter_str='nonem')
            ica_e = ICAize.unpickle_PCA(path=spectra_path, target_type=target_type, filter_str='em')
        else:
            ica = ICAize.unpickle_PCA(path=spectra_path, target_type=target_type, filter_str='both')

    spectra_dir_list = os.listdir(spectra_path)

    ################################################################
    results = None
    for test_ind in test_inds:
        test_X = X_arr[test_ind]
        train_X = np.vstack( [X_arr[:test_ind], X_arr[test_ind+1:]] )
        test_y =  (c_sources[sorted_inds])[test_ind]
        train_y = np.vstack( [(c_sources[sorted_inds])[:test_ind], (c_sources[sorted_inds])[test_ind+1:]] )
        if use_filter_split:
            test_y_e =  (c_sources_e[sorted_e_inds])[test_ind]
            train_y_e = np.vstack( [(c_sources_e[sorted_e_inds])[:test_ind], (c_sources_e[sorted_e_inds])[test_ind+1:]] )

        if scale:
            scaler = StandardScaler(with_std=scale_std)
            train_X = scaler.fit_transform(train_X)
            test_X = scaler.transform(test_X)

        title_str = "exp{}, {}".format(c_exposures[sorted_inds[test_ind]], target_type)

        if not linear_only:
            rfr.fit(X=train_X, y=train_y)
            if use_filter_split:
                rfr_e.fit(X=train_X, y=train_y_e)
            if include_knn:
                knn.fit(X=train_X, y=train_y)
                if user_filter_split:
                    knn_e.fit(X=train_X, y=train_y_e)

        if include_linear:
            linear.fit(train_X, train_y)
            poly_2_linear.fit(train_X, train_y)
            if order_3:
                poly_3_linear.fit(train_X, train_y)
            if order_4:
                poly_4_linear.fit(train_X, train_y)
        if use_filter_split and include_linear:
            linear_e.fit(train_X, train_y_e)
            poly_2_linear_e.fit(train_X, train_y_e)
            if order_3:
                poly_3_linear_e.fit(train_X, train_y_e)
            if order_4:
                poly_4_linear_e.fit(train_X, train_y_e)

        print test_ind, c_exposures[sorted_inds[test_ind]],

        data = None
        actual = None
        mask = None
        delta_mask = None
        ivar = None

        for file in spectra_dir_list:
            if fnmatch.fnmatch(file, "stacked_sky_*exp{}.csv".format(c_exposures[sorted_inds[test_ind]])):
                data = Table.read(os.path.join(spectra_path, file), format="ascii.csv")
                ivar = data['ivar']
                mask = (data['ivar'] == 0)
                delta_mask = mask.copy()
                if restrict_delta:
                    if restrict_color == 'blue':
                        delta_mask[2700:] = True
                    else:
                        delta_mask[:2700] = True

                actual = data['flux']
                break
        if actual is None:
            continue

        if not linear_only:
            rfr_prediction = rfr.predict(test_X)
            if not use_spca and not use_pca:
                rfr_predicted = ica.inverse_transform(rfr_prediction, copy=True)
            else:
                rfr_predicted = np.zeros( (1, ica.components_.shape[1]) )
                rfr_predicted[0,:] = np.sum(rfr_prediction.T * ica.components_, 0)

            if use_filter_split:
                rfr_e_prediction = rfr_e.predict(test_X)
                if not use_spca and not use_pca:
                    rfr_e_predicted = ica_e.inverse_transform(rfr_e_prediction, copy=True)
                else:
                    rfr_e_predicted = np.zeros( (1, ica_e.components_.shape[1]) )
                    rfr_e_predicted[0,:] = np.sum(rfr_e_prediction.T * ica_e.components_, 0)
                rfr_predicted = rfr_predicted + rfr_e_predicted

            rfr_delta = rfr_predicted[0] - actual
            if not no_plot:
                plt.plot(c_wavelengths[~mask], rfr_predicted[0][~mask])
                plt.plot(c_wavelengths[~mask], actual[~mask])
                plt.plot(c_wavelengths[~mask], rfr_delta[~mask])
            if not no_plot:
                plt.plot(c_wavelengths, [0]*len(c_wavelengths))
            err_term = np.sum(np.power(rfr_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
            err_sum = np.sum(rfr_delta[~delta_mask])/len(rfr_delta[~delta_mask])
            red_chi = np.sum(np.power(rfr_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)
            if not no_plot:
                plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                plt.tight_layout()
                plt.title("Random Forest Regressor: {}".format(title_str))
                plt.show()
                plt.close()
            print err_term, red_chi, err_sum,

            if include_knn:
                knn_prediction = knn.predict(test_X)
                if not use_spca and not use_pca:
                    knn_predicted = ica.inverse_transform(knn_prediction, copy=True)
                else:
                    knn_predicted = np.zeros( (1, ica.components_.shape[1]) )
                    knn_predicted[0,:] = np.sum(knn_prediction.T * ica.components_, 0)

                if use_filter_split:
                    knn_e_prediction = knn_e.predict(test_X)
                    if not use_spca and not use_pca:
                        knn_e_predicted = ica_e.inverse_transform(knn_e_prediction, copy=True)
                    else:
                        knn_e_predicted = np.zeros( (1, ica_e.components_.shape[1]) )
                        knn_e_predicted[0,:] = np.sum(knn_e_prediction.T * ica_e.components_, 0)
                    knn_predicted = knn_predicted + knn_e_predicted

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], knn_predicted[0][~mask])
                    plt.plot(c_wavelengths[~mask], actual[~mask])
                knn_delta = knn_predicted[0] - actual
                err_term = np.sum(np.power(knn_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
                err_sum = np.sum(knn_delta[~delta_mask])/len(knn_delta[~delta_mask])
                red_chi = np.sum(np.power(knn_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], knn_delta[~mask])
                    plt.plot(c_wavelengths, [0]*len(c_wavelengths))
                    plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                    plt.tight_layout()
                    plt.title("Good 'ol K-NN: {}".format(title_str))
                    plt.show()
                    plt.close()
                print err_term, red_chi, err_sum,

        if include_linear:
            poly_1_prediction = linear.predict(test_X)
            if not use_spca and not use_pca:
                poly_1_predicted = ica.inverse_transform(poly_1_prediction, copy=True)
            else:
                poly_1_predicted = np.zeros( (1, ica.components_.shape[1]) )
                poly_1_predicted[0,:] = np.sum(poly_1_prediction.T * ica.components_, 0)

            if use_filter_split:
                poly_1_e_prediction = linear.predict(test_X)
                if not use_spca and not use_pca:
                    poly_1_e_predicted = ica_e.inverse_transform(poly_1_e_prediction, copy=True)
                else:
                    poly_1_e_predicted = np.zeros( (1, ica_e.components_.shape[1]) )
                    poly_1_e_predicted[0,:] = np.sum(poly_1_e_prediction.T * ica_e.components_, 0)
                poly_1_predicted = poly_1_predicted + poly_1_e_predicted

            poly_1_delta = poly_1_predicted[0] - actual

            if not no_plot:
                plt.plot(c_wavelengths[~mask], poly_1_predicted[0][~mask])
                plt.plot(c_wavelengths[~mask], actual[~mask])
            err_term = np.sum(np.power(poly_1_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
            err_sum = np.sum(poly_1_delta[~delta_mask])/len(poly_1_delta[~delta_mask])
            red_chi = np.sum(np.power(poly_1_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)

            if not no_plot:
                plt.plot(c_wavelengths[~mask], poly_1_delta[~mask])
                plt.plot(c_wavelengths, [0]*len(c_wavelengths))
                plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                plt.tight_layout()
                plt.title("Poly 1: {}".format(title_str))
                plt.show()
                plt.close()

            print err_term, red_chi, err_sum,

            poly_2_prediction = poly_2_linear.predict(test_X)
            if not use_spca and not use_pca:
                poly_2_predicted = ica.inverse_transform(poly_2_prediction, copy=True)
            else:
                poly_2_predicted = np.zeros( (1, ica.components_.shape[1]) )
                poly_2_predicted[0,:] = np.sum(poly_2_prediction.T * ica.components_, 0)

            poly_2_delta = poly_2_predicted[0] - actual

            if not no_plot:
                plt.plot(c_wavelengths[~mask], poly_2_predicted[0][~mask])
                plt.plot(c_wavelengths[~mask], actual[~mask])
            err_term = np.sum(np.power(poly_2_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
            err_sum = np.sum(poly_2_delta[~delta_mask])/len(poly_2_delta[~delta_mask])
            red_chi = np.sum(np.power(poly_2_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)

            if not no_plot:
                plt.plot(c_wavelengths[~mask], poly_2_delta[~mask])
                plt.plot(c_wavelengths, [0]*len(c_wavelengths))
                plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                plt.tight_layout()
                plt.title("Poly 2: {}".format(title_str))
                plt.show()
                plt.close()

            print err_term, red_chi, err_sum,
            err_ind =+ 1

            if order_3:
                poly_3_prediction = poly_3_linear.predict(test_X)
                if not use_spca and not use_pca:
                    poly_3_predicted = ica.inverse_transform(poly_3_prediction, copy=True)
                else:
                    poly_3_predicted = np.zeros( (1, ica.components_.shape[1]) )
                    poly_3_predicted[0,:] = np.sum(poly_3_prediction.T * ica.components_, 0)

                poly_3_delta = poly_3_predicted[0] - actual

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], poly_3_predicted[0][~mask])
                    plt.plot(c_wavelengths[~mask], actual[~mask])
                err_term = np.sum(np.power(poly_3_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
                err_sum = np.sum(poly_3_delta[~delta_mask])/len(poly_3_delta[~delta_mask])
                red_chi = np.sum(np.power(poly_3_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], poly_3_delta[~mask])
                    plt.plot(c_wavelengths, [0]*len(c_wavelengths))
                    plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                    plt.tight_layout()
                    plt.title("Poly 3: {}".format(title_str))
                    plt.show()
                    plt.close()

                print err_term, red_chi, err_sum,
                err_ind =+ 1

            if order_4:
                poly_4_prediction = poly_4_linear.predict(test_X)
                if not use_spca and not use_pca:
                    poly_4_predicted = ica.inverse_transform(poly_4_prediction, copy=True)
                else:
                    poly_4_predicted = np.zeros( (1, ica.components_.shape[1]) )
                    poly_4_predicted[0,:] = np.sum(poly_4_prediction.T * ica.components_, 0)

                poly_4_delta = poly_4_predicted[0] - actual

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], poly_4_predicted[0][~mask])
                    plt.plot(c_wavelengths[~mask], actual[~mask])
                err_term = np.sum(np.power(poly_4_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
                err_sum = np.sum(poly_4_delta[~delta_mask])/len(poly_4_delta[~delta_mask])
                red_chi = np.sum(np.power(poly_4_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], poly_4_delta[~mask])
                    plt.plot(c_wavelengths, [0]*len(c_wavelengths))
                    plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                    plt.tight_layout()
                    plt.title("Poly 4: {}".format(title_str))
                    plt.show()
                    plt.close()

                print err_term, red_chi, err_sum,
                err_ind =+ 1

        print

        if save_out:
            out_table = Table()
            wavelength_col = Column(c_wavelengths, name="wavelength", dtype=float)
            out_table.add_columns([wavelength_col])

            if not linear_only:
                rf_col = Column(rfr_predicted[0], name="rf_flux", dtype=float)
                out_table.add_columns([rf_col])

                if include_knn:
                    knn_col = Column(knn_predicted[0], name="knn_flux", dtype=float)
                    avg_col = Column(avg_predicted[0], name="avg_flux", dtype=float)
                    out_table.add_columns([knn_col, avg_col])

            if include_linear:
                poly_1_col = Column(poly_1_predicted[0], name="poly_1_flux", dtype=float)
                poly_2_col = Column(poly_2_predicted[0], name="poly_2_flux", dtype=float)
                out_table.add_columns([poly_1_col, poly_2_col])
                if order_3:
                    poly_3_col = Column(poly_3_predicted[0], name="poly_3_flux", dtype=float)
                    out_table.add_columns([poly_3_col])
                if order_4:
                    poly_4_col = Column(poly_4_predicted[0], name="poly_4_flux", dtype=float)
                    out_table.add_columns([poly_4_col])

            mask_col = Column(~mask, name="mask_col", dtype=bool)
            out_table.add_columns([mask_col])

            out_table.write("predicted_sky_exp{}.csv".format(c_exposures[sorted_inds[test_ind]]), format="ascii.csv")
Example #24
0
import ICAize
import stack
import matplotlib.pyplot as plt
import numpy as np
import random_forest_spectra as rfs
import sklearn.metrics as sm
import sys
import os.path
import pickle

path = '.'

if len(sys.argv) == 2:
    path = sys.argv[1]

fastica = ICAize.unpickle_FastICA(target_type="combined", filter_str="both")
for comp_i in range(min(fastica.components_.shape[0], 25)):
    scale_factor = 2.4/np.max(np.abs(fastica.components_[comp_i]))
    plt.plot(stack.skyexp_wlen_out, (fastica.components_[comp_i]*scale_factor)+(5*comp_i) )
plt.show()
plt.close()

fastica = ICAize.unpickle_FastICA(target_type="combined", filter_str="em")
for comp_i in range(min(fastica.components_.shape[0], 25)):
    scale_factor = 2.4/np.max(np.abs(fastica.components_[comp_i]))
    plt.plot(stack.skyexp_wlen_out, (fastica.components_[comp_i]*scale_factor)+(5*comp_i) )
plt.show()
plt.close()

fastica = ICAize.unpickle_FastICA(target_type="combined", filter_str="nonem")
for comp_i in range(min(fastica.components_.shape[0], 25)):