예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        'Build and test models based on dim reductions and provided spectra')
    parser.add_argument('--spectra_path',
                        type=str,
                        default='.',
                        metavar='PATH',
                        help='Spectra path to work from, if not '
                        '.'
                        '')
    parser.add_argument('--method',
                        type=str,
                        default='ICA',
                        metavar='METHOD',
                        help='Dim reduction method to load data for')
    parser.add_argument(
        '--file_path',
        type=str,
        default=None,
        metavar='FILE_PATH',
        help='COMPLETE path from which to load a dim reduction')

    args = parser.parse_args()

    data_model = None
    scaler = None
    if args.file_path is not None:
        data_model, scaler = ize.unpickle_model(filename=args.file_path)
    else:
        data_model, scaler = ize.unpickle_model(path=args.spectra_path,
                                                method=args.method)
    components = ize.get_components(args.method, data_model)

    offset = 0
    for i, comp_i in enumerate(components):
        if i > 0:
            offset += np.max(np.abs(comp_i[comp_i < 0])) * 1.2
        plt.plot(stack.skyexp_wlen_out, comp_i + offset)
        offset += np.max(comp_i[comp_i > 0]) * 1.2
    plt.show()
    plt.close()
예제 #2
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Build and test models based on dim reductions and provided spectra",
    )
    parser.add_argument(
        "--spectra_path", type=str, default=".", metavar="PATH", help="Spectra path to work from, if not " "." ""
    )
    parser.add_argument(
        "--method", type=str, default="ICA", metavar="METHOD", help="Dim reduction method to load data for"
    )
    parser.add_argument(
        "--file_path",
        type=str,
        default=None,
        metavar="FILE_PATH",
        help="COMPLETE path from which to load a dim reduction",
    )

    args = parser.parse_args()

    data_model = None
    scaler = None
    if args.file_path is not None:
        data_model, scaler = ize.unpickle_model(filename=args.file_path)
    else:
        data_model, scaler = ize.unpickle_model(path=args.spectra_path, method=args.method)
    components = ize.get_components(args.method, data_model)

    offset = 0
    for i, comp_i in enumerate(components):
        if i > 0:
            offset += np.max(np.abs(comp_i[comp_i < 0])) * 1.2
        plt.plot(stack.skyexp_wlen_out, comp_i + offset)
        offset += np.max(comp_i[comp_i > 0]) * 1.2
    plt.show()
    plt.close()
예제 #3
0
def animate_sky_spectra_for_coord(obs_time_start, obs_time_end, point_coord, lunar_metadata_file,
                    solar_metadata_file, sunspot_metadata_file, model_path, dm_path, dm_method):
    metadata_tups, dates, lunar_data, solar_data, sunspot_data = get_sky_for_coord(obs_time_start,
                    obs_time_end, point_coord, lunar_metadata_file, solar_metadata_file, sunspot_metadata_file)

    model = rfs.load_model(model_path)
    dm, ss, model_args = iz.unpickle_model(path=dm_path, method=dm_method)

    inv_spec = []
    labels = []
    for i, metadata in enumerate(metadata_tups):
        #print(metadata)
        np_metadata = np.array(metadata)
        pred = model.predict(np_metadata.reshape(1, -1))
        inv_spec.append(iz.inverse_transform(pred, dm, ss, dm_method, model_args)[0, :])
        labels.append(dates[i] + "(ALT,AZ): (" + str(metadata[3]) + ", " + str(metadata[2]) + ")")

    return inv_spec, labels
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Build and test models based on dim reductions and provided spectra'
    )
    subparsers = parser.add_subparsers(dest='subparser_name')

    parser.add_argument(
        '--metadata_path', type=str, default='.', metavar='PATH',
        help='Metadata path to work from, if not ''.'''
    )
    parser.add_argument(
        '--spectra_path', type=str, default='.', metavar='PATH',
        help='Spectra path to work from, if not ''.'''
    )
    parser.add_argument(
        '--method', type=str, default='ICA', metavar='METHOD',
        help='Dim reduction method to load data for'
    )
    parser.add_argument(
        '--n_jobs', type=int, default=1, metavar='N_JOBS',
        help='N_JOBS'
    )
    parser.add_argument(
        '--model', type=str, choices=['ET', 'RF', 'GP', 'KNN', 'SVR'], default='ET',
        help='Which model type to use: ET (Extra Trees), RF (Random Forest), GP (Gaussian Process), KNN, or SVR (Support Vector Regression)'
    )
    parser.add_argument(
        '--load_model', action='store_true',
        help='Whether or not to load the model from --model_path'
    )
    parser.add_argument(
        '--model_path', type=str, default='model.pkl', metavar='MODEL_PATH',
        help='COMPLETE path from which to load a model'
    )
    parser.add_argument(
        '--metadata_flags', type=str, default='', metavar='METADATA_FLAGS',
        help='Flags specifying observational metadata pre-processing, e.g. LUNAR_MAG which takes the '\
            'magnitude and linearizes it (ignoring that it is an area magnitude)'
    )
    parser.add_argument(
        '--compacted_path', type=str, default=None, metavar='COMPATED_PATH',
        help='Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored'
    )

    parser_compare = subparsers.add_parser('compare')
    parser_compare.add_argument(
        '--folds', type=int, default=3, metavar='TEST_FOLDS',
        help='Do k-fold cross validation with specified number of folds.  Defaults to 3.'
    )
    parser_compare.add_argument(
        '--iters', type=int, default=50, metavar='HYPER_FIT_ITERS',
        help='Number of iterations when fitting hyper-params'
    )
    parser_compare.add_argument(
        '--outputfbk', action='store_true',
        help='If set, outputs \'grid_scores_\' data from RandomizedSearchCV'
    )
    parser_compare.add_argument(
        '--save_best', action='store_true',
        help='Whether or not to save the (last/best) model built for e.g. --hyper_fit'
    )
    parser_compare.add_argument(
        '--scorer', type=str, choices=['R2', 'MAE', 'MSE', 'LL', 'EXP_VAR', 'MAPED', 'MSEMV'], default='R2',
        help='Which scoring method to use to determine ranking of model instances.'
    )
    parser_compare.add_argument(
        '--use_spectra', action='store_true',
        help='Whether scoring is done against the DM components or the predicted spectra'
    )
    parser_compare.add_argument(
        '--ivar_cutoff', type=float, default=0.001, metavar='IVAR_CUTOFF',
        help='data with inverse variace below cutoff is masked as if ivar==0'
    )
    parser_compare.add_argument(
        '--plot_final_errors', action='store_true',
        help='If set, will plot the errors from the final/best model, for the whole dataset, from ' + \
            'the best model re-trained on CV folds used for testing.' + \
            'Plots all errors on top of each other with low-ish alpha, to give a kind of visual ' + \
            'density map of errors.'
    )

    args = parser.parse_args()

    obs_metadata = trim_observation_metadata(load_observation_metadata(args.metadata_path, flags=args.metadata_flags))
    sources, components, exposures, wavelengths = ICAize.deserialize_data(args.spectra_path, args.method)
    source_model, ss, model_args = ICAize.unpickle_model(args.spectra_path, args.method)

    comb_flux_arr, comb_exposure_arr, comb_wavelengths = None, None, None
    if args.use_spectra:
        comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = ICAize.load_data(args)

        filter_arr = np.in1d(comb_exposure_arr, exposures)
        comb_flux_arr = comb_flux_arr[filter_arr]
        comb_exposure_arr = comb_exposure_arr[filter_arr]

        sorted_inds = np.argsort(comb_exposure_arr)
        comb_flux_arr = comb_flux_arr[sorted_inds]
        comb_exposure_arr = comb_exposure_arr[sorted_inds]

        del comb_ivar_arr
        del comb_masks

    reduced_obs_metadata = obs_metadata[np.in1d(obs_metadata['EXP_ID'], exposures)]
    reduced_obs_metadata.sort('EXP_ID')
    sorted_inds = np.argsort(exposures)

    reduced_obs_metadata.remove_column('EXP_ID')
    md_len = len(reduced_obs_metadata)
    var_count = len(reduced_obs_metadata.columns)
    X_arr = np.array(reduced_obs_metadata).view('f8').reshape((md_len,-1))
    Y_arr = sources[sorted_inds]

    if args.load_model:
        predictive_model = load_model(args.model_path)
    else:
        predictive_model = get_model(args.model)

    if args.subparser_name == 'compare':
        pdist = get_param_distribution_for_model(args.model, args.iters)

        scorer = None
        if args.scorer == 'R2':
            scorer = make_scorer(R2)
        elif args.scorer == 'MAE':
            if args.use_spectra:
                p_MAE_ = partial(MAE, Y_full=Y_arr, flux_arr=comb_flux_arr,
                            source_model=source_model, ss=ss,
                            source_model_args=model_args, method=args.method)
                scorer = make_scorer(p_MAE_, greater_is_better=False)
            else:
                scorer = make_scorer(MAE, greater_is_better=False)
        elif args.scorer == 'MSE':
            if args.use_spectra:
                p_MSE_ = partial(MSE, Y_full=Y_arr, flux_arr=comb_flux_arr,
                            source_model=source_model, ss=ss,
                            source_model_args=model_args, method=args.method)
                scorer = make_scorer(p_MSE_, greater_is_better=False)
            else:
                scorer = make_scorer(MSE, greater_is_better=False)
        elif args.scorer == 'MSEMV':
            if args.use_spectra:
                p_MSEMV_ = partial(MSEMV, Y_full=Y_arr, flux_arr=comb_flux_arr,
                            source_model=source_model, ss=ss,
                            source_model_args=model_args, method=args.method)
                scorer = make_scorer(p_MSEMV_, greater_is_better=False)
            else:
                scorer = make_scorer(MSEMV, greater_is_better=False)
        elif args.scorer == 'EXP_VAR':
            if args.use_spectra:
                p_EXP_VAR_ = partial(EXP_VAR, Y_full=Y_arr, flux_arr=comb_flux_arr,
                            source_model=source_model, ss=ss,
                            source_model_args=model_args, method=args.method)
                scorer = make_scorer(p_EXP_VAR_)
            else:
                scorer = make_scorer(EXP_VAR)
        elif args.scorer == 'MAPED':
            if args.use_spectra:
                p_MAPED_ = partial(MAPED, Y_full=Y_arr, flux_arr=comb_flux_arr,
                            source_model=source_model, ss=ss,
                            source_model_args=model_args, method=args.method)
                scorer = make_scorer(p_MAPED_, greater_is_better=False)
            else:
                scorer = make_scorer(MAPED, greater_is_better=False)
        elif args.scorer == 'LL':
            scorer = None

        folder = ShuffleSplit(exposures.shape[0], n_iter=args.folds, test_size=1.0/args.folds,
                            random_state=12345)

        if args.model == 'GP':
            predictive_model.random_start = args.folds
            rcv = GridSearchCV(predictive_model, param_grid=pdist,
                            error_score=0, cv=3, n_jobs=args.n_jobs,
                            scoring=scorer)
                            #random_state=RANDOM_STATE,
                            #n_iter=args.iters,
        else:
            rcv = RandomizedSearchCV(predictive_model, param_distributions=pdist,
                            n_iter=args.iters, cv=folder, n_jobs=args.n_jobs,
                            scoring=scorer)

        # This is going to fit X (metdata) to Y (DM'ed sources).  But there are
        # really two tests here:  how well hyperparams fit/predict the sources
        # and how well they fit/predict the actual source spectra.  Until I know
        # better, I 'm going to need to build a way to test both.
        rcv.fit(X_arr, Y_arr)

        print(rcv.best_score_)
        print(rcv.best_params_)
        print(rcv.best_estimator_)
        if args.outputfbk:
            print("=+"*10 + "=")
            for val in rcv.grid_scores_:
                print(val)
            print("=+"*10 + "=")

        if args.save_best:
            save_model(rcv.best_estimator_, args.model_path)

        if args.plot_final_errors:
            for train_inds, test_inds in folder:
                rcv.best_estimator_.fit(X_arr[train_inds], Y_arr[train_inds])
                predicted = rcv.best_estimator_.predict(X_arr[test_inds])
                back_trans_flux = ICAize.inverse_transform(predicted, source_model, ss, args.method, model_args)
                diffs = np.abs(comb_flux_arr[test_inds] - back_trans_flux)
                #Is there not 'trick' to getting matplotlib to do this without a loop?
                for i in range(diffs.shape[0]):
                    plt.plot(comb_wavelengths, diffs[i, :], 'b-', alpha=0.01)
            plt.show()
예제 #5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        'Build and test models based on dim reductions and provided spectra')
    subparsers = parser.add_subparsers(dest='subparser_name')

    parser.add_argument('--metadata_path',
                        type=str,
                        default='.',
                        metavar='PATH',
                        help='Metadata path to work from, if not '
                        '.'
                        '')
    parser.add_argument('--spectra_path',
                        type=str,
                        default='.',
                        metavar='PATH',
                        help='Spectra path to work from, if not '
                        '.'
                        '')
    parser.add_argument('--method',
                        type=str,
                        default='ICA',
                        metavar='METHOD',
                        help='Dim reduction method to load data for')
    parser.add_argument('--n_jobs',
                        type=int,
                        default=1,
                        metavar='N_JOBS',
                        help='N_JOBS')
    parser.add_argument(
        '--model',
        type=str,
        choices=['ET', 'RF', 'GP', 'KNN', 'SVR'],
        default='ET',
        help=
        'Which model type to use: ET (Extra Trees), RF (Random Forest), GP (Gaussian Process), KNN, or SVR (Support Vector Regression)'
    )
    parser.add_argument(
        '--load_model',
        action='store_true',
        help='Whether or not to load the model from --model_path')
    parser.add_argument('--model_path',
                        type=str,
                        default='model.pkl',
                        metavar='MODEL_PATH',
                        help='COMPLETE path from which to load a model')
    parser.add_argument(
        '--metadata_flags', type=str, default='', metavar='METADATA_FLAGS',
        help='Flags specifying observational metadata pre-processing, e.g. LUNAR_MAG which takes the '\
            'magnitude and linearizes it (ignoring that it is an area magnitude)'
    )
    parser.add_argument(
        '--compacted_path',
        type=str,
        default=None,
        metavar='COMPATED_PATH',
        help=
        'Path to find compacted/arrayized data; setting this will cause --path, --pattern to be ignored'
    )

    parser_compare = subparsers.add_parser('compare')
    parser_compare.add_argument(
        '--folds',
        type=int,
        default=3,
        metavar='TEST_FOLDS',
        help=
        'Do k-fold cross validation with specified number of folds.  Defaults to 3.'
    )
    parser_compare.add_argument(
        '--iters',
        type=int,
        default=50,
        metavar='HYPER_FIT_ITERS',
        help='Number of iterations when fitting hyper-params')
    parser_compare.add_argument(
        '--outputfbk',
        action='store_true',
        help='If set, outputs \'grid_scores_\' data from RandomizedSearchCV')
    parser_compare.add_argument(
        '--save_best',
        action='store_true',
        help=
        'Whether or not to save the (last/best) model built for e.g. --hyper_fit'
    )
    parser_compare.add_argument(
        '--scorer',
        type=str,
        choices=['R2', 'MAE', 'MSE', 'LL', 'EXP_VAR', 'MAPED', 'MSEMV'],
        default='R2',
        help=
        'Which scoring method to use to determine ranking of model instances.')
    parser_compare.add_argument(
        '--use_spectra',
        action='store_true',
        help=
        'Whether scoring is done against the DM components or the predicted spectra'
    )
    parser_compare.add_argument(
        '--ivar_cutoff',
        type=float,
        default=0.001,
        metavar='IVAR_CUTOFF',
        help='data with inverse variace below cutoff is masked as if ivar==0')
    parser_compare.add_argument(
        '--plot_final_errors', action='store_true',
        help='If set, will plot the errors from the final/best model, for the whole dataset, from ' + \
            'the best model re-trained on CV folds used for testing.' + \
            'Plots all errors on top of each other with low-ish alpha, to give a kind of visual ' + \
            'density map of errors.'
    )

    args = parser.parse_args()

    obs_metadata = trim_observation_metadata(
        load_observation_metadata(args.metadata_path,
                                  flags=args.metadata_flags))
    sources, components, exposures, wavelengths = ICAize.deserialize_data(
        args.spectra_path, args.method)
    source_model, ss, model_args = ICAize.unpickle_model(
        args.spectra_path, args.method)

    comb_flux_arr, comb_exposure_arr, comb_wavelengths = None, None, None
    if args.use_spectra:
        comb_flux_arr, comb_exposure_arr, comb_ivar_arr, comb_masks, comb_wavelengths = ICAize.load_data(
            args)

        filter_arr = np.in1d(comb_exposure_arr, exposures)
        comb_flux_arr = comb_flux_arr[filter_arr]
        comb_exposure_arr = comb_exposure_arr[filter_arr]

        sorted_inds = np.argsort(comb_exposure_arr)
        comb_flux_arr = comb_flux_arr[sorted_inds]
        comb_exposure_arr = comb_exposure_arr[sorted_inds]

        del comb_ivar_arr
        del comb_masks

    reduced_obs_metadata = obs_metadata[np.in1d(obs_metadata['EXP_ID'],
                                                exposures)]
    reduced_obs_metadata.sort('EXP_ID')
    sorted_inds = np.argsort(exposures)

    reduced_obs_metadata.remove_column('EXP_ID')
    md_len = len(reduced_obs_metadata)
    var_count = len(reduced_obs_metadata.columns)
    X_arr = np.array(reduced_obs_metadata).view('f8').reshape((md_len, -1))
    Y_arr = sources[sorted_inds]

    if args.load_model:
        predictive_model = load_model(args.model_path)
    else:
        predictive_model = get_model(args.model)

    if args.subparser_name == 'compare':
        pdist = get_param_distribution_for_model(args.model, args.iters)

        scorer = None
        if args.scorer == 'R2':
            scorer = make_scorer(R2)
        elif args.scorer == 'MAE':
            if args.use_spectra:
                p_MAE_ = partial(MAE,
                                 Y_full=Y_arr,
                                 flux_arr=comb_flux_arr,
                                 source_model=source_model,
                                 ss=ss,
                                 source_model_args=model_args,
                                 method=args.method)
                scorer = make_scorer(p_MAE_, greater_is_better=False)
            else:
                scorer = make_scorer(MAE, greater_is_better=False)
        elif args.scorer == 'MSE':
            if args.use_spectra:
                p_MSE_ = partial(MSE,
                                 Y_full=Y_arr,
                                 flux_arr=comb_flux_arr,
                                 source_model=source_model,
                                 ss=ss,
                                 source_model_args=model_args,
                                 method=args.method)
                scorer = make_scorer(p_MSE_, greater_is_better=False)
            else:
                scorer = make_scorer(MSE, greater_is_better=False)
        elif args.scorer == 'MSEMV':
            if args.use_spectra:
                p_MSEMV_ = partial(MSEMV,
                                   Y_full=Y_arr,
                                   flux_arr=comb_flux_arr,
                                   source_model=source_model,
                                   ss=ss,
                                   source_model_args=model_args,
                                   method=args.method)
                scorer = make_scorer(p_MSEMV_, greater_is_better=False)
            else:
                scorer = make_scorer(MSEMV, greater_is_better=False)
        elif args.scorer == 'EXP_VAR':
            if args.use_spectra:
                p_EXP_VAR_ = partial(EXP_VAR,
                                     Y_full=Y_arr,
                                     flux_arr=comb_flux_arr,
                                     source_model=source_model,
                                     ss=ss,
                                     source_model_args=model_args,
                                     method=args.method)
                scorer = make_scorer(p_EXP_VAR_)
            else:
                scorer = make_scorer(EXP_VAR)
        elif args.scorer == 'MAPED':
            if args.use_spectra:
                p_MAPED_ = partial(MAPED,
                                   Y_full=Y_arr,
                                   flux_arr=comb_flux_arr,
                                   source_model=source_model,
                                   ss=ss,
                                   source_model_args=model_args,
                                   method=args.method)
                scorer = make_scorer(p_MAPED_, greater_is_better=False)
            else:
                scorer = make_scorer(MAPED, greater_is_better=False)
        elif args.scorer == 'LL':
            scorer = None

        folder = ShuffleSplit(exposures.shape[0],
                              n_iter=args.folds,
                              test_size=1.0 / args.folds,
                              random_state=12345)

        if args.model == 'GP':
            predictive_model.random_start = args.folds
            rcv = GridSearchCV(predictive_model,
                               param_grid=pdist,
                               error_score=0,
                               cv=3,
                               n_jobs=args.n_jobs,
                               scoring=scorer)
            #random_state=RANDOM_STATE,
            #n_iter=args.iters,
        else:
            rcv = RandomizedSearchCV(predictive_model,
                                     param_distributions=pdist,
                                     n_iter=args.iters,
                                     cv=folder,
                                     n_jobs=args.n_jobs,
                                     scoring=scorer)

        # This is going to fit X (metdata) to Y (DM'ed sources).  But there are
        # really two tests here:  how well hyperparams fit/predict the sources
        # and how well they fit/predict the actual source spectra.  Until I know
        # better, I 'm going to need to build a way to test both.
        rcv.fit(X_arr, Y_arr)

        print(rcv.best_score_)
        print(rcv.best_params_)
        print(rcv.best_estimator_)
        if args.outputfbk:
            print("=+" * 10 + "=")
            for val in rcv.grid_scores_:
                print(val)
            print("=+" * 10 + "=")

        if args.save_best:
            save_model(rcv.best_estimator_, args.model_path)

        if args.plot_final_errors:
            for train_inds, test_inds in folder:
                rcv.best_estimator_.fit(X_arr[train_inds], Y_arr[train_inds])
                predicted = rcv.best_estimator_.predict(X_arr[test_inds])
                back_trans_flux = ICAize.inverse_transform(
                    predicted, source_model, ss, args.method, model_args)
                diffs = np.abs(comb_flux_arr[test_inds] - back_trans_flux)
                #Is there not 'trick' to getting matplotlib to do this without a loop?
                for i in range(diffs.shape[0]):
                    plt.plot(comb_wavelengths, diffs[i, :], 'b-', alpha=0.01)
            plt.show()