Beispiel #1
0
import ICAize
import stack
import matplotlib.pyplot as plt
import numpy as np
import random_forest_spectra as rfs
import sklearn.metrics as sm
import sys
import os.path
import pickle

path = '.'

if len(sys.argv) == 2:
    path = sys.argv[1]

fastica = ICAize.unpickle_FastICA(target_type="combined", filter_str="both")
for comp_i in range(min(fastica.components_.shape[0], 25)):
    scale_factor = 2.4/np.max(np.abs(fastica.components_[comp_i]))
    plt.plot(stack.skyexp_wlen_out, (fastica.components_[comp_i]*scale_factor)+(5*comp_i) )
plt.show()
plt.close()

fastica = ICAize.unpickle_FastICA(target_type="combined", filter_str="em")
for comp_i in range(min(fastica.components_.shape[0], 25)):
    scale_factor = 2.4/np.max(np.abs(fastica.components_[comp_i]))
    plt.plot(stack.skyexp_wlen_out, (fastica.components_[comp_i]*scale_factor)+(5*comp_i) )
plt.show()
plt.close()

fastica = ICAize.unpickle_FastICA(target_type="combined", filter_str="nonem")
for comp_i in range(min(fastica.components_.shape[0], 25)):
def load_plot_etc_target_type(metadata_path, spectra_path, test_inds, target_type, no_plot=False,
				save_out=False, restrict_delta=False, use_spca=False, use_pca=False):
    obs_metadata = trim_observation_metadata(load_observation_metadata(metadata_path))
    if use_filter_split:
        c_sources, c_mixing, c_exposures, c_wavelengths, c_filter_split_arr = load_spectra_data(spectra_path,
						target_type=target_type, filter_str='nonem', use_spca=use_spca, use_pca=use_pca)
        c_sources_e, c_mixing_e, c_exposures_e, c_wavelengths_e, c_filter_split_arr_e = load_spectra_data(spectra_path,
						target_type=target_type, filter_str='em', use_spca=use_spca, use_pca=use_pca)
    else:
        c_sources, c_mixing, c_exposures, c_wavelengths, c_filter_split_arr = load_spectra_data(spectra_path,
						target_type=target_type, filter_str='both', use_spca=use_spca, use_pca=use_pca)

    reduced_obs_metadata = obs_metadata[np.in1d(obs_metadata['EXP_ID'], c_exposures)]
    reduced_obs_metadata.sort('EXP_ID')
    sorted_inds = np.argsort(c_exposures)
    if use_filter_split:
        sorted_e_inds = np.argsort(c_exposures_e)

    if not linear_only:
        if reg_type == 'etr':
            rfr = ensemble.ExtraTreesRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split,
                        random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap)
            if use_filter_split:
                rfr_e = ensemble.ExtraTreesRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split,
                        random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap)
        else:
            rfr = ensemble.RandomForestRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split,
                        random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap)
            if use_filter_split:
                rfr_e = ensemble.RandomForestRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split,
                        random_state=rfr_random_state, n_jobs=-1, verbose=False, bootstrap=bootstrap)
        if include_knn:
            knn = neighbors.KNeighborsRegressor(weights='distance', n_neighbors=10, p=64)
            if use_filter_split:
                knn_e = neighbors.KNeighborsRegressor(weights='distance', n_neighbors=10, p=64)

    if include_linear:
        linear = Linear(fit_intercept=True, copy_X=True, n_jobs=-1)
        poly_2_linear = Pipeline([('poly', PolynomialFeatures(degree=2)),
                            ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])
        poly_3_linear = Pipeline([('poly', PolynomialFeatures(degree=3)),
                        ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])
        poly_4_linear = Pipeline([('poly', PolynomialFeatures(degree=4)),
                        ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])
        if use_filter_split:
            linear_e = Linear(fit_intercept=True, copy_X=True, n_jobs=-1)
            poly_2_linear_e = Pipeline([('poly', PolynomialFeatures(degree=2)),
                            ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])
            poly_3_linear_e = Pipeline([('poly', PolynomialFeatures(degree=3)),
                        ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])
            poly_4_linear_e = Pipeline([('poly', PolynomialFeatures(degree=4)),
                        ('linear', Linear(fit_intercept=True, copy_X=True, n_jobs=-1))])

    reduced_obs_metadata.remove_column('EXP_ID')
    md_len = len(reduced_obs_metadata)
    var_count = len(reduced_obs_metadata.columns)
    X_arr = np.array(reduced_obs_metadata).view('f8').reshape((md_len,-1))

    ica = None
    if not use_spca and not use_pca:
        if use_filter_split:
            ica = ICAize.unpickle_FastICA(path=spectra_path, target_type=target_type, filter_str='nonem')
            ica_e = ICAize.unpickle_FastICA(path=spectra_path, target_type=target_type, filter_str='em')
        else:
            ica = ICAize.unpickle_FastICA(path=spectra_path, target_type=target_type, filter_str='both')
    elif use_spca:
        ica = ICAize.unpickle_SPCA(path=spectra_path, target_type=target_type)
    else:
        if use_filter_split:
            ica = ICAize.unpickle_PCA(path=spectra_path, target_type=target_type, filter_str='nonem')
            ica_e = ICAize.unpickle_PCA(path=spectra_path, target_type=target_type, filter_str='em')
        else:
            ica = ICAize.unpickle_PCA(path=spectra_path, target_type=target_type, filter_str='both')

    spectra_dir_list = os.listdir(spectra_path)

    ################################################################
    results = None
    for test_ind in test_inds:
        test_X = X_arr[test_ind]
        train_X = np.vstack( [X_arr[:test_ind], X_arr[test_ind+1:]] )
        test_y =  (c_sources[sorted_inds])[test_ind]
        train_y = np.vstack( [(c_sources[sorted_inds])[:test_ind], (c_sources[sorted_inds])[test_ind+1:]] )
        if use_filter_split:
            test_y_e =  (c_sources_e[sorted_e_inds])[test_ind]
            train_y_e = np.vstack( [(c_sources_e[sorted_e_inds])[:test_ind], (c_sources_e[sorted_e_inds])[test_ind+1:]] )

        if scale:
            scaler = StandardScaler(with_std=scale_std)
            train_X = scaler.fit_transform(train_X)
            test_X = scaler.transform(test_X)

        title_str = "exp{}, {}".format(c_exposures[sorted_inds[test_ind]], target_type)

        if not linear_only:
            rfr.fit(X=train_X, y=train_y)
            if use_filter_split:
                rfr_e.fit(X=train_X, y=train_y_e)
            if include_knn:
                knn.fit(X=train_X, y=train_y)
                if user_filter_split:
                    knn_e.fit(X=train_X, y=train_y_e)

        if include_linear:
            linear.fit(train_X, train_y)
            poly_2_linear.fit(train_X, train_y)
            if order_3:
                poly_3_linear.fit(train_X, train_y)
            if order_4:
                poly_4_linear.fit(train_X, train_y)
        if use_filter_split and include_linear:
            linear_e.fit(train_X, train_y_e)
            poly_2_linear_e.fit(train_X, train_y_e)
            if order_3:
                poly_3_linear_e.fit(train_X, train_y_e)
            if order_4:
                poly_4_linear_e.fit(train_X, train_y_e)

        print test_ind, c_exposures[sorted_inds[test_ind]],

        data = None
        actual = None
        mask = None
        delta_mask = None
        ivar = None

        for file in spectra_dir_list:
            if fnmatch.fnmatch(file, "stacked_sky_*exp{}.csv".format(c_exposures[sorted_inds[test_ind]])):
                data = Table.read(os.path.join(spectra_path, file), format="ascii.csv")
                ivar = data['ivar']
                mask = (data['ivar'] == 0)
                delta_mask = mask.copy()
                if restrict_delta:
                    if restrict_color == 'blue':
                        delta_mask[2700:] = True
                    else:
                        delta_mask[:2700] = True

                actual = data['flux']
                break
        if actual is None:
            continue

        if not linear_only:
            rfr_prediction = rfr.predict(test_X)
            if not use_spca and not use_pca:
                rfr_predicted = ica.inverse_transform(rfr_prediction, copy=True)
            else:
                rfr_predicted = np.zeros( (1, ica.components_.shape[1]) )
                rfr_predicted[0,:] = np.sum(rfr_prediction.T * ica.components_, 0)

            if use_filter_split:
                rfr_e_prediction = rfr_e.predict(test_X)
                if not use_spca and not use_pca:
                    rfr_e_predicted = ica_e.inverse_transform(rfr_e_prediction, copy=True)
                else:
                    rfr_e_predicted = np.zeros( (1, ica_e.components_.shape[1]) )
                    rfr_e_predicted[0,:] = np.sum(rfr_e_prediction.T * ica_e.components_, 0)
                rfr_predicted = rfr_predicted + rfr_e_predicted

            rfr_delta = rfr_predicted[0] - actual
            if not no_plot:
                plt.plot(c_wavelengths[~mask], rfr_predicted[0][~mask])
                plt.plot(c_wavelengths[~mask], actual[~mask])
                plt.plot(c_wavelengths[~mask], rfr_delta[~mask])
            if not no_plot:
                plt.plot(c_wavelengths, [0]*len(c_wavelengths))
            err_term = np.sum(np.power(rfr_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
            err_sum = np.sum(rfr_delta[~delta_mask])/len(rfr_delta[~delta_mask])
            red_chi = np.sum(np.power(rfr_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)
            if not no_plot:
                plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                plt.tight_layout()
                plt.title("Random Forest Regressor: {}".format(title_str))
                plt.show()
                plt.close()
            print err_term, red_chi, err_sum,

            if include_knn:
                knn_prediction = knn.predict(test_X)
                if not use_spca and not use_pca:
                    knn_predicted = ica.inverse_transform(knn_prediction, copy=True)
                else:
                    knn_predicted = np.zeros( (1, ica.components_.shape[1]) )
                    knn_predicted[0,:] = np.sum(knn_prediction.T * ica.components_, 0)

                if use_filter_split:
                    knn_e_prediction = knn_e.predict(test_X)
                    if not use_spca and not use_pca:
                        knn_e_predicted = ica_e.inverse_transform(knn_e_prediction, copy=True)
                    else:
                        knn_e_predicted = np.zeros( (1, ica_e.components_.shape[1]) )
                        knn_e_predicted[0,:] = np.sum(knn_e_prediction.T * ica_e.components_, 0)
                    knn_predicted = knn_predicted + knn_e_predicted

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], knn_predicted[0][~mask])
                    plt.plot(c_wavelengths[~mask], actual[~mask])
                knn_delta = knn_predicted[0] - actual
                err_term = np.sum(np.power(knn_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
                err_sum = np.sum(knn_delta[~delta_mask])/len(knn_delta[~delta_mask])
                red_chi = np.sum(np.power(knn_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], knn_delta[~mask])
                    plt.plot(c_wavelengths, [0]*len(c_wavelengths))
                    plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                    plt.tight_layout()
                    plt.title("Good 'ol K-NN: {}".format(title_str))
                    plt.show()
                    plt.close()
                print err_term, red_chi, err_sum,

        if include_linear:
            poly_1_prediction = linear.predict(test_X)
            if not use_spca and not use_pca:
                poly_1_predicted = ica.inverse_transform(poly_1_prediction, copy=True)
            else:
                poly_1_predicted = np.zeros( (1, ica.components_.shape[1]) )
                poly_1_predicted[0,:] = np.sum(poly_1_prediction.T * ica.components_, 0)

            if use_filter_split:
                poly_1_e_prediction = linear.predict(test_X)
                if not use_spca and not use_pca:
                    poly_1_e_predicted = ica_e.inverse_transform(poly_1_e_prediction, copy=True)
                else:
                    poly_1_e_predicted = np.zeros( (1, ica_e.components_.shape[1]) )
                    poly_1_e_predicted[0,:] = np.sum(poly_1_e_prediction.T * ica_e.components_, 0)
                poly_1_predicted = poly_1_predicted + poly_1_e_predicted

            poly_1_delta = poly_1_predicted[0] - actual

            if not no_plot:
                plt.plot(c_wavelengths[~mask], poly_1_predicted[0][~mask])
                plt.plot(c_wavelengths[~mask], actual[~mask])
            err_term = np.sum(np.power(poly_1_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
            err_sum = np.sum(poly_1_delta[~delta_mask])/len(poly_1_delta[~delta_mask])
            red_chi = np.sum(np.power(poly_1_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)

            if not no_plot:
                plt.plot(c_wavelengths[~mask], poly_1_delta[~mask])
                plt.plot(c_wavelengths, [0]*len(c_wavelengths))
                plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                plt.tight_layout()
                plt.title("Poly 1: {}".format(title_str))
                plt.show()
                plt.close()

            print err_term, red_chi, err_sum,

            poly_2_prediction = poly_2_linear.predict(test_X)
            if not use_spca and not use_pca:
                poly_2_predicted = ica.inverse_transform(poly_2_prediction, copy=True)
            else:
                poly_2_predicted = np.zeros( (1, ica.components_.shape[1]) )
                poly_2_predicted[0,:] = np.sum(poly_2_prediction.T * ica.components_, 0)

            poly_2_delta = poly_2_predicted[0] - actual

            if not no_plot:
                plt.plot(c_wavelengths[~mask], poly_2_predicted[0][~mask])
                plt.plot(c_wavelengths[~mask], actual[~mask])
            err_term = np.sum(np.power(poly_2_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
            err_sum = np.sum(poly_2_delta[~delta_mask])/len(poly_2_delta[~delta_mask])
            red_chi = np.sum(np.power(poly_2_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)

            if not no_plot:
                plt.plot(c_wavelengths[~mask], poly_2_delta[~mask])
                plt.plot(c_wavelengths, [0]*len(c_wavelengths))
                plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                plt.tight_layout()
                plt.title("Poly 2: {}".format(title_str))
                plt.show()
                plt.close()

            print err_term, red_chi, err_sum,
            err_ind =+ 1

            if order_3:
                poly_3_prediction = poly_3_linear.predict(test_X)
                if not use_spca and not use_pca:
                    poly_3_predicted = ica.inverse_transform(poly_3_prediction, copy=True)
                else:
                    poly_3_predicted = np.zeros( (1, ica.components_.shape[1]) )
                    poly_3_predicted[0,:] = np.sum(poly_3_prediction.T * ica.components_, 0)

                poly_3_delta = poly_3_predicted[0] - actual

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], poly_3_predicted[0][~mask])
                    plt.plot(c_wavelengths[~mask], actual[~mask])
                err_term = np.sum(np.power(poly_3_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
                err_sum = np.sum(poly_3_delta[~delta_mask])/len(poly_3_delta[~delta_mask])
                red_chi = np.sum(np.power(poly_3_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], poly_3_delta[~mask])
                    plt.plot(c_wavelengths, [0]*len(c_wavelengths))
                    plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                    plt.tight_layout()
                    plt.title("Poly 3: {}".format(title_str))
                    plt.show()
                    plt.close()

                print err_term, red_chi, err_sum,
                err_ind =+ 1

            if order_4:
                poly_4_prediction = poly_4_linear.predict(test_X)
                if not use_spca and not use_pca:
                    poly_4_predicted = ica.inverse_transform(poly_4_prediction, copy=True)
                else:
                    poly_4_predicted = np.zeros( (1, ica.components_.shape[1]) )
                    poly_4_predicted[0,:] = np.sum(poly_4_prediction.T * ica.components_, 0)

                poly_4_delta = poly_4_predicted[0] - actual

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], poly_4_predicted[0][~mask])
                    plt.plot(c_wavelengths[~mask], actual[~mask])
                err_term = np.sum(np.power(poly_4_delta[~delta_mask], 2))/len(c_wavelengths[~delta_mask])
                err_sum = np.sum(poly_4_delta[~delta_mask])/len(poly_4_delta[~delta_mask])
                red_chi = np.sum(np.power(poly_4_delta[~delta_mask], 2)*ivar[~delta_mask])/(len(c_wavelengths[~delta_mask])-var_count-1)

                if not no_plot:
                    plt.plot(c_wavelengths[~mask], poly_4_delta[~mask])
                    plt.plot(c_wavelengths, [0]*len(c_wavelengths))
                    plt.legend(['Predicted', 'Actual', 'Delta {:0.5f}'.format(err_term)])
                    plt.tight_layout()
                    plt.title("Poly 4: {}".format(title_str))
                    plt.show()
                    plt.close()

                print err_term, red_chi, err_sum,
                err_ind =+ 1

        print

        if save_out:
            out_table = Table()
            wavelength_col = Column(c_wavelengths, name="wavelength", dtype=float)
            out_table.add_columns([wavelength_col])

            if not linear_only:
                rf_col = Column(rfr_predicted[0], name="rf_flux", dtype=float)
                out_table.add_columns([rf_col])

                if include_knn:
                    knn_col = Column(knn_predicted[0], name="knn_flux", dtype=float)
                    avg_col = Column(avg_predicted[0], name="avg_flux", dtype=float)
                    out_table.add_columns([knn_col, avg_col])

            if include_linear:
                poly_1_col = Column(poly_1_predicted[0], name="poly_1_flux", dtype=float)
                poly_2_col = Column(poly_2_predicted[0], name="poly_2_flux", dtype=float)
                out_table.add_columns([poly_1_col, poly_2_col])
                if order_3:
                    poly_3_col = Column(poly_3_predicted[0], name="poly_3_flux", dtype=float)
                    out_table.add_columns([poly_3_col])
                if order_4:
                    poly_4_col = Column(poly_4_predicted[0], name="poly_4_flux", dtype=float)
                    out_table.add_columns([poly_4_col])

            mask_col = Column(~mask, name="mask_col", dtype=bool)
            out_table.add_columns([mask_col])

            out_table.write("predicted_sky_exp{}.csv".format(c_exposures[sorted_inds[test_ind]]), format="ascii.csv")