Beispiel #1
0
def plot(val_fn, pts_fn, output_fn):
    points = []
    with open(pts_fn) as fp:
        for line in fp.xreadlines():
            points.append(map(float, line.split()))

    values = []
    with open(val_fn) as fp:
        for line in fp.xreadlines():
            values.append(float(line.split()[1]))

    xx = [pt[0] for pt in points]
    yy = [pt[1] for pt in points]
    print "X:", min(xx), max(xx)
    print "Y:", min(yy), max(yy)

    m = min(values)
    values = [(v - m) % 1. for v in values]
    print "V:", min(values), max(values)
    # hsv()
    myData = numpy.array(points)
    #results = PCA(myData,2)
    pca = PCA(n_components=2)
    results = pca.fit_transform(points)
    fig = figure()
    scatter(results[:, 0], results[:, 1], s=10, c=values, cmap="spectral")
    colorbar()

    # ax = fig.add_axes([-.05,-.1,1.1,1.1])
    ax = axes()
    ax.set_axis_off()
    ax.set_aspect('equal', 'box')
    # adjust(0,0,1,1,0,0)

    fig.savefig(output_fn)
avg_sulphates = df.sulphates / np.mean(df.sulphates)
avg_alcohol = df.alcohol / np.mean(df.alcohol)

#normalized dataset using the max of column method
max_df = pd.DataFrame({'fixed_acidity': max_fixed_acidity, 'volatile_acidity': max_volatile_acidity, 'citric_acid':max_citric_acid, \
 'residual_sugar': max_residual_sugar, 'chlorides': max_chlorides, 'free_sulfur_dioxide': max_free_sulfur_dioxide, \
 'total_sulfur_dioxide': max_total_sulfur_dioxide, 'density': max_density, 'pH': max_pH, 'sulphates': max_sulphates, 'alcohol':max_alcohol})

#normalized datset using the average of column method
avg_df = pd.DataFrame({'fixed_acidity': avg_fixed_acidity, 'volatile_acidity': avg_volatile_acidity, 'citric_acid':avg_citric_acid, \
 'residual_sugar': avg_residual_sugar, 'chlorides': avg_chlorides, 'free_sulfur_dioxide': avg_free_sulfur_dioxide, \
 'total_sulfur_dioxide': avg_total_sulfur_dioxide, 'density': avg_density, 'pH': avg_pH, 'sulphates': avg_sulphates, 'alcohol':avg_alcohol})

#principal component analysis of all three data frames
pca = PCA()
pca_df = pd.DataFrame(pca.fit_transform(df), columns=column_names)
pca_avg_df = pd.DataFrame(pca.fit_transform(avg_df), columns=column_names)
pca_max_df = pd.DataFrame(pca.fit_transform(max_df), columns=column_names)

######################### VIZ - SCREE CHARTS & SCATTER ###########################

PALETTE = [
    "#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4",
    "#46f0f0", "#f032e6", "#d2f53c", "#008080", "#e6beff"
]

if option == 1:
    plt.hist(quality, bins=10, color=PALETTE[7])
    plt.show()

def featurize(df, meta):
    df['flux_ratio_sq'] = np.power(df['flux'] / df['flux_err'], 2.0)
    df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']
    # train[detected==1, mjd_diff:=max(mjd)-min(mjd), by=object_id]
    aggs = {
        'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'flux_err': ['min', 'max', 'mean', 'median', 'std', 'skew'],
        'detected': ['mean'],
        'flux_ratio_sq': ['sum', 'skew'],
        'flux_by_flux_ratio_sq': ['sum', 'skew']
    }
    # Features to compute with tsfresh library. Fft coefficient is meant to capture periodicity
    fcp = {
        'flux': {
            'longest_strike_above_mean': None,
            'longest_strike_below_mean': None,
            'mean_change': None,
            'mean_abs_change': None,
            'length': None,
        },
        'flux_by_flux_ratio_sq': {
            'longest_strike_above_mean': None,
            'longest_strike_below_mean': None,
        },
        'flux_passband': {
            'fft_coefficient': [{
                'coeff': 0,
                'attr': 'abs'
            }, {
                'coeff': 1,
                'attr': 'abs'
            }],
            'kurtosis':
            None,
            'skewness':
            None,
        },
        'mjd': {
            'maximum': None,
            'minimum': None,
            'mean_change': None,
            'mean_abs_change': None,
        },
    }
    agg_df = df.groupby(['object_id']).agg(aggs)
    new_columns = [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]
    agg_df.columns = new_columns
    agg_df = agg_df.reset_index()
    agg_df['flux_diff'] = agg_df['flux_max'] - agg_df['flux_min']
    agg_df['flux_dif2'] = (agg_df['flux_max'] -
                           agg_df['flux_min']) / agg_df['flux_mean']
    agg_df['flux_w_mean'] = agg_df['flux_by_flux_ratio_sq_sum'] / agg_df[
        'flux_ratio_sq_sum']
    agg_df['flux_dif3'] = (agg_df['flux_max'] -
                           agg_df['flux_min']) / agg_df['flux_w_mean']
    # Added DWT features (ignore warnings from pywt.wavedec):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        wavelet_df = wavelet(df)
    agg_df = agg_df.merge(right=wavelet_df, how='left', on='object_id')
    print('MERGED WAVELET DF')
    # Run PCA
    X = agg_df.iloc[:, 22:]
    X_std = StandardScaler().fit_transform(X)
    pca = PCA(n_components=200)
    pca_results = pd.DataFrame(pca.fit_transform(X_std))
    pca_results.columns = ['PCA' + str(x + 1) for x in range(200)]
    agg_df = agg_df.iloc[:, :22]
    agg_df = pd.concat([agg_df, pca_results], axis=1)
    # Add more tsfresh features with passband, flux, flux_ratio_sq:
    agg_df_ts_flux_passband = extract_features(
        df,
        column_id='object_id',
        column_sort='mjd',
        column_kind='passband',
        column_value='flux',
        default_fc_parameters=fcp['flux_passband'],
        n_jobs=4)
    agg_df_ts_flux = extract_features(df,
                                      column_id='object_id',
                                      column_value='flux',
                                      default_fc_parameters=fcp['flux'],
                                      n_jobs=4)
    agg_df_ts_flux_by_flux_ratio_sq = extract_features(
        df,
        column_id='object_id',
        column_value='flux_by_flux_ratio_sq',
        default_fc_parameters=fcp['flux_by_flux_ratio_sq'],
        n_jobs=4)
    # Add smart feature that is suggested here https://www.kaggle.com/c/PLAsTiCC-2018/discussion/69696#410538
    # dt[detected==1, mjd_diff:=max(mjd)-min(mjd), by=object_id]
    df_det = df[df['detected'] == 1].copy()
    agg_df_mjd = extract_features(df_det,
                                  column_id='object_id',
                                  column_value='mjd',
                                  default_fc_parameters=fcp['mjd'],
                                  n_jobs=4)
    agg_df_mjd['mjd_diff_det'] = agg_df_mjd['mjd__maximum'] - agg_df_mjd[
        'mjd__minimum']
    del agg_df_mjd['mjd__maximum'], agg_df_mjd['mjd__minimum']
    agg_df_ts_flux_passband = agg_df_ts_flux_passband.reset_index()
    agg_df_ts_flux_passband.rename(columns={'id': 'object_id'}, inplace=True)
    agg_df_ts_flux = agg_df_ts_flux.reset_index()
    agg_df_ts_flux.rename(columns={'id': 'object_id'}, inplace=True)
    agg_df_ts_flux_by_flux_ratio_sq = agg_df_ts_flux_by_flux_ratio_sq.reset_index(
    )
    agg_df_ts_flux_by_flux_ratio_sq.rename(columns={'id': 'object_id'},
                                           inplace=True)
    agg_df_mjd = agg_df_mjd.reset_index()
    agg_df_mjd.rename(columns={'id': 'object_id'}, inplace=True)
    agg_df_ts = pd.concat([
        agg_df,
        agg_df_ts_flux_passband.drop(labels=['object_id'], axis=1),
        agg_df_ts_flux.drop(labels=['object_id'], axis=1),
        agg_df_ts_flux_by_flux_ratio_sq.drop(labels=['object_id'], axis=1),
        agg_df_mjd.drop(labels=['object_id'], axis=1)
    ],
                          axis=1).reset_index()
    if 'index' in agg_df_ts:
        del agg_df_ts['index']
    result = agg_df_ts.merge(right=meta, how='outer', on=['object_id'])
    return result
Beispiel #4
0
                print('Extracting PCA data from index files')

                PCA_data, lons, lats = get_PCA_data(var_name, rcp, years, data_dir)
                num_lons = lons.shape[0]
                num_lats = lats.shape[0]
                print('DATA MATRIX ' + str(PCA_data.shape))

                # Scale PCA data
                # Gives zero/NaN/infinity error
                #PCA_data = scale_linear_bycolumn(PCA_data, high=1.0, low=0.0)

                print('Computing component matrix')
                num_comps = 6
                comp_indices = []
                pca = PCA(n_components=num_comps)
                X_pca = pca.fit_transform(PCA_data) #(5490, 3)
                # Projection of original data onto component space
                corr_array = pca.inverse_transform(X_pca) #(5490, 103968)
                components = pca.components_.transpose() # (103968, 3)
                print('VARIANCE EXPLAINED: ')
                print (pca.explained_variance_ratio_)
                rotated_components = varimax(components).transpose() #(3, 103968)
                dates_dt = []
                dates_ts = []
                for year in years:
                    dates_dt.append(dt.datetime(year,12,1))
                    dates_ts.append(datetime_to_date(dates_dt[-1], '-'))
                    for doy_idx in range(1,90):
                        dates_dt.append(advance_date(dates_dt[-1],1, 'forward'))
                        dates_ts.append(datetime_to_date(dates_dt[-1], '-'))
Beispiel #5
0
def perform_PCA(data, number_of_components):
    #print ("Data",data.shape)
    pca = PCA(n_components=number_of_components)
    #print (pca.fit_transform(data).shape)
    return (pca.fit_transform(data).transpose())
Beispiel #6
0
def perform_PCA(data, number_of_components):
    pca = PCA(n_components=number_of_components)
    return (pca.fit_transform(data))
                print('PROCESSSING VARIABLE ' + var_name)
                print('Extracting PCA data from index files')

                LIVNEH_PCA_data, LIVNEH_lons, LIVNEH_lats = get_LIVNEH_PCA_data(var_name, livneh_years, livneh_data_dir)
                LOCA_PCA_data, LOCA_lons, LOCA_lats = get_LOCA_PCA_data(rcp, var_name, loca_years, loca_data_dir)
                num_lons = LIVNEH_lons.shape[0]
                num_lats = LIVNEH_lats.shape[0]
                print('LIVNEH DATA MATRIX ' + str(LIVNEH_PCA_data.shape))
                print('LOCA DATA MATRIX ' + str(LOCA_PCA_data.shape))

                #print('Minutes elapsed: ' + str((dt.datetime.now() - start_time).total_seconds() / 60.0))
                print('Computing component matrix')
                num_comps = 6
                comp_indices = []
                pca = PCA(n_components=num_comps)
                X_pca = pca.fit_transform(LIVNEH_PCA_data) #(5490, 3)
                # Projection of original data onto component space
                corr_array = pca.inverse_transform(X_pca) #(5490, 103968)
                components = pca.components_.transpose() # (103968, 3)
                print('VARIANCE EXPLAINED: ')
                print (pca.explained_variance_ratio_)
                rotated_components = varimax(components).transpose() #(3, 103968)
                dates_dt = []
                dates_ts = []
                for year in loca_years:
                    dates_dt.append(dt.datetime(year,12,1))
                    dates_ts.append(datetime_to_date(dates_dt[-1], '-'))
                    for doy_idx in range(1,90):
                        dates_dt.append(advance_date(dates_dt[-1],1, 'forward'))
                        dates_ts.append(datetime_to_date(dates_dt[-1], '-'))
# if np.isnan(0) == False:
#     print(100)

print(ddd.shape)

ddd_scaled = scale(ddd)

print(ddd_scaled.shape)

# a= np.array(data[])
pca = PCA(n_components=300)
results = pca.fit(ddd_scaled)

print(pca.explained_variance_ratio_)

a = pca.fit_transform(ddd_scaled)

aa = pd.Series(a[:, 0].tolist())
data["PCA1"] = aa.values
aa = pd.Series(a[:, 1].tolist())
data["PCA2"] = aa.values
aa = pd.Series(a[:, 2].tolist())
data["PCA3"] = aa.values
aa = pd.Series(a[:, 3].tolist())
data["PCA4"] = aa.values
aa = pd.Series(a[:, 4].tolist())
data["PCA5"] = aa.values
aa = pd.Series(a[:, 5].tolist())
data["PCA6"] = aa.values
aa = pd.Series(a[:, 6].tolist())
data["PCA7"] = aa.values
Beispiel #9
0
        PCA_data, lons, lats = get_PCA_data(var_name, years, data_dir)
        num_lons = lons.shape[0]
        num_lats = lats.shape[0]
        print('DATA MATRIX ' + str(PCA_data.shape))

        # Scale PCA data
        # Gives zero/NaN/infinity error
        #PCA_data = scale_linear_bycolumn(PCA_data, high=1.0, low=0.0)

        #print('Minutes elapsed: ' + str((dt.datetime.now() - start_time).total_seconds() / 60.0))
        print('Computing component matrix')
        num_comps = 6
        comp_indices = []
        pca = PCA(n_components=num_comps)
        X_pca = pca.fit_transform(PCA_data) #(5490, 3)
        # Projection of original data onto component space
        corr_array = pca.inverse_transform(X_pca) #(5490, 103968)
        components = pca.components_.transpose() # (103968, 3)
        print('VARIANCE EXPLAINED: ')
        print (pca.explained_variance_ratio_)
        rotated_components = varimax(components).transpose() #(3, 103968)
        dates_dt = []
        dates_ts = []
        for year in years:
            dates_dt.append(dt.datetime(year,12,1))
            dates_ts.append(datetime_to_date(dates_dt[-1], '-'))
            for doy_idx in range(1,90):
                dates_dt.append(advance_date(dates_dt[-1],1, 'forward'))
                dates_ts.append(datetime_to_date(dates_dt[-1], '-'))