def plot(val_fn, pts_fn, output_fn): points = [] with open(pts_fn) as fp: for line in fp.xreadlines(): points.append(map(float, line.split())) values = [] with open(val_fn) as fp: for line in fp.xreadlines(): values.append(float(line.split()[1])) xx = [pt[0] for pt in points] yy = [pt[1] for pt in points] print "X:", min(xx), max(xx) print "Y:", min(yy), max(yy) m = min(values) values = [(v - m) % 1. for v in values] print "V:", min(values), max(values) # hsv() myData = numpy.array(points) #results = PCA(myData,2) pca = PCA(n_components=2) results = pca.fit_transform(points) fig = figure() scatter(results[:, 0], results[:, 1], s=10, c=values, cmap="spectral") colorbar() # ax = fig.add_axes([-.05,-.1,1.1,1.1]) ax = axes() ax.set_axis_off() ax.set_aspect('equal', 'box') # adjust(0,0,1,1,0,0) fig.savefig(output_fn)
avg_sulphates = df.sulphates / np.mean(df.sulphates) avg_alcohol = df.alcohol / np.mean(df.alcohol) #normalized dataset using the max of column method max_df = pd.DataFrame({'fixed_acidity': max_fixed_acidity, 'volatile_acidity': max_volatile_acidity, 'citric_acid':max_citric_acid, \ 'residual_sugar': max_residual_sugar, 'chlorides': max_chlorides, 'free_sulfur_dioxide': max_free_sulfur_dioxide, \ 'total_sulfur_dioxide': max_total_sulfur_dioxide, 'density': max_density, 'pH': max_pH, 'sulphates': max_sulphates, 'alcohol':max_alcohol}) #normalized datset using the average of column method avg_df = pd.DataFrame({'fixed_acidity': avg_fixed_acidity, 'volatile_acidity': avg_volatile_acidity, 'citric_acid':avg_citric_acid, \ 'residual_sugar': avg_residual_sugar, 'chlorides': avg_chlorides, 'free_sulfur_dioxide': avg_free_sulfur_dioxide, \ 'total_sulfur_dioxide': avg_total_sulfur_dioxide, 'density': avg_density, 'pH': avg_pH, 'sulphates': avg_sulphates, 'alcohol':avg_alcohol}) #principal component analysis of all three data frames pca = PCA() pca_df = pd.DataFrame(pca.fit_transform(df), columns=column_names) pca_avg_df = pd.DataFrame(pca.fit_transform(avg_df), columns=column_names) pca_max_df = pd.DataFrame(pca.fit_transform(max_df), columns=column_names) ######################### VIZ - SCREE CHARTS & SCATTER ########################### PALETTE = [ "#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4", "#46f0f0", "#f032e6", "#d2f53c", "#008080", "#e6beff" ] if option == 1: plt.hist(quality, bins=10, color=PALETTE[7]) plt.show()
def featurize(df, meta): df['flux_ratio_sq'] = np.power(df['flux'] / df['flux_err'], 2.0) df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq'] # train[detected==1, mjd_diff:=max(mjd)-min(mjd), by=object_id] aggs = { 'flux': ['min', 'max', 'mean', 'median', 'std', 'skew'], 'flux_err': ['min', 'max', 'mean', 'median', 'std', 'skew'], 'detected': ['mean'], 'flux_ratio_sq': ['sum', 'skew'], 'flux_by_flux_ratio_sq': ['sum', 'skew'] } # Features to compute with tsfresh library. Fft coefficient is meant to capture periodicity fcp = { 'flux': { 'longest_strike_above_mean': None, 'longest_strike_below_mean': None, 'mean_change': None, 'mean_abs_change': None, 'length': None, }, 'flux_by_flux_ratio_sq': { 'longest_strike_above_mean': None, 'longest_strike_below_mean': None, }, 'flux_passband': { 'fft_coefficient': [{ 'coeff': 0, 'attr': 'abs' }, { 'coeff': 1, 'attr': 'abs' }], 'kurtosis': None, 'skewness': None, }, 'mjd': { 'maximum': None, 'minimum': None, 'mean_change': None, 'mean_abs_change': None, }, } agg_df = df.groupby(['object_id']).agg(aggs) new_columns = [k + '_' + agg for k in aggs.keys() for agg in aggs[k]] agg_df.columns = new_columns agg_df = agg_df.reset_index() agg_df['flux_diff'] = agg_df['flux_max'] - agg_df['flux_min'] agg_df['flux_dif2'] = (agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_mean'] agg_df['flux_w_mean'] = agg_df['flux_by_flux_ratio_sq_sum'] / agg_df[ 'flux_ratio_sq_sum'] agg_df['flux_dif3'] = (agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_w_mean'] # Added DWT features (ignore warnings from pywt.wavedec): with warnings.catch_warnings(): warnings.simplefilter("ignore") wavelet_df = wavelet(df) agg_df = agg_df.merge(right=wavelet_df, how='left', on='object_id') print('MERGED WAVELET DF') # Run PCA X = agg_df.iloc[:, 22:] X_std = StandardScaler().fit_transform(X) pca = PCA(n_components=200) pca_results = pd.DataFrame(pca.fit_transform(X_std)) pca_results.columns = ['PCA' + str(x + 1) for x in range(200)] agg_df = agg_df.iloc[:, :22] agg_df = pd.concat([agg_df, pca_results], axis=1) # Add more tsfresh features with passband, flux, flux_ratio_sq: agg_df_ts_flux_passband = extract_features( df, column_id='object_id', column_sort='mjd', column_kind='passband', column_value='flux', default_fc_parameters=fcp['flux_passband'], n_jobs=4) agg_df_ts_flux = extract_features(df, column_id='object_id', column_value='flux', default_fc_parameters=fcp['flux'], n_jobs=4) agg_df_ts_flux_by_flux_ratio_sq = extract_features( df, column_id='object_id', column_value='flux_by_flux_ratio_sq', default_fc_parameters=fcp['flux_by_flux_ratio_sq'], n_jobs=4) # Add smart feature that is suggested here https://www.kaggle.com/c/PLAsTiCC-2018/discussion/69696#410538 # dt[detected==1, mjd_diff:=max(mjd)-min(mjd), by=object_id] df_det = df[df['detected'] == 1].copy() agg_df_mjd = extract_features(df_det, column_id='object_id', column_value='mjd', default_fc_parameters=fcp['mjd'], n_jobs=4) agg_df_mjd['mjd_diff_det'] = agg_df_mjd['mjd__maximum'] - agg_df_mjd[ 'mjd__minimum'] del agg_df_mjd['mjd__maximum'], agg_df_mjd['mjd__minimum'] agg_df_ts_flux_passband = agg_df_ts_flux_passband.reset_index() agg_df_ts_flux_passband.rename(columns={'id': 'object_id'}, inplace=True) agg_df_ts_flux = agg_df_ts_flux.reset_index() agg_df_ts_flux.rename(columns={'id': 'object_id'}, inplace=True) agg_df_ts_flux_by_flux_ratio_sq = agg_df_ts_flux_by_flux_ratio_sq.reset_index( ) agg_df_ts_flux_by_flux_ratio_sq.rename(columns={'id': 'object_id'}, inplace=True) agg_df_mjd = agg_df_mjd.reset_index() agg_df_mjd.rename(columns={'id': 'object_id'}, inplace=True) agg_df_ts = pd.concat([ agg_df, agg_df_ts_flux_passband.drop(labels=['object_id'], axis=1), agg_df_ts_flux.drop(labels=['object_id'], axis=1), agg_df_ts_flux_by_flux_ratio_sq.drop(labels=['object_id'], axis=1), agg_df_mjd.drop(labels=['object_id'], axis=1) ], axis=1).reset_index() if 'index' in agg_df_ts: del agg_df_ts['index'] result = agg_df_ts.merge(right=meta, how='outer', on=['object_id']) return result
print('Extracting PCA data from index files') PCA_data, lons, lats = get_PCA_data(var_name, rcp, years, data_dir) num_lons = lons.shape[0] num_lats = lats.shape[0] print('DATA MATRIX ' + str(PCA_data.shape)) # Scale PCA data # Gives zero/NaN/infinity error #PCA_data = scale_linear_bycolumn(PCA_data, high=1.0, low=0.0) print('Computing component matrix') num_comps = 6 comp_indices = [] pca = PCA(n_components=num_comps) X_pca = pca.fit_transform(PCA_data) #(5490, 3) # Projection of original data onto component space corr_array = pca.inverse_transform(X_pca) #(5490, 103968) components = pca.components_.transpose() # (103968, 3) print('VARIANCE EXPLAINED: ') print (pca.explained_variance_ratio_) rotated_components = varimax(components).transpose() #(3, 103968) dates_dt = [] dates_ts = [] for year in years: dates_dt.append(dt.datetime(year,12,1)) dates_ts.append(datetime_to_date(dates_dt[-1], '-')) for doy_idx in range(1,90): dates_dt.append(advance_date(dates_dt[-1],1, 'forward')) dates_ts.append(datetime_to_date(dates_dt[-1], '-'))
def perform_PCA(data, number_of_components): #print ("Data",data.shape) pca = PCA(n_components=number_of_components) #print (pca.fit_transform(data).shape) return (pca.fit_transform(data).transpose())
def perform_PCA(data, number_of_components): pca = PCA(n_components=number_of_components) return (pca.fit_transform(data))
print('PROCESSSING VARIABLE ' + var_name) print('Extracting PCA data from index files') LIVNEH_PCA_data, LIVNEH_lons, LIVNEH_lats = get_LIVNEH_PCA_data(var_name, livneh_years, livneh_data_dir) LOCA_PCA_data, LOCA_lons, LOCA_lats = get_LOCA_PCA_data(rcp, var_name, loca_years, loca_data_dir) num_lons = LIVNEH_lons.shape[0] num_lats = LIVNEH_lats.shape[0] print('LIVNEH DATA MATRIX ' + str(LIVNEH_PCA_data.shape)) print('LOCA DATA MATRIX ' + str(LOCA_PCA_data.shape)) #print('Minutes elapsed: ' + str((dt.datetime.now() - start_time).total_seconds() / 60.0)) print('Computing component matrix') num_comps = 6 comp_indices = [] pca = PCA(n_components=num_comps) X_pca = pca.fit_transform(LIVNEH_PCA_data) #(5490, 3) # Projection of original data onto component space corr_array = pca.inverse_transform(X_pca) #(5490, 103968) components = pca.components_.transpose() # (103968, 3) print('VARIANCE EXPLAINED: ') print (pca.explained_variance_ratio_) rotated_components = varimax(components).transpose() #(3, 103968) dates_dt = [] dates_ts = [] for year in loca_years: dates_dt.append(dt.datetime(year,12,1)) dates_ts.append(datetime_to_date(dates_dt[-1], '-')) for doy_idx in range(1,90): dates_dt.append(advance_date(dates_dt[-1],1, 'forward')) dates_ts.append(datetime_to_date(dates_dt[-1], '-'))
# if np.isnan(0) == False: # print(100) print(ddd.shape) ddd_scaled = scale(ddd) print(ddd_scaled.shape) # a= np.array(data[]) pca = PCA(n_components=300) results = pca.fit(ddd_scaled) print(pca.explained_variance_ratio_) a = pca.fit_transform(ddd_scaled) aa = pd.Series(a[:, 0].tolist()) data["PCA1"] = aa.values aa = pd.Series(a[:, 1].tolist()) data["PCA2"] = aa.values aa = pd.Series(a[:, 2].tolist()) data["PCA3"] = aa.values aa = pd.Series(a[:, 3].tolist()) data["PCA4"] = aa.values aa = pd.Series(a[:, 4].tolist()) data["PCA5"] = aa.values aa = pd.Series(a[:, 5].tolist()) data["PCA6"] = aa.values aa = pd.Series(a[:, 6].tolist()) data["PCA7"] = aa.values
PCA_data, lons, lats = get_PCA_data(var_name, years, data_dir) num_lons = lons.shape[0] num_lats = lats.shape[0] print('DATA MATRIX ' + str(PCA_data.shape)) # Scale PCA data # Gives zero/NaN/infinity error #PCA_data = scale_linear_bycolumn(PCA_data, high=1.0, low=0.0) #print('Minutes elapsed: ' + str((dt.datetime.now() - start_time).total_seconds() / 60.0)) print('Computing component matrix') num_comps = 6 comp_indices = [] pca = PCA(n_components=num_comps) X_pca = pca.fit_transform(PCA_data) #(5490, 3) # Projection of original data onto component space corr_array = pca.inverse_transform(X_pca) #(5490, 103968) components = pca.components_.transpose() # (103968, 3) print('VARIANCE EXPLAINED: ') print (pca.explained_variance_ratio_) rotated_components = varimax(components).transpose() #(3, 103968) dates_dt = [] dates_ts = [] for year in years: dates_dt.append(dt.datetime(year,12,1)) dates_ts.append(datetime_to_date(dates_dt[-1], '-')) for doy_idx in range(1,90): dates_dt.append(advance_date(dates_dt[-1],1, 'forward')) dates_ts.append(datetime_to_date(dates_dt[-1], '-'))