/
feature_gen.py
279 lines (210 loc) · 8.78 KB
/
feature_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# Script to do generate features from SN light curve data
import os
import pywt
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ExpSineSquared
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.decomposition import PCA
# ignore UserWarning messages during wavelet decomposition
warnings.filterwarnings("ignore", category=UserWarning)
# Method to do GPR fitting of lightcurves, in one band
# Take in photometric time-series data:
# - A list of observations in one filter
#
# Return:
# - [x,y,yerr], where x is the phase
# and y is the GPR fit of the light curve, and
# yerr is the error in y
def gaussian_regfit(lc_tab, flt):
# list to hold GPR fit variables
lcfit = []
lc = lc_tab
# Use only data with SNR > 1
lc = lc.loc[lc['SNR'] > 1]
# Load in values from observation
t = lc['NORM_T']
flux = lc['FLUXCAL']
flux_err = lc['FLUXCALERR']
#flt = lc['FLT'].unique()[0]
x_in = t
y_in = flux
y_err = flux_err
x_min = min(t) - 10
x_max = max(t) + 10
# Mesh the input space for evaluations of the real function, the prediction and
# its MSE
x_space = np.atleast_2d(np.linspace(x_min, x_max, 100)).T
x_fit = np.atleast_2d(x_in).T
# Define RBF kernel
k_rbf = RBF(length_scale=10, length_scale_bounds=(5., 1e2))
# Define sine kernel
k_sine = ExpSineSquared(length_scale=1e2, length_scale_bounds=(5., 1e2), periodicity=1e2, periodicity_bounds=(1e2, 1e4))
# Define white noise kernel
k_noise = WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-3, 1e3))
kernel = 1.0*k_rbf + 1.0*(k_rbf*k_sine) + k_noise
'''
# Things used for plotting
kernel_label = 'RBF + RBF*Sine + noise'
mean_colors = ['#000099', '#b30000', '#006600']
var_colors = ['#9999ff', '#ff8080', '#66ff66']
'''
gpr = GaussianProcessRegressor(kernel=kernel, alpha=y_err, n_restarts_optimizer=10)
# Fit to data using Maximum Likelihood Estimation of the parameters
gpr.fit(x_fit, y_in)
# Make the prediction on the meshed x-axis (ask for MSE as well)
y_pred, y_pred_sigma = gpr.predict(x_space, return_std=True)
# Store fitted lightcurve into holding list
lcfit.append([x_space.flatten(), y_pred, y_pred_sigma])
'''
# Get log likelihood and hyperparameters
log_likelihood = np.round(gpr.log_marginal_likelihood(),2)
hyper_params = gpr.kernel_
params = hyper_params.get_params()
'''
return lcfit
# Method to write GPR fit data to .csv file
def tabulate_gpr(lc_tab, obs_filter, sn_id, sn_type, save_directory):
#print(lc_tab[0])
# Extract information
t = lc_tab[0]
flux = lc_tab[1]
flux_err = lc_tab[2]
# Make column for SN type
sn_typecol = [sn_type] * len(t)
# Create pandas dataframe for GPR fit values
gp_data = {'phase' : t, 'flux_pred' : flux, 'flux_pred_sigma' : flux_err, 'type' : sn_typecol}
gp_df = pd.DataFrame(data=gp_data)
filename = sn_id + '_gprfit_{0}.csv'.format(obs_filter)
gp_df.to_csv(save_directory + filename, header=True, index=False)
# Methods to do wavelet decompostion on lightcurves
# Method to do 2-level wavelet decomposition on 2d data (epoch, mag),
# returns wavelet coefficients, a 2D array of wavelet coefficients
def get_wavedec_2d(x_in, y_in, wavelet):
coeffs = pywt.wavedec2([x_in, y_in], wavelet, level=2)
#print('Decomposition level: {0}'.format(len(coeffs)-1))
cA2 = coeffs[0]
cH2, cV2, cD2 = coeffs[1][0], coeffs[1][1], coeffs[1][2]
cH1, cV1, cD1 = coeffs[2][0], coeffs[2][1], coeffs[2][2]
coeffs_vector = [cA2, cH2, cV2, cD2, cH1, cV1, cD1]
return coeffs_vector
# Method to do 2-level stationary wavelet decomposition on 1d data (flux as a function of time),
# returns wavelet coefficients
def get_stwavedec_1d(y_in, wavelet):
coeffs = pywt.swt(y_in, wavelet, level=2)
return coeffs
# Method to flatten vector of coefficients
def flatten_coeffs(coeff_vector):
#flattened_arr = [coeff.flatten() for coeff in coeff_vector]
flattened_arr = []
for coeff in coeff_vector:
flat_coeffs = coeff.flatten()
flattened_arr.extend(flat_coeffs)
return flattened_arr
# Method to create a dictionary of coeff labels and values and type
def mk_coeff_dict(coeff_vector, sn_id, sn_type):
len_cv = len(coeff_vector)
coeff_labels = ['coeff{0}'.format(i) for i in range(len_cv)]
coeff_dict = {}
for j, label in enumerate(coeff_labels):
coeff_dict[label] = coeff_vector[j]
coeff_dict['type'] = sn_type
coeff_dict['id'] = sn_id
return coeff_dict
# Path to SPCC lightcurves
lc_path = '/local/php18ufb/backed_up_on_astro3/SPCC/lightcurves_withkeys/'
# Path to Training set lightcurves (not needed?)
trainset_lcpath = '/local/php18ufb/backed_up_on_astro3/SPCC/trainset_lightcurves/'
# Path to training set gpr fits
trainset_gprpath = '/local/php18ufb/backed_up_on_astro3/SPCC/trainset_gprfit/'
# Test save directory for GPR fits
gpr_path = '/local/php18ufb/backed_up_on_astro3/SPCC/test_gprfit_350/'
##############################################################################
# get list of all lightcurves
lc_files = sorted(os.listdir(lc_path))
# Get a random sample of lightcurves for testing
#lc_tsample = random.sample(lc_files, 350)
# Load full SPCC SN catalog as a dataframe
sn_cat = pd.read_csv('spcc_sn_keycat.csv')
# Load SPCC SN catalog for the training set
train_sn_cat = pd.read_csv('SPCC_spectypes.csv')
test_sn_cat = pd.read_csv('SPCC_no_spectypes.csv')
# Get filenames of training set SNe
train_snid = train_sn_cat['SNID'].tolist()
train_lcs = [train_id + '.csv' for train_id in train_snid]
# Get filenames of test set SNe
test_snid = test_sn_cat['SNID'].tolist()
test_lcs = [test_id + '.csv' for test_id in test_snid]
# --------------------
# Testing with 10 training set supernovae
#lc_tsample = random.sample(train_lcs, 10)
# Select wavelet family
wavelet = input('Wavelet family: ')
# List of wavelet coefficients of each SN
sn_features = []
n_rej = 0 # Track no. of rejected lightcurves
for i, sn in enumerate(train_lcs):
# Display progress in terminal
print('Getting features from supernova light curves ... {0}/{1}'.format((i+1), len(train_lcs)), end='\r')
# Load photometric data as pandas dataframe
lc_df = pd.read_csv(lc_path + sn)
sn_id = sn.strip('.csv')
# Get SN type
sn_data = sn_cat.loc[sn_cat['SNID'] == sn_id]
sn_type = sn_data.iloc[0]['TYPE']
# Get list of filters
flts = lc_df['FLT'].unique()
# For each filter, do:
# - Gaussian Regression fit, save fit to a .csv file
# - Wavelet decomposition, store wavelet coefficients
# in an array outside of per filter loop
wt_coeffs = []
# If no_obs = True in any filter, then exit the loop
no_obs = False
for f in flts:
# Select all observations in a single filter
lc = lc_df.loc[lc_df['FLT'] == f]
# Exit loop if there is no observations (Catch exception)
try:
# Do Gaussian Regression fit
gpr_fit = gaussian_regfit(lc, f)
# Save GPR fit to .csv file
tabulate_gpr(gpr_fit[0], f, sn_id, sn_type, trainset_gprpath)
# Get information needed for wavelet decomposition:
# - phase
# - predicted flux
phase = gpr_fit[0][0]
flux_pred = gpr_fit[0][1]
# Do wavelet decomposition
#wvlets = get_wavedec_2d(phase, flux_pred, wavelet) - OLD
stationary_coeffs = get_stwavedec_1d(flux_pred, wavelet)
sn_coeff_vector = np.array(stationary_coeffs).flatten()
wt_coeffs.append(sn_coeff_vector)
#wvlets_vec = flatten_coeffs(wvlets) - OLD
#wt_coeffs.append(wvlets_vec) - OLD
except ValueError:
no_obs = True
n_rej += 1
break
# Do nothing if any of the filters contain no observations
if no_obs == True:
print('\nNo observations in {0} band for {1}'.format(f, sn_id), end = '\n')
else:
# Combine wavelet coefficients for different filters into one list
wt_coeffs = np.array(wt_coeffs).flatten()
# Convert NaN to zero
wt_coeffs = np.nan_to_num(wt_coeffs)
features_dict = mk_coeff_dict(wt_coeffs, sn_id, sn_type)
sn_features.append(features_dict)
print()
print('No. of rejected SNe: {0}'.format(n_rej))
# Create dataframe containing the wavelet coefficients of all SNe
#print(len(sn_wvlets), len(sn_typescol))
print('Shape of wavelet coefficients table: ', np.shape(sn_features))
sn_features_df = pd.DataFrame(sn_features)
#print(sn_features)
# Save all wavelet coefficients to a .csv file
sn_features_df.to_csv('sn_wvlet_coeffs_trainset.csv')