def __getitem__(self, index): dates = self.get_dates(index) x = np.concatenate([ np.expand_dims( (standardize(v.sel(time=dates), v.name) if self.standardize else v.sel(time=dates)).fillna(0).values, -1) for v in self.variables ], axis=-1) if x.ndim == 5: x = x.squeeze(axis=1) x = crop_center(crop_boundaries(x), (len(dates), *self.in_size, len(self.varnames))) if self.truth_filename: y = self.truth_variable.sel(time=dates)[:, ::-1, :].values if self.onehot: y = to_categorical(y, 5) else: y = np.expand_dims(y, axis=-1) if self.onehot: y = crop_center(crop_boundaries(y), (len(dates), *self.in_size, 5)) else: y = crop_center(crop_boundaries(y), (len(dates), *self.in_size, 1)) return x, y return x
def PCA(encodings): """ Performs PCA on the encodings. First, it calculates the mean for the data set along the dimensions. Then it centers the data by subtracting the mean from each data point. Then, it scales the data into a unit vector. This process is called standardization. Then it calculates the eigenvalues/eigenvectors using eigenvalue decomposition on the covariance matrix """ print "Calculated normalization" encodings = normalize.standardize(encodings) np.save(prefix + str(capacity) + "normalized", encodings) print "calculating cov" cov_mat = np.cov(encodings) np.save(prefix + str(capacity) + "cov_matrix", cov_mat) print "calculating eig" eig_vals, eig_vecs = np.linalg.eig(cov_mat) idx = eig_vals.argsort()[::-1] eig_vals = eig_vals[idx] eig_vecs = eig_vecs[:,idx] np.save(prefix + str(capacity) + "eig_vals", eig_vals) np.save(prefix + str(capacity) + "eig_vecs", eig_vecs) return eig_val, eig_vec
from graph_pca_info import plot_pca_info from normalize import norm, standardize from sklearn import metrics from pprint import pprint from sklearn.metrics import roc_curve, auc from matplotlib.backends.backend_pdf import PdfPages import time import matplotlib.pyplot as plt # import training and testing data # filePath = "X.dat" file = open(filePath, 'r') allData = np.loadtxt(file, delimiter=',') Xtrain = allData[0:int(len(allData) / 2), 0:-1] Xtrain = standardize(Xtrain) # the labels are in the last column ytrain = allData[0:int(len(allData) / 2), -1] # n_components for dimensionality reduction, chosen from graph_pca_info plots n_components = 40 # n_neighbors for knn; sklearn's default value is 5 knn_neighbors = 5 # dimensionality reduction; change the method for whichever method you choose # lle(n_components, neighbors=(n_components * (n_components + 3) / 2) + 1, hessian=True) # lle(n_components, neighbors=(n_components * (n_components + 3) / 2) + 1) # componentsPca(n_components) # kpca(n_components) start_time = time.clock()
k_eig_vals = eig_vals[:top_k] k_eig_vecs = eig_vecs[:top_k] # for i in xrange(len(k_eig_vals)): # print i, k_eig_vals[i] normalized = encodings final = np.dot(k_eig_vecs, normalized) # Need to pass in each encoding row-wise final_t = np.transpose(final) clf = svm.SVC() clf.fit(final_t, stars) # Save our model joblib.dump(clf, "pca_clf.pkl") print "Training score: ", clf.score(final_t, stars) # Test it on our training set test_encodings, test_stars = normalize.read_data("test.json", dictionary, 0, 10000) # Normalize test_encodings = normalize.standardize(test_encodings) final_test = np.dot(k_eig_vecs, test_encodings) final_test_t = np.transpose(final_test) print "Testing score: ", clf.score(final_test_t, test_stars)