/
pca_module.py
60 lines (49 loc) · 1.93 KB
/
pca_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Name: Shipra Shivaji Shinde
# ID: 800974877
#
# Principal Component Analysis (PCA)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
def perform_pca(file_path):
# read the dataset from csv file
X = np.genfromtxt (file_path, delimiter=",")
# delete the first row
X = np.delete(X, (0), axis=0)
X_std = StandardScaler().fit_transform(X)
# Step 1: calculate mean center for all the columns
mean_vec = np.mean(X_std, axis=0)
# Step 2: calculate cov(x)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)
# Step 3: calculate eigen values and eigen vectors of cov(x)
eigen_values, eigen_vectors = np.linalg.eig(cov_mat)
print('Eigenvectors \n%s' %eigen_vectors)
print('\nEigenvalues \n%s' %eigen_values)
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eigen_values[i]), eigen_vectors[:,i]) for i in range(len(eigen_values))]
# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()
# Step 4: Projection
matrix_w = np.hstack((eig_pairs[0][1].reshape(X_std.shape[1],1),
eig_pairs[1][1].reshape(X_std.shape[1],1)))
print('Matrix W:\n', matrix_w)
Y = X_std.dot(matrix_w)
print('Matrix Y:\n', Y)
# Step 5: Plot the projections for the first and second principal components
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(Y[:,0],Y[:,1])
# set the x-spine (see below for more info on `set_position`)
ax.spines['left'].set_position('zero')
# turn off the right spine/ticks
ax.spines['right'].set_color('none')
ax.yaxis.tick_left()
# set the y-spine
ax.spines['bottom'].set_position('zero')
# turn off the top spine/ticks
ax.spines['top'].set_color('none')
ax.xaxis.tick_bottom()
fig.show()