/
main.py
54 lines (41 loc) · 1.21 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from readMNIST import readMNIST_CSV
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from DR_Plot import DR_Plot
from DR_Plot import DR_Plot_black
from sklearn.cluster import DBSCAN
import xlwt
# Import the data from the csv file
label, data = readMNIST_CSV('train.csv')
# Select the very first 300 data samples of each digit from the csv file
sel_data = []
sel_label = []
idx = 0
for i in range(0,len(label),6000):
count = 1
for j in range(len(data)):
sel_data.append(data[j+idx])
sel_label.append(label[i])
count += 1
if count > 300:
break
idx += 6000
# from list to array
sel_label = np.array(sel_label)
sel_data = np.array(sel_data)
# t-SNE (2-D)
tsne = TSNE(n_components=2, perplexity=30).fit_transform(sel_data)
# PCA (2-D)
pca = PCA(n_components=2)
pca = pca.fit_transform(sel_data, sel_label)
# Clustering
clustering = DBSCAN(eps=4, min_samples=20).fit(tsne)
clustering_pca = DBSCAN(eps=60, min_samples=8).fit(pca)
# Plot
DR_Plot(sel_label, tsne, 'tsne')
DR_Plot(sel_label, pca, 'pca')
DR_Plot(clustering.labels_, tsne, 'clustering_tsne')
DR_Plot(clustering_pca.labels_, pca, 'clustering_pca')
DR_Plot_black(tsne, 'tsne_black')
DR_Plot_black(pca, 'pca_black')