-
Notifications
You must be signed in to change notification settings - Fork 2
/
Generation_Stage.py
130 lines (119 loc) · 8.93 KB
/
Generation_Stage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import Dataset_Generator as dg
import Evaluation as evaluation
import MyLLE as lle
import numpy as np
import pickle as pk
import time
import pickle
def run():
# Following are the code for stage 1: Datasets creation and reduction
# Please note that stage 1 must be done before stage 2
localtime = time.asctime(time.localtime(time.time()))
print("Local current time :", localtime)
print("Stage 1: creating the five artificial dataset and reading the MNIST natural dataset, then generate datasets with reduced dimentionality, using LLE and PCA\n")
# Note running 5000 samples may take awfully long time, while 1000 samples takes only around 30 mins.
print("Now generating the five artificial datasets and reading the MNIST dataset")
swiss_roll_dataset, swiss_roll_labels = dg.get_swiss_roll_dataset_with_labels2(5000)
helix_dataset, helix_labels = dg.get_helix_dataset_with_label2(5000)
twin_peaks_dataset, twin_peak_labels = dg.get_twin_peaks_with_label2(5000)
broken_swiss_dataset, broken_swiss_labels = dg.get_broken_swiss_roll_dataset_with_label2(5000)
hd_dataset, hd_labels = dg.get_hd_dataset_with_label2(5000)
MNIST_images, MNIST_labels = evaluation.get_natural_dataset_samples(5000)
original_datasets = {"swiss_roll": swiss_roll_dataset, "helix": helix_dataset, "twin_peaks": twin_peaks_dataset,
"broken_swiss_roll": broken_swiss_dataset, "hd": hd_dataset, "MNIST": MNIST_images}
pk.dump(original_datasets, open('original_datasets.p', 'wb'))
print("Finished! \n")
print("Now getting labels for all datasets")
datasets_labels = {"swiss_roll": swiss_roll_labels, "helix": helix_labels, "twin_peaks": twin_peak_labels,
"broken_swiss_roll": broken_swiss_labels, "hd": hd_labels, "MNIST": MNIST_labels}
pk.dump(datasets_labels, open('datasets_labels.p', 'wb'))
print("Finished! \n")
# The following code reduces dimensionality using PCA and LLE
print("Now using PCA to reduce dimensionality of each dataset")
pca_reduced_swiss_roll = evaluation.pca_dim_reduction(swiss_roll_dataset, 2)
pca_reduced_helix = evaluation.pca_dim_reduction(helix_dataset, 1)
pca_reduced_twin_peaks = evaluation.pca_dim_reduction(twin_peaks_dataset, 2)
pca_reduced_broken_swiss = evaluation.pca_dim_reduction(broken_swiss_dataset, 2)
pca_reduced_hd = evaluation.pca_dim_reduction(hd_dataset, 2)
pca_reduced_MNIST_images = evaluation.pca_dim_reduction(MNIST_images, 20)
pca_reduced_datasets = {"swiss_roll": pca_reduced_swiss_roll, "helix": pca_reduced_helix, "twin_peaks": pca_reduced_twin_peaks,
"broken_swiss_roll": pca_reduced_broken_swiss, "hd": pca_reduced_hd, "MNIST": pca_reduced_MNIST_images}
# pca_reduced_datasets = [pca_reduced_swiss_roll, pca_reduced_helix, pca_reduced_twin_peaks, pca_reduced_broken_swiss,
# pca_reduced_hd, pca_reduced_MNIST_images]
pk.dump(pca_reduced_datasets, open('pca_reduced_datasets.p', 'wb'))
print("Finished! \n")
lle_reduced_datasets_under_diff_k = [] # this list contains results under different k parameter, where idx i is the result for k = i + 5
print("Now using LLE to reduce dimensionality of each dataset. Note that the parameter k ranges from 5 to 15 so this step is gonna take a while")
for k in range(5, 16):
lle_reduced_swiss_roll = lle.locally_linear_embedding(np.array(swiss_roll_dataset, np.float64), k, 2)[0].tolist()
lle_reduced_helix = lle.locally_linear_embedding(np.array(helix_dataset, np.float64), k, 1)[0].tolist()
lle_reduced_twin_peaks = lle.locally_linear_embedding(np.array(twin_peaks_dataset, np.float64), k, 2)[0].tolist()
lle_reduced_broken_swiss = lle.locally_linear_embedding(np.array(broken_swiss_dataset, np.float64), k, 2)[0].tolist()
lle_reduced_hd = lle.locally_linear_embedding(np.array(hd_dataset, np.float64), k, 5)[0].tolist()
lle_reduced_MNIST_images = lle.locally_linear_embedding(np.array(MNIST_images, np.float64), k, 20)[0].tolist()
curr_k_results = {"swiss_roll": lle_reduced_swiss_roll, "helix": lle_reduced_helix,
"twin_peaks": lle_reduced_twin_peaks,
"broken_swiss_roll": lle_reduced_broken_swiss, "hd": lle_reduced_hd,
"MNIST": lle_reduced_MNIST_images}
lle_reduced_datasets_under_diff_k.append(curr_k_results)
pk.dump(lle_reduced_datasets_under_diff_k, open('lle_reduced_datasets_under_diff_k.p', 'wb'))
print("Finished! \n")
localtime = time.asctime(time.localtime(time.time()))
print("Local current time :", localtime)
# ************************ End of the stage 1
def generate_original_datasets():
print("Now generating the five artificial datasets and reading the MNIST dataset")
swiss_roll_dataset, swiss_roll_labels = dg.get_swiss_roll_dataset_with_labels2(5000)
helix_dataset, helix_labels = dg.get_helix_dataset_with_label2(5000)
twin_peaks_dataset, twin_peak_labels = dg.get_twin_peaks_with_label2(5000)
broken_swiss_dataset, broken_swiss_labels = dg.get_broken_swiss_roll_dataset_with_label2(5000)
hd_dataset, hd_labels = dg.get_hd_dataset_with_label2(5000)
MNIST_images, MNIST_labels = evaluation.get_natural_dataset_samples(5000)
original_datasets = {"swiss_roll": swiss_roll_dataset, "helix": helix_dataset, "twin_peaks": twin_peaks_dataset,
"broken_swiss_roll": broken_swiss_dataset, "hd": hd_dataset, "MNIST": MNIST_images}
pk.dump(original_datasets, open('original_datasets.p', 'wb'))
print("Finished! \n")
def perform_pca_to_original_datasets():
print("Now using PCA to reduce dimensionality of each dataset")
original_datasets = pickle.load(open('original_datasets.p', 'rb'))
swiss_roll_dataset = original_datasets["swiss_roll"]
helix_dataset = original_datasets["helix"]
twin_peaks_dataset = original_datasets["twin_peaks"]
broken_swiss_dataset = original_datasets["broken_swiss_roll"]
hd_dataset = original_datasets["hd"]
MNIST_images = original_datasets["MNIST"]
pca_reduced_swiss_roll = evaluation.pca_dim_reduction(swiss_roll_dataset, 2)
pca_reduced_helix = evaluation.pca_dim_reduction(helix_dataset, 1)
pca_reduced_twin_peaks = evaluation.pca_dim_reduction(twin_peaks_dataset, 2)
pca_reduced_broken_swiss = evaluation.pca_dim_reduction(broken_swiss_dataset, 2)
pca_reduced_hd = evaluation.pca_dim_reduction(hd_dataset, 2)
pca_reduced_MNIST_images = evaluation.pca_dim_reduction(MNIST_images, 20)
pca_reduced_datasets = {"swiss_roll": pca_reduced_swiss_roll, "helix": pca_reduced_helix, "twin_peaks": pca_reduced_twin_peaks,
"broken_swiss_roll": pca_reduced_broken_swiss, "hd": pca_reduced_hd, "MNIST": pca_reduced_MNIST_images}
# pca_reduced_datasets = [pca_reduced_swiss_roll, pca_reduced_helix, pca_reduced_twin_peaks, pca_reduced_broken_swiss,
# pca_reduced_hd, pca_reduced_MNIST_images]
pk.dump(pca_reduced_datasets, open('pca_reduced_datasets.p', 'wb'))
print("Finished! \n")
def perform_lle_to_orginal_datasets():
lle_reduced_datasets_under_diff_k = [] # this list contains results under different k parameter, where idx i is the result for k = i + 5
original_datasets = pickle.load(open('original_datasets.p', 'rb'))
swiss_roll_dataset = original_datasets["swiss_roll"]
helix_dataset = original_datasets["helix"]
twin_peaks_dataset = original_datasets["twin_peaks"]
broken_swiss_dataset = original_datasets["broken_swiss_roll"]
hd_dataset = original_datasets["hd"]
MNIST_images = original_datasets["MNIST"]
print("Now using LLE to reduce dimensionality of each dataset. Note that the parameter k ranges from 5 to 15 so this step is gonna take a while")
for k in range(5, 16):
lle_reduced_swiss_roll = lle.locally_linear_embedding(np.array(swiss_roll_dataset, np.float64), k, 2)[0].tolist()
lle_reduced_helix = lle.locally_linear_embedding(np.array(helix_dataset, np.float64), k, 1)[0].tolist()
lle_reduced_twin_peaks = lle.locally_linear_embedding(np.array(twin_peaks_dataset, np.float64), k, 2)[0].tolist()
lle_reduced_broken_swiss = lle.locally_linear_embedding(np.array(broken_swiss_dataset, np.float64), k, 2)[0].tolist()
lle_reduced_hd = lle.locally_linear_embedding(np.array(hd_dataset, np.float64), k, 5)[0].tolist()
lle_reduced_MNIST_images = lle.locally_linear_embedding(np.array(MNIST_images, np.float64), k, 20)[0].tolist()
curr_k_results = {"swiss_roll": lle_reduced_swiss_roll, "helix": lle_reduced_helix,
"twin_peaks": lle_reduced_twin_peaks,
"broken_swiss_roll": lle_reduced_broken_swiss, "hd": lle_reduced_hd,
"MNIST": lle_reduced_MNIST_images}
lle_reduced_datasets_under_diff_k.append(curr_k_results)
pk.dump(lle_reduced_datasets_under_diff_k, open('lle_reduced_datasets_under_diff_k.p', 'wb'))