-
Notifications
You must be signed in to change notification settings - Fork 0
/
simulation.py
109 lines (88 loc) · 3.54 KB
/
simulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from sklearn.cluster.k_means_ import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.datasets import make_blobs
import numpy as np
import pandas as pd
from multiprocessing import Pool
import re
import readline
def print_progress_bar (iteration, total, prefix='', suffix='', decimals=2, length=50, fill='█'):
'''
Auxillary function. Gives us a progress bar which tracks the completion status of our task. Put in loop.
:param iteration: current iteration
:param total: total number of iterations
:param prefix: string
:param suffix: string
:param decimals: float point precision of % done number
:param length: length of bar
:param fill: fill of bar
:return:
'''
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
# Print New Line on Complete
if iteration == total:
print()
def simulation(n, n_clusters, k_range, dim, runs=100):
all_data = []
k_low, k_hi = k_range
for idx in range(runs):
data, labels = make_blobs(n_samples=n, n_features=dim, centers=n_clusters,
cluster_std=0.1, center_box=(-1.0, 1.0))
for k in range(k_low, k_hi+1):
# Get a model specified, fit to data, score for error, mark error as -1 if fails
model = KMeans(n_clusters=k, random_state=0)
labels = model.fit_predict(data)
avg_score = silhouette_score(data, labels)
all_data.append([n, n_clusters, k, dim, avg_score])
df = pd.DataFrame(all_data, columns=['n', 'n_clusters', 'k', 'dim', 'avg_score'])
return df
def run_sim(n, n_clusters, k_range, dim_range, runs, file):
dim_min, dim_max = dim_range
tasks = []
total = 0
for dim in np.arange(dim_min, dim_max + 1):
tasks.append((n, n_clusters, k_range, dim, runs,))
total += 1
# Progress bar stuff
iteration = 0
prefix = "Simulation"
suffix = "Complete"
print_progress_bar(iteration, total, prefix=prefix, suffix=suffix)
# Send our tasks to the process pool, as they complete append their results to data
data = []
with Pool(processes=3) as pool:
results = [pool.apply_async(simulation, args=t) for t in tasks]
for r in results:
iteration += 1
data.append(r.get())
print_progress_bar(iteration, total, prefix=prefix, suffix=suffix)
print("Writing data...")
df = pd.concat(data)
df.to_csv("data/{}".format(file), sep=',', index=False)
if __name__ == "__main__":
userin = input("Give value for n\n"
">>>> ")
n = int(userin)
userin = input("Give a fixed number of actual data clusters\n"
">>> ")
n_clusters = int(userin)
userin = input("Give a range for k values (comma separated)\n"
">>>> ")
splt = re.split(",", userin)
vals = [int(x) for x in splt]
k_range = (vals[0], vals[1])
userin = input("Give a range for dimensions (comma separated)\n"
">>>> ")
splt = re.split(",", userin)
vals = [int(x) for x in splt]
dim = (vals[0], vals[1])
userin = input("Give numeber of runs\n"
">>>> ")
runs = int(userin)
userin = input("Enter a file name for save\n"
">>>> ")
file = userin
run_sim(n, n_clusters, k_range, dim, runs, file)