/
GridCluster.py
116 lines (94 loc) · 4.08 KB
/
GridCluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from LoadFiles import load, dist
data = load("flaredata2.txt")
# 10 dimensional matrix, to be filled with the count of each potential data element
gridcounts = [[[[[[[[[[0 for u in range(2)] for y in range(2)] for t in range(2)] for r in range(2)] for e in range(3)] for w in range(3)] for q in range(2)] for i in range(4)] for j in range(6)] for x in range(7)]
# populate the data
for e in data:
gridcounts[e[0]][e[1]][e[2]][e[3]-1][e[4]-1][e[5]-1][e[6]-1][e[7]-1][e[8]-1][e[9]-1] = gridcounts[e[0]][e[1]][e[2]][e[3]-1][e[4]-1][e[5]-1][e[6]-1][e[7]-1][e[8]-1][e[9]-1] + 1
# collapse data into a useable list
gridlist = []
for a in range(len(gridcounts)):
for b in range(len(gridcounts[a])):
for c in range(len(gridcounts[a][b])):
for d in range(len(gridcounts[a][b][c])):
for e in range(len(gridcounts[a][b][c][d])):
for f in range(len(gridcounts[a][b][c][d])):
for g in range(len(gridcounts[a][b][c][d][e][f])):
for h in range(len(gridcounts[a][b][c][d][e][f][g])):
for j in range(len(gridcounts[a][b][c][d][e][f][g][h])):
for k in range(len(gridcounts[a][b][c][d][e][f][g][h][j])):
gridlist.append([gridcounts[a][b][c][d][e][f][g][h][j][k], a/6, b/5, c/3, d, e/2, f/2, g, h, j, k])
# sort by count
gridlist.sort(key = lambda el: el[0], reverse = True)
#for x in range(len(gridlist)):
# print(gridlist[x])
total = len(data)
count = 0
previousCount = 0
cutoff = int(0.75*total)
noiseCutOff = gridlist[0][0]
previousCutOff = noiseCutOff
i = 0
while(count < cutoff):
previousCutOff = noiseCutOff
noiseCutOff = gridlist[i][0]
previousCount = count
while(noiseCutOff == gridlist[i][0]):
count = count + gridlist[i][0]
i = i + 1
#print(previousCutOff)
#print(count)
#print(previousCount)
#print(cutoff)
#print(i/len(gridlist))
def NonNoise(data, cutoff):
for ele in data:
if(ele[0] > cutoff):
yield ele
else:
break
def addToCluster(nx, clusters, max_dist):
candidateClusters = []
distanceToNeighbor = max_dist + 1
for cluster in clusters:
closest = min(cluster, key = lambda x: dist(x[1:], nx[1:]))
if(dist(closest[1:], nx[1:]) < distanceToNeighbor):
candidateClusters = []
candidateClusters.append(cluster)
distanceToNeighbor = dist(closest[1:], nx[1:])
elif(dist(closest[1:], nx[1:]) == distanceToNeighbor):
candidateClusters.append(cluster)
if(distanceToNeighbor > max_dist):
return True
else:
#print(candidateClusters).append(cluster)
max(candidateClusters, key = lambda x: min(x, key = lambda y: dist(y[1:], nx[1:]))[0]).append(nx)
return False
clusters = []
for nc in NonNoise(gridlist, noiseCutOff):
if(addToCluster(nc, clusters, 1)):
clusters.append([nc])
#print(clusters)
def dataEqauls(e, o):
return e[0]/6 == o[1] and e[1]/5 == o[2] and e[2]/3 == o[3] and e[3]-1 == o[4] and (e[4]-1)/2 == o[5] and (e[5]-1)/2 == o[6] and e[6]-1 == o[7] and e[7]-1 == o[8] and e[8]-1 == o[9] and e[9]-1 == o[10]
Csets = [dict() for x in range(len(clusters))]
for e in data:
for i in range(len(clusters)):
for o in clusters[i]:
if dataEqauls(e, o):
if (e[10], e[11], e[12]) in Csets[i]:
Csets[i][(e[10], e[11], e[12])] = Csets[i][(e[10], e[11], e[12])] + 1
else:
Csets[i][(e[10], e[11], e[12])] = 1
print("number of clusters: ", end = "")
print(len(clusters))
for i in range(len(clusters)):
print("CLUSTER:")
for grid in clusters[i]:
print("\t", end = "")
print('{:3d}, {:.2f}, {:.2f}, {:.2f}, {:.2f}, {:.2f}, {:.2f}, {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(grid[0], grid[1],grid[2], grid[3],grid[4], grid[5],grid[6], grid[7],grid[8], grid[9],grid[10]))
print("Classifiers:")
for d in iter(Csets[i]):
print("\t", end = "")
print(d, end = ": ")
print(Csets[i][d])