def dbscan(self, dataSet): clusters = [] visited = set() noise = set() # Iterate over data points for i in range(len(dataSet)): point = dataSet[i] if point in visited: continue visited.add(point) N = [] minPtsNeighbours = 0 # check which point satisfies minPts condition for j in range(len(dataSet)): if i == j: continue pt = dataSet[j] dist = self.getEuclideanDist(point.x, point.y, pt.x, pt.y) if dist <= self.e: minPtsNeighbours += 1 N.append(pt) if minPtsNeighbours >= self.minPts: cluster = set() cluster.add(point) point.isAssignedToCluster = True j = 0 while j < len(N): point1 = N[j] minPtsNeighbours1 = 0 N1 = [] if not point1 in visited: visited.add(point1) for l in range(len(dataSet)): pt = dataSet[l] dist = self.getEuclideanDist( point1.x, point1.y, pt.x, pt.y) if dist <= self.e: minPtsNeighbours1 += 1 N1.append(pt) if minPtsNeighbours1 >= self.minPts: self.removeDuplicates(N, N1) else: N1 = [] # Add point1 is not yet member of any other cluster then add it to cluster # Hint: use self.isAssignedToCluster function to check if a point is assigned to any clusters # ****************Please Fill Missing Lines Here***************** if (point1.isAssignedToCluster == False): cluster.add(point1) point1.isAssignedToCluster = True j += 1 # add cluster to the list of clusters clusters.append(cluster) else: noise.add(point) N = [] # List clusters print(("Number of clusters formed :" + str(len(clusters)))) print(("Noise points :" + str(len(noise)))) # Calculate purity maxLabelCluster = [] for j in range(len(clusters)): maxLabelCluster.append(KMeans.getMaxClusterLabel(clusters[j])) purity = 0.0 for j in range(len(clusters)): purity += maxLabelCluster[j] purity /= len(dataSet) print(("Purity is :" + str(purity))) nmiMatrix = DataPoints.getNMIMatrix(clusters, self.noOfLabels) nmi = DataPoints.calcNMI(nmiMatrix) print(("NMI :" + str(nmi))) DataPoints.writeToFile(noise, clusters, "DBSCAN_dataset3.csv")
def GMM(self): clusters = [] self.mean = [[0.0 for y in range(2)] for x in range(self.K)] self.stdDev = [[0.0 for y in range(2)] for x in range(self.K)] self.coVariance = [[[0.0 for z in range(2)] for y in range(2)] for x in range(self.K)] k = 0 while k < self.K: cluster = set() clusters.append(cluster) k += 1 # Initially randomly assign points to clusters i = 0 for point in self.dataSet: clusters[i % self.K].add(point) i += 1 for m in range(self.K): self.w[m] = 1.0 / self.K # Get Initial mean DataPoints.getMean(clusters, self.mean) DataPoints.getStdDeviation(clusters, self.mean, self.stdDev) DataPoints.getCovariance(clusters, self.mean, self.stdDev, self.coVariance) length = 0 mle_old = 0.0 mle_new = 0.0 while True: mle_old = self.Likelihood() self.Estep() self.Mstep(clusters) length += 1 mle_new = self.Likelihood() # convergence condition if abs(mle_new - mle_old) / abs(mle_old) < 0.000001: break print(("Number of Iterations = " + str(length))) print("\nAfter Calculations") print("Final mean = ") self.printArray(self.mean) print("\nFinal covariance = ") self.print3D(self.coVariance) # Assign points to cluster depending on max prob. for j in range(self.K): clusters[j] = set() i = 0 for point in self.dataSet: index = -1 prob = 0.0 for j in range(self.K): if self.W[i][j] > prob: index = j prob = self.W[i][j] temp = clusters[index] temp.add(point) i += 1 # Calculate purity maxLabelCluster = [0 for x in range(self.K)] for j in range(self.K): maxLabelCluster[j] = KMeans.getMaxClusterLabel(clusters[j]) purity = 0.0 for j in range(self.K): purity += maxLabelCluster[j] purity = purity / float(len(self.dataSet)) print(("Purity is :" + str(purity))) noOfLabels = DataPoints.getNoOFLabels(self.dataSet) nmiMatrix = DataPoints.getNMIMatrix(clusters, noOfLabels) nmi = DataPoints.calcNMI(nmiMatrix) print(("NMI :" + str(nmi))) # write clusters to file for plotting f = open("GMM.csv", 'w') for w in range(self.K): print(("Cluster " + str(w) + " size :" + str(len(clusters[w])))) for point in clusters[w]: f.write( str(point.x) + "," + str(point.y) + "," + str(w) + "\n") f.close()
def dbscan(self, dataSet): clusters = [] visited = set() noise = set() # Iterate over data points for i in range(len(dataSet)): point = dataSet[i] if point in visited: continue visited.add(point) N = [] minPtsNeighbours = 0 # check which point satisfies minPts condition for j in range(len(dataSet)): if i==j: continue pt = dataSet[j] dist = self.getEuclideanDist(point.x, point.y, pt.x, pt.y) if dist <= self.e: minPtsNeighbours += 1 N.append(pt) if minPtsNeighbours >= self.minPts: cluster = set() cluster.add(point) point.isAssignedToCluster = True j = 0 while j < len(N): point1 = N[j] minPtsNeighbours1 = 0 N1 = [] if not point1 in visited: visited.add(point1) for l in range(len(dataSet)): pt = dataSet[l] dist = self.getEuclideanDist(point1.x, point1.y, pt.x, pt.y) if dist <= self.e: minPtsNeighbours1 += 1 N1.append(pt) if minPtsNeighbours1 >= self.minPts: self.removeDuplicates(N, N1) else: N1 = [] # Add point1 is not yet member of any other cluster then add it to cluster if not point1.isAssignedToCluster: cluster.add(point1) point1.isAssignedToCluster = True j += 1 # add cluster to the list of clusters clusters.append(cluster) else: noise.add(point) N = [] # List clusters print("Number of clusters formed :" + str(len(clusters))) print("Noise points :" + str(len(noise))) # Calculate purity maxLabelCluster = [] for j in range(len(clusters)): maxLabelCluster.append(KMeans.getMaxClusterLabel(clusters[j])) purity = 0.0 for j in range(len(clusters)): purity += maxLabelCluster[j] purity /= len(dataSet) print("Purity is :" + str(purity)) nmiMatrix = DataPoints.getNMIMatrix(clusters, self.noOfLabels) nmi = DataPoints.calcNMI(nmiMatrix) print("NMI :" + str(nmi)) color_idx = 0 colors = ['b','g','r','c','m','y','k','w'] minX=100000 minY=100000 maxX=0 maxY=0 for cluster in clusters: for point in cluster: if(point.x<=minX): minX=point.x if(point.y<=minY): minY=point.y if(point.x>maxX): maxX=point.x if(point.y>maxY): maxY=point.y plt.scatter(point.x, point.y, c=colors[color_idx%8]) color_idx += 1 plt.axis([minX-1, maxX+1, minY-1, maxY+1]) # plt.show() figname = self.fp[:len(self.fp)-4] + ".png" plt.savefig(figname) plt.gcf().clear() DataPoints.writeToFile(noise, clusters, "DBSCAN_dataset3.csv")
class DBSCAN: # ------------------------------------------------------------------- def __init__(self): self.e = 0.0 self.minPts = 3 self.noOfLabels = 0 # ------------------------------------------------------------------- def main(self, args): seed = 71 print("For dataset1") dataSet = KMeans.readDataSet("dataset1.txt") random.Random(seed).shuffle(dataSet) self.noOfLabels = DataPoints.getNoOFLabels(dataSet) self.getEpsilonFromCurve(dataSet) #self.e = self.getEpsilon(dataSet) #set e manully according to curve self.e = 0.49 print("Esp :" + str(self.e)) self.dbscan(dataSet,1) print("\nFor dataset2") dataSet = KMeans.readDataSet("dataset2.txt") random.Random(seed).shuffle(dataSet) self.noOfLabels = DataPoints.getNoOFLabels(dataSet) self.getEpsilonFromCurve(dataSet) #self.e = self.getEpsilon(dataSet) #set e manully according to curve self.e = 0.6 print("Esp :" + str(self.e)) self.dbscan(dataSet,2) print("\nFor dataset3") dataSet = KMeans.readDataSet("dataset3.txt") random.Random(seed).shuffle(dataSet) self.noOfLabels = DataPoints.getNoOFLabels(dataSet) self.getEpsilonFromCurve(dataSet) #set e manully according to curve #self.e = self.getEpsilon(dataSet) self.e = 0.2 print("Esp :" + str(self.e)) self.dbscan(dataSet,3) # ------------------------------------------------------------------- #==============method 1: find the mean of disNearest4th as eps def getEpsilon(self, dataSet): sumOfDist = 0.0 # ****************Please Fill Missing Lines Here***************** #dis between a point with its 4th nearest neighbour disNearestKth = self.find_Kth_nearestDis(dataSet,4) sumOfDist = sum(disNearestKth) return sumOfDist/len(dataSet) # ------------------------------------------------------------------- #==============method 2: draw graph to find eps def getEpsilonFromCurve(self, dataSet): #dis between a point with its 4th nearest neighbour disNearestKth = self.find_Kth_nearestDis(dataSet,4) disNearestKth_sorted = sorted(disNearestKth) #x-axis: index of disNearestKth_sorted, y-axis:value of disNearestKth_sorted x = np.arange(0, len(dataSet), 1) plt.plot(x, disNearestKth_sorted) plt.show() # ------------------------------------------------------------------- def find_Kth_nearestDis(self, dataSet,k): #4th nearest dis of all the points in dataset disNearestKth = [] #dis between point i and all the other points distances = [] for i in range(len(dataSet)): #compute dis of point i with all other points, and find the 4th nearest dis for j in range(len(dataSet)): distances.append(self.getEuclideanDist(dataSet[i].x, dataSet[i].y, dataSet[j].x, dataSet[j].y)) #since the smallest dis is itself, so pick the k-index value of the sorted dis distances = sorted(distances) disNearestKth.append(distances[k]) distances = [] return disNearestKth # ------------------------------------------------------------------- def dbscan(self, dataSet, datasetID): clusters = [] visited = set() noise = set() # Iterate over data points for i in range(len(dataSet)): point = dataSet[i] if point in visited: continue #until find an unvisited point, do below #mark this unvisited as visited visited.add(point) #store Neighbours points N = [] minPtsNeighbours = 0 # check which point satisfies minPts condition, traverse all points except itself #calculate a neighbour set N of point i's neighbour for j in range(len(dataSet)): if i==j: continue pt = dataSet[j] dist = self.getEuclideanDist(point.x, point.y, pt.x, pt.y) if dist <= self.e: minPtsNeighbours += 1 N.append(pt) # if point i have enough neibours,i is not a noise point, so add a new cluster if minPtsNeighbours >= self.minPts: cluster = set() cluster.add(point) point.isAssignedToCluster = True j = 0 #traver the neighbours of point i #Neighbours1 represent neighbours's neighbour #N beome larger each loop. neighbour's n's n's n .... all will add to N while j < len(N): #point1 is point i 's neighbour point1 = N[j] #actual num of neighbours minPtsNeighbours1 = 0 N1 = [] #if unvisited if not point1 in visited: visited.add(point1) for l in range(len(dataSet)): pt = dataSet[l] dist = self.getEuclideanDist(point1.x, point1.y, pt.x, pt.y) if dist <= self.e: minPtsNeighbours1 += 1 N1.append(pt) if minPtsNeighbours1 >= self.minPts: #put point i's Neighbour's neighbour exsits, put them in as point i's Neighbour's and remove duplicares self.removeDuplicates(N, N1) else: N1 = [] # Add point1 is not yet member of any other cluster then add it to cluster if not point1.isAssignedToCluster: cluster.add(point1) point1.isAssignedToCluster = True j += 1 # add cluster to the list of clusters clusters.append(cluster) else: noise.add(point) N = [] # List clusters print("Number of clusters formed :" + str(len(clusters))) print("Noise points :" + str(len(noise))) # Calculate purity maxLabelCluster = [] for j in range(len(clusters)): maxLabelCluster.append(KMeans.getMaxClusterLabel(clusters[j])) purity = 0.0 for j in range(len(clusters)): purity += maxLabelCluster[j] purity /= len(dataSet) print("Purity is :" + str(purity)) nmiMatrix = DataPoints.getNMIMatrix(clusters, self.noOfLabels) nmi = DataPoints.calcNMI(nmiMatrix) print("NMI :" + str(nmi)) DataPoints.writeToFile(noise, clusters, "DBSCAN_dataset"+str(datasetID)+".csv")
def GMM(self): clusters = [] self.mean = [[0.0 for y in range(2)] for x in range(self.K)] self.stdDev = [[0.0 for y in range(2)] for x in range(self.K)] self.coVariance = [[[0.0 for z in range(2)] for y in range(2)] for x in range(self.K)] k = 0 while k < self.K: cluster = set() clusters.append(cluster) k += 1 # Initially randomly assign points to clusters i = 0 for point in self.dataSet: clusters[i % self.K].add(point) i += 1 for m in range(self.K): self.w[m] = 1.0 / self.K # Get Initial mean DataPoints.getMean(clusters, self.mean) DataPoints.getStdDeviation(clusters, self.mean, self.stdDev) DataPoints.getCovariance(clusters, self.mean, self.stdDev, self.coVariance) length = 0 mle_old = 0.0 mle_new = 0.0 mean_zero = [[0.0 for y in range(2)] for x in range(self.K)] coVariance_zero = [[[0.0 for z in range(2)] for y in range(2)] for x in range(self.K)] all_zero = False if mean_zero == self.mean and coVariance_zero == self.coVariance: all_zero = True while True and not all_zero: mle_old = self.Likelihood() self.Estep() self.Mstep(clusters) length += 1 mle_new = self.Likelihood() # convergence condition if abs(mle_new - mle_old) / abs(mle_old) < 0.000001: break print("Number of Iterations = " + str(length)) # print("\nAfter Calculations") # print("Final mean = ") # self.printArray(self.mean) # print("\nFinal covariance = ") # self.print3D(self.coVariance) # Assign points to cluster depending on max prob. for j in range(self.K): clusters[j] = set() i = 0 for point in self.dataSet: index = -1 prob = 0.0 for j in range(self.K): if self.W[i][j] > prob: index = j prob = self.W[i][j] temp = clusters[index] temp.add(point) i += 1 # Calculate purity maxLabelCluster = [0 for x in range(self.K)] for j in range(self.K): maxLabelCluster[j] = KMeans.getMaxClusterLabel(clusters[j]) purity = 0.0 for j in range(self.K): purity += maxLabelCluster[j] purity = purity / float(len(self.dataSet)) print("Purity is :" + str(purity)) noOfLabels = DataPoints.getNoOFLabels(self.dataSet) nmiMatrix = DataPoints.getNMIMatrix(clusters, noOfLabels) nmi = DataPoints.calcNMI(nmiMatrix) print("NMI :" + str(nmi)) color_idx = 0 colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] minX = 100000 minY = 100000 maxX = 0 maxY = 0 for cluster in clusters: for point in cluster: if (point.x <= minX): minX = point.x if (point.y <= minY): minY = point.y if (point.x > maxX): maxX = point.x if (point.y > maxY): maxY = point.y plt.scatter(point.x, point.y, c=colors[color_idx % 8]) color_idx += 1 plt.axis([minX - 1, maxX + 1, minY - 1, maxY + 1]) # plt.show() figname = self.fp[:len(self.fp) - 4] + ".png" plt.savefig(figname) plt.gcf().clear() # write clusters to file for plotting f = open(self.fp, 'w') # f = open("GMM.csv", 'w') for w in range(self.K): print("Cluster " + str(w) + " size :" + str(len(clusters[w]))) for point in clusters[w]: f.write( str(point.x) + "," + str(point.y) + "," + str(w) + "\n") f.close()
i = 0 for point in self.dataSet: index = -1 prob = 0.0 for j in range(self.K): if self.W[i][j] > prob: index = j prob = self.W[i][j] temp = clusters[index] temp.add(point) i += 1 # Calculate purity maxLabelCluster = [0 for x in range(self.K)] for j in range(self.K): maxLabelCluster[j] = KMeans.getMaxClusterLabel(clusters[j]) purity = 0.0 for j in range(self.K): purity += maxLabelCluster[j] purity = purity / float(len(self.dataSet)) print("Purity is :" + str(purity)) noOfLabels = DataPoints.getNoOFLabels(self.dataSet) nmiMatrix = DataPoints.getNMIMatrix(clusters, noOfLabels) nmi = DataPoints.calcNMI(nmiMatrix) print("NMI :" + str(nmi)) # write clusters to file for plotting f = open("GMM"+str(datasetID)+".csv", 'w') for w in range(self.K): print("Cluster " + str(w) + " size :" + str(len(clusters[w])))
def GMM(self): clusters = [] self.mean = [[0.0 for y in range(2)] for x in range(self.K)] self.stdDev = [[0.0 for y in range(2)] for x in range(self.K)] self.coVariance = [[[0.0 for z in range(2)] for y in range(2)] for x in range(self.K)] k = 0 while k < self.K: cluster = set() clusters.append(cluster) k += 1 # Initially randomly assign points to clusters i = 0 for point in self.dataSet: clusters[i % self.K].add(point) i += 1 for m in range(self.K): self.w[m] = 1.0 / self.K # Get Initial mean DataPoints.getMean(clusters, self.mean) DataPoints.getStdDeviation(clusters, self.mean, self.stdDev) DataPoints.getCovariance(clusters, self.mean, self.stdDev, self.coVariance) # print self.mean # print self.stdDev # print self.coVariance[2] length = 0 mle_old = 0.0 mle_new = 0.0 mean_zero = [[0.0 for y in range(2)] for x in range(self.K)] coVariance_zero = [[[0.0 for z in range(2)] for y in range(2)] for x in range(self.K)] all_zero = False if mean_zero == self.mean and coVariance_zero == self.coVariance: all_zero = True while True and not all_zero: mle_old = self.Likelihood() self.Estep() self.Mstep(clusters) length += 1 mle_new = self.Likelihood() # convergence condition if abs(mle_new - mle_old) / abs(mle_old) < 0.000001: break print("Number of Iterations = " + str(length)) print("\nAfter Calculations") print("Final mean = ") self.printArray(self.mean) print("\nFinal covariance = ") self.print3D(self.coVariance) # Assign points to cluster depending on max prob. for j in range(self.K): clusters[j] = set() i = 0 for point in self.dataSet: index = -1 prob = 0.0 for j in range(self.K): if self.W[i][j] > prob: index = j prob = self.W[i][j] temp = clusters[index] temp.add(point) i += 1 # Calculate purity maxLabelCluster = [0 for x in range(self.K)] for j in range(self.K): maxLabelCluster[j] = KMeans.getMaxClusterLabel(clusters[j]) purity = 0.0 for j in range(self.K): purity += maxLabelCluster[j] purity = purity / float(len(self.dataSet)) print("Purity is :" + str(purity)) noOfLabels = DataPoints.getNoOFLabels(self.dataSet) nmiMatrix = DataPoints.getNMIMatrix(clusters, noOfLabels) nmi = DataPoints.calcNMI(nmiMatrix) print("NMI :" + str(nmi)) # plot the result Plotter.plot(clusters)