Exemple #1
0
    def dbscan(self, dataSet):
        clusters = []
        visited = set()
        noise = set()

        # Iterate over data points
        for i in range(len(dataSet)):
            point = dataSet[i]
            if point in visited:
                continue
            visited.add(point)
            N = []
            minPtsNeighbours = 0

            # check which point satisfies minPts condition
            for j in range(len(dataSet)):
                if i == j:
                    continue
                pt = dataSet[j]
                dist = self.getEuclideanDist(point.x, point.y, pt.x, pt.y)
                if dist <= self.e:
                    minPtsNeighbours += 1
                    N.append(pt)

            if minPtsNeighbours >= self.minPts:
                cluster = set()
                cluster.add(point)
                point.isAssignedToCluster = True

                j = 0
                while j < len(N):
                    point1 = N[j]
                    minPtsNeighbours1 = 0
                    N1 = []
                    if not point1 in visited:
                        visited.add(point1)
                        for l in range(len(dataSet)):
                            pt = dataSet[l]
                            dist = self.getEuclideanDist(
                                point1.x, point1.y, pt.x, pt.y)
                            if dist <= self.e:
                                minPtsNeighbours1 += 1
                                N1.append(pt)
                        if minPtsNeighbours1 >= self.minPts:
                            self.removeDuplicates(N, N1)
                        else:
                            N1 = []
                    # Add point1 is not yet member of any other cluster then add it to cluster
                    # Hint: use self.isAssignedToCluster function to check if a point is assigned to any clusters
        # ****************Please Fill Missing Lines Here*****************
                    if (point1.isAssignedToCluster == False):
                        cluster.add(point1)
                        point1.isAssignedToCluster = True

                    j += 1
                # add cluster to the list of clusters
                clusters.append(cluster)

            else:
                noise.add(point)

            N = []

        # List clusters
        print(("Number of clusters formed :" + str(len(clusters))))
        print(("Noise points :" + str(len(noise))))

        # Calculate purity
        maxLabelCluster = []
        for j in range(len(clusters)):
            maxLabelCluster.append(KMeans.getMaxClusterLabel(clusters[j]))
        purity = 0.0
        for j in range(len(clusters)):
            purity += maxLabelCluster[j]
        purity /= len(dataSet)
        print(("Purity is :" + str(purity)))

        nmiMatrix = DataPoints.getNMIMatrix(clusters, self.noOfLabels)
        nmi = DataPoints.calcNMI(nmiMatrix)
        print(("NMI :" + str(nmi)))

        DataPoints.writeToFile(noise, clusters, "DBSCAN_dataset3.csv")
Exemple #2
0
    def GMM(self):
        clusters = []
        self.mean = [[0.0 for y in range(2)] for x in range(self.K)]
        self.stdDev = [[0.0 for y in range(2)] for x in range(self.K)]
        self.coVariance = [[[0.0 for z in range(2)] for y in range(2)]
                           for x in range(self.K)]
        k = 0
        while k < self.K:
            cluster = set()
            clusters.append(cluster)
            k += 1

        # Initially randomly assign points to clusters
        i = 0
        for point in self.dataSet:
            clusters[i % self.K].add(point)
            i += 1

        for m in range(self.K):
            self.w[m] = 1.0 / self.K

        # Get Initial mean
        DataPoints.getMean(clusters, self.mean)
        DataPoints.getStdDeviation(clusters, self.mean, self.stdDev)
        DataPoints.getCovariance(clusters, self.mean, self.stdDev,
                                 self.coVariance)
        length = 0
        mle_old = 0.0
        mle_new = 0.0
        while True:
            mle_old = self.Likelihood()
            self.Estep()
            self.Mstep(clusters)
            length += 1
            mle_new = self.Likelihood()

            # convergence condition
            if abs(mle_new - mle_old) / abs(mle_old) < 0.000001:
                break

        print(("Number of Iterations = " + str(length)))
        print("\nAfter Calculations")
        print("Final mean = ")
        self.printArray(self.mean)
        print("\nFinal covariance = ")
        self.print3D(self.coVariance)

        # Assign points to cluster depending on max prob.
        for j in range(self.K):
            clusters[j] = set()

        i = 0
        for point in self.dataSet:
            index = -1
            prob = 0.0
            for j in range(self.K):
                if self.W[i][j] > prob:
                    index = j
                    prob = self.W[i][j]
            temp = clusters[index]
            temp.add(point)
            i += 1

        # Calculate purity
        maxLabelCluster = [0 for x in range(self.K)]
        for j in range(self.K):
            maxLabelCluster[j] = KMeans.getMaxClusterLabel(clusters[j])
        purity = 0.0
        for j in range(self.K):
            purity += maxLabelCluster[j]
        purity = purity / float(len(self.dataSet))
        print(("Purity is :" + str(purity)))

        noOfLabels = DataPoints.getNoOFLabels(self.dataSet)
        nmiMatrix = DataPoints.getNMIMatrix(clusters, noOfLabels)
        nmi = DataPoints.calcNMI(nmiMatrix)
        print(("NMI :" + str(nmi)))

        # write clusters to file for plotting
        f = open("GMM.csv", 'w')
        for w in range(self.K):
            print(("Cluster " + str(w) + " size :" + str(len(clusters[w]))))
            for point in clusters[w]:
                f.write(
                    str(point.x) + "," + str(point.y) + "," + str(w) + "\n")
        f.close()
Exemple #3
0
    def dbscan(self, dataSet):
        clusters = []
        visited = set()
        noise = set()

        # Iterate over data points
        for i in range(len(dataSet)):
            point = dataSet[i]
            if point in visited:
                continue
            visited.add(point)
            N = []
            minPtsNeighbours = 0

            # check which point satisfies minPts condition 
            for j in range(len(dataSet)):
                if i==j:
                    continue
                pt = dataSet[j]
                dist = self.getEuclideanDist(point.x, point.y, pt.x, pt.y)
                if dist <= self.e:
                    minPtsNeighbours += 1
                    N.append(pt)

            if minPtsNeighbours >= self.minPts:
                cluster = set()
                cluster.add(point)
                point.isAssignedToCluster = True

                j = 0
                while j < len(N):
                    point1 = N[j]
                    minPtsNeighbours1 = 0
                    N1 = []
                    if not point1 in visited:
                        visited.add(point1)
                        for l in range(len(dataSet)):
                            pt = dataSet[l]
                            dist = self.getEuclideanDist(point1.x, point1.y, pt.x, pt.y)
                            if dist <= self.e:
                                minPtsNeighbours1 += 1
                                N1.append(pt)
                        if minPtsNeighbours1 >= self.minPts:
                            self.removeDuplicates(N, N1)
                        else:
                            N1 = []
                    # Add point1 is not yet member of any other cluster then add it to cluster
                    if not point1.isAssignedToCluster:
                        cluster.add(point1)
                        point1.isAssignedToCluster = True
                    j += 1
                # add cluster to the list of clusters
                clusters.append(cluster)

            else:
                noise.add(point)

            N = []

        # List clusters
        print("Number of clusters formed :" + str(len(clusters)))
        print("Noise points :" + str(len(noise)))

        # Calculate purity
        maxLabelCluster = []
        for j in range(len(clusters)):
            maxLabelCluster.append(KMeans.getMaxClusterLabel(clusters[j]))
        purity = 0.0
        for j in range(len(clusters)):
            purity += maxLabelCluster[j]
        purity /= len(dataSet)
        print("Purity is :" + str(purity))

        nmiMatrix = DataPoints.getNMIMatrix(clusters, self.noOfLabels)
        nmi = DataPoints.calcNMI(nmiMatrix)
        print("NMI :" + str(nmi))

        color_idx = 0
        colors = ['b','g','r','c','m','y','k','w']  
        minX=100000
        minY=100000
        maxX=0
        maxY=0 
        for cluster in clusters:        
            for point in cluster: 
                if(point.x<=minX):
                    minX=point.x 
                if(point.y<=minY):
                    minY=point.y 
                if(point.x>maxX):
                    maxX=point.x 
                if(point.y>maxY):
                    maxY=point.y
                plt.scatter(point.x, point.y, c=colors[color_idx%8])
            color_idx += 1
        plt.axis([minX-1, maxX+1, minY-1, maxY+1])
        # plt.show()
        figname = self.fp[:len(self.fp)-4] + ".png"
        plt.savefig(figname)
        plt.gcf().clear()

        DataPoints.writeToFile(noise, clusters, "DBSCAN_dataset3.csv")
Exemple #4
0
class DBSCAN:
    # -------------------------------------------------------------------
    def __init__(self):
        self.e = 0.0
        self.minPts = 3
        self.noOfLabels = 0
    # -------------------------------------------------------------------
    def main(self, args):
        seed = 71
        print("For dataset1")
        dataSet = KMeans.readDataSet("dataset1.txt")
        random.Random(seed).shuffle(dataSet)
        self.noOfLabels = DataPoints.getNoOFLabels(dataSet)
        self.getEpsilonFromCurve(dataSet)
        #self.e = self.getEpsilon(dataSet)
        #set e manully according to curve
        self.e = 0.49
        print("Esp :" + str(self.e))
        self.dbscan(dataSet,1)
        
        print("\nFor dataset2")
        dataSet = KMeans.readDataSet("dataset2.txt")
        random.Random(seed).shuffle(dataSet)
        self.noOfLabels = DataPoints.getNoOFLabels(dataSet)
        self.getEpsilonFromCurve(dataSet)
        #self.e = self.getEpsilon(dataSet)
        #set e manully according to curve
        self.e = 0.6
        print("Esp :" + str(self.e))
        self.dbscan(dataSet,2)
        
        print("\nFor dataset3")
        dataSet = KMeans.readDataSet("dataset3.txt")
        random.Random(seed).shuffle(dataSet)
        self.noOfLabels = DataPoints.getNoOFLabels(dataSet)
        self.getEpsilonFromCurve(dataSet)
        #set e manully according to curve
        #self.e = self.getEpsilon(dataSet)
        self.e = 0.2
        print("Esp :" + str(self.e))
        self.dbscan(dataSet,3)
    # -------------------------------------------------------------------
    #==============method 1: find the mean of disNearest4th as eps
    def getEpsilon(self, dataSet):
        
        sumOfDist = 0.0
        # ****************Please Fill Missing Lines Here*****************
        #dis between a point with its 4th nearest neighbour
        disNearestKth = self.find_Kth_nearestDis(dataSet,4)
        sumOfDist = sum(disNearestKth)
        
        return sumOfDist/len(dataSet)
    # -------------------------------------------------------------------
    #==============method 2: draw graph to find eps
    def getEpsilonFromCurve(self, dataSet):
        #dis between a point with its 4th nearest neighbour
        disNearestKth = self.find_Kth_nearestDis(dataSet,4)
        disNearestKth_sorted = sorted(disNearestKth)
        #x-axis: index of disNearestKth_sorted, y-axis:value of disNearestKth_sorted
        x = np.arange(0, len(dataSet), 1)
        plt.plot(x, disNearestKth_sorted)
        plt.show()
    
    # -------------------------------------------------------------------
    def find_Kth_nearestDis(self, dataSet,k):
        #4th nearest dis of all the points in dataset
        disNearestKth = []
        #dis between point i and all the other points
        distances = []
        for i in range(len(dataSet)):
            #compute dis of point i with all other points, and find the 4th nearest dis
            for j in range(len(dataSet)):
                distances.append(self.getEuclideanDist(dataSet[i].x, dataSet[i].y, dataSet[j].x, dataSet[j].y))
            #since the smallest dis is itself, so pick the k-index value of the sorted dis
            distances = sorted(distances)
            disNearestKth.append(distances[k])
            distances = []
        return disNearestKth
    # -------------------------------------------------------------------
    def dbscan(self, dataSet, datasetID):
        clusters = []
        visited = set()
        noise = set()
        
        # Iterate over data points
        for i in range(len(dataSet)):
            point = dataSet[i]
            if point in visited:
                continue
            #until find an unvisited point, do below
            #mark this unvisited as visited
            visited.add(point)
            #store Neighbours points
            N = []
            minPtsNeighbours = 0
            
            # check which point satisfies minPts condition, traverse all points except itself
            #calculate a neighbour set N of point i's neighbour
            for j in range(len(dataSet)):
                if i==j:
                    continue
                pt = dataSet[j]
                dist = self.getEuclideanDist(point.x, point.y, pt.x, pt.y)
                if dist <= self.e:
                    minPtsNeighbours += 1
                    N.append(pt)
        
            # if point i have enough neibours,i is not a noise point, so add a new cluster
            if minPtsNeighbours >= self.minPts:
                cluster = set()
                cluster.add(point)
                point.isAssignedToCluster = True
                
                j = 0
                #traver the neighbours of point i
                #Neighbours1 represent neighbours's neighbour
                #N beome larger each loop. neighbour's n's n's n .... all will add to N
                while j < len(N):
                    #point1 is point i 's neighbour
                    point1 = N[j]
                    #actual num of neighbours
                    minPtsNeighbours1 = 0
                    N1 = []
                    #if unvisited
                    if not point1 in visited:
                        visited.add(point1)
                        for l in range(len(dataSet)):
                            pt = dataSet[l]
                            dist = self.getEuclideanDist(point1.x, point1.y, pt.x, pt.y)
                            if dist <= self.e:
                                minPtsNeighbours1 += 1
                                N1.append(pt)
                        if minPtsNeighbours1 >= self.minPts:
                            #put point i's Neighbour's neighbour exsits, put them in as point i's Neighbour's and remove duplicares
                            self.removeDuplicates(N, N1)
                        else:
                            N1 = []
                    # Add point1 is not yet member of any other cluster then add it to cluster
                    if not point1.isAssignedToCluster:
                        cluster.add(point1)
                        point1.isAssignedToCluster = True
                    j += 1
                # add cluster to the list of clusters
                clusters.append(cluster)
            
            else:
                noise.add(point)
                
    N = []
        
        # List clusters
        print("Number of clusters formed :" + str(len(clusters)))
        print("Noise points :" + str(len(noise)))
        
        # Calculate purity
        maxLabelCluster = []
        for j in range(len(clusters)):
            maxLabelCluster.append(KMeans.getMaxClusterLabel(clusters[j]))
        purity = 0.0
        for j in range(len(clusters)):
            purity += maxLabelCluster[j]
        purity /= len(dataSet)
        print("Purity is :" + str(purity))
        
        nmiMatrix = DataPoints.getNMIMatrix(clusters, self.noOfLabels)
        nmi = DataPoints.calcNMI(nmiMatrix)
        print("NMI :" + str(nmi))
        
                    DataPoints.writeToFile(noise, clusters, "DBSCAN_dataset"+str(datasetID)+".csv")
Exemple #5
0
    def GMM(self):
        clusters = []
        self.mean = [[0.0 for y in range(2)] for x in range(self.K)]
        self.stdDev = [[0.0 for y in range(2)] for x in range(self.K)]
        self.coVariance = [[[0.0 for z in range(2)] for y in range(2)]
                           for x in range(self.K)]
        k = 0
        while k < self.K:
            cluster = set()
            clusters.append(cluster)
            k += 1

        # Initially randomly assign points to clusters
        i = 0
        for point in self.dataSet:
            clusters[i % self.K].add(point)
            i += 1

        for m in range(self.K):
            self.w[m] = 1.0 / self.K

        # Get Initial mean
        DataPoints.getMean(clusters, self.mean)
        DataPoints.getStdDeviation(clusters, self.mean, self.stdDev)
        DataPoints.getCovariance(clusters, self.mean, self.stdDev,
                                 self.coVariance)
        length = 0
        mle_old = 0.0
        mle_new = 0.0
        mean_zero = [[0.0 for y in range(2)] for x in range(self.K)]
        coVariance_zero = [[[0.0 for z in range(2)] for y in range(2)]
                           for x in range(self.K)]
        all_zero = False
        if mean_zero == self.mean and coVariance_zero == self.coVariance:
            all_zero = True
        while True and not all_zero:
            mle_old = self.Likelihood()
            self.Estep()
            self.Mstep(clusters)
            length += 1
            mle_new = self.Likelihood()

            # convergence condition
            if abs(mle_new - mle_old) / abs(mle_old) < 0.000001:
                break

        print("Number of Iterations = " + str(length))
        # print("\nAfter Calculations")
        # print("Final mean = ")
        # self.printArray(self.mean)
        # print("\nFinal covariance = ")
        # self.print3D(self.coVariance)

        # Assign points to cluster depending on max prob.
        for j in range(self.K):
            clusters[j] = set()

        i = 0
        for point in self.dataSet:
            index = -1
            prob = 0.0
            for j in range(self.K):
                if self.W[i][j] > prob:
                    index = j
                    prob = self.W[i][j]
            temp = clusters[index]
            temp.add(point)
            i += 1

        # Calculate purity
        maxLabelCluster = [0 for x in range(self.K)]
        for j in range(self.K):
            maxLabelCluster[j] = KMeans.getMaxClusterLabel(clusters[j])
        purity = 0.0
        for j in range(self.K):
            purity += maxLabelCluster[j]
        purity = purity / float(len(self.dataSet))
        print("Purity is :" + str(purity))

        noOfLabels = DataPoints.getNoOFLabels(self.dataSet)
        nmiMatrix = DataPoints.getNMIMatrix(clusters, noOfLabels)
        nmi = DataPoints.calcNMI(nmiMatrix)
        print("NMI :" + str(nmi))

        color_idx = 0
        colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
        minX = 100000
        minY = 100000
        maxX = 0
        maxY = 0
        for cluster in clusters:
            for point in cluster:
                if (point.x <= minX):
                    minX = point.x
                if (point.y <= minY):
                    minY = point.y
                if (point.x > maxX):
                    maxX = point.x
                if (point.y > maxY):
                    maxY = point.y
                plt.scatter(point.x, point.y, c=colors[color_idx % 8])
            color_idx += 1
        plt.axis([minX - 1, maxX + 1, minY - 1, maxY + 1])
        # plt.show()
        figname = self.fp[:len(self.fp) - 4] + ".png"
        plt.savefig(figname)
        plt.gcf().clear()

        # write clusters to file for plotting
        f = open(self.fp, 'w')
        # f = open("GMM.csv", 'w')
        for w in range(self.K):
            print("Cluster " + str(w) + " size :" + str(len(clusters[w])))
            for point in clusters[w]:
                f.write(
                    str(point.x) + "," + str(point.y) + "," + str(w) + "\n")
        f.close()
        i = 0
        for point in self.dataSet:
            index = -1
            prob = 0.0
            for j in range(self.K):
                if self.W[i][j] > prob:
                    index = j
                    prob = self.W[i][j]
            temp = clusters[index]
            temp.add(point)
            i += 1

# Calculate purity
maxLabelCluster = [0 for x in range(self.K)]
    for j in range(self.K):
        maxLabelCluster[j] = KMeans.getMaxClusterLabel(clusters[j])
        purity = 0.0
        for j in range(self.K):
            purity += maxLabelCluster[j]
    purity = purity / float(len(self.dataSet))
        print("Purity is :" + str(purity))
        
        noOfLabels = DataPoints.getNoOFLabels(self.dataSet)
        nmiMatrix = DataPoints.getNMIMatrix(clusters, noOfLabels)
        nmi = DataPoints.calcNMI(nmiMatrix)
        print("NMI :" + str(nmi))
        
        # write clusters to file for plotting
        f = open("GMM"+str(datasetID)+".csv", 'w')
        for w in range(self.K):
            print("Cluster " + str(w) + " size :" + str(len(clusters[w])))
Exemple #7
0
    def GMM(self):
        clusters = []
        self.mean = [[0.0 for y in range(2)] for x in range(self.K)]
        self.stdDev = [[0.0 for y in range(2)] for x in range(self.K)]
        self.coVariance = [[[0.0 for z in range(2)] for y in range(2)]
                           for x in range(self.K)]
        k = 0
        while k < self.K:
            cluster = set()
            clusters.append(cluster)
            k += 1

        # Initially randomly assign points to clusters
        i = 0
        for point in self.dataSet:
            clusters[i % self.K].add(point)
            i += 1

        for m in range(self.K):
            self.w[m] = 1.0 / self.K

        # Get Initial mean
        DataPoints.getMean(clusters, self.mean)
        DataPoints.getStdDeviation(clusters, self.mean, self.stdDev)
        DataPoints.getCovariance(clusters, self.mean, self.stdDev,
                                 self.coVariance)
        # print self.mean
        # print self.stdDev
        # print self.coVariance[2]

        length = 0
        mle_old = 0.0
        mle_new = 0.0
        mean_zero = [[0.0 for y in range(2)] for x in range(self.K)]
        coVariance_zero = [[[0.0 for z in range(2)] for y in range(2)]
                           for x in range(self.K)]
        all_zero = False
        if mean_zero == self.mean and coVariance_zero == self.coVariance:
            all_zero = True
        while True and not all_zero:
            mle_old = self.Likelihood()
            self.Estep()
            self.Mstep(clusters)
            length += 1
            mle_new = self.Likelihood()

            # convergence condition
            if abs(mle_new - mle_old) / abs(mle_old) < 0.000001:
                break

        print("Number of Iterations = " + str(length))
        print("\nAfter Calculations")
        print("Final mean = ")
        self.printArray(self.mean)
        print("\nFinal covariance = ")
        self.print3D(self.coVariance)

        # Assign points to cluster depending on max prob.
        for j in range(self.K):
            clusters[j] = set()

        i = 0
        for point in self.dataSet:
            index = -1
            prob = 0.0
            for j in range(self.K):
                if self.W[i][j] > prob:
                    index = j
                    prob = self.W[i][j]
            temp = clusters[index]
            temp.add(point)
            i += 1

        # Calculate purity
        maxLabelCluster = [0 for x in range(self.K)]
        for j in range(self.K):
            maxLabelCluster[j] = KMeans.getMaxClusterLabel(clusters[j])
        purity = 0.0
        for j in range(self.K):
            purity += maxLabelCluster[j]
        purity = purity / float(len(self.dataSet))
        print("Purity is :" + str(purity))

        noOfLabels = DataPoints.getNoOFLabels(self.dataSet)
        nmiMatrix = DataPoints.getNMIMatrix(clusters, noOfLabels)
        nmi = DataPoints.calcNMI(nmiMatrix)
        print("NMI :" + str(nmi))

        # plot the result
        Plotter.plot(clusters)