Beispiel #1
0
    def CAclustering(self, constraints, final_n_of_clusters, clusters=None):
        """
        Main hierarhical clustering loop
        """
        self.l.log("Creating transitive ML closure...")
        stevec = len(clusters)

        for c in clusters:
            print(clusters[c].clusterId, clusters[c].points)

        for x in constraints:
            if 'must-link' in x:
                print("omejitev: ", x)
                kluc1 = self.getClusterID(x['point'][0], clusters)
                kluc2 = self.getClusterID(x['must-link'][0], clusters)
                print(kluc1, " | ", kluc2, " | ", stevec, kluc1 == kluc2)
                if kluc1 != kluc2:
                    tocke = []
                    tocke.append(clusters[kluc1].points)
                    tocke.append(clusters[kluc2].points)
                    clusters.pop(kluc1)
                    clusters.pop(kluc2)
                    nov = Cluster(stevec)
                    nov.update(kluc1, kluc2, 0, tocke)  # TLE DEJ NOT
                    clusters.update({stevec: nov})
                    stevec += 1
        m = stevec
        self.l.log("Creating distance matrix....")
        self.distances = {}
        self.clusters = clusters
        stevec = 0
        for c in self.clusters:
            print(self.clusters[c].points, self.clusters[c].clusterId)
            for p in self.clusters[c].points:
                stevec += 1

        print(len(self.clusters), stevec)

        z = [(clusters[a].clusterId, clusters[b].clusterId)
             for a in self.clusters for b in self.clusters]
        for l in z:
            kljuc1 = str(l[0]) + " " + str(l[1])
            kljuc2 = str(l[1]) + " " + str(l[0])
            if l[0] != l[1]:
                if kljuc1 in self.distances:
                    continue
                elif kljuc2 in self.distances:
                    continue
                else:
                    if self.linkage == "Ward":
                        c = []
                        u = []
                        v = []

                        for p in self.clusters[l[0]].points:
                            c.append(p.coords)
                            u.append(p.coords)
                        for r in self.clusters[l[1]].points:
                            c.append(r.coords)
                            v.append(r.coords)
                        centroid_uv = np.average(c, axis=0)
                        centroid_u = np.average(u, axis=0)
                        centroid_v = np.average(v, axis=0)
                        dist1 = 0
                        dist2 = 0
                        dist3 = 0
                        for point in c:
                            if self.distance_type == "Cosine":
                                dist1 += spatial.distance.cosine(
                                    centroid_uv, point)**2
                            elif self.distance_type == "Euclidean":
                                dist1 += spatial.distance.euclidean(
                                    centroid_uv, point)**2
                        for point in u:
                            if self.distance_type == "Cosine":
                                dist2 += spatial.distance.cosine(
                                    centroid_u, point)**2
                            elif self.distance_type == "Euclidean":
                                dist2 += spatial.distance.euclidean(
                                    centroid_u, point)**2
                        for point in v:
                            if self.distance_type == "Cosine":
                                dist3 += spatial.distance.cosine(
                                    centroid_v, point)**2
                            elif self.distance_type == "Euclidean":
                                dist3 += spatial.distance.euclidean(
                                    centroid_v, point)**2
                        dist = dist1 - dist2 - dist3
                        self.distances.update({kljuc1: dist})
                    elif self.linkage == "Average":
                        u = [(a, b) for a in self.clusters[l[0]].points
                             for b in self.clusters[l[1]].points]
                        dist = self.average_linkage(u)
                        self.distances.update({kljuc1: dist})
                    else:
                        print("Error creating distance matrix...")
                        exit(1)

        self.l.log("Finding clusters...")
        '''
        s = sorted(self.distances.items(), key=lambda x: x[1])
        for k, v in s:
            print(k, v)
        '''
        print("st. omejitev: ", len(constraints))
        # print("clustri: ", self.clusters.keys())

        self.Z = np.array([])

        # n = len(self.points)  #na začetku je vsak primer svoj cluster
        n = len(self.clusters)
        idZ = 0
        stop_clustering = False
        while (n != final_n_of_clusters):
            # print("### ",n," ###")
            condition = True
            # clusters_checked = []
            while condition:

                key = min(self.distances, key=self.distances.get)
                kljuc = key
                par = key.split(' ')
                par = [int(i) for i in par]
                dist = self.distances[kljuc]
                # print("   ->",key, " ", self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points))
                if self.check_cannot_link(constraints,
                                          self.clusters[par[0]].points,
                                          self.clusters[par[1]].points):
                    self.distances[kljuc] = sys.maxsize
                    if dist == sys.maxsize:
                        self.l.log(
                            "ABHC cannot find clusters under those constraints..."
                        )
                        return self.clusters
                    print("   Cannot link:", par)
                else:
                    break
                # print("--------------------")
                # print(par[0], par[1])
            # print(self.distances.keys())
            self.distances.pop(kljuc, None)
            self.izbrisi_razdalje(par[0])
            self.izbrisi_razdalje(par[1])
            # print(self.distances.keys())
            tocke = []
            tocke.append(self.clusters[par[0]].points)
            tocke.append(self.clusters[par[1]].points)
            # print("tocke: ", len(tocke))
            novCluster = Cluster(m + idZ)
            novCluster.update(par[0], par[1], dist, tocke)
            novCluster.centroid = novCluster.calculateCentroid()
            self.clusters.pop(par[0])
            self.clusters.pop(par[1])
            self.clusters.update({(m + idZ): novCluster})
            # print("clustri:")
            # print(self.clusters.keys())
            # print("dodajam razdalje...")

            # print("NOV:" ,m+idZ)
            self.dodaj_razdalje(m + idZ)

            print("par: ", par, "dist: ", '%.08f' % dist)
            if idZ == 0:
                self.Z = [par[0], par[1], dist, novCluster.n]
            else:
                newrow = [par[0], par[1], dist, novCluster.n]
                self.Z = np.vstack([self.Z, newrow])

            n = len(self.clusters)
            idZ += 1
        return self.clusters
Beispiel #2
0
    def ABHclustering(self, constraints, final_n_of_clusters, clusters=None):
        """
        Main hierarhical clustering loop
        """
        self.l.log("Creating transitive ML closure...")
        stevec = len(clusters)

        for x in constraints:
            if 'must-link' in x:
                #print("omejitev: ", x)
                kluc1 = self.getClusterID(x['point'][0], clusters)
                kluc2 = self.getClusterID(x['must-link'][0], clusters)
                #print(kluc1, " | " , kluc2," | ", stevec, kluc1 == kluc2)
                if kluc1 != kluc2:
                    tocke = []
                    tocke.append(clusters[kluc1].points)
                    tocke.append(clusters[kluc2].points)
                    clusters.pop(kluc1)
                    clusters.pop(kluc2)
                    nov = Cluster(stevec)
                    nov.update(kluc1, kluc2, 0, tocke)  #TLE DEJ NOT
                    clusters.update({stevec: nov})
                    stevec += 1
        m = stevec
        self.l.log("Creating distance matrix....")
        self.distances = {}
        self.clusters = clusters
        stevec = 0
        for c in self.clusters:
            print(self.clusters[c].points, self.clusters[c].clusterId)
            for p in self.clusters[c].points:
                stevec += 1

        print(len(self.clusters), stevec)

        z = [(clusters[a].clusterId, clusters[b].clusterId)
             for a in self.clusters for b in self.clusters]
        for l in z:
            kljuc1 = str(l[0]) + " " + str(l[1])
            kljuc2 = str(l[1]) + " " + str(l[0])
            if l[0] != l[1]:
                if kljuc1 in self.distances:
                    continue
                elif kljuc2 in self.distances:
                    continue
                else:
                    if self.linkage == "Ward":
                        c = []
                        u = []
                        v = []

                        for p in self.clusters[l[0]].points:
                            c.append(p.coords)
                            u.append(p.coords)
                        for r in self.clusters[l[1]].points:
                            c.append(r.coords)
                            v.append(r.coords)
                        centroid_uv = np.average(c, axis=0)
                        centroid_u = np.average(u, axis=0)
                        centroid_v = np.average(v, axis=0)
                        dist1 = 0
                        dist2 = 0
                        dist3 = 0
                        for point in c:
                            if self.distance_type == "Cosine":
                                dist1 += spatial.distance.cosine(
                                    centroid_uv, point)**2
                            elif self.distance_type == "Euclidean":
                                dist1 += spatial.distance.euclidean(
                                    centroid_uv, point)**2
                        for point in u:
                            if self.distance_type == "Cosine":
                                dist2 += spatial.distance.cosine(
                                    centroid_u, point)**2
                            elif self.distance_type == "Euclidean":
                                dist2 += spatial.distance.euclidean(
                                    centroid_u, point)**2
                        for point in v:
                            if self.distance_type == "Cosine":
                                dist3 += spatial.distance.cosine(
                                    centroid_v, point)**2
                            elif self.distance_type == "Euclidean":
                                dist3 += spatial.distance.euclidean(
                                    centroid_v, point)**2
                        dist = dist1 - dist2 - dist3
                        self.distances.update({kljuc1: dist})
                    elif self.linkage == "Average":
                        u = [(a, b) for a in self.clusters[l[0]].points
                             for b in self.clusters[l[1]].points]
                        dist = self.average_linkage(u)
                        self.distances.update({kljuc1: dist})
                    else:
                        print("Error creating distance matrix...")
                        exit(1)

        self.l.log("Finding clusters...")
        '''
        s = sorted(self.distances.items(), key=lambda x: x[1])
        for k, v in s:
            print(k, v)
        '''
        print("st. omejitev: ", len(constraints))
        #print("clustri: ", self.clusters.keys())

        self.Z = np.array([])

        #n = len(self.points)  #na začetku je vsak primer svoj cluster
        n = len(self.clusters)
        idZ = 0
        stop_clustering = False
        while (n != final_n_of_clusters):
            #print("### ",n," ###")
            condition = True
            #clusters_checked = []
            while condition:
                """
                if len(clusters_checked) == len(self.clusters):
                    print("Ni mozno nadaljne zruzevanje, ostalo je ",len(self.clusters)," clustrov.")
                    break
                dist, pair = self.closest_clusters(clusters_checked)
                if(pair is None):
                    stop_clustering = True
                    break
                par = list()
                for el in pair:
                    par.append(el)
                self.constraints = self.sort_constraints()
              
                #ali ima katerakoli tocka iz obeh clustrov ML, jo zdruzi in ponovno poisci najblizja clustra
                #ML_pair = self.check_must_link(constraints, self.clusters[par[0]].points)
                
                if ML_pair == -1:
                    ML_pair = self.check_must_link(constraints, self.clusters[par[1]].points)
                if ML_pair != -1:
                    par[0] = ML_pair[0]
                    par[1] = ML_pair[1]
                condition = self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points)
                if condition:
                    clusters_checked.append([par[0], par[1]])
                dist = self.cluster_distance(par[0], par[1])
                
                
            if stop_clustering:
                break
            #print("par: ", par, ", dist: ", round(dist,2), " ", len(self.clusters))
                """
                key = min(self.distances, key=self.distances.get)
                kljuc = key
                par = key.split(' ')
                par = [int(i) for i in par]
                dist = self.distances[kljuc]
                #print("   ->",key, " ", self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points))
                if self.check_cannot_link(constraints,
                                          self.clusters[par[0]].points,
                                          self.clusters[par[1]].points):
                    self.distances[kljuc] = sys.maxsize
                    if dist == sys.maxsize:
                        self.l.log(
                            "ABHC cannot find clusters under those constraints..."
                        )
                        return self.clusters
                    print("   Cannot link:", par)
                else:
                    break
                # print("--------------------")
                # print(par[0], par[1])
            #print(self.distances.keys())
            self.distances.pop(kljuc, None)
            self.izbrisi_razdalje(par[0])
            self.izbrisi_razdalje(par[1])
            #print(self.distances.keys())
            tocke = []
            tocke.append(self.clusters[par[0]].points)
            tocke.append(self.clusters[par[1]].points)
            #print("tocke: ", len(tocke))
            novCluster = Cluster(m + idZ)
            novCluster.update(par[0], par[1], dist, tocke)
            novCluster.centroid = novCluster.calculateCentroid()
            self.clusters.pop(par[0])
            self.clusters.pop(par[1])
            self.clusters.update({(m + idZ): novCluster})
            #print("clustri:")
            #print(self.clusters.keys())
            #print("dodajam razdalje...")

            #print("NOV:" ,m+idZ)
            self.dodaj_razdalje(m + idZ)

            print("par: ", par, "dist: ", '%.08f' % dist)
            if idZ == 0:
                self.Z = [par[0], par[1], dist, novCluster.n]
            else:
                newrow = [par[0], par[1], dist, novCluster.n]
                self.Z = np.vstack([self.Z, newrow])

            n = len(self.clusters)
            idZ += 1
        #zapomni si primere, kateri so v drugi skupini kot v prejšni iteraciji.
        self.diff = []
        clusters_checked = set()
        hm = 0

        for cluster in self.clusters:
            val = -1
            for point in self.clusters[cluster].points:
                hm += 1
                if val < 0:
                    val = self.prev_dict[point.reference]
                    if val in clusters_checked:
                        self.diff.append(point.reference)
                else:
                    if val != self.prev_dict[point.reference]:
                        self.diff.append(point.reference)
            clusters_checked.add(val)
        self.prev_dict = self.make_dict()
        print(len(self.diff))
        print(sorted(self.diff))
        print("stevilo primerov: ", hm)
        return self.clusters
Beispiel #3
0
 def randomClusters(self, max_clusters):
     for i in range(0, max_clusters):
         cluster = Cluster()
         cluster.centroid = self.randomPoint()
         self.clusters.append(cluster)
Beispiel #4
0
    def hierarhicalClustering(self, clusters=None):
        """
        Main hierarhical clustering loop
        """
        distanca = 0
        self.l.log("Building distance matrix...")
        n = len(self.points)  #na začetku je vsak primer svoj cluster
        data = []
        for c in self.clusters:
            p = [point.coords for point in self.clusters[c].points]
            data.append(p[0])
        df = pd.DataFrame(data, columns=np.array([a for a in self.attributes]))
        n_df = (df.values)
        self.d_matrix = np.zeros(((df.values).shape[0], (df.values).shape[0]))
        for i in range((df.values).shape[0]):
            for j in range((df.values).shape[0]):
                kljuc1 = str(i) + ' ' + str(j)
                kljuc2 = str(j) + ' ' + str(i)
                if i != j:
                    if kljuc1 in self.distances:
                        continue
                    elif kljuc2 in self.distances:
                        continue
                    else:
                        if self.linkage == "Ward":
                            l = []
                            l.append(n_df[i])
                            l.append(n_df[j])
                            centroid = np.average(l, axis=0)
                            dist = 0
                            if self.distance_type == "Cosine":
                                dist += spatial.distance.cosine(
                                    centroid, n_df[i])**2
                                dist += spatial.distance.cosine(
                                    centroid, n_df[j])**2
                            elif self.distance_type == "Euclidean":
                                dist += spatial.distance.euclidean(
                                    centroid, n_df[i])**2
                                dist += spatial.distance.euclidean(
                                    centroid, n_df[i])**2
                            self.distances.update({kljuc1: dist})
                        elif self.linkage == "Average":
                            if self.distance_type == "Cosine":
                                dist = spatial.distance.cosine(
                                    n_df[i], n_df[j])
                            elif self.distance_type == "Euclidean":
                                dist = spatial.distance.euclidean(
                                    n_df[i], n_df[j])
                            self.distances.update({kljuc1: dist})
                        else:
                            print("Error creating distance matrix...")
                            exit(1)

        idZ = 0
        m = len(self.points)
        self.l.log("Finding clusters...")
        while n > 1:
            """
            dist, pair = self.closest_clusters()
            par = list()
            for el in pair:
                par.append(el)
           
            dist = np.amin(self.d_matrix)
            result = np.where(self.d_matrix == dist)

            par = list()
            for el in result[0]:
                par.append(el)
            print("--",par)
            """
            key = min(self.distances, key=self.distances.get)
            par = key.split(' ')
            par = [int(i) for i in par]
            dist = self.distances[key]
            #print("--------------------")
            #print(par[0], par[1])
            self.distances.pop(key, None)
            self.izbrisi_razdalje(par[0])
            self.izbrisi_razdalje(par[1])
            #print("5 238" in self.distances)
            tocke = []
            tocke.append(self.clusters[par[0]].points)
            tocke.append(self.clusters[par[1]].points)

            #print("tocke: ", len(tocke))
            novCluster = Cluster(m + idZ)
            novCluster.update(par[0], par[1], dist, tocke)
            novCluster.centroid = novCluster.calculateCentroid()
            self.clusters.pop(par[0])
            self.clusters.pop(par[1])
            self.clusters.update({(m + idZ): novCluster})
            #print("dodajam razdalje...")
            self.dodaj_razdalje(m + idZ)
            """
            novCluster = Cluster(par[0])
            novCluster.update(par[0], par[1], dist, tocke)
            novCluster.centroid = novCluster.calculateCentroid()
            self.clusters.pop(par[0])
            self.clusters.pop(par[1])
            self.clusters.update({(par[0]): novCluster})
            #TODO: preracunaj razdalje v matriki razdalj
            """
            if idZ == 0:
                self.Z = [par[0], par[1], dist, novCluster.n]
            else:
                newrow = [par[0], par[1], dist, novCluster.n]
                self.Z = np.vstack([self.Z, newrow])

            n = len(self.clusters)
            #self.vseSilhuete.update({idZ: self.metodaSilhuet()})
            print("par: ", par, ", dist: ", '%.08f' % dist)
            #print(idZ, n, m+idZ)
            idZ += 1

        self.l.log("Dendrogram created...")

        #vrnil naj bi matriko Z, in rezultate metod, ki nam povejo koliko clustrov je
        #print("Optimalno stevilo clustrov po metodi silhuet: ", len(self.points)-1-max(self.vseSilhuete.items(), key=operator.itemgetter(1))[0])
        return self.clusters
Beispiel #5
0
parser = argparse.ArgumentParser()
parser.add_argument('k', metavar='k', type=int,
                    help='The number of clusters to be used')
args = parser.parse_args()

k = args.k

clusters = []
plants = file_reader.readFile()
centroids = random.sample(plants, k)

# Inicia os clusters com centroids randômicos
for i in range(0, k):
    cluster = Cluster(i)
    cluster.centroid = centroids[i]
    clusters.append(cluster)

isConverging = False
iterationsLimit = 1000
currentIteration = 0

while (currentIteration < iterationsLimit and not isConverging):

    for cluster in clusters:
        cluster.plants = []

    for plant in plants:
        # O primeiro elemento representa o cluster mais próxima e o segundo
        # representa a distância até o centróide do mesmo
        closestCluster = [0, 99999999]