def test_equalsTest_sameInstance_2D(self):
        d2 = 2
        d3 = 3
        s1 = Sample((0.0,0.0))
        s2 = Sample((1.0,0.0))
        s3 = Sample((2.0,0.0))
        s4 = Sample((3.0,0.0))
        s5 = Sample((3.0,1.0))
        s6 = Sample((0.0,0.0,0.0))


        c1 = Cluster([s1,s2,s3],d2)
        c2 = Cluster([s4],d2)
        c3 = Cluster([s5],d2)
        c4 = Cluster([s6],d3)

        container1 = ClusterContainer([c1,c2],d2)
        container2 = ClusterContainer([c1,c2],d2)
        container3 = ClusterContainer([c1,c3],d2)
        container4 = ClusterContainer([c3],d2)
        container5 = c3
        container6 = ClusterContainer([c4],d3)

        self.assertEquals(container1, container2, "los container son iguales")
        self.assertNotEquals(container1,container3, "los container no son iguales")
        self.assertNotEquals(container1,container4, "los container no son iguales")
        self.assertNotEquals(container1,container5, "container 5 no es un container")
        self.assertNotEquals(container1,container6, "container 6 es una dimension diferente")
    def test_createDistanceGraph_severalSamplesInClusters_4D(self):
        d = 4
        c1 = Cluster([(1.0, 7.0, 0.0, 0.0), (3.0, 7.0, 0.0, 0.0),
                      (1.0, 5.0, 0.0, 0.0), (3.0, 5.0, 0.0, 0.0)], d)
        c3 = Cluster([(4.0, 7.0, 0.0, 0.0), (6.0, 5.0, 0.0, 0.0),
                      (6.0, 7.0, 0.0, 0.0), (4.0, 5.0, 0.0, 0.0)], d)
        c2 = Cluster([(1.0, 3.0, 0.0, 0.0), (3.0, 3.0, 0.0, 0.0),
                      (1.0, 1.0, 0.0, 0.0), (3.0, 1.0, 0.0, 0.0)], d)
        c4 = Cluster([(4.0, 3.0, 0.0, 0.0), (4.0, 1.0, 0.0, 0.0),
                      (6.0, 1.0, 0.0, 0.0), (6.0, 3.0, 0.0, 0.0)], d)
        clusters = ClusterContainer([c1, c2, c3, c4], d)
        g = createDistanceGraph(clusters.getClusters())

        self.assertEquals(
            g.get_edge_data(c1, c2)['weight'], 4.0, "la distancia debe ser 4")
        self.assertEquals(
            g.get_edge_data(c1, c3)['weight'], 3.0, "la distancia debe ser 3")
        self.assertEquals(
            g.get_edge_data(c1, c4)['weight'], 5.0, "la distancia debe ser 5")
        self.assertEquals(
            g.get_edge_data(c2, c3)['weight'], 5.0, "la distancia debe ser 5")
        self.assertEquals(
            g.get_edge_data(c2, c4)['weight'], 3.0, "la distancia debe ser 3")
        self.assertEquals(
            g.get_edge_data(c3, c4)['weight'], 4.0, "la distancia debe ser 4")

        "minimumdistance: escuentra la arista de menor peso en el grafo de distancia"
    def test_createDistanceGraph_severalSamplesInClusters_3D(self):
        d = 3
        c1 = Cluster([(1.0, 7.0, 0.0), (3.0, 7.0, 0.0), (1.0, 5.0, 0.0),
                      (3.0, 5.0, 0.0)], d)
        c3 = Cluster([(4.0, 7.0, 0.0), (6.0, 5.0, 0.0), (6.0, 7.0, 0.0),
                      (4.0, 5.0, 0.0)], d)
        c2 = Cluster([(1.0, 3.0, 0.0), (3.0, 3.0, 0.0), (1.0, 1.0, 0.0),
                      (3.0, 1.0, 0.0)], d)
        c4 = Cluster([(4.0, 3.0, 0.0), (4.0, 1.0, 0.0), (6.0, 1.0, 0.0),
                      (6.0, 3.0, 0.0)], d)
        clusters = ClusterContainer([c1, c2, c3, c4], d)
        g = createDistanceGraph(clusters.getClusters())

        self.assertEquals(
            g.get_edge_data(c1, c2)['weight'], 4.0, "la distancia debe ser 4")
        self.assertEquals(
            g.get_edge_data(c1, c3)['weight'], 3.0, "la distancia debe ser 3")
        self.assertEquals(
            g.get_edge_data(c1, c4)['weight'], 5.0, "la distancia debe ser 5")
        self.assertEquals(
            g.get_edge_data(c2, c3)['weight'], 5.0, "la distancia debe ser 5")
        self.assertEquals(
            g.get_edge_data(c2, c4)['weight'], 3.0, "la distancia debe ser 3")
        self.assertEquals(
            g.get_edge_data(c3, c4)['weight'], 4.0, "la distancia debe ser 4")
    def test_minimumDistance_trivial_4d(self):
        d = 4
        c1 = Cluster([(0.0, 0.0, 0.0, 0.0)], d)
        c2 = Cluster([(1.0, 0.0, 0.0, 0.0)], d)
        clusters = ClusterContainer([c1, c2], d)
        g = createDistanceGraph(clusters.getClusters())

        (u, v) = minimumEdge(g)
        self.assertEquals(g[u][v]['weight'], 1.0,
                          "la minima arista tiene peso 1")
 def test_minimunDistanceOnlyOneSamplesForCluster_4D(self):
     d = 4
     c1 = Cluster([(2.0, 6.0, 0.0, 0.0)], d)
     c2 = Cluster([(2.0, 2.0, 0.0, 0.0)], d)
     c3 = Cluster([(5.0, 6.0, 0.0, 0.0)], d)
     c4 = Cluster([(5.0, 2.0, 0.0, 0.0)], d)
     clusters = ClusterContainer([c1, c2, c3, c4], d)
     g = createDistanceGraph(clusters.getClusters())
     (u, v) = minimumEdge(g)
     self.assertEquals(g[u][v]['weight'], 3.0,
                       "la minima arista tiene peso 3")
    def test_minimumDistance_severalSamplesInClusters_2D(self):
        d = 2
        c1 = Cluster([(1.0, 7.0), (3.0, 7.0), (1.0, 5.0), (3.0, 5.0)], d)
        c2 = Cluster([(1.0, 3.0), (3.0, 3.0), (1.0, 1.0), (3.0, 1.0)], d)
        c3 = Cluster([(4.0, 7.0), (6.0, 5.0), (6.0, 7.0), (4.0, 5.0)], d)
        c4 = Cluster([(4.0, 3.0), (4.0, 1.0), (6.0, 1.0), (6.0, 3.0)], d)
        clusters = ClusterContainer([c1, c2, c3, c4], d)
        g = createDistanceGraph(clusters.getClusters())

        (u, v) = minimumEdge(g)
        self.assertEquals(g[u][v]['weight'], 3.0,
                          "la minima arista tiene peso 3")
    def test_equalsTest_differentInstancesOfTheSameData3D(self):

        d = 3
        d4 = 4
        c1 = ClusterContainer([Cluster([(0.0,1.0,0.0),(1.0,1.0,0.0),(2.0,1.0,0.0)],d), Cluster([(2.6,3.4,0.0)],d)],d)
        c2 = ClusterContainer([Cluster([(0.0,1.0,0.0),(1.0,1.0,0.0),(2.0,1.0,0.0)],d), Cluster([(2.6,3.4,0.0)],d)],d)
        c3 = ClusterContainer([Cluster([(0.0,1.0,0.0),(1.0,1.0,0.0),(2.0,1.0,0.0)],d), Cluster([(2.7,3.4,0.0)],d)],d)
        c4 = ClusterContainer([Cluster([(2.6,3.4,0.0)],d)],d)
        c5 = Cluster([(2.6,3.4,0.0)],d)
        c6 = ClusterContainer([Cluster([(0.0,1.0,0.0,0.0),(1.0,1.0,0.0,0.0),(2.0,1.0,0.0,0.0)],d), Cluster([(2.6,3.4,0.0,0.0)],d4)],d4)

        self.assertEquals(c1,c2,"Los dos clusters container son iguales")
        self.assertNotEquals(c1,c3, "los clusters container no son iguales")
        self.assertNotEquals(c1, c4, "Los clusters container no son iguales")
        self.assertNotEquals(c1, c5, "c5 no es un cluster container")
        self.assertNotEquals(c1, c6, "los cluster container son de dimension diferente")
    def test_createDistanceGraph_4D(self):
        d = 4
        c1 = Cluster([(2.0, 6.0, 0.0, 0.0)], d)
        c2 = Cluster([(2.0, 2.0, 0.0, 0.0)], d)
        c3 = Cluster([(5.0, 6.0, 0.0, 0.0)], d)
        c4 = Cluster([(5.0, 2.0, 0.0, 0.0)], d)
        clusters = ClusterContainer([c1, c2, c3, c4], d)
        g = createDistanceGraph(clusters.getClusters())

        self.assertEquals(
            g.get_edge_data(c1, c2)['weight'], 4.0, "la distancia debe ser 4")
        self.assertEquals(
            g.get_edge_data(c1, c3)['weight'], 3.0, "la distancia debe ser 3")
        self.assertEquals(
            g.get_edge_data(c1, c4)['weight'], 5.0, "la distancia debe ser 5")
        self.assertEquals(
            g.get_edge_data(c2, c3)['weight'], 5.0, "la distancia debe ser 5")
        self.assertEquals(
            g.get_edge_data(c2, c4)['weight'], 3.0, "la distancia debe ser 3")
        self.assertEquals(
            g.get_edge_data(c3, c4)['weight'], 4.0, "la distancia debe ser 4")
 def test_onlyOneOutlier_3D(self):
     d = 3
     classA = SampleContainer([(0.0, 0.0, 0.0), (0.0, 1.0, 0.0),
                               (0.0, 2.0, 0.0), (0.0, 3.0, 0.0)], d)
     classB = SampleContainer([(0.0, 1.5, 0.0)], d)
     clusters = createClusters(classA, classB)
     clusters_test = ClusterContainer([
         Cluster([(0.0, 2.0, 0.0), (0.0, 3.0, 0.0)], d),
         Cluster([(0.0, 0.0, 0.0), (0.0, 1.0, 0.0)], d)
     ], d)
     self.assertEquals(
         clusters, clusters_test,
         "las muestras mergeables deben estar en el mismo cluster")
    def test_onlyOneSampleForCluster_2D(self):
        d = 2
        s0_1, s0_2, s0_3, s0_4 = (0.0, 2.0), (0.0, 4.0), (0.0, 6.0), (0.0, 8.0)
        s1_1, s1_2, s1_3, s1_4 = (0.0, 1.0), (0.0, 3.0), (0.0, 5.0), (0.0, 7.0)
        classA = SampleContainer([s1_1, s1_2, s1_3, s1_4], d)
        classB = SampleContainer([s0_1, s0_2, s0_3, s0_4], d)

        clusters = createClusters(classA, classB)
        clusters_test = ClusterContainer([
            Cluster([s1_1], d),
            Cluster([s1_2], d),
            Cluster([s1_3], d),
            Cluster([s1_4], d)
        ], d)
        self.assertEquals(clusters, clusters_test,
                          "debe generarce un cluster para cada muestra")
def createClusters(samplesA, samplesB):
    clusters = None
    if not isinstance(samplesA, ClusterContainer):
        print("no es sample")
        clusters = createDefaultClusters(samplesA)
    else:
        print("si es sample")
        clusters = samplesA
    samples = samplesB

    if samplesA.getSize() == 1:
        return clusters

    else:

        K = clusters.getSize()
        k = 0
        distances_graph = createDistanceGraph(clusters.getClusters())
        #sorted_edges = sorted(distances_graph.edges(data=True), key=lambda x: x[2]['weight'])
        while k < K:

            #(u,v,w) = sorted_edges[0]
            (u, v) = minimumEdge(distances_graph)
            merged = mergeClusters(u, v)
            """print("se puede fusionar: " + str(not containsOutlier(merged, samples)) + " k: " + str(k) + " K: " + str(K))
            print("cluster u: " + str(map(lambda s : s.getData(), u.getSamples())))
            print("cluster v: " + str(map(lambda s : s.getData(), v.getSamples())))
            print("cluster merged: " + str(map(lambda s : s.getData(), merged.getSamples())))"""
            if containsOutlier(merged, samples):
                #k = k + 1
                #sorted_edges.remove(sorted_edges[0])
                distances_graph[u][v]['weight'] = float('inf')

            else:

                clusters = updateClusterContainer(clusters, u, v, merged)
                distances_graph = updateDistanceGraph(distances_graph, u, v,
                                                      merged)
                #sorted_edges = sorted(distances_graph.edges(data=True), key=lambda x: x[2]['weight'])
                K = K - 1
                k = 0
            k = k + 1

        return ClusterContainer(
            filter(lambda cls: cls.getSize() >= samplesA.getSize() * 0.01,
                   clusters.getClusters()), clusters.getDimension())
    def test_createClusters_allSamplesInTheSameCluster_2D(self):
        d = 2
        s0_1, s0_2, s0_3 = Sample((3.0, 3.0)), Sample((4.0, 4.0)), Sample(
            (3.0, 4.0))
        s1_1, s1_2, s1_3, s1_4, s1_5, s1_6 = Sample((0.0, 1.0)), Sample(
            (0.0, 2.0)), Sample((0.0, 3.0)), Sample((1.0, 0.0)), Sample(
                (1.0, 1.0)), Sample((1.0, 2.0))

        class0 = SampleContainer([s0_1, s0_2, s0_3], d)
        class1 = SampleContainer([s1_1, s1_2, s1_3, s1_4, s1_5, s1_6], d)

        clusters_test = ClusterContainer(
            [Cluster([s1_1, s1_2, s1_3, s1_4, s1_5, s1_6], d)], d)
        clusters = createClusters(class1, class0)
        self.assertEquals(
            clusters_test, clusters,
            "todas las muestras deben estar en un unico cluster")
    def test_createRegions_trivial4D(self):
        d = 4
        groups = GroupContainer(d)
        groups.addSamples(1, [Sample((5.0, 4.0, 0.0, 0.0))])
        clusters = ClusterContainer([Cluster([(7.0, 4.0, 0.0, 0.0)], d)], d)
        regions = createRegions(groups, clusters)
        hiperplanes = regions[0].getHyperplanes().pop()

        self.assertTrue(
            0.999999950215 < hiperplanes.getCoefficient(0)
            and 0.999999950216 > hiperplanes.getCoefficient(0),
            "0.99999995001 debe multiplicar la primer incognita")
        self.assertEquals(0.0, hiperplanes.getCoefficient(1),
                          "0.0 debe multiplicar la segunda incognita")
        self.assertTrue(
            5.9999997013 > hiperplanes.getIntercept()
            and 5.9999997012 < hiperplanes.getIntercept(),
            "alfa debe ser 5.9999997013")
    def test_createClusters_severalOutliers2D(self):
        d = 2
        s1, s2, s3, s4, s5 = (3.0, 7.0), (3.0, 6.0), (10.0, 7.0), (10.0,
                                                                   6.0), (6.5,
                                                                          6.5)
        samplesA = SampleContainer([s1, s2, s3, s4, s5], d)
        samplesB = SampleContainer([(6.0, 7.0), (6.0, 6.0), (7.0, 6.0),
                                    (7.0, 7.0), (6.0, 6.5), (7.0, 6.5)], d)
        c1 = Cluster([s1, s2], d)
        c2 = Cluster([s4, s3], d)
        c3 = Cluster([s5], d)

        container_test = ClusterContainer([c1, c2, c3], d)
        container = createClusters(samplesA, samplesB)

        self.assertEquals(
            container, container_test,
            "Deben definirse los clusters: [s1,s2],[s3,s4],[s5]")
def createClusters2(samplesA, samplesB):

    clusters = createDefaultClusters(samplesA)
    samples = samplesB

    if samplesA.getSize() == 1:
        return clusters

    else:

        K = clusters.getSize()
        k = 0
        print("creando grafo de distancias...")
        distances_graph = createDistanceGraph(clusters.getClusters())
        #print("ordenando aristas...")
        sorted_edges = sorted(distances_graph.edges(data=True),
                              key=lambda x: x[2]['weight'])
        has_already_been_merged = createMap(distances_graph.nodes)
        #print("Cantidad de aristas " + str(len(sorted_edges)))
        #print("Cantidad de clusters: " + str(clusters.getSize()))

        print("reduciendo clusters...")
        while k < K:

            if len(sorted_edges) == 0:
                print("re-ordenando...")
                sorted_edges = sorted(distances_graph.edges(data=True),
                                      key=lambda x: x[2]['weight'])
                has_already_been_merged = createMap(distances_graph.nodes)
                """print("Cantidad de aristas " + str(len(sorted_edges)))
                print("Cantidad de clusters: " + str(clusters.getSize()))
                print("cantidad de clusters: " + str(map(lambda s: s.getSize(),clusters.getClusters())))
                print("K: " + str(K))"""

            else:
                (u, v, w) = sorted_edges[0]
                #(u,v) = minimumEdge(distances_graph)
                if (not has_already_been_merged[v]
                        and not has_already_been_merged[u]):
                    merged = mergeClusters(u, v)
                    """print("se puede fusionar: " + str(not containsOutlier(merged, samples)) + " k: " + str(k) + " K: " + str(K))
                    print("cluster u: " + str(map(lambda s : s.getData(), u.getSamples())))
                    print("cluster v: " + str(map(lambda s : s.getData(), v.getSamples())))
                    print("cluster merged: " + str(map(lambda s : s.getData(), merged.getSamples())))"""

                    if not containsOutlier(merged, samples):
                        clusters = updateClusterContainer(
                            clusters, u, v, merged)
                        distances_graph = updateDistanceGraph(
                            distances_graph, u, v, merged)
                        has_already_been_merged[v] = True
                        has_already_been_merged[u] = True
                        K = K - 1
                        k = 0
                    k = k + 1
                sorted_edges.remove(sorted_edges[0])

        print(map(lambda c: c.getSize(), clusters.getClusters()))

        clusters = createClusters(clusters, samplesB)
        return ClusterContainer(
            filter(lambda cls: cls.getSize() >= samplesA.getSize() * 0.01,
                   clusters.getClusters()), clusters.getDimension())
def createDefaultClusters(samples):
    d = samples.getDimension()
    return ClusterContainer(
        map(lambda spl: Cluster([spl], d), samples.getSamples()), d)
Example #17
0
def removeOutliers(clusters, outliers):
    return ClusterContainer(
        map(
            lambda clstr: Cluster(clstr.getSamples() - outliers.getSamples(),
                                  clstr.getDimension()),
            clusters.getClusters()), clusters.getDimension())