Beispiel #1
0
    def __init__(self, data):
        drange = self.__diameter(data)
        dcount = data.shape[0]

        # run clustering
        clusterer = hcluster(compute_full_tree=True)
        clusterer.fit(data.copy())
        hc_children = clusterer.children_

        # setup leaf clusters
        clusters = {i: Cluster(i, None, None, data=np.array([data[i, :]]), count=1, range=0) for i in range(data.shape[0])}
        leaves = {}

        # setup tree
        if drange != 0:
            minrange = 1
            for idx in range(hc_children.shape[0]):
                children = hc_children[idx]
                left_child, right_child = clusters[children[0]], clusters[children[1]]
                id = idx + data.shape[0]
                cluster = Cluster(id, left_child, right_child)
                cluster.data = np.vstack((left_child.data, right_child.data))
                cluster.range = self.__diameter(cluster.data) / (drange)
                if cluster.range < minrange: minrange = cluster.range
                cluster.count = left_child.count + right_child.count
                left_child.parent = cluster
                right_child.parent = cluster
                clusters[id] = cluster
            
            for id in clusters:
                cluster = clusters[id]
                leaf = (cluster.count == 1)
                while (cluster.parent is not None and cluster.parent.range == 0):
                    cluster = cluster.parent
                clusters[id] = cluster
                if cluster.range == 0 and cluster.parent is not None and cluster.parent.range != 0:
                    if cluster.items is None:
                        cluster.items = set()
                    if leaf:
                        leaves[id] = cluster
                        cluster.items.add(id)

            # clear "clusters" from original leaves
            # set true leaves children to empty
            cids = clusters.keys()
            for id in cids:
                cluster = clusters[id]
                if cluster.id != id:
                    del clusters[id]
                elif cluster.range == 0:
                    cluster.left, cluster.right = None, None

            for id in clusters:
                cluster = clusters[id]
                if cluster.range == 0:
                    if minrange < 0.1:
                        cluster.range = minrange
                    else:
                        cluster.range = 1e-4
                cluster.count = cluster.count * 1.0 / dcount
        else:
            big_cluster = Cluster(data.shape[0], None, None, data=np.array(data[0, :]), count=data.shape[0], range=1e-4)
            big_cluster.items = set(range(data.shape[0]))
            for i in range(data.shape[0]):
                leaves[i] = big_cluster
            clusters = {data.shape[0]: big_cluster}

        # setup leaf levels
        for lid in leaves:
            lcluster = leaves[lid]
            lcluster.level = 1

        # compute all tree levels
        computed = set([leaves[lid].id for lid in leaves.keys()])
        for lid in leaves:
            cluster = leaves[lid]
            while (cluster is not None and (cluster.right is None or cluster.right.id in computed) and (cluster.left is None or cluster.left.id in computed)):
                if (cluster.id not in computed):
                    cluster.level = max(cluster.right.level, cluster.left.level) + 1
                    computed.add(cluster.id)
                cluster = cluster.parent
        del computed

        # set tree state variables
        biggest, max_level = 0, 0
        test_cl = leaves[0]
        while (test_cl.parent is not None):
            test_cl = test_cl.parent
        self.root = test_cl
        self.max_level = test_cl.level
        self.data = data
        self.clusters = clusters
        self.leaves = leaves
        self.__drange = drange
        self.__dcount = dcount
Beispiel #2
0
    def __init__(self, data):
        drange = self.__diameter(data)
        dcount = data.shape[0]

        # run clustering
        clusterer = hcluster(compute_full_tree=True)
        clusterer.fit(data.copy())
        hc_children = clusterer.children_

        # setup leaf clusters
        clusters = {
            i: Cluster(i,
                       None,
                       None,
                       data=np.array([data[i, :]]),
                       count=1,
                       range=0)
            for i in range(data.shape[0])
        }
        leaves = {}

        # setup tree
        if drange != 0:
            minrange = 1
            for idx in range(hc_children.shape[0]):
                children = hc_children[idx]
                left_child, right_child = clusters[children[0]], clusters[
                    children[1]]
                id = idx + data.shape[0]
                cluster = Cluster(id, left_child, right_child)
                cluster.data = np.vstack((left_child.data, right_child.data))
                cluster.range = self.__diameter(cluster.data) / (drange)
                if cluster.range < minrange: minrange = cluster.range
                cluster.count = left_child.count + right_child.count
                left_child.parent = cluster
                right_child.parent = cluster
                clusters[id] = cluster

            for id in clusters:
                cluster = clusters[id]
                leaf = (cluster.count == 1)
                while (cluster.parent is not None
                       and cluster.parent.range == 0):
                    cluster = cluster.parent
                clusters[id] = cluster
                if cluster.range == 0 and cluster.parent is not None and cluster.parent.range != 0:
                    if cluster.items is None:
                        cluster.items = set()
                    if leaf:
                        leaves[id] = cluster
                        cluster.items.add(id)

            # clear "clusters" from original leaves
            # set true leaves children to empty
            cids = clusters.keys()
            for id in cids:
                cluster = clusters[id]
                if cluster.id != id:
                    del clusters[id]
                elif cluster.range == 0:
                    cluster.left, cluster.right = None, None

            for id in clusters:
                cluster = clusters[id]
                if cluster.range == 0:
                    if minrange < 0.1:
                        cluster.range = minrange
                    else:
                        cluster.range = 1e-4
                cluster.count = cluster.count * 1.0 / dcount
        else:
            big_cluster = Cluster(data.shape[0],
                                  None,
                                  None,
                                  data=np.array(data[0, :]),
                                  count=data.shape[0],
                                  range=1e-4)
            big_cluster.items = set(range(data.shape[0]))
            for i in range(data.shape[0]):
                leaves[i] = big_cluster
            clusters = {data.shape[0]: big_cluster}

        # setup leaf levels
        for lid in leaves:
            lcluster = leaves[lid]
            lcluster.level = 1

        # compute all tree levels
        computed = set([leaves[lid].id for lid in leaves.keys()])
        for lid in leaves:
            cluster = leaves[lid]
            while (cluster is not None
                   and (cluster.right is None or cluster.right.id in computed)
                   and (cluster.left is None or cluster.left.id in computed)):
                if (cluster.id not in computed):
                    cluster.level = max(cluster.right.level,
                                        cluster.left.level) + 1
                    computed.add(cluster.id)
                cluster = cluster.parent
        del computed

        # set tree state variables
        biggest, max_level = 0, 0
        test_cl = leaves[0]
        while (test_cl.parent is not None):
            test_cl = test_cl.parent
        self.root = test_cl
        self.max_level = test_cl.level
        self.data = data
        self.clusters = clusters
        self.leaves = leaves
        self.__drange = drange
        self.__dcount = dcount
Beispiel #3
0
    def __init__(self, data):
        # NOTE: assumes data is 1-d numpy array
        pos = data.argsort() 
        data = data[pos]
        data = data.reshape(data.shape[0], 1) 
        drange = data[-1,0] - data[0,0]
        dcount = data.shape[0]

        # run clustering
        # clusterer = hcluster(compute_full_tree=True)
        clusterer = hcluster(compute_full_tree=True, linkage='complete')
        clusterer.fit(data)
        hc_children = clusterer.children_

        # setup leaf clusters
        clusters = {i: Cluster(i, None, None, data=(data[i, 0], data[i, 0]), count=1, range=0) for i in range(data.shape[0])}
        leaves = {}

        # setup tree
        if drange != 0:
            minrange = 1
            for idx in range(hc_children.shape[0]):
                children = hc_children[idx]
                lc, rc = clusters[children[0]], clusters[children[1]]
                check = 0 if lc.data[-1] < rc.data[0] else 1
                left_child, right_child = clusters[children[check]], clusters[children[1-check]]
                id = idx + data.shape[0]
                cluster = Cluster(id, left_child, right_child)
                cluster.data = (min(left_child.data), max(right_child.data))
                cluster.range = (cluster.data[-1] - cluster.data[0]) / (drange)
                if cluster.range < minrange: minrange = cluster.range
                cluster.count = left_child.count + right_child.count
                left_child.parent = cluster
                right_child.parent = cluster
                clusters[id] = cluster
            
            for id in clusters:
                cluster = clusters[id]
                leaf = (cluster.count == 1)
                while (cluster.parent is not None and cluster.parent.range == 0):
                    cluster = cluster.parent
                clusters[id] = cluster
                if cluster.range == 0 and cluster.parent is not None and cluster.parent.range != 0:
                    if cluster.items is None:
                        cluster.items = set()
                    if leaf:
                        leaves[id] = cluster
                        cluster.items.add(id)

            cids = clusters.keys()
            for id in cids:
                cluster = clusters[id]
                if cluster.id != id:
                    del clusters[id]
                elif cluster.range == 0:
                    cluster.left, cluster.right = None, None

            for id in clusters:
                cluster = clusters[id]
                if cluster.range == 0:
                    if minrange < 0.1:
                        cluster.range = minrange
                    else:
                        cluster.range = 1e-4
                cluster.count = cluster.count * 1.0 / dcount
        else:
            big_cluster = Cluster(data.shape[0], None, None, data=(data[0,0], data[0,0]), count=data.shape[0], range=1e-4)
            big_cluster.items = set(range(data.shape[0]))
            for i in range(data.shape[0]):
                leaves[i] = big_cluster
            clusters = {data.shape[0]: big_cluster}

        # setup leaf levels
        for lid in leaves:
            lcluster = leaves[lid]
            lcluster.level = 1

        # compute all tree levels
        computed = set([leaves[lid].id for lid in leaves.keys()])
        for lid in leaves:
            cluster = leaves[lid]
            while (cluster is not None and \
                ((cluster.data[0] == cluster.data[1]) or \
                (cluster.right.id in computed and cluster.left.id in computed))):
                if (cluster.id not in computed):
                    cluster.level = max(cluster.right.level, cluster.left.level) + 1
                    computed.add(cluster.id)
                cluster = cluster.parent
        del computed

        # set tree state variables
        biggest, max_level = 0, 0
        for id in clusters:
            if id > biggest: biggest = id
            if clusters[id].level > max_level: max_level = clusters[id].level
        self.data = data 
        self.root = clusters[biggest]
        self.max_level = max_level
        self.clusters = clusters
        self.leaves = leaves
        self.__dindex = pos 
        self.__findex = np.argsort(pos)
        self.__drange = drange
        self.__dcount = dcount
Beispiel #4
0
    def __init__(self, data):
        # NOTE: assumes data is 1-d numpy array
        pos = data.argsort()
        data = data[pos]
        data = data.reshape(data.shape[0], 1)
        drange = data[-1, 0] - data[0, 0]
        dcount = data.shape[0]

        # run clustering
        # clusterer = hcluster(compute_full_tree=True)
        clusterer = hcluster(compute_full_tree=True, linkage='complete')
        clusterer.fit(data)
        hc_children = clusterer.children_

        # setup leaf clusters
        clusters = {
            i: Cluster(i,
                       None,
                       None,
                       data=(data[i, 0], data[i, 0]),
                       count=1,
                       range=0)
            for i in range(data.shape[0])
        }
        leaves = {}

        # setup tree
        if drange != 0:
            minrange = 1
            for idx in range(hc_children.shape[0]):
                children = hc_children[idx]
                lc, rc = clusters[children[0]], clusters[children[1]]
                check = 0 if lc.data[-1] < rc.data[0] else 1
                left_child, right_child = clusters[children[check]], clusters[
                    children[1 - check]]
                id = idx + data.shape[0]
                cluster = Cluster(id, left_child, right_child)
                cluster.data = (min(left_child.data), max(right_child.data))
                cluster.range = (cluster.data[-1] - cluster.data[0]) / (drange)
                if cluster.range < minrange: minrange = cluster.range
                cluster.count = left_child.count + right_child.count
                left_child.parent = cluster
                right_child.parent = cluster
                clusters[id] = cluster

            for id in clusters:
                cluster = clusters[id]
                leaf = (cluster.count == 1)
                while (cluster.parent is not None
                       and cluster.parent.range == 0):
                    cluster = cluster.parent
                clusters[id] = cluster
                if cluster.range == 0 and cluster.parent is not None and cluster.parent.range != 0:
                    if cluster.items is None:
                        cluster.items = set()
                    if leaf:
                        leaves[id] = cluster
                        cluster.items.add(id)

            cids = clusters.keys()
            for id in cids:
                cluster = clusters[id]
                if cluster.id != id:
                    del clusters[id]
                elif cluster.range == 0:
                    cluster.left, cluster.right = None, None

            for id in clusters:
                cluster = clusters[id]
                if cluster.range == 0:
                    if minrange < 0.1:
                        cluster.range = minrange
                    else:
                        cluster.range = 1e-4
                cluster.count = cluster.count * 1.0 / dcount
        else:
            big_cluster = Cluster(data.shape[0],
                                  None,
                                  None,
                                  data=(data[0, 0], data[0, 0]),
                                  count=data.shape[0],
                                  range=1e-4)
            big_cluster.items = set(range(data.shape[0]))
            for i in range(data.shape[0]):
                leaves[i] = big_cluster
            clusters = {data.shape[0]: big_cluster}

        # setup leaf levels
        for lid in leaves:
            lcluster = leaves[lid]
            lcluster.level = 1

        # compute all tree levels
        computed = set([leaves[lid].id for lid in leaves.keys()])
        for lid in leaves:
            cluster = leaves[lid]
            while (cluster is not None and \
                ((cluster.data[0] == cluster.data[1]) or \
                (cluster.right.id in computed and cluster.left.id in computed))):
                if (cluster.id not in computed):
                    cluster.level = max(cluster.right.level,
                                        cluster.left.level) + 1
                    computed.add(cluster.id)
                cluster = cluster.parent
        del computed

        # set tree state variables
        biggest, max_level = 0, 0
        for id in clusters:
            if id > biggest: biggest = id
            if clusters[id].level > max_level: max_level = clusters[id].level
        self.data = data
        self.root = clusters[biggest]
        self.max_level = max_level
        self.clusters = clusters
        self.leaves = leaves
        self.__dindex = pos
        self.__findex = np.argsort(pos)
        self.__drange = drange
        self.__dcount = dcount