Ejemplo n.º 1
0
 def __init__(self, size):
     self.grid = numpy.zeros((size, size))
     # self.grid = [[0 for i in range(size)] for i in range(size)]
     self.open_sites = 0.0
     self.index = union_find.UnionFind(size * size)
     self.size = size
     self.percolate = False
Ejemplo n.º 2
0
def persistence(im):
    h, w = im.shape

    # Get indices orderd by value from high to low
    indices = [(i, j) for i in range(h) for j in range(w)]
    indices.sort(key=lambda p: get(im, p), reverse=True)
   # print indices
    # Maintains the growing sets
    uf = union_find.UnionFind()

    groups0 = {}

    def get_comp_birth(p):
        return get(im, uf[p])

    # Process pixels from high to low
    for i, p in enumerate(indices):
        v = get(im, p)
        ni = []
        ff = iter_neighbors(p, w, h)

        print "------"
        for q in ff:
            if q in uf:
                print q
                ni.append(uf[q])
        print "------"
        nc = sorted([(get_comp_birth(q), q) for q in set(ni)], reverse=True)

        if i == 0:
            groups0[p] = (v, v, None)

        uf.add(p, -i)

        if len(nc) > 0:
            oldp = nc[0][1]
            uf.union(oldp, p)

            # Merge all others with oldp
            for bl, q in nc[1:]:
                if uf[q] not in groups0:
                    #print(i, ": Merge", uf[q], "with", oldp, "via", p)
                    groups0[uf[q]] = (bl, bl-v, p)
                uf.union(oldp, q)


    # groups1 = [(k, groups0[k][0], groups0[k][1], groups0[k][2]) for k in groups0]
    # groups1.sort(key=lambda g: g[2], reverse=True)
    groups1 = []
    for k in groups0:
        groups1.append((k, groups0[k][0], groups0[k][1], groups0[k][2]))
    groups1.sort(key=lambda g: g[2], reverse=True)

    print groups1
    return groups1
 def kruskalMST(self):
     self.graph = sorted(self.graph, key=lambda x: x[2])
     print("sorted graph", self.graph)
     uf = union_find.UnionFind()
     e = 0
     for i in self.graph:
         if (e == self.nodes - 1
             ):  # MST will hva eatmost nodes-1 edges so break when satsfies
             break
         if i[0] not in uf.sets:
             uf.makeSet(i[0])  # add node to sets in union_find
         if i[1] not in uf.sets:
             uf.makeSet(i[1])
         #print("i0 =", i[0])
         #print("i1 =", i[1])
         if (False == uf.union(uf.sets[i[0]], uf.sets[i[1]])
             ):  # can't make union beacuse this edge formas cycle
             continue
         self.mst.append(i)
         e += 1
Ejemplo n.º 4
0
    def kruskal(self, graph):
        """
        Given a connected undirected graph G = (V, E) with positive edge
        weights, computes a minimum spanning tree that consists of a subset
        of edges E′ ⊆ E of minimum total weight such that the graph (V, E′)
        is connected.

        Greedy Strategy: Repeatedly adds the next lightest edge if this
        doesn’t produce a cycle.

        Note: The graph does not have to be undirected.
        """

        minimum_spanning_tree = Graph()

        set = union_find.UnionFind()

        node_to_wrapper_node_map = {}
        priority_queue = heap.BinHeap(heap.HeapMode.min)
        for node in graph.nodes():
            minimum_spanning_tree.add_node(node)

            wrapper_node = union_find.Node(node)
            node_to_wrapper_node_map[node] = wrapper_node
            set.make_set(wrapper_node)

        for u, v in graph.edges():
            edge = (node_to_wrapper_node_map[u], node_to_wrapper_node_map[v])
            priority_queue.insert(heap.HeapItem(graph.weight((u, v)), edge))

        while priority_queue.size > 0:
            min_item = priority_queue.extract()
            u_node, v_node = min_item.datum
            if set.find(u_node) != set.find(v_node):
                minimum_spanning_tree.add_undirected_edge(
                    u_node.value, v_node.value, min_item.priority)
                set.union(u_node, v_node)

        return minimum_spanning_tree
Ejemplo n.º 5
0
Archivo: HW.py Proyecto: hzhaoc/utils
def C3W2_2():
    """Input nodes of 24 bits. Edge cost is Hamming Distance"""
    """largest value of k such that there is a k-clustering with spacing at least 3"""
    # input
    i = -1
    with open('data\\clustering_big.txt') as file:
        nodes = []
        for line in file:
            if i == -1:
                i += 1
                continue
            bit = int(''.join(line.split()), 2)  # converted to decimal
            nodes.append(bit)
    nodes = set(
        nodes
    )  # this equals union nodes with distance = 0 (we only care about distince nodes in this problem)
    mask1 = [1 << i for i in range(24)]  # 1-bit mask (distance = 1)
    _tmp = [i + 1 for i in mask1[1:]]
    mask1 = set(mask1)
    mask2 = {
        x << i
        for i in range(24) for x in _tmp if (x << i) <= int('1' * 24, 2)
    }  # 2-bit mask (distance = 2)
    # clustering
    union = UF.UnionFind(nodes)
    for node in nodes:
        # union this node with other nodes where distance = 1
        for m1 in mask1:
            if (node ^ m1) in nodes and not union.inSameUnion(node, node ^ m1):
                union.union(node, node ^ m1)
        # union this node with other nodes where distance = 2
        for m2 in mask2:
            if (node ^ m2) in nodes and not union.inSameUnion(node, node ^ m2):
                union.union(node, node ^ m2)
    # after distance=1 nodes and distance=2 nodes are unioned. Current K is the largest with spacing at least 3
    # if continue union, shortest distance = 3 nodes will be unioned, and K will decrease.
    print(f'current largest K with spacing at least 3 is {union.n_of_union}')
Ejemplo n.º 6
0
def groupTPL(TPL, distance=1):

    # TO-DO:
    # Rethink ways to cluster points
    # K-d tree may be an alternative
    # Currently algorithm runs on O(n^2)

    print 'Inside groupTPL()'

    U = union_find.UnionFind()

    for (i, x) in enumerate(TPL):
        for j in range(i + 1, len(TPL)):
            y = TPL[j]
            if max(abs(x[0] - y[0]), abs(x[1] - y[1])) <= distance:
                U.union(x, y)

    disjSets = {}
    for x in TPL:
        s = disjSets.get(U[x], set())
        s.add(x)
        disjSets[U[x]] = s

    return [list(x) for x in disjSets.values()]
Ejemplo n.º 7
0
def dump_unions(bug_list):
    path_map = dict()
    pair_list = []
    ex_dumps = []
    result = []
    for bug_id in bug_list:
        bug = b.get_bug(bug_id)
        child = b.get_bug(bug.dupe_of)
        # add to pair if exists
        if child.cf_crashdump_location:
            path_map[bug_id] = bug.cf_crashdump_location
            path_map[bug.dupe_of] = child.cf_crashdump_location
            pair_list.append([bug_id, bug.dupe_of])
    # remove single bug_id
    ex_bugs = list(path_map.keys())
    # apply union_find
    uf = union_find.UnionFind(len(ex_bugs))
    for pair in pair_list:
        uf.unite(ex_bugs.index(pair[0]), ex_bugs.index(pair[1]))
    uf.id = [uf.find(i) for i in uf.id]
    # convert bug_id to dump
    for bug_id in ex_bugs:
        ex_dumps.append(path_map[bug_id])
    dump_dict = dict(zip(ex_dumps, uf.id))
    # extract dump path
    for group_id in set(uf.id):
        res = []
        for k, v in dump_dict.items():
            if v == group_id:
                res = split_paths(k)
        # filter single group
        if len(res) > 1:
            result.append(res)
    tgt_path = os.path.join(os.getcwd(), "json", "dump_unions.json")
    with open(tgt_path, "w") as fp:
        json.dump(result, fp, indent=4, sort_keys=True)
 def setUp(self):
     self.structure = union_find.UnionFind()
Ejemplo n.º 9
0
    def __init__(self, fragments_fn, bcs_to_use, bam_fn):
        ''' Compute sparse barcode x genome bin coverage matrix.
            Each row is normalized to 1, so that the expected
            overlap for uncorrelated barcodes is 1 '''

        self.uf = union_find.UnionFind()

        bam_in = tk_bam.create_bam_infile(bam_fn)

        # Choose the set of fragments to use
        if type(fragments_fn) is str or type(fragments_fn) is unicode:
            fragments = self.load_fragments_filtered(fragments_fn, bcs_to_use)
        elif type(fragments_fn) is p.DataFrame:
            fragments = fragments_fn
        else:
            raise Exception(
                "unrecognized fragments_fn argument type: %s, must be filename or pandas.DataFrame"
                % str(type(fragments_fn)))

        # Setup genome bins
        genome_length = sum(bam_in.lengths)
        bin_size = max(1, genome_length / GENOME_BINS)
        chrom_bins = np.ceil(
            np.array([float(l) / bin_size for l in bam_in.lengths]))
        total_bins = chrom_bins.sum()
        start_bin = np.concatenate([[0], np.cumsum(chrom_bins)[:-1]])
        chrom_map = {c: idx for (idx, c) in enumerate(bam_in.references)}

        npartitions = len(bcs_to_use)

        # Number the selected barcodes -- the assigned number is their row in the BC-bin matrix
        bcs = fragments.bc.values
        bc_ids = {}
        self.bcs_to_use = []
        c = 0
        for bc in bcs:
            if bc_ids.has_key(bc):
                continue
            else:
                self.bcs_to_use.append(bc)
                bc_ids[bc] = c
                c += 1

        martian.log_info("making sparse matrix")

        indexes = np.empty((2, len(fragments)), dtype=np.int32)
        data = np.ones((len(fragments), ), dtype=np.float32)

        chroms = fragments.chrom.values
        pos_bin = fragments.start_pos.values / bin_size

        for fidx in range(len(fragments)):
            chrom_id = chrom_map[chroms[fidx]]
            which_bin = start_bin[chrom_id] + pos_bin[fidx]
            which_bc = bc_ids[bcs[fidx]]
            indexes[0, fidx] = which_bc
            indexes[1, fidx] = which_bin

        mat = scipy.sparse.csr_matrix((data, indexes),
                                      shape=(npartitions, total_bins),
                                      dtype=np.float32)
        # If there are multiple fragments for the same BC in the same bin, the csr_matrix constructor above will sum them up, leading
        # to entries greater than 1.  Cap everything at 1.
        mat.data = np.ones(mat.data.shape, dtype=mat.dtype)
        '''
        mat1 = scipy.sparse.lil_matrix((npartitions, total_bins), dtype=np.float32)
        bc_grps = fragments.groupby(["bc"])
        bc_count = 0

        # For each barcode, mark the genome bins covered by a fragment
        for (bc, bc_grp) in bc_grps:
            # Track the reads per fragment in tested partitions for reporting
            l = len(bc_grp)
            bins = np.zeros(l, dtype=np.int32)
            chroms = bc_grp.chrom.values
            starts = bc_grp.start_pos.values
            pos_bin = starts / bin_size

            for i in range(l):
                chrom_id = chrom_map[chroms[i]]
                which_bin = start_bin[chrom_id] + pos_bin[i]
                bins[i] = which_bin

            mat1[bc_count, bins] = 1.0
            bc_count += 1

            if bc_count % 1000 == 0:
                print bc_count
        '''
        eps = 0.0001

        # Get the genome bin occupancy
        genome_bin_counts = np.array(
            (mat > np.float32(0)).sum(axis=0)).flatten().astype(
                'float')  # total BC counts per bin
        high_cov_threshold = np.percentile(genome_bin_counts, 99.5)

        # switch off high-coverage bins -- set them to eps (a small nonzero number so we can distinguish them)
        high_cov_bins = np.where(genome_bin_counts > high_cov_threshold)[0]
        (r, c) = mat.nonzero()
        martian.log_info("removing %d bins" % len(high_cov_bins))
        for hc_bin in high_cov_bins:
            mat.data[c == hc_bin] = eps

        # Recalculate the genome bins distribution
        genome_bin_counts = np.array(
            (mat > (2 * eps)).sum(axis=0)).flatten().astype('float')
        martian.log_info(
            "Genome Bin Coverage  mean: %f  99.95th percentile: %f" %
            (genome_bin_counts.mean(), high_cov_threshold))

        # Adjust for 'effective genome size' based on the distribution over bins
        # i.e. more skewed distribution -> fewer effective bins
        effective_bins_factor = ((genome_bin_counts /
                                  genome_bin_counts.sum())**2).sum()
        self.effective_genome_bins = 1.0 / effective_bins_factor
        martian.log_info("Effective Number of Genome Bins = %f" %
                         self.effective_genome_bins)

        self.mat = mat
        martian.log_info("done __init__")
Ejemplo n.º 10
0
class Node:

    def __init__ (self, label):
        self.label = label

    def __str__(self):
        return str(self.label)

def print_sets(nodes):
    sets = [ str(union_find.find(x)) for x in nodes ]
    print('set representatives: %s' % (sets))
    print('number of disjoint sets: %s' %
        (len([ i for i in itertools.groupby(sets) ])))
    print()

union_find = union_find.UnionFind()

nodes = [ Node(ch) for ch in 'abcdefg' ]

print('labels: %s' % ([ str(i) for i in nodes ]))
for node in nodes:
    union_find.make_set(node)

print_sets(nodes)

assert(union_find.find(nodes[0]) != union_find.find(nodes[2]))
union_find.union(nodes[0], nodes[2])
assert(union_find.find(nodes[0]) == union_find.find(nodes[2]))

print_sets(nodes)