Beispiel #1
0
def single_linkage(points, max_dist=Nmax, min_cluster_size=N):
    """
    points are (x-index, y-index, cscore) per chromosome pair.
    """
    # This is the core single linkage algorithm
    # this behaves in O(n) complexity: we iterate through the pairs, for each pair
    # we look back on the adjacent pairs to find links

    clusters = Grouper()
    n = len(points)
    points.sort()
    for i in xrange(n):
        for j in xrange(i - 1, -1, -1):
            # x-axis distance
            del_x = points[i][0] - points[j][0]
            if del_x > max_dist: break
            # y-axis distance
            del_y = points[i][1] - points[j][1]
            if del_x + abs(del_y) > max_dist: continue
            #if abs(del_y) > Nmax: continue
            # otherwise join
            clusters.join(points[i], points[j])
    clusters = [
        cluster for cluster in list(clusters)
        if len(cluster) >= min_cluster_size
    ]
    return clusters
Beispiel #2
0
def mergeable(group1, group2, all_ranks, quota):

    # rule no.1- respect quota
    # rule no.2- but only count close genes once
    micro_grouper = Grouper() # data structure to check rule no.2
    merged_group = group1 + group2 # attempted merge
    nmerged = len(merged_group)

    # do all pairwise comparisons to find closely located genes
    # TODO: silly implementation, not efficient
    for i, genei in enumerate(merged_group):
        speciesi, chri, posi = all_ranks[genei]
        micro_grouper.join(genei)
        for j in xrange(i+1, nmerged):
            genej = merged_group[j]
            speciesj, chrj, posj = all_ranks[genej]
            if speciesi==speciesj and chri==chrj and abs(posi-posj)<=Tandem_Nmax/2:
                micro_grouper.join(genei, genej)

    species_count = collections.defaultdict(int) # data structure to check rule no.1
    for gene_group in micro_grouper:
        species = all_ranks[gene_group[0]][0]
        species_count[species] += 1

    for species, count in species_count.items():
        if count>quota[species]: 
            return False

    return True
Beispiel #3
0
def get_2D_overlap(chain, eclusters):
    """
    Implements a sweep line algorithm, that has better running time than naive O(n^2):
    assume block has x_ends, and y_ends for the bounds

    1. sort x_ends, and take a sweep line to scan the x_ends
    2. if left end, test y-axis intersection of current block with `active` set;
       also put this block in the `active` set
    3. if right end, remove block from the `active` set
    """
    mergeables = Grouper()
    active = set()

    x_ends = []
    for i, (range_x, range_y, score) in enumerate(eclusters):
        chr, left, right = range_x
        x_ends.append((chr, left, 0, i))  # 0/1 for left/right-ness
        x_ends.append((chr, right, 1, i))
    x_ends.sort()

    chr_last = ""
    for chr, pos, left_right, i in x_ends:
        if chr != chr_last: active.clear()
        if left_right == 0:
            active.add(i)
            for x in active:
                # check y-overlap
                if range_overlap(eclusters[x][1], eclusters[i][1]):
                    mergeables.join(x, i)
        else:  # right end
            active.remove(i)

        chr_last = chr

    return mergeables
def get_2D_overlap(chain, eclusters):
    """
    Implements a sweep line algorithm, that has better running time than naive O(n^2):
    assume block has x_ends, and y_ends for the bounds

    1. sort x_ends, and take a sweep line to scan the x_ends
    2. if left end, test y-axis intersection of current block with `active` set;
       also put this block in the `active` set
    3. if right end, remove block from the `active` set
    """
    mergeables = Grouper()
    active = set()

    x_ends = []
    for i, (range_x, range_y, score) in enumerate(eclusters):
        chr, left, right = range_x
        x_ends.append((chr, left, 0, i))  # 0/1 for left/right-ness
        x_ends.append((chr, right, 1, i))
    x_ends.sort()

    chr_last = ""
    for chr, pos, left_right, i in x_ends:
        if chr != chr_last: active.clear()
        if left_right==0: 
            active.add(i) 
            for x in active:
                # check y-overlap
                if range_overlap(eclusters[x][1], eclusters[i][1]):
                    mergeables.join(x, i)
        else: # right end
            active.remove(i) 

        chr_last = chr

    return mergeables
Beispiel #5
0
def find_synteny_region(query, sbed, data, window, cutoff, colinear=True):
    # get all synteny blocks for a query, algorithm is single linkage
    # anchors are a window centered on query
    # two categories of syntenic regions depending on what query is:
    # (Syntelog): syntenic region is denoted by the syntelog
    # (Gray gene): syntenic region is marked by the closest flanker

    regions = []
    ysorted = sorted(data, key=lambda x: x[1])
    g = Grouper()

    a, b = itertools.tee(ysorted)
    next(b, None)
    for ia, ib in itertools.izip(a, b):
        pos1, pos2 = ia[1], ib[1]
        if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid:
            g.join(ia, ib)

    for group in sorted(g):
        (qflanker,
         syntelog), (far_flanker,
                     far_syntelog), flanked = get_flanker(group, query)

        # y-boundary of the block
        gs = [x[1] for x in group]
        left, right = min(gs), max(gs)

        # run a mini-dagchainer here, take the direction that gives us most anchors
        orientation = "+"
        if colinear:
            y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)]
            lis = longest_increasing_subsequence(y_indexed_group)
            lds = longest_decreasing_subsequence(y_indexed_group)

            if len(lis) >= len(lds):
                track = lis
            else:
                track = lds
                orientation = "-"

            group = [group[i] for (y, i) in track]

        xpos, ypos = zip(*group)
        score = min(len(set(xpos)), len(set(ypos)))

        if qflanker == query:
            gray = "S"
        else:
            gray = "G" if not flanked else "F"
            score -= 1  # slight penalty for not finding syntelog

        if score < cutoff: continue

        # this characterizes a syntenic region (left, right). syntelog is -1 if it's a gray gene
        syn_region = (syntelog, left, right, gray, orientation, score)
        regions.append(syn_region)

    return sorted(regions, key=lambda x: -x[-1])  # decreasing synteny score
def find_synteny_region(query, sbed, data, window, cutoff, colinear=True):
    # get all synteny blocks for a query, algorithm is single linkage
    # anchors are a window centered on query
    # two categories of syntenic regions depending on what query is:
    # (Syntelog): syntenic region is denoted by the syntelog
    # (Gray gene): syntenic region is marked by the closest flanker

    regions = []
    ysorted = sorted(data, key=lambda x:x[1])
    g = Grouper()

    a, b = itertools.tee(ysorted)
    next(b, None)
    for ia, ib in itertools.izip(a, b):
        pos1, pos2 = ia[1], ib[1]
        if pos2 - pos1 < window and sbed[pos1].seqid==sbed[pos2].seqid:
            g.join(ia, ib)

    for group in sorted(g):
        (qflanker, syntelog), (far_flanker, far_syntelog), flanked = get_flanker(group, query)

        # y-boundary of the block
        gs = [x[1] for x in group]
        left, right = min(gs), max(gs)

        # run a mini-dagchainer here, take the direction that gives us most anchors
        orientation = "+"
        if colinear:
            y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)]
            lis = longest_increasing_subsequence(y_indexed_group)
            lds = longest_decreasing_subsequence(y_indexed_group)

            if len(lis) >= len(lds):
                track = lis
            else:
                track = lds
                orientation = "-"

            group = [group[i] for (y, i) in track]

        xpos, ypos = zip(*group)
        score = min(len(set(xpos)), len(set(ypos)))

        if qflanker==query:
            gray = "S"
        else:
            gray = "G" if not flanked else "F"
            score -= 1 # slight penalty for not finding syntelog

        if score < cutoff: continue

        # this characterizes a syntenic region (left, right). syntelog is -1 if it's a gray gene
        syn_region = (syntelog, left, right, gray, orientation, score)
        regions.append(syn_region)

    return sorted(regions, key=lambda x: -x[-1]) # decreasing synteny score
 def to_groups(self, distance):
     # not used.
     g = Grouper()
     for name, anchors in itertools.groupby(self,
                                            key=lambda a, b:
                                            (a.seqid, b.seqid)):
         for ia, qa, sa in enumerate(anchors[:-1]):
             qb, sb = anchors[ia + 1]
             if qb.start - qa.end <= distance and sb.start - sa.end <= distance:
                 g.join((qa, sa), (qb, sb))
     return g
Beispiel #8
0
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True):
    if not flip:
        simple_blast = [(b.query, (b.sseqid, b.si)) for b in blast_list if b.evalue < 1e-10] 
    else:
        simple_blast = [(b.subject, (b.qseqid, b.qi)) for b in blast_list if b.evalue < 1e-10] 

    simple_blast.sort()

    standems = Grouper()
    for name, hits in itertools.groupby(simple_blast, key=lambda x:x[0]):
        # these are already sorted.
        hits = [x[1] for x in hits]
        for ia, a in enumerate(hits[:-1]):
            b = hits[ia + 1]
            # on the same chromosome and rank difference no larger than tandem_Nmax
            if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]: 
                standems.join(a[1], b[1])

    return standems
Beispiel #9
0
def make_family(gene_pairs, all_ranks, quota):
    
    print >>sys.stderr, "... gene family clustering started"

    g = Grouper() 

    gene_pairs.sort(reverse=True)
    #pprint.pprint(gene_pairs[:10])
    for synteny_score, gene1, gene2 in gene_pairs:
        # attempt to join the two genes

        g.join(gene1)
        g.join(gene2)
        group1, group2 = g[gene1], g[gene2]

        if mergeable(group1, group2, all_ranks, quota):
            g.join(gene1, gene2)

    return g
Beispiel #10
0
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True):
    if not flip:
        simple_blast = [(b.query, (b.sseqid, b.si)) for b in blast_list if b.evalue < 1e-10] 
    else:
        simple_blast = [(b.subject, (b.qseqid, b.qi)) for b in blast_list if b.evalue < 1e-10] 

    simple_blast.sort()

    standems = Grouper()
    for name, hits in itertools.groupby(simple_blast, key=lambda x:x[0]):
        # these are already sorted.
        hits = [x[1] for x in hits]
        for ia, a in enumerate(hits[:-1]):
            b = hits[ia + 1]
            # on the same chromosome and rank difference no larger than tandem_Nmax
            if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]: 
                standems.join(a[1], b[1])

    return standems
Beispiel #11
0
def merge_clusters(chain, clusters):

    # there are, in general, two kinds of breakpoints
    # those that are induced by inversions, and those by translocations
    # inversion-breakpoints are excessive breakpoints that I want to remove

    chain_num = len(chain)
    mergeables = Grouper()  # disjoint sets of clusters that can be merged
    for j in xrange(chain_num):
        cj = chain[j]
        mergeables.join(cj, cj)
        for i in xrange(j - 1, -1, -1):
            ci = chain[i]
            del_x = distance_x(clusters[ci], clusters[cj])
            if del_x > Nmax: continue

            del_y = distance_y(clusters[ci], clusters[cj])
            if del_x + del_y > Nmax: continue
            mergeables.join(ci, cj)

    to_merge = {}
    for mergeable in mergeables:
        for m in mergeable:
            to_merge[m] = min(mergeables[m])

    merged_chain = []
    for c in chain:
        if to_merge[c] == c:  # i.e. parent of mergeables
            merged_chain.append(c)

    # refresh clusters list, merge chains
    for k, v in to_merge.iteritems():
        if to_merge[k] != k:  # i.e. not map to self
            clusters[v].extend(clusters[k])

    # maintain the x-sort
    [cluster.sort() for cluster in clusters]

    # nothing is merged
    updated = (len(merged_chain) != chain_num)
    return merged_chain, updated
def merge_clusters(chain, clusters):

    # there are, in general, two kinds of breakpoints
    # those that are induced by inversions, and those by translocations
    # inversion-breakpoints are excessive breakpoints that I want to remove
    
    chain_num = len(chain)
    mergeables = Grouper() # disjoint sets of clusters that can be merged
    for j in xrange(chain_num):
        cj = chain[j]
        mergeables.join(cj, cj)
        for i in xrange(j-1, -1, -1):
            ci = chain[i]
            del_x = distance_x(clusters[ci], clusters[cj])
            if del_x > Nmax: continue 

            del_y = distance_y(clusters[ci], clusters[cj])
            if del_x + del_y > Nmax: continue
            mergeables.join(ci, cj)

    to_merge = {} 
    for mergeable in mergeables:
        for m in mergeable:
            to_merge[m] = min(mergeables[m])

    merged_chain = []
    for c in chain:
        if to_merge[c]==c: # i.e. parent of mergeables
            merged_chain.append(c)

    # refresh clusters list, merge chains
    for k, v in to_merge.iteritems():
        if to_merge[k]!=k: # i.e. not map to self
            clusters[v].extend(clusters[k])

    # maintain the x-sort
    [cluster.sort() for cluster in clusters]

    # nothing is merged
    updated = (len(merged_chain) != chain_num)
    return merged_chain, updated
def single_linkage(points, xdist, ydist, N):

    # This is the core single linkage algorithm
    # this behaves in O(n) complexity: we iterate through the pairs, for each pair
    # we look back on the adjacent pairs to find links

    clusters = Grouper()
    n = len(points)
    points.sort()
    for i in xrange(n):
        for j in xrange(i-1, -1, -1):
            # x-axis distance
            del_x = points[i][0]-points[j][0]
            if del_x > xdist: break
            # y-axis distance
            del_y = points[i][1]-points[j][1]
            if abs(del_y) > ydist: continue
            # otherwise join
            clusters.join(points[i], points[j])
    clusters = [cluster for cluster in list(clusters) if score(cluster)>=N]
    return clusters
def mergedSpeakers(chat):
    gps = Grouper()

    for comment in chat:
        if comment.thread > 0:
            spkr = comment.name
            gps.join(comment.name, comment.name)
            for ment in comment.mentioned:
                gps.join(comment.name, ment)

    gpToNum = {}
    for ctr, gp in enumerate(gps):
        gpToNum[tuple(gp)] = ctr + 1

    for comment in chat:
        if comment.thread > 0:
            gp = gps.find(comment.name)
            num = gpToNum[tuple(gp)]
            comment.thread = num

    return chat
Beispiel #15
0
def load_geneorders(fp_gff):

    # load gene orders before any filtering

    fp_gff.seek(0)
    tandem = Grouper()
    print >> sys.stderr, "Read .genes file"
    # chromosome => gene_list in that chromosome
    chr_ranks = collections.defaultdict(list)
    ranks = {}  # gene => rank postion
    for row in fp_gff:
        chr, gene, start, stop = row.split()
        start = int(start)
        chr_ranks[chr].append((start, gene, chr))
        tandem.join(gene)
    for v in chr_ranks.itervalues():
        gene_rank = 0
        for start, gene, chr in sorted(v):
            ranks[gene] = (chr, gene_rank)
            gene_rank += 1
    return ranks, tandem
def load_geneorders(fp_gff):

    # load gene orders before any filtering

    fp_gff.seek(0)
    tandem = Grouper()
    print >>sys.stderr, "Read .genes file"
    # chromosome => gene_list in that chromosome
    chr_ranks = collections.defaultdict(list)
    ranks = {}  # gene => rank postion
    for row in fp_gff:
        chr, gene, start, stop = row.split()
        start = int(start)
        chr_ranks[chr].append((start, gene, chr))
        tandem.join(gene)
    for v in chr_ranks.itervalues():
        gene_rank = 0
        for start, gene, chr in sorted(v):
            ranks[gene] = (chr, gene_rank)
            gene_rank += 1
    return ranks, tandem
def single_linkage(points, max_dist=Nmax, min_cluster_size=N):
    """
    points are (x-index, y-index, cscore) per chromosome pair.
    """
    # This is the core single linkage algorithm
    # this behaves in O(n) complexity: we iterate through the pairs, for each pair
    # we look back on the adjacent pairs to find links

    clusters = Grouper()
    n = len(points)
    points.sort()
    for i in xrange(n):
        for j in xrange(i-1, -1, -1):
            # x-axis distance
            del_x = points[i][0]-points[j][0]
            if del_x > max_dist: break
            # y-axis distance
            del_y = points[i][1]-points[j][1]
            if del_x + abs(del_y) > max_dist: continue
            #if abs(del_y) > Nmax: continue
            # otherwise join
            clusters.join(points[i], points[j])
    clusters = [cluster for cluster in list(clusters) if len(cluster)>=min_cluster_size]
    return clusters
Beispiel #18
0
            right_wall.neighbours = (current_cell, cells[(x+1,y)])
            walls.append(right_wall)


cell_list = [cells[key] for key in cells]

maze = Grouper(cell_list)

for _ in range(len(walls)):
    
    wall = popchoice(walls)
    cell_1, cell_2 = wall.neighbours
    
    if not maze.joined(cell_1, cell_2):
        wall.active = False
        maze.join(cell_1, cell_2)



maze_map = []

x_max = (X*2)+1
y_max = (Y*2)+1

maze_map.append([True for _ in range(x_max)])
for y in range(1, y_max):
    maze_map.append([True]+[False for _ in range(1, x_max)])


for coords, cell in cells.items():
    x, y = coords