def single_linkage(points, max_dist=Nmax, min_cluster_size=N): """ points are (x-index, y-index, cscore) per chromosome pair. """ # This is the core single linkage algorithm # this behaves in O(n) complexity: we iterate through the pairs, for each pair # we look back on the adjacent pairs to find links clusters = Grouper() n = len(points) points.sort() for i in xrange(n): for j in xrange(i - 1, -1, -1): # x-axis distance del_x = points[i][0] - points[j][0] if del_x > max_dist: break # y-axis distance del_y = points[i][1] - points[j][1] if del_x + abs(del_y) > max_dist: continue #if abs(del_y) > Nmax: continue # otherwise join clusters.join(points[i], points[j]) clusters = [ cluster for cluster in list(clusters) if len(cluster) >= min_cluster_size ] return clusters
def mergeable(group1, group2, all_ranks, quota): # rule no.1- respect quota # rule no.2- but only count close genes once micro_grouper = Grouper() # data structure to check rule no.2 merged_group = group1 + group2 # attempted merge nmerged = len(merged_group) # do all pairwise comparisons to find closely located genes # TODO: silly implementation, not efficient for i, genei in enumerate(merged_group): speciesi, chri, posi = all_ranks[genei] micro_grouper.join(genei) for j in xrange(i+1, nmerged): genej = merged_group[j] speciesj, chrj, posj = all_ranks[genej] if speciesi==speciesj and chri==chrj and abs(posi-posj)<=Tandem_Nmax/2: micro_grouper.join(genei, genej) species_count = collections.defaultdict(int) # data structure to check rule no.1 for gene_group in micro_grouper: species = all_ranks[gene_group[0]][0] species_count[species] += 1 for species, count in species_count.items(): if count>quota[species]: return False return True
def get_2D_overlap(chain, eclusters): """ Implements a sweep line algorithm, that has better running time than naive O(n^2): assume block has x_ends, and y_ends for the bounds 1. sort x_ends, and take a sweep line to scan the x_ends 2. if left end, test y-axis intersection of current block with `active` set; also put this block in the `active` set 3. if right end, remove block from the `active` set """ mergeables = Grouper() active = set() x_ends = [] for i, (range_x, range_y, score) in enumerate(eclusters): chr, left, right = range_x x_ends.append((chr, left, 0, i)) # 0/1 for left/right-ness x_ends.append((chr, right, 1, i)) x_ends.sort() chr_last = "" for chr, pos, left_right, i in x_ends: if chr != chr_last: active.clear() if left_right == 0: active.add(i) for x in active: # check y-overlap if range_overlap(eclusters[x][1], eclusters[i][1]): mergeables.join(x, i) else: # right end active.remove(i) chr_last = chr return mergeables
def get_2D_overlap(chain, eclusters): """ Implements a sweep line algorithm, that has better running time than naive O(n^2): assume block has x_ends, and y_ends for the bounds 1. sort x_ends, and take a sweep line to scan the x_ends 2. if left end, test y-axis intersection of current block with `active` set; also put this block in the `active` set 3. if right end, remove block from the `active` set """ mergeables = Grouper() active = set() x_ends = [] for i, (range_x, range_y, score) in enumerate(eclusters): chr, left, right = range_x x_ends.append((chr, left, 0, i)) # 0/1 for left/right-ness x_ends.append((chr, right, 1, i)) x_ends.sort() chr_last = "" for chr, pos, left_right, i in x_ends: if chr != chr_last: active.clear() if left_right==0: active.add(i) for x in active: # check y-overlap if range_overlap(eclusters[x][1], eclusters[i][1]): mergeables.join(x, i) else: # right end active.remove(i) chr_last = chr return mergeables
def find_synteny_region(query, sbed, data, window, cutoff, colinear=True): # get all synteny blocks for a query, algorithm is single linkage # anchors are a window centered on query # two categories of syntenic regions depending on what query is: # (Syntelog): syntenic region is denoted by the syntelog # (Gray gene): syntenic region is marked by the closest flanker regions = [] ysorted = sorted(data, key=lambda x: x[1]) g = Grouper() a, b = itertools.tee(ysorted) next(b, None) for ia, ib in itertools.izip(a, b): pos1, pos2 = ia[1], ib[1] if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid: g.join(ia, ib) for group in sorted(g): (qflanker, syntelog), (far_flanker, far_syntelog), flanked = get_flanker(group, query) # y-boundary of the block gs = [x[1] for x in group] left, right = min(gs), max(gs) # run a mini-dagchainer here, take the direction that gives us most anchors orientation = "+" if colinear: y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)] lis = longest_increasing_subsequence(y_indexed_group) lds = longest_decreasing_subsequence(y_indexed_group) if len(lis) >= len(lds): track = lis else: track = lds orientation = "-" group = [group[i] for (y, i) in track] xpos, ypos = zip(*group) score = min(len(set(xpos)), len(set(ypos))) if qflanker == query: gray = "S" else: gray = "G" if not flanked else "F" score -= 1 # slight penalty for not finding syntelog if score < cutoff: continue # this characterizes a syntenic region (left, right). syntelog is -1 if it's a gray gene syn_region = (syntelog, left, right, gray, orientation, score) regions.append(syn_region) return sorted(regions, key=lambda x: -x[-1]) # decreasing synteny score
def find_synteny_region(query, sbed, data, window, cutoff, colinear=True): # get all synteny blocks for a query, algorithm is single linkage # anchors are a window centered on query # two categories of syntenic regions depending on what query is: # (Syntelog): syntenic region is denoted by the syntelog # (Gray gene): syntenic region is marked by the closest flanker regions = [] ysorted = sorted(data, key=lambda x:x[1]) g = Grouper() a, b = itertools.tee(ysorted) next(b, None) for ia, ib in itertools.izip(a, b): pos1, pos2 = ia[1], ib[1] if pos2 - pos1 < window and sbed[pos1].seqid==sbed[pos2].seqid: g.join(ia, ib) for group in sorted(g): (qflanker, syntelog), (far_flanker, far_syntelog), flanked = get_flanker(group, query) # y-boundary of the block gs = [x[1] for x in group] left, right = min(gs), max(gs) # run a mini-dagchainer here, take the direction that gives us most anchors orientation = "+" if colinear: y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)] lis = longest_increasing_subsequence(y_indexed_group) lds = longest_decreasing_subsequence(y_indexed_group) if len(lis) >= len(lds): track = lis else: track = lds orientation = "-" group = [group[i] for (y, i) in track] xpos, ypos = zip(*group) score = min(len(set(xpos)), len(set(ypos))) if qflanker==query: gray = "S" else: gray = "G" if not flanked else "F" score -= 1 # slight penalty for not finding syntelog if score < cutoff: continue # this characterizes a syntenic region (left, right). syntelog is -1 if it's a gray gene syn_region = (syntelog, left, right, gray, orientation, score) regions.append(syn_region) return sorted(regions, key=lambda x: -x[-1]) # decreasing synteny score
def to_groups(self, distance): # not used. g = Grouper() for name, anchors in itertools.groupby(self, key=lambda a, b: (a.seqid, b.seqid)): for ia, qa, sa in enumerate(anchors[:-1]): qb, sb = anchors[ia + 1] if qb.start - qa.end <= distance and sb.start - sa.end <= distance: g.join((qa, sa), (qb, sb)) return g
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True): if not flip: simple_blast = [(b.query, (b.sseqid, b.si)) for b in blast_list if b.evalue < 1e-10] else: simple_blast = [(b.subject, (b.qseqid, b.qi)) for b in blast_list if b.evalue < 1e-10] simple_blast.sort() standems = Grouper() for name, hits in itertools.groupby(simple_blast, key=lambda x:x[0]): # these are already sorted. hits = [x[1] for x in hits] for ia, a in enumerate(hits[:-1]): b = hits[ia + 1] # on the same chromosome and rank difference no larger than tandem_Nmax if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]: standems.join(a[1], b[1]) return standems
def make_family(gene_pairs, all_ranks, quota): print >>sys.stderr, "... gene family clustering started" g = Grouper() gene_pairs.sort(reverse=True) #pprint.pprint(gene_pairs[:10]) for synteny_score, gene1, gene2 in gene_pairs: # attempt to join the two genes g.join(gene1) g.join(gene2) group1, group2 = g[gene1], g[gene2] if mergeable(group1, group2, all_ranks, quota): g.join(gene1, gene2) return g
def merge_clusters(chain, clusters): # there are, in general, two kinds of breakpoints # those that are induced by inversions, and those by translocations # inversion-breakpoints are excessive breakpoints that I want to remove chain_num = len(chain) mergeables = Grouper() # disjoint sets of clusters that can be merged for j in xrange(chain_num): cj = chain[j] mergeables.join(cj, cj) for i in xrange(j - 1, -1, -1): ci = chain[i] del_x = distance_x(clusters[ci], clusters[cj]) if del_x > Nmax: continue del_y = distance_y(clusters[ci], clusters[cj]) if del_x + del_y > Nmax: continue mergeables.join(ci, cj) to_merge = {} for mergeable in mergeables: for m in mergeable: to_merge[m] = min(mergeables[m]) merged_chain = [] for c in chain: if to_merge[c] == c: # i.e. parent of mergeables merged_chain.append(c) # refresh clusters list, merge chains for k, v in to_merge.iteritems(): if to_merge[k] != k: # i.e. not map to self clusters[v].extend(clusters[k]) # maintain the x-sort [cluster.sort() for cluster in clusters] # nothing is merged updated = (len(merged_chain) != chain_num) return merged_chain, updated
def merge_clusters(chain, clusters): # there are, in general, two kinds of breakpoints # those that are induced by inversions, and those by translocations # inversion-breakpoints are excessive breakpoints that I want to remove chain_num = len(chain) mergeables = Grouper() # disjoint sets of clusters that can be merged for j in xrange(chain_num): cj = chain[j] mergeables.join(cj, cj) for i in xrange(j-1, -1, -1): ci = chain[i] del_x = distance_x(clusters[ci], clusters[cj]) if del_x > Nmax: continue del_y = distance_y(clusters[ci], clusters[cj]) if del_x + del_y > Nmax: continue mergeables.join(ci, cj) to_merge = {} for mergeable in mergeables: for m in mergeable: to_merge[m] = min(mergeables[m]) merged_chain = [] for c in chain: if to_merge[c]==c: # i.e. parent of mergeables merged_chain.append(c) # refresh clusters list, merge chains for k, v in to_merge.iteritems(): if to_merge[k]!=k: # i.e. not map to self clusters[v].extend(clusters[k]) # maintain the x-sort [cluster.sort() for cluster in clusters] # nothing is merged updated = (len(merged_chain) != chain_num) return merged_chain, updated
def single_linkage(points, xdist, ydist, N): # This is the core single linkage algorithm # this behaves in O(n) complexity: we iterate through the pairs, for each pair # we look back on the adjacent pairs to find links clusters = Grouper() n = len(points) points.sort() for i in xrange(n): for j in xrange(i-1, -1, -1): # x-axis distance del_x = points[i][0]-points[j][0] if del_x > xdist: break # y-axis distance del_y = points[i][1]-points[j][1] if abs(del_y) > ydist: continue # otherwise join clusters.join(points[i], points[j]) clusters = [cluster for cluster in list(clusters) if score(cluster)>=N] return clusters
def mergedSpeakers(chat): gps = Grouper() for comment in chat: if comment.thread > 0: spkr = comment.name gps.join(comment.name, comment.name) for ment in comment.mentioned: gps.join(comment.name, ment) gpToNum = {} for ctr, gp in enumerate(gps): gpToNum[tuple(gp)] = ctr + 1 for comment in chat: if comment.thread > 0: gp = gps.find(comment.name) num = gpToNum[tuple(gp)] comment.thread = num return chat
def load_geneorders(fp_gff): # load gene orders before any filtering fp_gff.seek(0) tandem = Grouper() print >> sys.stderr, "Read .genes file" # chromosome => gene_list in that chromosome chr_ranks = collections.defaultdict(list) ranks = {} # gene => rank postion for row in fp_gff: chr, gene, start, stop = row.split() start = int(start) chr_ranks[chr].append((start, gene, chr)) tandem.join(gene) for v in chr_ranks.itervalues(): gene_rank = 0 for start, gene, chr in sorted(v): ranks[gene] = (chr, gene_rank) gene_rank += 1 return ranks, tandem
def load_geneorders(fp_gff): # load gene orders before any filtering fp_gff.seek(0) tandem = Grouper() print >>sys.stderr, "Read .genes file" # chromosome => gene_list in that chromosome chr_ranks = collections.defaultdict(list) ranks = {} # gene => rank postion for row in fp_gff: chr, gene, start, stop = row.split() start = int(start) chr_ranks[chr].append((start, gene, chr)) tandem.join(gene) for v in chr_ranks.itervalues(): gene_rank = 0 for start, gene, chr in sorted(v): ranks[gene] = (chr, gene_rank) gene_rank += 1 return ranks, tandem
def single_linkage(points, max_dist=Nmax, min_cluster_size=N): """ points are (x-index, y-index, cscore) per chromosome pair. """ # This is the core single linkage algorithm # this behaves in O(n) complexity: we iterate through the pairs, for each pair # we look back on the adjacent pairs to find links clusters = Grouper() n = len(points) points.sort() for i in xrange(n): for j in xrange(i-1, -1, -1): # x-axis distance del_x = points[i][0]-points[j][0] if del_x > max_dist: break # y-axis distance del_y = points[i][1]-points[j][1] if del_x + abs(del_y) > max_dist: continue #if abs(del_y) > Nmax: continue # otherwise join clusters.join(points[i], points[j]) clusters = [cluster for cluster in list(clusters) if len(cluster)>=min_cluster_size] return clusters
right_wall.neighbours = (current_cell, cells[(x+1,y)]) walls.append(right_wall) cell_list = [cells[key] for key in cells] maze = Grouper(cell_list) for _ in range(len(walls)): wall = popchoice(walls) cell_1, cell_2 = wall.neighbours if not maze.joined(cell_1, cell_2): wall.active = False maze.join(cell_1, cell_2) maze_map = [] x_max = (X*2)+1 y_max = (Y*2)+1 maze_map.append([True for _ in range(x_max)]) for y in range(1, y_max): maze_map.append([True]+[False for _ in range(1, x_max)]) for coords, cell in cells.items(): x, y = coords