def find_synteny_region(query, sbed, data, window, cutoff, colinear=False): """ Get all synteny blocks for a query, algorithm is single linkage anchors are a window centered on query Two categories of syntenic regions depending on what query is: (Syntelog): syntenic region is denoted by the syntelog (Gray gene): syntenic region is marked by the closest flanker """ regions = [] ysorted = sorted(data, key=lambda x: x[1]) g = Grouper() a, b = tee(ysorted) next(b, None) for ia, ib in zip(a, b): pos1, pos2 = ia[1], ib[1] if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid: g.join(ia, ib) for group in sorted(g): (qflanker, syntelog), (far_flanker, far_syntelog), flanked = \ get_flanker(group, query) # run a mini-dagchainer here, take the direction that gives us most anchors if colinear: y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)] lis = longest_increasing_subsequence(y_indexed_group) lds = longest_decreasing_subsequence(y_indexed_group) if len(lis) >= len(lds): track = lis orientation = "+" else: track = lds orientation = "-" group = [group[i] for (y, i) in track] xpos, ypos = zip(*group) score = min(len(set(xpos)), len(set(ypos))) if qflanker == query: gray = "S" else: gray = "G" if not flanked else "F" score -= 1 # slight penalty for not finding syntelog if score < cutoff: continue # y-boundary of the block left, right = group[0][1], group[-1][1] # this characterizes a syntenic region (left, right). # syntelog is -1 if it's a gray gene syn_region = (syntelog, far_syntelog, left, right, gray, orientation, score) regions.append(syn_region) return sorted(regions, key=lambda x: -x[-1]) # decreasing synteny score
def find_synteny_region(query, sbed, data, window, cutoff, colinear=False): """ Get all synteny blocks for a query, algorithm is single linkage anchors are a window centered on query Two categories of syntenic regions depending on what query is: (Syntelog): syntenic region is denoted by the syntelog (Gray gene): syntenic region is marked by the closest flanker """ regions = [] ysorted = sorted(data, key=lambda x: x[1]) g = Grouper() a, b = tee(ysorted) next(b, None) for ia, ib in izip(a, b): pos1, pos2 = ia[1], ib[1] if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid: g.join(ia, ib) for group in sorted(g): (qflanker, syntelog), (far_flanker, far_syntelog), flanked = \ get_flanker(group, query) # run a mini-dagchainer here, take the direction that gives us most anchors if colinear: y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)] lis = longest_increasing_subsequence(y_indexed_group) lds = longest_decreasing_subsequence(y_indexed_group) if len(lis) >= len(lds): track = lis orientation = "+" else: track = lds orientation = "-" group = [group[i] for (y, i) in track] xpos, ypos = zip(*group) score = min(len(set(xpos)), len(set(ypos))) if qflanker == query: gray = "S" else: gray = "G" if not flanked else "F" score -= 1 # slight penalty for not finding syntelog if score < cutoff: continue # y-boundary of the block left, right = group[0][1], group[-1][1] # this characterizes a syntenic region (left, right). # syntelog is -1 if it's a gray gene syn_region = (syntelog, far_syntelog, left, right, gray, orientation, score) regions.append(syn_region) return sorted(regions, key=lambda x: -x[-1]) # decreasing synteny score
def main(arg): fp = open(arg) N = int(fp.readline().strip()) a = [int(x) for x in fp.readline().split()] assert N == len(a) lis = longest_increasing_subsequence(a) lds = longest_decreasing_subsequence(a) print " ".join(str(x) for x in lis) print " ".join(str(x) for x in lds)
def renumber(args): """ %prog renumber Mt35.consolidated.bed > tagged.bed Renumber genes for annotation updates. """ from jcvi.algorithms.lis import longest_increasing_subsequence from jcvi.utils.grouper import Grouper p = OptionParser(renumber.__doc__) p.set_annot_reformat_opts() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args pf = bedfile.rsplit(".", 1)[0] abedfile = pf + ".a.bed" bbedfile = pf + ".b.bed" if need_update(bedfile, (abedfile, bbedfile)): prepare(bedfile) mbed = Bed(bbedfile) g = Grouper() for s in mbed: accn = s.accn g.join(*accn.split(";")) bed = Bed(abedfile) for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue ranks = [] gg = set() for s in sbed: accn = s.accn achr, arank = atg_name(accn) if achr != current_chr: continue ranks.append(arank) gg.add(accn) lranks = longest_increasing_subsequence(ranks) print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \ "==>", len(lranks) granks = set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, uc=opts.uc) for x in lranks) | \ set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks) tagstore = {} for s in sbed: achr, arank = atg_name(s.accn) accn = s.accn if accn in granks: tag = (accn, FRAME) elif accn in gg: tag = (accn, RETAIN) else: tag = (".", NEW) tagstore[accn] = tag # Find cases where genes overlap for s in sbed: accn = s.accn gaccn = g[accn] tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn] group = [(PRIORITY.index(tag), x) for tag, x in tags] best = min(group)[-1] if accn != best: tag = (best, OVERLAP) else: tag = tagstore[accn] print "\t".join((str(s), "|".join(tag)))
def renumber(args): """ %prog renumber Mt35.consolidated.bed > tagged.bed Renumber genes for annotation updates. """ from jcvi.algorithms.lis import longest_increasing_subsequence from jcvi.utils.grouper import Grouper p = OptionParser(renumber.__doc__) p.set_annot_reformat_opts() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args pf = bedfile.rsplit(".", 1)[0] abedfile = pf + ".a.bed" bbedfile = pf + ".b.bed" if need_update(bedfile, (abedfile, bbedfile)): prepare(bedfile) mbed = Bed(bbedfile) g = Grouper() for s in mbed: accn = s.accn g.join(*accn.split(";")) bed = Bed(abedfile) for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue ranks = [] gg = set() for s in sbed: accn = s.accn achr, arank = atg_name(accn) if achr != current_chr: continue ranks.append(arank) gg.add(accn) lranks = longest_increasing_subsequence(ranks) print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \ "==>", len(lranks) granks = set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, uc=opts.uc) for x in lranks) | \ set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks) tagstore = {} for s in sbed: achr, arank = atg_name(s.accn) accn = s.accn if accn in granks: tag = (accn, FRAME) elif accn in gg: tag = (accn, RETAIN) else: tag = (".", NEW) tagstore[accn] = tag # Find cases where genes overlap for s in sbed: accn = s.accn gaccn = g[accn] tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn] group = [(PRIORITY.index(tag), x) for tag, x in tags] best = min(group)[-1] if accn != best: tag = (best, OVERLAP) else: tag = tagstore[accn] print "\t".join((str(s), "|".join(tag)))
def test_longest_increasing_subsequence(input_array, expected): from jcvi.algorithms.lis import longest_increasing_subsequence assert longest_increasing_subsequence(input_array) == expected