def group(args): """ %prog group anchorfiles Group the anchors into ortho-groups. Can input multiple anchor files. """ p = OptionParser(group.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) anchorfiles = args groups = Grouper() for anchorfile in anchorfiles: ac = AnchorFile(anchorfile) for a, b, idx in ac.iter_pairs(): groups.join(a, b) logging.debug("Created {0} groups with {1} members.".\ format(len(groups), groups.num_members)) outfile = opts.outfile fw = must_open(outfile, "w") for g in groups: print >> fw, ",".join(sorted(g)) fw.close() return outfile
def get_2D_overlap(chain, eclusters): """ Implements a sweep line algorithm, that has better running time than naive O(n^2): assume block has x_ends, and y_ends for the bounds 1. sort x_ends, and take a sweep line to scan the x_ends 2. if left end, test y-axis intersection of current block with `active` set; also put this block in the `active` set 3. if right end, remove block from the `active` set """ mergeables = Grouper() active = set() x_ends = [] for i, (range_x, range_y, score) in enumerate(eclusters): chr, left, right = range_x x_ends.append((chr, left, 0, i)) # 0/1 for left/right-ness x_ends.append((chr, right, 1, i)) x_ends.sort() chr_last = "" for chr, pos, left_right, i in x_ends: if chr != chr_last: active.clear() if left_right == 0: active.add(i) for x in active: # check y-overlap if range_overlap(eclusters[x][1], eclusters[i][1]): mergeables.join(x, i) else: # right end active.remove(i) chr_last = chr return mergeables
def find_synteny_region(query, sbed, data, window, cutoff, colinear=False): """ Get all synteny blocks for a query, algorithm is single linkage anchors are a window centered on query Two categories of syntenic regions depending on what query is: (Syntelog): syntenic region is denoted by the syntelog (Gray gene): syntenic region is marked by the closest flanker """ regions = [] ysorted = sorted(data, key=lambda x: x[1]) g = Grouper() a, b = tee(ysorted) next(b, None) for ia, ib in izip(a, b): pos1, pos2 = ia[1], ib[1] if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid: g.join(ia, ib) for group in sorted(g): (qflanker, syntelog), (far_flanker, far_syntelog), flanked = \ get_flanker(group, query) # run a mini-dagchainer here, take the direction that gives us most anchors if colinear: y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)] lis = longest_increasing_subsequence(y_indexed_group) lds = longest_decreasing_subsequence(y_indexed_group) if len(lis) >= len(lds): track = lis orientation = "+" else: track = lds orientation = "-" group = [group[i] for (y, i) in track] xpos, ypos = zip(*group) score = min(len(set(xpos)), len(set(ypos))) if qflanker == query: gray = "S" else: gray = "G" if not flanked else "F" score -= 1 # slight penalty for not finding syntelog if score < cutoff: continue # y-boundary of the block left, right = group[0][1], group[-1][1] # this characterizes a syntenic region (left, right). # syntelog is -1 if it's a gray gene syn_region = (syntelog, far_syntelog, left, right, gray, orientation, score) regions.append(syn_region) return sorted(regions, key=lambda x: -x[-1]) # decreasing synteny score
def group(args): """ %prog group anchorfiles Group the anchors into ortho-groups. Can input multiple anchor files. """ p = OptionParser(group.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) anchorfiles = args groups = Grouper() for anchorfile in anchorfiles: ac = AnchorFile(anchorfile) for a, b in ac.iter_pairs(): groups.join(a, b) ngroups = len(groups) nmembers = sum(len(x) for x in groups) logging.debug("Created {0} groups with {1} members.".\ format(ngroups, nmembers)) for g in groups: print ",".join(sorted(g))
def synteny_scan(points, xdist, ydist, N): """ This is the core single linkage algorithm which behaves in O(n): iterate through the pairs, foreach pair we look back on the adjacent pairs to find links """ clusters = Grouper() n = len(points) points.sort() for i in xrange(n): for j in xrange(i - 1, -1, -1): # x-axis distance del_x = points[i][0] - points[j][0] if del_x > xdist: break # y-axis distance del_y = points[i][1] - points[j][1] if abs(del_y) > ydist: continue # otherwise join clusters.join(points[i], points[j]) # select clusters that are at least >=N clusters = [sorted(cluster) for cluster in list(clusters) \ if _score(cluster) >= N] return clusters
def iter_partitions(self, cutoff=.3, gtr=True): from jcvi.utils.grouper import Grouper if gtr: names = self.gnames fp = open(self.gtrfile) else: names = self.anames fp = open(self.atrfile) reader = csv.reader(fp, delimiter="\t") grouper = Grouper() for g in map(GTRLine._make, reader): d = float(g.dist) if d < cutoff: continue grouper.join(g.parent, g.left_child, g.right_child) parents = {} for i, group in enumerate(grouper): for g in group: parents[g] = i partitions = [[parents.get(a, x), x] for a, x in names] for key, parts in groupby(partitions, key=lambda x: x[0]): yield list(x[1] for x in parts)
def pile(args): """ %prog pile abedfile bbedfile > piles Call intersectBed on two bedfiles. """ from jcvi.utils.grouper import Grouper p = OptionParser(pile.__doc__) p.add_option("--minOverlap", default=0, type="int", help="Minimum overlap required [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) abedfile, bbedfile = args iw = intersectBed_wao(abedfile, bbedfile, minOverlap=opts.minOverlap) groups = Grouper() for a, b in iw: groups.join(a.accn, b.accn) ngroups = 0 for group in groups: if len(group) > 1: ngroups += 1 print "|".join(group) logging.debug("A total of {0} piles (>= 2 members)".format(ngroups))
def find_synteny_region(query, sbed, data, window, cutoff, colinear=False): """ Get all synteny blocks for a query, algorithm is single linkage anchors are a window centered on query Two categories of syntenic regions depending on what query is: (Syntelog): syntenic region is denoted by the syntelog (Gray gene): syntenic region is marked by the closest flanker """ regions = [] ysorted = sorted(data, key=lambda x: x[1]) g = Grouper() a, b = tee(ysorted) next(b, None) for ia, ib in zip(a, b): pos1, pos2 = ia[1], ib[1] if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid: g.join(ia, ib) for group in sorted(g): (qflanker, syntelog), (far_flanker, far_syntelog), flanked = \ get_flanker(group, query) # run a mini-dagchainer here, take the direction that gives us most anchors if colinear: y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)] lis = longest_increasing_subsequence(y_indexed_group) lds = longest_decreasing_subsequence(y_indexed_group) if len(lis) >= len(lds): track = lis orientation = "+" else: track = lds orientation = "-" group = [group[i] for (y, i) in track] xpos, ypos = zip(*group) score = min(len(set(xpos)), len(set(ypos))) if qflanker == query: gray = "S" else: gray = "G" if not flanked else "F" score -= 1 # slight penalty for not finding syntelog if score < cutoff: continue # y-boundary of the block left, right = group[0][1], group[-1][1] # this characterizes a syntenic region (left, right). # syntelog is -1 if it's a gray gene syn_region = (syntelog, far_syntelog, left, right, gray, orientation, score) regions.append(syn_region) return sorted(regions, key=lambda x: -x[-1]) # decreasing synteny score
def fuse(args): """ %prog fuse *.bed *.anchors Fuse gene orders based on anchors file. """ from jcvi.algorithms.graph import BiGraph p = OptionParser(fuse.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bedfiles = [x for x in args if x.endswith(".bed")] anchorfiles = [x for x in args if x.endswith(".anchors")] # TODO: Use Markov clustering to sparsify the edges families = Grouper() for anchorfile in anchorfiles: af = AnchorFile(anchorfile) for a, b, block_id in af.iter_pairs(): families.join(a, b) allowed = set(families.keys()) logging.debug("Total families: {}, Gene members: {}" .format(len(families), len(allowed))) # TODO: Use C++ implementation of BiGraph() when available # For now just serialize this to the disk G = BiGraph() for bedfile in bedfiles: bed = Bed(bedfile, include=allowed) #add_bed_to_graph(G, bed, families) print_edges(G, bed, families)
def fuse(args): """ %prog fuse *.bed *.anchors Fuse gene orders based on anchors file. """ from jcvi.algorithms.graph import BiGraph p = OptionParser(fuse.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bedfiles = [x for x in args if x.endswith(".bed")] anchorfiles = [x for x in args if x.endswith(".anchors")] # TODO: Use Markov clustering to sparsify the edges families = Grouper() for anchorfile in anchorfiles: af = AnchorFile(anchorfile) for a, b, block_id in af.iter_pairs(): families.join(a, b) allowed = set(families.keys()) logging.debug("Total families: {}, Gene members: {}".format( len(families), len(allowed))) # TODO: Use C++ implementation of BiGraph() when available # For now just serialize this to the disk for bedfile in bedfiles: bed = Bed(bedfile, include=allowed) print_edges(bed, families)
def athalianatruth(args): """ %prog athalianatruth J_a.txt J_bc.txt Prepare pairs data for At alpha/beta/gamma. """ p = OptionParser(athalianatruth.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) atxt, bctxt = args g = Grouper() pairs = set() for txt in (atxt, bctxt): extract_groups(g, pairs, txt) fw = open("pairs", "w") for pair in sorted(pairs): print("\t".join(pair), file=fw) fw.close() fw = open("groups", "w") for group in list(g): print(",".join(group), file=fw) fw.close()
def main(blast_file, cds_file, bed_file, N=3): # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file).order # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) / 2: continue query, subject = gene_name(b.query), gene_name(b.subject) qi, q = bed[query] si, s = bed[subject] if q.seqid == s.seqid and abs(qi - si) <= N: g.join(query, subject) # dump the grouper ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >> sys.stderr, "Proximal paralogues (dist=%d):" % N print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >> sys.stderr, "Longest families (%d): %s" % ( len(longest_family), ",".join(longest_family))
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True): if not flip: simple_blast = [(b.query, (b.sseqid, b.si)) for b in blast_list if b.evalue < 1e-10] else: simple_blast = [(b.subject, (b.qseqid, b.qi)) for b in blast_list if b.evalue < 1e-10] simple_blast.sort() standems = Grouper() for name, hits in groupby(simple_blast, key=lambda x: x[0]): # these are already sorted. hits = [x[1] for x in hits] for ia, a in enumerate(hits[:-1]): b = hits[ia + 1] # on the same chr and rank difference no larger than tandem_Nmax if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]: standems.join(a[1], b[1]) return standems
def main(blast_file, cds_file, bed_file, N=3): # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file).order # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) / 2: continue query, subject = gene_name(b.query), gene_name(b.subject) qi, q = bed[query] si, s = bed[subject] if q.seqid == s.seqid and abs(qi - si) <= N: g.join(query, subject) # dump the grouper ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >>sys.stderr, "Proximal paralogues (dist=%d):" % N print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family), ",".join(longest_family))
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True): if not flip: simple_blast = [(b.query, (b.sseqid, b.si)) \ for b in blast_list if b.evalue < 1e-10] else: simple_blast = [(b.subject, (b.qseqid, b.qi)) \ for b in blast_list if b.evalue < 1e-10] simple_blast.sort() standems = Grouper() for name, hits in groupby(simple_blast, key=lambda x: x[0]): # these are already sorted. hits = [x[1] for x in hits] for ia, a in enumerate(hits[:-1]): b = hits[ia + 1] # on the same chr and rank difference no larger than tandem_Nmax if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]: standems.join(a[1], b[1]) return standems
def chain_HSPs(blastlines, xdist=100, ydist=100): """ Take a list of BlastLines (or a BlastSlow instance), and returns a list of BlastLines. """ key = lambda x: (x.query, x.subject) blastlines.sort(key=key) clusters = Grouper() for qs, points in groupby(blastlines, key=key): points = sorted(list(points), \ key=lambda x: (x.qstart, x.qstop, x.sstart, x.sstop)) n = len(points) for i in xrange(n): a = points[i] clusters.join(a) for j in xrange(i + 1, n): b = points[j] if a.orientation != b.orientation: continue # x-axis distance del_x = get_distance(a, b) if del_x > xdist: continue # y-axis distance del_y = get_distance(a, b, xaxis=False) if del_y > ydist: continue # otherwise join clusters.join(a, b) chained_hsps = [] for c in clusters: chained_hsps.append(combine_HSPs(c)) chained_hsps = sorted(chained_hsps, key=lambda x: -x.score) return chained_hsps
def fuse(args): """ %prog fuse *.bed *.anchors Fuse gene orders based on anchors file. """ p = OptionParser(fuse.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bedfiles = [x for x in args if x.endswith(".bed")] anchorfiles = [x for x in args if x.endswith(".anchors")] aligned_genes = Grouper() for anchorfile in anchorfiles: af = AnchorFile(anchorfile) for a, b, block_id in af.iter_pairs(): aligned_genes.join(a, b) print list(aligned_genes) logging.debug("Total aligned genes: {}".format(len(aligned_genes)))
def chain_HSPs(blast, xdist=100, ydist=100): """ Take a list of BlastLines (or a BlastSlow instance), and returns a list of BlastLines. """ key = lambda x: (x.query, x.subject) blast.sort(key=key) clusters = Grouper() for qs, points in groupby(blast, key=key): points = sorted(list(points), \ key=lambda x: (x.qstart, x.qstop, x.sstart, x.sstop)) n = len(points) for i in xrange(n): a = points[i] clusters.join(a) for j in xrange(i + 1, n): b = points[j] # x-axis distance del_x = get_distance(a, b) if del_x > xdist: break # y-axis distance del_y = get_distance(a, b, xaxis=False) if del_y > ydist: continue # otherwise join clusters.join(a, b) chained_hsps = [combine_HSPs(x) for x in clusters] key = lambda x: (x.query, -x.score if x.has_score else 0) chained_hsps = sorted(chained_hsps, key=key) return chained_hsps
def chain_HSPs(blast, xdist=100, ydist=100): """ Take a list of BlastLines (or a BlastSlow instance), and returns a list of BlastLines. """ key = lambda x: (x.query, x.subject) blast.sort(key=key) clusters = Grouper() for qs, points in groupby(blast, key=key): points = sorted( list(points), key=lambda x: (x.qstart, x.qstop, x.sstart, x.sstop) ) n = len(points) for i in range(n): a = points[i] clusters.join(a) for j in range(i + 1, n): b = points[j] # x-axis distance del_x = get_distance(a, b) if del_x > xdist: break # y-axis distance del_y = get_distance(a, b, xaxis=False) if del_y > ydist: continue # otherwise join clusters.join(a, b) chained_hsps = [combine_HSPs(x) for x in clusters] key = lambda x: (x.query, -x.score if x.has_score else 0) chained_hsps = sorted(chained_hsps, key=key) return chained_hsps
def napus(args): """ %prog napus napus.bed brapa.boleracea.i1.blocks diploid.napus.fractionation Extract napus gene loss vs diploid ancestors. We are looking specifically for anything that has the pattern: BR - BO or BR - BO | | AN CN Step 1: extract BR - BO syntenic pairs Step 2: get diploid gene retention patterns from BR or BO as query Step 3: look for if AN or CN is NS(non-syntenic) or NF(not found) and specifically with NS, the NS location is actually the homeologous site. Step 4: categorize gene losses into singleton, or segmental (defined as consecutive losses with a maximum skip of 1 """ from jcvi.utils.cbook import SummaryStats p = OptionParser(napus.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) napusbed, brbo, dpnp = args retention = {} fp = open(dpnp) for row in fp: seqid, query, hit = row.split() retention[query] = hit order = Bed(napusbed).order quartetsfile = "quartets" fp = open(brbo) fw = open(quartetsfile, "w") AL = "AN LOST" CL = "CN LOST" for row in fp: br, bo = row.split() if '.' in (br, bo): continue an, cn = retention[br], retention[bo] row = "\t".join((br, bo, an, cn)) if '.' in (an, cn): #print row continue # label loss candidates antag, anrange = get_tag(an, order) cntag, cnrange = get_tag(cn, order) if range_overlap(anrange, cnrange): if (antag, cntag) == ("NS", None): row = row + "\t{0}|{1}".format(AL, br) if (antag, cntag) == (None, "NS"): row = row + "\t{0}|{1}".format(CL, bo) print >> fw, row fw.close() logging.debug("Quartets and gene losses written to `{0}`.".\ format(quartetsfile)) # Parse the quartets file to extract singletons vs.segmental losses fp = open(quartetsfile) fw = open(quartetsfile + ".summary", "w") data = [x.rstrip().split("\t") for x in fp] skip = 1 # max distance between losses g = Grouper() losses = [(len(x) == 5) for x in data] for i, d in enumerate(losses): if not d: continue g.join(i, i) itag = data[i][-1].split("|")[0] for j in xrange(i + 1, i + skip + 1): jtag = data[j][-1].split("|")[0] if j < len(losses) and losses[j] and itag == jtag: g.join(i, j) losses = list(g) singletons = [x for x in losses if len(x) == 1] segments = [x for x in losses if len(x) > 1] ns, nm = len(singletons), len(segments) assert len(losses) == ns + nm grab_tag = lambda pool, tag: \ [x for x in pool if all(data[z][-1].startswith(tag) for z in x)] an_loss_singletons = grab_tag(singletons, AL) cn_loss_singletons = grab_tag(singletons, CL) als, cls = len(an_loss_singletons), len(cn_loss_singletons) an_loss_segments = grab_tag(segments, AL) cn_loss_segments = grab_tag(segments, CL) alm, clm = len(an_loss_segments), len(cn_loss_segments) mixed = len(segments) - alm - clm assert mixed == 0 logging.debug("Singletons: {0} (AN LOSS: {1}, CN LOSS: {2})".\ format(ns, als, cls)) logging.debug("Segments: {0} (AN LOSS: {1}, CN LOSS: {2})".\ format(nm, alm, clm)) print >> sys.stderr, SummaryStats([len(x) for x in losses]) for x in singletons + segments: print >> fw, "### LENGTH =", len(x) for i in x: print >> fw, "\t".join(data[i]) fw.close()
def segment(args): """ %prog segment loss.ids bedfile Merge adjacent gene loss into segmental loss. Then based on the segmental loss, estimate amount of DNA loss in base pairs. Two estimates can be given: - conservative: just within the start and end of a single gene - aggressive: extend the deletion track to the next gene The real deletion size is within these estimates. """ from jcvi.formats.base import SetFile p = OptionParser(segment.__doc__) p.add_option("--chain", default=1, type="int", help="Allow next N genes to be chained [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) idsfile, bedfile = args bed = Bed(bedfile) order = bed.order ids = SetFile(idsfile) losses = Grouper() skip = opts.chain for i, a in enumerate(bed): a = a.accn for j in xrange(i + 1, i + 1 + skip): if j >= len(bed): break b = bed[j].accn if a in ids: losses.join(a, a) if a in ids and b in ids: losses.join(a, b) losses = list(losses) singletons = [x for x in losses if len(x) == 1] segments = [x for x in losses if len(x) > 1] ns, nm, nt = len(singletons), len(segments), len(losses) assert ns + nm == nt # Summary for all segments for x in sorted(singletons) + sorted(segments): print "\t".join(str(x) for x in ("|".join(sorted(x)), len(x), estimate_size(x, bed, order))) # Find longest segment stretch if segments: mx, maxsegment = max([(len(x), x) for x in segments]) print >> sys.stderr, "Longest stretch: run of {0} genes".format(mx) print >> sys.stderr, " {0}".format("|".join(sorted(maxsegment))) seg_asize = sum(estimate_size(x, bed, order) for x in segments) seg_bsize = sum(estimate_size(x, bed, order, conservative=False) \ for x in segments) else: seg_asize = seg_bsize = 0 sing_asize = sum(estimate_size(x, bed, order) for x in singletons) sing_bsize = sum(estimate_size(x, bed, order, conservative=False) \ for x in singletons) total_asize = sing_asize + seg_asize total_bsize = sing_bsize + seg_bsize print >> sys.stderr, "Singleton ({0}): {1} - {2} bp".\ format(ns, sing_asize, sing_bsize) print >> sys.stderr, "Segment ({0}): {1} - {2} bp".\ format(nm, seg_asize, seg_bsize) print >> sys.stderr, "Total ({0}): {1} - {2} bp".\ format(nt, total_asize, total_bsize) print >> sys.stderr, "Average ({0}): {1} bp".\ format(nt, (total_asize + total_bsize) / 2)
def path(args): """ %prog path input.bed scaffolds.fasta Construct golden path given a set of genetic maps. The respective weight for each map is given in file `weights.txt`. The map with the highest weight is considered the pivot map. The final output is an AGP file that contains ordered scaffolds. """ oargs = args p = OptionParser(path.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Use weights from file") p.add_option("--distance", default="rank", choices=distance_choices, help="Distance function when building initial consensus") p.add_option("--linkage", default="double", choices=linkage_choices, help="Linkage function when building initial consensus") p.add_option("--gapsize", default=100, type="int", help="Insert gaps of size between scaffolds") p.add_option("--ngen", default=500, type="int", help="Iterations in GA, more ~ slower") p.add_option("--npop", default=100, type="int", help="Population size in GA, more ~ slower") p.add_option("--seqid", help="Only run partition with this seqid") p.add_option("--links", default=10, type="int", help="Only plot matchings more than") p.add_option("--noplot", default=False, action="store_true", help="Do not visualize the alignments") p.set_cpus(cpus=16) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, fastafile = args pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".bed" weightsfile = opts.weightsfile gapsize = opts.gapsize ngen = opts.ngen npop = opts.npop cpus = opts.cpus if sys.version_info[:2] < (2, 7): logging.debug("Python version: {0}. CPUs set to 1.".\ format(sys.version.splitlines()[0].strip())) cpus = 1 function = get_function(opts.distance) cc = Map(bedfile, function) mapnames = cc.mapnames allseqids = cc.seqids weights = Weights(weightsfile, mapnames) pivot = weights.pivot ref = weights.ref linkage = opts.linkage oseqid = opts.seqid logging.debug("Linkage function: {0}-linkage".format(linkage)) linkage = { "single": min, "double": double_linkage, "complete": max, "average": np.mean, "median": np.median }[linkage] # Partition the linkage groups into consensus clusters C = Grouper() # Initialize the partitions for mlg in cc.mlgs: C.join(mlg) logging.debug("Partition LGs based on {0}".format(ref)) for mapname in mapnames: if mapname == ref: continue # Compute co-occurrence between LG pairs G = defaultdict(int) for s in allseqids: s = Scaffold(s, cc) s.add_LG_pairs(G, (ref, mapname)) # Convert edge list to adj list nodes = defaultdict(list) for (a, b), w in G.items(): nodes[a].append((b, w)) # Find the best ref LG every non-ref LG matches to for n, neighbors in nodes.items(): if n.split("-")[0] == ref: continue neighbors = dict(neighbors) best_neighbor, best_value = best_no_ambiguous(neighbors, n) if best_neighbor is None: continue C.join(n, best_neighbor) partitions = defaultdict(list) # Partition the scaffolds and assign them to one consensus for s in allseqids: s = Scaffold(s, cc) seqid = s.seqid counts = {} for mlg, count in s.mlg_counts.items(): consensus = C[mlg] mapname = mlg.split("-")[0] mw = weights[mapname] if consensus not in counts: counts[consensus] = 0 counts[consensus] += count * mw best_consensus, best_value = best_no_ambiguous(counts, seqid) if best_consensus is None: continue partitions[best_consensus].append(seqid) # Perform OO within each partition agpfile = pf + ".chr.agp" tourfile = pf + ".tour" sizes = Sizes(fastafile).mapping fwagp = must_open(agpfile, "w") fwtour = must_open(tourfile, "w") solutions = [] for lgs, scaffolds in sorted(partitions.items()): if oseqid and oseqid not in lgs: continue tag = "|".join(lgs) lgs_maps = set(x.split("-")[0] for x in lgs) if pivot not in lgs_maps: logging.debug("Skipping {0} ...".format(tag)) continue logging.debug("Working on {0} ...".format(tag)) s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes, function=function, linkage=linkage, ngen=ngen, npop=npop, cpus=cpus) for fw in (sys.stderr, fwtour): print >> fw, ">{0} ({1})".format(s.object, tag) print >> fw, " ".join("".join(x) for x in s.tour) solutions.append(s) fwtour.close() # meta-data about the run parameters command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\ format(" ".join(oargs)) comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\ format(version, get_today(), command) AGP.print_header(fwagp, comment=comment) for s in sorted(solutions, key=lambda x: x.object): order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize, gaptype="map") fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) logging.debug("Tour file written to `{0}`.".format(tourfile)) build([inputbed, fastafile]) summaryfile = pf + ".summary.txt" summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)]) if not opts.noplot: plotall([inputbed, "--links={0}".format(opts.links)])
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \ evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False): if genefam: N = 1e5 # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file) order = bed.order if is_self: # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) qi, q = order[query] si, s = order[subject] if abs(qi - si) <= N and b.evalue <= evalue: if genefam: g.join(query, subject) elif q.seqid == s.seqid: g.join(query, subject) else: homologs = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue if b.evalue > evalue: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) homologs.join(query, subject) if genefam: g = homologs else: g = Grouper() for i, atom in enumerate(bed): for x in range(1, N + 1): if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \ homologs.joined(bed[i-x].accn, atom.accn)]): leni = sizes[bed[i].accn] lenx = sizes[bed[i - x].accn] if abs(leni - lenx) > max(leni, lenx) * (1 - P / 100.): continue g.join(bed[i - x].accn, atom.accn) # dump the grouper fw = must_open(ofile, "w") ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print >> fw, ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >> sys.stderr, "Proximal paralogues (dist=%d):" % N print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >> sys.stderr, "Longest families (%d): %s" % ( len(longest_family), ",".join(longest_family)) return families
def consolidate(args): """ %prog consolidate gffile1 gffile2 ... > consolidated.out Given 2 or more gff files generated by pasa annotation comparison, iterate through every gene locus and identify all cases of same and different isoforms across the different input datasets. """ from jcvi.formats.base import longest_unique_prefix from jcvi.formats.gff import make_index from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper from itertools import combinations, product p = OptionParser(consolidate.__doc__) p.add_option("--slop", default=False, action="store_true", help="allow minor variation in terminal 5'/3' UTR" + \ " start/stop position [default: %default]") p.set_outfile() opts, args = p.parse_args(args) slop = opts.slop if len(args) < 2: sys.exit(not p.print_help()) gffdbx = {} gene_coords = {} mrna = AutoVivification() for gffile in args: dbn = longest_unique_prefix(gffile, args) gffdbx[dbn] = make_index(gffile) for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')): if gene.id not in gene_coords: gene_coords[gene.id] = [] gene_coords[gene.id].extend([gene.start, gene.stop]) c = list(gffdbx[dbn].children(gene, featuretype='mRNA', order_by='start')) if len(c) > 0: mrna[gene.id][dbn] = c fw = must_open(opts.outfile, "w") print >> fw, "##gff-version 3" summary = ["id"] summary.extend(gffdbx.keys()) print >> sys.stderr, "\t".join(str(x) for x in summary) for gene in mrna: g = Grouper() dbns = list(combinations(mrna[gene], 2)) if len(dbns) > 0: for dbn1, dbn2 in dbns: for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]): g.join((dbn1, mrna1.id)) g.join((dbn2, mrna2.id)) fUTR, tUTR = None, None if match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2]): fUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \ featuretype='five_prime_UTR', slop=slop) tUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \ featuretype='three_prime_UTR', slop=slop) if fUTR and tUTR: g.join((dbn1, mrna1.id), (dbn2, mrna2.id)) else: for dbn1 in mrna[gene]: for mrna1 in mrna[gene][dbn1]: g.join((dbn1, mrna1.id)) dbn = mrna[gene].keys()[0] gene_coords[gene].sort() _gene = gffdbx[dbn][gene] _gene.start, _gene.stop = gene_coords[gene][0], gene_coords[gene][-1] print >> fw, _gene logging.debug(list(g)) for group in g: dbs, mrnas = [el[0] for el in group], [el[1] for el in group] d, m = dbs[0], mrnas[0] if slop: mlen = 0 for D, M in zip(dbs, mrnas): _mrna = gffdbx[D][M] _mlen = (_mrna.stop - _mrna.start) + 1 if _mlen > mlen: d, m, mlen = D, M, _mlen dbid, _mrnaid = "".join(str(x) for x in set(dbs)), [] _mrnaid = [x for x in mrnas if x not in _mrnaid] mrnaid = "{0}:{1}".format(dbid, "-".join(_mrnaid)) _mrna = gffdbx[d][m] _mrna.attributes['ID'] = [mrnaid] children = gffdbx[d].children(m, order_by='start') print >> fw, _mrna for child in children: child.attributes['ID'] = ["{0}:{1}".format(dbid, child.id)] child.attributes['Parent'] = [mrnaid] print >> fw, child summary = [mrnaid] summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx]) print >> sys.stderr, "\t".join(str(x) for x in summary) fw.close()
def renumber(args): """ %prog renumber Mt35.consolidated.bed > tagged.bed Renumber genes for annotation updates. """ from jcvi.algorithms.lis import longest_increasing_subsequence from jcvi.utils.grouper import Grouper p = OptionParser(renumber.__doc__) p.set_annot_reformat_opts() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args pf = bedfile.rsplit(".", 1)[0] abedfile = pf + ".a.bed" bbedfile = pf + ".b.bed" if need_update(bedfile, (abedfile, bbedfile)): prepare(bedfile) mbed = Bed(bbedfile) g = Grouper() for s in mbed: accn = s.accn g.join(*accn.split(";")) bed = Bed(abedfile) for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue ranks = [] gg = set() for s in sbed: accn = s.accn achr, arank = atg_name(accn) if achr != current_chr: continue ranks.append(arank) gg.add(accn) lranks = longest_increasing_subsequence(ranks) print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \ "==>", len(lranks) granks = set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, uc=opts.uc) for x in lranks) | \ set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks) tagstore = {} for s in sbed: achr, arank = atg_name(s.accn) accn = s.accn if accn in granks: tag = (accn, FRAME) elif accn in gg: tag = (accn, RETAIN) else: tag = (".", NEW) tagstore[accn] = tag # Find cases where genes overlap for s in sbed: accn = s.accn gaccn = g[accn] tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn] group = [(PRIORITY.index(tag), x) for tag, x in tags] best = min(group)[-1] if accn != best: tag = (best, OVERLAP) else: tag = tagstore[accn] print "\t".join((str(s), "|".join(tag)))
def annotate(args): """ %prog annotate new.bed old.bed 2> log Annotate the `new.bed` with features from `old.bed` for the purpose of gene numbering. Ambiguity in ID assignment can be resolved by either of the following 2 methods: - `alignment`: make use of global sequence alignment score (calculated by `needle`) - `overlap`: make use of overlap length (calculated by `intersectBed`) Transfer over as many identifiers as possible while following guidelines: http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing Note: Following RegExp pattern describes the structure of the identifier assigned to features in the `new.bed` file. new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+") Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144 Adjust the value of `new_id_pat` manually as per your ID naming conventions. """ from jcvi.utils.grouper import Grouper valid_resolve_choices = ["alignment", "overlap"] p = OptionParser(annotate.__doc__) p.add_option("--resolve", default="alignment", choices=valid_resolve_choices, help="Resolve ID assignment based on a certain metric" \ + " [default: %default]") p.add_option("--atg_name", default=False, action="store_true", help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \ + " [default: %default]") g1 = OptionGroup(p, "Optional parameters (alignment):\n" \ + "Use if resolving ambiguities based on sequence `alignment`") g1.add_option("--pid", dest="pid", default=35., type="float", help="Percent identity cutoff [default: %default]") g1.add_option("--score", dest="score", default=250., type="float", help="Alignment score cutoff [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters (overlap):\n" \ + "Use if resolving ambiguities based on `overlap` length\n" \ + "Parameters equivalent to `intersectBed`") g2.add_option("-f", dest="f", default=0.5, type="float", help="Minimum overlap fraction (0.0 - 1.0) [default: %default]") g2.add_option("-r", dest="r", default=False, action="store_true", help="Require fraction overlap to be reciprocal [default: %default]") g2.add_option("-s", dest="s", default=True, action="store_true", help="Require same strandedness [default: %default]") p.add_option_group(g2) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) nbedfile, obedfile = args npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0] # Make consolidated.bed cbedfile = "consolidated.bed" if not os.path.isfile(cbedfile): consolidate(nbedfile, obedfile, cbedfile) else: logging.warning("`{0}` already exists. Skipping step".format(cbedfile)) logging.warning("Resolving ID assignment ambiguity based on `{0}`".\ format(opts.resolve)) if opts.resolve == "alignment": # Get pairs and prompt to run needle pairsfile = "nw.pairs" scoresfile = "nw.scores" if not os.path.isfile(pairsfile): get_pairs(cbedfile, pairsfile) else: logging.warning("`{0}` already exists. Checking for needle output".\ format(pairsfile)) # If needle scores do not exist, prompt user to run needle if not os.path.isfile(scoresfile): logging.error("`{0}` does not exist. Please process {1} using `needle`".\ format(scoresfile, pairsfile)) sys.exit() else: scoresfile = "ovl.scores" # Calculate overlap length using intersectBed calculate_ovl(nbedfile, obedfile, opts, scoresfile) logging.warning("`{0}' exists. Storing scores in memory".\ format(scoresfile)) scores = read_scores(scoresfile, opts) # Iterate through consolidated bed and # filter piles based on score abedline = {} cbed = Bed(cbedfile) g = Grouper() for c in cbed: accn = c.accn g.join(*accn.split(";")) nbedline = {} nbed = Bed(nbedfile) for line in nbed: nbedline[line.accn] = line splits = set() for chr, chrbed in nbed.sub_beds(): abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits) if splits is not None: abedline = process_splits(splits, scores, nbedline, abedline) abedfile = npf + ".annotated.bed" afh = open(abedfile, "w") for accn in abedline: print >> afh, abedline[accn] afh.close() sort([abedfile, "-i"])
def path(args): """ %prog path input.bed scaffolds.fasta Construct golden path given a set of genetic maps. The respective weight for each map is given in file `weights.txt`. The map with the highest weight is considered the pivot map. The final output is an AGP file that contains ordered scaffolds. """ oargs = args p = OptionParser(path.__doc__) p.add_option("-b", "--bedfile", help=SUPPRESS_HELP) p.add_option("-s", "--fastafile", help=SUPPRESS_HELP) p.add_option("-w", "--weightsfile", default="weights.txt", help="Use weights from file") p.add_option("--distance", default="rank", choices=distance_choices, help="Distance function when building initial consensus") p.add_option("--linkage", default="double", choices=linkage_choices, help="Linkage function when building initial consensus") p.add_option("--gapsize", default=100, type="int", help="Insert gaps of size between scaffolds") p.add_option("--ngen", default=500, type="int", help="Iterations in GA, more ~ slower") p.add_option("--npop", default=100, type="int", help="Population size in GA, more ~ slower") p.add_option("--seqid", help="Only run partition with this seqid") p.add_option("--links", default=10, type="int", help="Only plot matchings more than") p.add_option("--noplot", default=False, action="store_true", help="Do not visualize the alignments") p.set_cpus(cpus=16) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, fastafile = args inputbed = opts.bedfile or inputbed fastafile = opts.fastafile or fastafile pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".bed" weightsfile = opts.weightsfile gapsize = opts.gapsize ngen = opts.ngen npop = opts.npop cpus = opts.cpus if sys.version_info[:2] < (2, 7): logging.debug("Python version: {0}. CPUs set to 1.".\ format(sys.version.splitlines()[0].strip())) cpus = 1 function = get_function(opts.distance) cc = Map(bedfile, function) mapnames = cc.mapnames allseqids = cc.seqids weights = Weights(weightsfile, mapnames) pivot = weights.pivot ref = weights.ref linkage = opts.linkage oseqid = opts.seqid logging.debug("Linkage function: {0}-linkage".format(linkage)) linkage = {"single": min, "double": double_linkage, "complete": max, "average": np.mean, "median": np.median}[linkage] # Partition the linkage groups into consensus clusters C = Grouper() # Initialize the partitions for mlg in cc.mlgs: C.join(mlg) logging.debug("Partition LGs based on {0}".format(ref)) for mapname in mapnames: if mapname == ref: continue # Compute co-occurrence between LG pairs G = defaultdict(int) for s in allseqids: s = Scaffold(s, cc) s.add_LG_pairs(G, (ref, mapname)) # Convert edge list to adj list nodes = defaultdict(list) for (a, b), w in G.items(): nodes[a].append((b, w)) # Find the best ref LG every non-ref LG matches to for n, neighbors in nodes.items(): if n.split("-")[0] == ref: continue neighbors = dict(neighbors) best_neighbor, best_value = best_no_ambiguous(neighbors, n) if best_neighbor is None: continue C.join(n, best_neighbor) partitions = defaultdict(list) # Partition the scaffolds and assign them to one consensus for s in allseqids: s = Scaffold(s, cc) seqid = s.seqid counts = {} for mlg, count in s.mlg_counts.items(): consensus = C[mlg] mapname = mlg.split("-")[0] mw = weights[mapname] if consensus not in counts: counts[consensus] = 0 counts[consensus] += count * mw best_consensus, best_value = best_no_ambiguous(counts, seqid) if best_consensus is None: continue partitions[best_consensus].append(seqid) # Perform OO within each partition agpfile = pf + ".chr.agp" tourfile = pf + ".tour" sizes = Sizes(fastafile).mapping fwagp = must_open(agpfile, "w") fwtour = must_open(tourfile, "w") solutions = [] for lgs, scaffolds in sorted(partitions.items()): if oseqid and oseqid not in lgs: continue tag = "|".join(lgs) lgs_maps = set(x.split("-")[0] for x in lgs) if pivot not in lgs_maps: logging.debug("Skipping {0} ...".format(tag)) continue logging.debug("Working on {0} ...".format(tag)) s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes, function=function, linkage=linkage, ngen=ngen, npop=npop, cpus=cpus) for fw in (sys.stderr, fwtour): print >> fw, ">{0} ({1})".format(s.object, tag) print >> fw, " ".join("".join(x) for x in s.tour) solutions.append(s) fwtour.close() # meta-data about the run parameters command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\ format(" ".join(oargs)) comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\ format(version, get_today(), command) AGP.print_header(fwagp, comment=comment) for s in sorted(solutions, key=lambda x: x.object): order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize, gaptype="map") fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) logging.debug("Tour file written to `{0}`.".format(tourfile)) build([inputbed, fastafile]) summaryfile = pf + ".summary.txt" summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)]) if not opts.noplot: plotall([inputbed, "--links={0}".format(opts.links)])
def enrich(args): """ %prog enrich omgfile groups ntaxa > enriched.omg Enrich OMG output by pulling genes misses by OMG. """ p = OptionParser(enrich.__doc__) p.add_option("--ghost", default=False, action="store_true", help="Add ghost homologs already used [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) omgfile, groupsfile, ntaxa = args ntaxa = int(ntaxa) ghost = opts.ghost # Get gene pair => weight mapping weights = get_edges() info = get_info() # Get gene => taxon mapping info = dict((k, v.split()[5]) for k, v in info.items()) groups = Grouper() fp = open(groupsfile) for row in fp: members = row.strip().split(",") groups.join(*members) logging.debug("Imported {0} families with {1} members.".\ format(len(groups), groups.num_members)) seen = set() omggroups = Grouper() fp = open(omgfile) for row in fp: genes, idxs = row.split() genes = genes.split(",") seen.update(genes) omggroups.join(*genes) nmembers = omggroups.num_members logging.debug("Imported {0} OMG families with {1} members.".\ format(len(omggroups), nmembers)) assert nmembers == len(seen) alltaxa = set(str(x) for x in range(ntaxa)) recruited = [] fp = open(omgfile) for row in fp: genes, idxs = row.split() genes = genes.split(",") a = genes[0] idxs = set(idxs.split(",")) missing_taxa = alltaxa - idxs if not missing_taxa: print row.rstrip() continue leftover = groups[a] if not ghost: leftover = set(leftover) - seen if not leftover: print row.rstrip() continue leftover_sorted_by_taxa = dict((k, \ [x for x in leftover if info[x] == k]) \ for k in missing_taxa) #print genes, leftover #print leftover_sorted_by_taxa solutions = [] for solution in product(*leftover_sorted_by_taxa.values()): score = sum(weights.get((a, b), 0) for a in solution for b in genes) if score == 0: continue score += sum(weights.get((a, b), 0) for a, b in combinations(solution, 2)) solutions.append((score, solution)) #print solution, score best_solution = max(solutions) if solutions else None if best_solution is None: print row.rstrip() continue #print "best ==>", best_solution best_score, best_addition = best_solution genes.extend(best_addition) recruited.extend(best_addition) genes = sorted([(info[x], x) for x in genes]) idxs, genes = zip(*genes) if ghost: # decorate additions so it's clear that they were added pgenes = [] for g in genes: if g in recruited and g in seen: pgenes.append("|{0}|".format(g)) else: pgenes.append(g) genes = pgenes print "\t".join((",".join(genes), ",".join(idxs))) if not ghost: seen.update(best_addition) logging.debug("Recruited {0} new genes.".format(len(recruited)))
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \ evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False): if genefam: N = 1e5 # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file) order = bed.order if is_self: # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len)*P/100.: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) qi, q = order[query] si, s = order[subject] if abs(qi - si) <= N and b.evalue <= evalue: if genefam: g.join(query, subject) elif q.seqid == s.seqid: g.join(query, subject) else: homologs = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len)*P/100.: continue if b.evalue > evalue: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) homologs.join(query, subject) if genefam: g = homologs else: g = Grouper() for i, atom in enumerate(bed): for x in range(1, N+1): if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \ homologs.joined(bed[i-x].accn, atom.accn)]): leni = sizes[bed[i].accn] lenx = sizes[bed[i-x].accn] if abs(leni - lenx) > max(leni, lenx)*(1-P/100.): continue g.join(bed[i-x].accn, atom.accn) # dump the grouper fw = must_open(ofile, "w") ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print >>fw, ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >>sys.stderr, "Proximal paralogues (dist=%d):" % N print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family), ",".join(longest_family)) return families
def enrich(args): """ %prog enrich omgfile groups ntaxa > enriched.omg Enrich OMG output by pulling genes misses by OMG. """ p = OptionParser(enrich.__doc__) p.add_option("--ghost", default=False, action="store_true", help="Add ghost homologs already used [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) omgfile, groupsfile, ntaxa = args ntaxa = int(ntaxa) ghost = opts.ghost # Get gene pair => weight mapping weights = get_edges() info = get_info() # Get gene => taxon mapping info = dict((k, v.split()[5]) for k, v in info.items()) groups = Grouper() fp = open(groupsfile) for row in fp: members = row.strip().split(",") groups.join(*members) logging.debug("Imported {0} families with {1} members.".\ format(len(groups), groups.num_members)) seen = set() omggroups = Grouper() fp = open(omgfile) for row in fp: genes, idxs = row.split() genes = genes.split(",") seen.update(genes) omggroups.join(*genes) nmembers = omggroups.num_members logging.debug("Imported {0} OMG families with {1} members.".\ format(len(omggroups), nmembers)) assert nmembers == len(seen) alltaxa = set(str(x) for x in range(ntaxa)) recruited = [] fp = open(omgfile) for row in fp: genes, idxs = row.split() genes = genes.split(",") a = genes[0] idxs = set(idxs.split(",")) missing_taxa = alltaxa - idxs if not missing_taxa: print row.rstrip() continue leftover = groups[a] if not ghost: leftover = set(leftover) - seen if not leftover: print row.rstrip() continue leftover_sorted_by_taxa = dict((k, \ [x for x in leftover if info[x] == k]) \ for k in missing_taxa) #print genes, leftover #print leftover_sorted_by_taxa solutions = [] for solution in product(*leftover_sorted_by_taxa.values()): score = sum( weights.get((a, b), 0) for a in solution for b in genes) if score == 0: continue score += sum( weights.get((a, b), 0) for a, b in combinations(solution, 2)) solutions.append((score, solution)) #print solution, score best_solution = max(solutions) if solutions else None if best_solution is None: print row.rstrip() continue #print "best ==>", best_solution best_score, best_addition = best_solution genes.extend(best_addition) recruited.extend(best_addition) genes = sorted([(info[x], x) for x in genes]) idxs, genes = zip(*genes) if ghost: # decorate additions so it's clear that they were added pgenes = [] for g in genes: if g in recruited and g in seen: pgenes.append("|{0}|".format(g)) else: pgenes.append(g) genes = pgenes print "\t".join((",".join(genes), ",".join(idxs))) if not ghost: seen.update(best_addition) logging.debug("Recruited {0} new genes.".format(len(recruited)))
def annotate(args): """ %prog annotate new.bed old.bed 2> log Annotate the `new.bed` with features from `old.bed` for the purpose of gene numbering. Ambiguity in ID assignment can be resolved by either of the following 2 methods: - `alignment`: make use of global sequence alignment score (calculated by `needle`) - `overlap`: make use of overlap length (calculated by `intersectBed`) Transfer over as many identifiers as possible while following guidelines: http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing Note: Following RegExp pattern describes the structure of the identifier assigned to features in the `new.bed` file. new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+") Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144 Adjust the value of `new_id_pat` manually as per your ID naming conventions. """ from jcvi.utils.grouper import Grouper valid_resolve_choices = ["alignment", "overlap"] p = OptionParser(annotate.__doc__) p.add_option("--resolve", default="alignment", choices=valid_resolve_choices, help="Resolve ID assignment based on a certain metric" \ + " [default: %default]") p.add_option("--atg_name", default=False, action="store_true", help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \ + " [default: %default]") g1 = OptionGroup(p, "Optional parameters (alignment):\n" \ + "Use if resolving ambiguities based on sequence `alignment`") g1.add_option("--pid", dest="pid", default=35., type="float", help="Percent identity cutoff [default: %default]") g1.add_option("--score", dest="score", default=250., type="float", help="Alignment score cutoff [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters (overlap):\n" \ + "Use if resolving ambiguities based on `overlap` length\n" \ + "Parameters equivalent to `intersectBed`") g2.add_option( "-f", dest="f", default=0.5, type="float", help="Minimum overlap fraction (0.0 - 1.0) [default: %default]") g2.add_option( "-r", dest="r", default=False, action="store_true", help="Require fraction overlap to be reciprocal [default: %default]") g2.add_option("-s", dest="s", default=True, action="store_true", help="Require same strandedness [default: %default]") p.add_option_group(g2) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) nbedfile, obedfile = args npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0] # Make consolidated.bed cbedfile = "consolidated.bed" if not os.path.isfile(cbedfile): consolidate(nbedfile, obedfile, cbedfile) else: logging.warning("`{0}` already exists. Skipping step".format(cbedfile)) logging.warning("Resolving ID assignment ambiguity based on `{0}`".\ format(opts.resolve)) if opts.resolve == "alignment": # Get pairs and prompt to run needle pairsfile = "nw.pairs" scoresfile = "nw.scores" if not os.path.isfile(pairsfile): get_pairs(cbedfile, pairsfile) else: logging.warning("`{0}` already exists. Checking for needle output".\ format(pairsfile)) # If needle scores do not exist, prompt user to run needle if not os.path.isfile(scoresfile): logging.error("`{0}` does not exist. Please process {1} using `needle`".\ format(scoresfile, pairsfile)) sys.exit() else: scoresfile = "ovl.scores" # Calculate overlap length using intersectBed calculate_ovl(nbedfile, obedfile, opts, scoresfile) logging.warning("`{0}' exists. Storing scores in memory".\ format(scoresfile)) scores = read_scores(scoresfile, opts) # Iterate through consolidated bed and # filter piles based on score abedline = {} cbed = Bed(cbedfile) g = Grouper() for c in cbed: accn = c.accn g.join(*accn.split(";")) nbedline = {} nbed = Bed(nbedfile) for line in nbed: nbedline[line.accn] = line splits = set() for chr, chrbed in nbed.sub_beds(): abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits) if splits is not None: abedline = process_splits(splits, scores, nbedline, abedline) abedfile = npf + ".annotated.bed" afh = open(abedfile, "w") for accn in abedline: print >> afh, abedline[accn] afh.close() sort([abedfile, "-i"])
def segment(args): """ %prog segment loss.ids bedfile Merge adjacent gene loss into segmental loss. Then based on the segmental loss, estimate amount of DNA loss in base pairs. Two estimates can be given: - conservative: just within the start and end of a single gene - aggressive: extend the deletion track to the next gene The real deletion size is within these estimates. """ from jcvi.formats.base import SetFile p = OptionParser(segment.__doc__) p.add_option("--chain", default=1, type="int", help="Allow next N genes to be chained [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) idsfile, bedfile = args bed = Bed(bedfile) order = bed.order ids = SetFile(idsfile) losses = Grouper() skip = opts.chain for i, a in enumerate(bed): a = a.accn for j in xrange(i + 1, i + 1 + skip): if j >= len(bed): break b = bed[j].accn if a in ids: losses.join(a, a) if a in ids and b in ids: losses.join(a, b) losses = list(losses) singletons = [x for x in losses if len(x) == 1] segments = [x for x in losses if len(x) > 1] ns, nm, nt = len(singletons), len(segments), len(losses) assert ns + nm == nt # Summary for all segments for x in sorted(singletons) + sorted(segments): print "\t".join( str(x) for x in ("|".join(sorted(x)), len(x), estimate_size(x, bed, order))) # Find longest segment stretch if segments: mx, maxsegment = max([(len(x), x) for x in segments]) print >> sys.stderr, "Longest stretch: run of {0} genes".format(mx) print >> sys.stderr, " {0}".format("|".join(sorted(maxsegment))) seg_asize = sum(estimate_size(x, bed, order) for x in segments) seg_bsize = sum(estimate_size(x, bed, order, conservative=False) \ for x in segments) else: seg_asize = seg_bsize = 0 sing_asize = sum(estimate_size(x, bed, order) for x in singletons) sing_bsize = sum(estimate_size(x, bed, order, conservative=False) \ for x in singletons) total_asize = sing_asize + seg_asize total_bsize = sing_bsize + seg_bsize print >> sys.stderr, "Singleton ({0}): {1} - {2} bp".\ format(ns, sing_asize, sing_bsize) print >> sys.stderr, "Segment ({0}): {1} - {2} bp".\ format(nm, seg_asize, seg_bsize) print >> sys.stderr, "Total ({0}): {1} - {2} bp".\ format(nt, total_asize, total_bsize) print >> sys.stderr, "Average ({0}): {1} bp".\ format(nt, (total_asize + total_bsize) / 2)
def group(args): """ %prog group tabfile > tabfile.grouped Given a tab-delimited file, either group all elements within the file or group the elements in the value column(s) based on the key (groupby) column For example, convert this | into this --------------------------------------- a 2 3 4 | a,2,3,4,5,6 a 5 6 | b,7,8 b 7 8 | c,9,10,11 c 9 | c 10 11 | If grouping by a particular column, convert this | into this: --------------------------------------------- a 2 3 4 | a 2,5 3,6 4 a 5 6 | b 7 8 b 7 8 | c 9,10 11 c 9 | c 10 11 | By default, it uniqifies all the grouped elements """ from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper p = OptionParser(group.__doc__) p.set_sep() p.add_option("--groupby", default=None, type="int", help="Default column to groupby") p.add_option("--groupsep", default=",", help="Separator to join the grouped elements") p.add_option( "--nouniq", default=False, action="store_true", help="Do not uniqify the grouped elements", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (tabfile, ) = args sep = opts.sep groupby = opts.groupby groupsep = opts.groupsep cols = [] grouper = AutoVivification() if groupby is not None else Grouper() fp = must_open(tabfile) for row in fp: row = row.rstrip() atoms = row.split(sep) if groupby is not None: if len(cols) < len(atoms): cols = [x for x in range(len(atoms))] if groupby not in cols: logging.error( "groupby col index `{0}` is out of range".format(groupby)) sys.exit() key = atoms[groupby] for col in cols: if col == groupby: continue if not grouper[key][col]: grouper[key][col] = [] if opts.nouniq else set() if col < len(atoms): if groupsep in atoms[col]: for atom in atoms[col].split(groupsep): if opts.nouniq: grouper[key][col].append(atom) else: grouper[key][col].add(atom) else: if opts.nouniq: grouper[key][col].append(atoms[col]) else: grouper[key][col].add(atoms[col]) else: grouper.join(*atoms) for key in grouper: if groupby is not None: line = [] for col in cols: if col == groupby: line.append(key) elif col in grouper[key].keys(): line.append(groupsep.join(grouper[key][col])) else: line.append("na") print(sep.join(line)) else: print(groupsep.join(key))
def consolidate(args): """ %prog consolidate gffile1 gffile2 ... > consolidated.out Given 2 or more gff files generated by pasa annotation comparison, iterate through each locus (shared locus name or overlapping CDS) and identify same/different isoforms (shared splicing structure) across the input datasets. If `slop` is enabled, consolidation will collapse any variation in terminal UTR lengths, keeping the longest as representative. """ from jcvi.formats.base import longest_unique_prefix from jcvi.formats.gff import make_index, match_subfeats from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper from itertools import combinations, product supported_modes = ["name", "coords"] p = OptionParser(consolidate.__doc__) p.add_option("--slop", default=False, action="store_true", help="allow minor variation in terminal 5'/3' UTR" + \ " start/stop position [default: %default]") p.add_option("--inferUTR", default=False, action="store_true", help="infer presence of UTRs from exon coordinates") p.add_option("--mode", default="name", choices=supported_modes, help="method used to determine overlapping loci") p.add_option("--summary", default=False, action="store_true", help="Generate summary table of consolidation process") p.add_option("--clusters", default=False, action="store_true", help="Generate table of cluster members after consolidation") p.set_outfile() opts, args = p.parse_args(args) slop = opts.slop inferUTR = opts.inferUTR mode = opts.mode if len(args) < 2: sys.exit(not p.print_help()) gffdbx = {} for gffile in args: dbn = longest_unique_prefix(gffile, args) gffdbx[dbn] = make_index(gffile) loci = Grouper() for dbn in gffdbx: odbns = [odbn for odbn in gffdbx if dbn != odbn] for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')): if mode == "name": loci.join(gene.id, (gene.id, dbn)) else: if (gene.id, dbn) not in loci: loci.join((gene.id, dbn)) gene_cds = list(gffdbx[dbn].children(gene, \ featuretype='CDS', order_by=('start'))) gene_cds_start, gene_cds_stop = gene_cds[0].start, \ gene_cds[-1].stop for odbn in odbns: for ogene_cds in gffdbx[odbn].region(seqid=gene.seqid, \ start=gene_cds_start, end=gene_cds_stop, \ strand=gene.strand, featuretype='CDS'): for ogene in gffdbx[odbn].parents(ogene_cds, featuretype='gene'): loci.join((gene.id, dbn), (ogene.id, odbn)) gfeats = {} mrna = AutoVivification() for i, locus in enumerate(loci): gene = "gene_{0:0{pad}}".format(i, pad=6) \ if mode == "coords" else None for elem in locus: if type(elem) == tuple: _gene, dbn = elem if gene is None: gene = _gene g = gffdbx[dbn][_gene] if gene not in gfeats: gfeats[gene] = g gfeats[gene].attributes['ID'] = [gene] else: if g.start < gfeats[gene].start: gfeats[gene].start = g.start if g.stop > gfeats[gene].stop: gfeats[gene].stop = g.stop c = list(gffdbx[dbn].children(_gene, featuretype='mRNA', order_by='start')) if len(c) > 0: mrna[gene][dbn] = c fw = must_open(opts.outfile, "w") print("##gff-version 3", file=fw) seen = {} if opts.summary: summaryfile = "{0}.summary.txt".format(opts.outfile.rsplit(".")[0]) sfw = must_open(summaryfile, "w") summary = ["id"] summary.extend(gffdbx.keys()) print("\t".join(str(x) for x in summary), file=sfw) if opts.clusters: clustersfile = "{0}.clusters.txt".format(opts.outfile.rsplit(".")[0]) cfw = must_open(clustersfile, "w") clusters = ["id", "dbns", "members", "trlens"] print("\t".join(str(x) for x in clusters), file=cfw) for gene in mrna: g = Grouper() dbns = list(combinations(mrna[gene], 2)) if len(dbns) > 0: for dbn1, dbn2 in dbns: dbx1, dbx2 = gffdbx[dbn1], gffdbx[dbn2] for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]): mrna1s, mrna2s = mrna1.stop - mrna1.start + 1, \ mrna2.stop - mrna2.start + 1 g.join((dbn1, mrna1.id, mrna1s)) g.join((dbn2, mrna2.id, mrna2s)) if match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype='CDS'): res = [] ftypes = ['exon'] if inferUTR else ['five_prime_UTR', 'three_prime_UTR'] for ftype in ftypes: res.append(match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype=ftype, slop=slop)) if all(r == True for r in res): g.join((dbn1, mrna1.id, mrna1s), (dbn2, mrna2.id, mrna2s)) else: for dbn1 in mrna[gene]: for mrna1 in mrna[gene][dbn1]: g.join((dbn1, mrna1.id, mrna1.stop - mrna1.start + 1)) print(gfeats[gene], file=fw) for group in g: group.sort(key=lambda x: x[2], reverse=True) dbs, mrnas = [el[0] for el in group], [el[1] for el in group] d, m = dbs[0], mrnas[0] dbid, _mrnaid = "|".join(str(x) for x in set(dbs)), [] for x in mrnas: if x not in _mrnaid: _mrnaid.append(x) mrnaid = "{0}|{1}".format(dbid, "-".join(_mrnaid)) if mrnaid not in seen: seen[mrnaid] = 0 else: seen[mrnaid] += 1 mrnaid = "{0}-{1}".format(mrnaid, seen[mrnaid]) _mrna = gffdbx[d][m] _mrna.attributes['ID'] = [mrnaid] _mrna.attributes['Parent'] = [gene] children = gffdbx[d].children(m, order_by='start') print(_mrna, file=fw) for child in children: child.attributes['ID'] = ["{0}|{1}".format(dbid, child.id)] child.attributes['Parent'] = [mrnaid] print(child, file=fw) if opts.summary: summary = [mrnaid] summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx]) print("\t".join(str(x) for x in summary), file=sfw) if opts.clusters: clusters = [mrnaid] clusters.append(",".join(str(el[0]) for el in group)) clusters.append(",".join(str(el[1]) for el in group)) clusters.append(",".join(str(el[2]) for el in group)) print("\t".join(str(x) for x in clusters), file=cfw) fw.close() if opts.summary: sfw.close() if opts.clusters: cfw.close()