def clr(args): """ %prog blastfile fastafiles Calculate the vector clear range file based BLAST to the vectors. """ p = OptionParser(clr.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) blastfile = args[0] fastafiles = args[1:] sizes = {} for fa in fastafiles: f = Fasta(fa) sizes.update(f.itersizes()) b = Blast(blastfile) seen = set() for query, hits in b.iter_hits(): qsize = sizes[query] vectors = list((x.qstart, x.qstop) for x in hits) vmin, vmax = range_minmax(vectors) left_size = vmin - 1 right_size = qsize - vmax if left_size > right_size: clr_start, clr_end = 0, vmin else: clr_start, clr_end = vmax, qsize print "\t".join(str(x) for x in (query, clr_start, clr_end)) del sizes[query] for q, size in sorted(sizes.items()): print "\t".join(str(x) for x in (q, 0, size))
def loss(args): """ %prog loss a.b.i1.blocks [a.b-genomic.blast] Extract likely gene loss candidates between genome a and b. """ p = OptionParser(loss.__doc__) p.add_option("--bed", default=False, action="store_true", help="Genomic BLAST is in bed format [default: %default]") p.add_option("--gdist", default=20, type="int", help="Gene distance [default: %default]") p.add_option("--bdist", default=20000, type="int", help="Base pair distance [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) blocksfile = args[0] emptyblast = (len(args) == 1) if emptyblast: genomicblast = "empty.blast" sh("touch {0}".format(genomicblast)) else: genomicblast = args[1] gdist, bdist = opts.gdist, opts.bdist qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts) blocks = [] fp = open(blocksfile) genetrack = {} proxytrack = {} for row in fp: a, b = row.split() genetrack[a] = b blocks.append((a, b)) data = [] for key, rows in groupby(blocks, key=lambda x: x[-1]): rows = list(rows) data.append((key, rows)) imax = len(data) - 1 for i, (key, rows) in enumerate(data): if i == 0 or i == imax: continue if key != '.': continue before, br = data[i - 1] after, ar = data[i + 1] bi, bx = sorder[before] ai, ax = sorder[after] dist = abs(bi - ai) if bx.seqid != ax.seqid or dist > gdist: continue start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end))) start, end = max(start - bdist, 1), end + bdist proxy = (bx.seqid, start, end) for a, b in rows: proxytrack[a] = proxy tags = {} if opts.bed: bed = Bed(genomicblast, sorted=False) key = lambda x: gene_name(x.accn.rsplit(".", 1)[0]) for query, bb in groupby(bed, key=key): bb = list(bb) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.seqid, b.start, b.end) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.seqid, best_b.start, best_b.end) proxytrack[query] = hsp tags[query] = tag else: blast = Blast(genomicblast) for query, bb in blast.iter_hits(): bb = list(bb) query = gene_name(query) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.subject, b.sstart, b.sstop) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.subject, best_b.sstart, best_b.sstop) proxytrack[query] = hsp tags[query] = tag for b in qbed: accn = b.accn target_region = genetrack[accn] if accn in proxytrack: target_region = region_str(proxytrack[accn]) if accn in tags: ptag = "[{0}]".format(tags[accn]) else: ptag = "[NF]" target_region = ptag + target_region print "\t".join((b.seqid, accn, target_region)) if emptyblast: sh("rm -f {0}".format(genomicblast))
def rnaseq(args): """ %prog rnaseq blastfile ref.fasta Evaluate de-novo RNA-seq assembly against a reference gene set (same or closely related organism). Ideally blatfile needs to be supermap'd. Following metric is used (Martin et al. 2010, Rnnotator paper): Accuracy: % of contigs share >=95% identity with ref genome (TODO) Completeness: % of ref genes covered by contigs to >=80% of their lengths Contiguity: % of ref genes covered by a *single* contig >=80% of lengths Chimer: % of contigs that contain two or more annotated genes >= 50bp """ from jcvi.algorithms.supermap import supermap p = OptionParser(rnaseq.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) blastfile, reffasta = args sizes = Sizes(reffasta).mapping known_genes = len(sizes) querysupermap = blastfile + ".query.supermap" refsupermap = blastfile + ".ref.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") if not op.exists(refsupermap): supermap(blastfile, filter="ref") blast = Blast(querysupermap) chimers = 0 goodctg80 = set() goodctg50 = set() for ctg, hits in blast.iter_hits(): bps = defaultdict(int) for x in hits: bps[x.subject] += abs(x.sstop - x.sstart) + 1 valid_hits = bps.items() for vh, length in valid_hits: rsize = sizes[vh] ratio = length * 100. / rsize if ratio >= 80: goodctg80.add(ctg) if ratio >= 50: goodctg50.add(ctg) # Chimer if len(valid_hits) > 1: chimers += 1 blast = Blast(refsupermap) goodref80 = set() goodref50 = set() bps = defaultdict(int) for x in blast.iter_line(): bps[x.subject] += abs(x.sstop - x.sstart) + 1 for vh, length in bps.items(): rsize = sizes[vh] ratio = length * 100. / rsize if ratio >= 80: goodref80.add(vh) if ratio >= 50: goodref50.add(vh) print >> sys.stderr, "Reference set: `{0}`, # of transcripts {1}".\ format(reffasta, known_genes) print >> sys.stderr, "A total of {0} contigs map to 80% of a reference"\ " transcript".format(len(goodctg80)) print >> sys.stderr, "A total of {0} contigs map to 50% of a reference"\ " transcript".format(len(goodctg50)) print >> sys.stderr, "A total of {0} reference transcripts ({1:.1f}%) have 80% covered" \ .format(len(goodref80), len(goodref80) * 100. / known_genes)