def prepare_synteny(tourfile, lastfile, odir, p, opts): """ Prepare synteny plots for movie(). """ qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) qbedfile = op.abspath(qbedfile) sbedfile = op.abspath(sbedfile) qbed = Bed(qbedfile, sorted=False) contig_to_beds = dict(qbed.sub_beds()) # Create a separate directory for the subplots and movie mkdir(odir, overwrite=True) os.chdir(odir) logging.debug("Change into subdir `{}`".format(odir)) # Make anchorsfile anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) + ".anchors" fw = open(anchorsfile, "w") for b in Blast(lastfile): print >> fw, "\t".join( (gene_name(b.query), gene_name(b.subject), str(int(b.score)))) fw.close() # Symlink sbed symlink(sbedfile, op.basename(sbedfile)) return anchorsfile, qbedfile, contig_to_beds
def clr(args): """ %prog blastfile fastafiles Calculate the vector clear range file based BLAST to the vectors. """ p = OptionParser(clr.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) blastfile = args[0] fastafiles = args[1:] sizes = {} for fa in fastafiles: f = Fasta(fa) sizes.update(f.itersizes()) b = Blast(blastfile) seen = set() for query, hits in b.iter_hits(): qsize = sizes[query] vectors = list((x.qstart, x.qstop) for x in hits) vmin, vmax = range_minmax(vectors) left_size = vmin - 1 right_size = qsize - vmax if left_size > right_size: clr_start, clr_end = 0, vmin else: clr_start, clr_end = vmax, qsize print "\t".join(str(x) for x in (query, clr_start, clr_end)) del sizes[query] for q, size in sorted(sizes.items()): print "\t".join(str(x) for x in (q, 0, size))
def read_blast(blast_file, qorder, sorder, is_self=False, ostrip=True): """ read the blast and convert name into coordinates """ filtered_blast = [] seen = set() bl = Blast(blast_file) for b in bl: query, subject = b.query, b.subject if query == subject: continue if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder or subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] if is_self: # remove redundant a<->b to one side when doing self-self BLAST if qi > si: query, subject = subject, query qi, si = si, qi q, s = s, q # Too close to diagonal! possible tandem repeats if q.seqid == s.seqid and si - qi < 40: continue key = query, subject if key in seen: continue seen.add(key) b.qseqid, b.sseqid = q.seqid, s.seqid b.qi, b.si = qi, si b.query, b.subject = query, subject filtered_blast.append(b) logging.debug("A total of {0} BLAST imported from `{1}`.".\ format(len(filtered_blast), blast_file)) return filtered_blast
def loss(args): """ %prog loss a.b.i1.blocks [a.b-genomic.blast] Extract likely gene loss candidates between genome a and b. """ p = OptionParser(loss.__doc__) p.add_option("--bed", default=False, action="store_true", help="Genomic BLAST is in bed format [default: %default]") p.add_option("--gdist", default=20, type="int", help="Gene distance [default: %default]") p.add_option("--bdist", default=20000, type="int", help="Base pair distance [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) blocksfile = args[0] emptyblast = (len(args) == 1) if emptyblast: genomicblast = "empty.blast" sh("touch {0}".format(genomicblast)) else: genomicblast = args[1] gdist, bdist = opts.gdist, opts.bdist qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts) blocks = [] fp = open(blocksfile) genetrack = {} proxytrack = {} for row in fp: a, b = row.split() genetrack[a] = b blocks.append((a, b)) data = [] for key, rows in groupby(blocks, key=lambda x: x[-1]): rows = list(rows) data.append((key, rows)) imax = len(data) - 1 for i, (key, rows) in enumerate(data): if i == 0 or i == imax: continue if key != '.': continue before, br = data[i - 1] after, ar = data[i + 1] bi, bx = sorder[before] ai, ax = sorder[after] dist = abs(bi - ai) if bx.seqid != ax.seqid or dist > gdist: continue start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end))) start, end = max(start - bdist, 1), end + bdist proxy = (bx.seqid, start, end) for a, b in rows: proxytrack[a] = proxy tags = {} if opts.bed: bed = Bed(genomicblast, sorted=False) key = lambda x: gene_name(x.accn.rsplit(".", 1)[0]) for query, bb in groupby(bed, key=key): bb = list(bb) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.seqid, b.start, b.end) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.seqid, best_b.start, best_b.end) proxytrack[query] = hsp tags[query] = tag else: blast = Blast(genomicblast) for query, bb in blast.iter_hits(): bb = list(bb) query = gene_name(query) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.subject, b.sstart, b.sstop) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.subject, best_b.sstart, best_b.sstop) proxytrack[query] = hsp tags[query] = tag for b in qbed: accn = b.accn target_region = genetrack[accn] if accn in proxytrack: target_region = region_str(proxytrack[accn]) if accn in tags: ptag = "[{0}]".format(tags[accn]) else: ptag = "[NF]" target_region = ptag + target_region print "\t".join((b.seqid, accn, target_region)) if emptyblast: sh("rm -f {0}".format(genomicblast))
def connect(args): """ %prog connect assembly.fasta read_mapping.blast Connect contigs using long reads. """ p = OptionParser(connect.__doc__) p.add_option( "--clip", default=2000, type="int", help="Only consider end of contigs", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, blastfile = args clip = opts.clip sizes = Sizes(fastafile).mapping blast = Blast(blastfile) blasts = [] for b in blast: seqid = b.subject size = sizes[seqid] start, end = b.sstart, b.sstop cstart, cend = min(size, clip), max(0, size - clip) if start > cstart and end < cend: continue blasts.append(b) key = lambda x: x.query blasts.sort(key=key) g = BiGraph() for query, bb in groupby(blasts, key=key): bb = sorted(bb, key=lambda x: x.qstart) nsubjects = len(set(x.subject for x in bb)) if nsubjects == 1: continue print("\n".join(str(x) for x in bb)) for a, b in pairwise(bb): astart, astop = a.qstart, a.qstop bstart, bstop = b.qstart, b.qstop if a.subject == b.subject: continue arange = astart, astop brange = bstart, bstop ov = range_intersect(arange, brange) alen = astop - astart + 1 blen = bstop - bstart + 1 if ov: ostart, ostop = ov ov = ostop - ostart + 1 print(ov, alen, blen) if ov and (ov > alen / 2 or ov > blen / 2): print("Too much overlap ({0})".format(ov)) continue asub = a.subject bsub = b.subject atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(asub, bsub, atag, btag) graph_to_agp(g, blastfile, fastafile, verbose=False)
def blastfilter_main(blast_file, p, opts): qbed, sbed, qorder, sorder, is_self = check_beds(blast_file, p, opts) tandem_Nmax = opts.tandem_Nmax cscore = opts.cscore fp = open(blast_file) total_lines = sum(1 for line in fp if line[0] != '#') logging.debug("Load BLAST file `%s` (total %d lines)" % \ (blast_file, total_lines)) bl = Blast(blast_file) blasts = sorted(list(bl), key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = opts.strip_names nwarnings = 0 for b in blasts: query, subject = b.query, b.subject if query == subject: continue if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder: if nwarnings < 100: logging.warning("{0} not in {1}".format(query, qbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue if subject not in sorder: if nwarnings < 100: logging.warning("{0} not in {1}".format( subject, sbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q.seqid, s.seqid filtered_blasts.append(b) if cscore: before_filter = len(filtered_blasts) logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore) filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) logging.debug("after filter (%d->%d) .." % (before_filter, len(filtered_blasts))) if tandem_Nmax: logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \ tandem_Nmax) qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(opts.qbed)[0] + ".localdups", "w") \ if opts.tandems_only else None if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_fh = open(op.splitext(opts.sbed)[0] + ".localdups", "w") \ if opts.tandems_only else None sdups_to_mother = write_localdups(standems, sbed, sdups_fh) if opts.tandems_only: # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) # just want to use this script as a tandem finder. #sys.exit() before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) logging.debug("after filter (%d->%d) .." % \ (before_filter, len(filtered_blasts))) blastfilteredfile = blast_file + ".filtered" fw = open(blastfilteredfile, "w") write_new_blast(filtered_blasts, fh=fw) fw.close()
def rnaseq(args): """ %prog rnaseq blastfile ref.fasta Evaluate de-novo RNA-seq assembly against a reference gene set (same or closely related organism). Ideally blatfile needs to be supermap'd. Following metric is used (Martin et al. 2010, Rnnotator paper): Accuracy: % of contigs share >=95% identity with ref genome (TODO) Completeness: % of ref genes covered by contigs to >=80% of their lengths Contiguity: % of ref genes covered by a *single* contig >=80% of lengths Chimer: % of contigs that contain two or more annotated genes >= 50bp """ from jcvi.algorithms.supermap import supermap p = OptionParser(rnaseq.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) blastfile, reffasta = args sizes = Sizes(reffasta).mapping known_genes = len(sizes) querysupermap = blastfile + ".query.supermap" refsupermap = blastfile + ".ref.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") if not op.exists(refsupermap): supermap(blastfile, filter="ref") blast = Blast(querysupermap) chimers = 0 goodctg80 = set() goodctg50 = set() for ctg, hits in blast.iter_hits(): bps = defaultdict(int) for x in hits: bps[x.subject] += abs(x.sstop - x.sstart) + 1 valid_hits = bps.items() for vh, length in valid_hits: rsize = sizes[vh] ratio = length * 100. / rsize if ratio >= 80: goodctg80.add(ctg) if ratio >= 50: goodctg50.add(ctg) # Chimer if len(valid_hits) > 1: chimers += 1 blast = Blast(refsupermap) goodref80 = set() goodref50 = set() bps = defaultdict(int) for x in blast.iter_line(): bps[x.subject] += abs(x.sstop - x.sstart) + 1 for vh, length in bps.items(): rsize = sizes[vh] ratio = length * 100. / rsize if ratio >= 80: goodref80.add(vh) if ratio >= 50: goodref50.add(vh) print >> sys.stderr, "Reference set: `{0}`, # of transcripts {1}".\ format(reffasta, known_genes) print >> sys.stderr, "A total of {0} contigs map to 80% of a reference"\ " transcript".format(len(goodctg80)) print >> sys.stderr, "A total of {0} contigs map to 50% of a reference"\ " transcript".format(len(goodctg50)) print >> sys.stderr, "A total of {0} reference transcripts ({1:.1f}%) have 80% covered" \ .format(len(goodref80), len(goodref80) * 100. / known_genes)
def expand(args): """ %prog expand bes.fasta reads.fastq Expand sequences using short reads. Useful, for example for getting BAC-end sequences. The template to use, in `bes.fasta` may just contain the junction sequences, then align the reads to get the 'flanks' for such sequences. """ import math from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.fastq import readlen, first, fasta from jcvi.formats.blast import Blast from jcvi.formats.base import FileShredder from jcvi.apps.bowtie import align, get_samfile from jcvi.apps.align import blast p = OptionParser(expand.__doc__) p.set_depth(depth=200) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bes, reads = args size = Fasta(bes).totalsize rl = readlen([reads]) expected_size = size + 2 * rl nreads = expected_size * opts.depth / rl nreads = int(math.ceil(nreads / 1000.)) * 1000 # Attract reads samfile, logfile = align([bes, reads, "--reorder", "--mapped", "--firstN={0}".format(opts.firstN)]) samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True) logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped)) pf = mapped.split(".")[0] pf = pf.split("-")[0] bespf = bes.split(".")[0] reads = pf + ".expand.fastq" first([str(nreads), mapped, "-o", reads]) # Perform mini-assembly fastafile = reads.rsplit(".", 1)[0] + ".fasta" qualfile = "" if need_update(reads, fastafile): fastafile, qualfile = fasta([reads]) contigs = op.join(pf, "454LargeContigs.fna") if need_update(fastafile, contigs): cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile) sh(cmd) assert op.exists(contigs) # Annotate contigs blastfile = blast([bes, contigs]) mapping = {} for query, b in Blast(blastfile).iter_best_hit(): mapping[query] = b f = Fasta(contigs, lazy=True) annotatedfasta = ".".join((pf, bespf, "fasta")) fw = open(annotatedfasta, "w") keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list recs = [] for key, v in f.iteritems_ordered(): vid = v.id if vid not in mapping: continue b = mapping[vid] subject = b.subject rec = v.reverse_complement() if b.orientation == '-' else v rec.id = rid = "_".join((pf, vid, subject)) rec.description = "" recs.append((keys.index(subject), rid, rec)) recs = [x[-1] for x in sorted(recs)] SeqIO.write(recs, fw, "fasta") fw.close() FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf]) logging.debug("Annotated seqs (n={0}) written to `{1}`.".\ format(len(recs), annotatedfasta)) return annotatedfasta