def synfind(args): """ %prog synfind all.last *.bed Prepare input for SynFind. """ p = OptionParser(synfind.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) lastfile = args[0] bedfiles = args[1:] fp = open(lastfile) filteredlast = lastfile + ".filtered" fw = open(filteredlast, "w") for row in fp: b = BlastLine(row) if b.query == b.subject: continue print >> fw, b fw.close() logging.debug("Filtered LAST file written to `{0}`".format(filteredlast)) allbed = "all.bed" fw = open(allbed, "w") for i, bedfile in enumerate(bedfiles): prefix = chr(ord('A') + i) bed = Bed(bedfile) for b in bed: b.seqid = prefix + b.seqid print >> fw, b fw.close() logging.debug("Bed file written to `{0}`".format(allbed))
def synfind(args): """ %prog synfind all.last *.bed Prepare input for SynFind. """ p = OptionParser(synfind.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) lastfile = args[0] bedfiles = args[1:] fp = open(lastfile) filteredlast = lastfile + ".filtered" fw = open(filteredlast, "w") for row in fp: b = BlastLine(row) if b.query == b.subject: continue print(b, file=fw) fw.close() logging.debug("Filtered LAST file written to `{0}`".format(filteredlast)) allbed = "all.bed" fw = open(allbed, "w") for i, bedfile in enumerate(bedfiles): prefix = chr(ord('A') + i) bed = Bed(bedfile) for b in bed: b.seqid = prefix + b.seqid print(b, file=fw) fw.close() logging.debug("Bed file written to `{0}`".format(allbed))
def read_blast(blast_file, qorder, sorder, is_self=False): """ read the blast and convert name into coordinates """ fp = open(blast_file) filtered_blast = [] seen = set() for row in fp: b = BlastLine(row) query, subject = b.query, b.subject if query not in qorder or subject not in sorder: continue key = query, subject if key in seen: continue seen.add(key) qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # remove redundant a<->b to one side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q b.qseqid, b.sseqid = q.seqid, s.seqid b.qi, b.si = qi, si filtered_blast.append(b) return filtered_blast
def dedup(args): """ %prog dedup assembly.assembly.blast assembly.fasta Remove duplicate contigs within assembly. """ from jcvi.formats.blast import BlastLine p = OptionParser(dedup.__doc__) p.set_align(pctid=0, pctcov=98) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args cov = opts.pctcov / 100.0 sizes = Sizes(fastafile).mapping fp = open(blastfile) removed = set() for row in fp: b = BlastLine(row) query, subject = b.query, b.subject if query == subject: continue qsize, ssize = sizes[query], sizes[subject] qspan = abs(b.qstop - b.qstart) if qspan < qsize * cov: continue if (qsize, query) < (ssize, subject): removed.add(query) print("\n".join(sorted(removed)))
def make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames): """ This function makes three matrices: observed, expected and logmp. The logmp contains the statistical significance for each comparison. """ m, n = len(qpadnames), len(spadnames) qpadorder, spadorder = qpadbed.order, spadbed.order qpadid = dict((a, i) for i, a in enumerate(qpadnames)) spadid = dict((a, i) for i, a in enumerate(spadnames)) qpadlen = dict((a, len(b)) for a, b in qpadbed.sub_beds()) spadlen = dict((a, len(b)) for a, b in spadbed.sub_beds()) qsize, ssize = len(qpadbed), len(spadbed) assert sum(qpadlen.values()) == qsize assert sum(spadlen.values()) == ssize # Populate arrays of observed counts and expected counts logging.debug("Initialize array of size ({0} x {1})".format(m, n)) observed = np.zeros((m, n)) fp = open(blastfile) all_dots = 0 for row in fp: b = BlastLine(row) qi, q = qpadorder[b.query] si, s = spadorder[b.subject] qseqid, sseqid = q.seqid, s.seqid qsi, ssi = qpadid[qseqid], spadid[sseqid] observed[qsi, ssi] += 1 all_dots += 1 assert int(round(observed.sum())) == all_dots logging.debug("Total area: {0} x {1}".format(qsize, ssize)) S = qsize * ssize expected = np.zeros((m, n)) qsum = 0 for i, a in enumerate(qpadnames): alen = qpadlen[a] qsum += alen for j, b in enumerate(spadnames): blen = spadlen[b] expected[i, j] = all_dots * alen * blen * 1.0 / S assert int(round(expected.sum())) == all_dots # Calculate the statistical significance for each cell from scipy.stats.distributions import poisson M = m * n # multiple testing logmp = np.zeros((m, n)) for i in range(m): for j in range(n): obs, exp = observed[i, j], expected[i, j] pois = max(poisson.pmf(obs, exp), 1e-250) # Underflow logmp[i, j] = max(-log(pois), 0) return logmp
def cyntenator(args): """ %prog cyntenator athaliana.athaliana.last athaliana.bed Prepare input for Cyntenator. """ p = OptionParser(cyntenator.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) lastfile = args[0] fp = open(lastfile) filteredlastfile = lastfile + ".blast" fw = open(filteredlastfile, "w") for row in fp: b = BlastLine(row) if b.query == b.subject: continue print("\t".join((b.query, b.subject, str(b.score))), file=fw) fw.close() bedfiles = args[1:] fp = open(lastfile) b = BlastLine(next(fp)) subject = b.subject txtfiles = [] for bedfile in bedfiles: order = Bed(bedfile).order if subject in order: db = op.basename(bedfile).split(".")[0][:20] logging.debug("Found db: {0}".format(db)) txtfile = write_txt(bedfile) txtfiles.append(txtfile) db += ".txt" mm = MakeManager() for txtfile in txtfiles: outfile = txtfile + ".alignment" cmd = 'cyntenator -t "({0} {1})" -h blast {2} > {3}'\ .format(txtfile, db, filteredlastfile, outfile) mm.add((txtfile, db, filteredlastfile), outfile, cmd) mm.write()
def read_blast(blast_file, qorder, sorder, is_self=False, ostrip=True): """ read the blast and convert name into coordinates """ fp = open(blast_file) filtered_blast = [] seen = set() for row in fp: b = BlastLine(row) query, subject = b.query, b.subject if query == subject: continue if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder or subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # remove redundant a<->b to one side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.qseqid, b.sseqid = q.seqid, s.seqid b.qi, b.si = qi, si b.query, b.subject = query, subject filtered_blast.append(b) logging.debug("A total of {0} BLAST imported from `{1}`.".\ format(len(filtered_blast), blast_file)) return filtered_blast
def bes(args): """ %prog bes bacfasta clonename Use the clone name to download BES gss sequences from Genbank, map and then visualize. """ from jcvi.apps.align import run_blat p = OptionParser(bes.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bacfasta, clonename = args entrez([clonename, "--database=nucgss", "--skipcheck"]) besfasta = clonename + ".fasta" blatfile = clonename + ".bes.blat" run_blat( infile=besfasta, outfile=blatfile, db=bacfasta, pctid=95, hitlen=100, cpus=opts.cpus, ) aid, asize = next(Fasta(bacfasta).itersizes()) width = 50 msg = "=" * width msg += " " + aid print(msg, file=sys.stderr) ratio = width * 1.0 / asize _ = lambda x: int(round(x * ratio, 0)) blasts = [BlastLine(x) for x in open(blatfile)] for b in blasts: if b.orientation == "+": msg = " " * _(b.sstart) + "->" else: msg = " " * (_(b.sstop) - 2) + "<-" msg += " " * (width - len(msg) + 2) msg += b.query if b.orientation == "+": msg += " (hang={0})".format(b.sstart - 1) else: msg += " (hang={0})".format(asize - b.sstop) print(msg, file=sys.stderr)
def iadhore(args): """ %prog iadhore athaliana.athaliana.last athaliana.bed Wrap around iADHoRe. """ p = OptionParser(iadhore.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) lastfile = args[0] bedfiles = args[1:] blast_table = "blast_table.txt" fp = open(lastfile) seen = set() for row in fp: c = BlastLine(row) a, b = c.query, c.subject a, b = gene_name(a), gene_name(b) if a > b: a, b = b, a seen.add((a, b)) fw = open(blast_table, "w") for a, b in seen: print("\t".join((a, b)), file=fw) fw.close() logging.debug("A total of {0} pairs written to `{1}`"\ .format(len(seen), blast_table)) fw = open("config.txt", "w") for bedfile in bedfiles: pf, stanza = write_lst(bedfile) print("genome={0}".format(pf), file=fw) for seqid, fname in stanza: print(" ".join((seqid, fname)), file=fw) print(file=fw) print("blast_table={0}".format(blast_table), file=fw) print("cluster_type=colinear", file=fw) print("tandem_gap=10", file=fw) print("prob_cutoff=0.001", file=fw) print("gap_size=20", file=fw) print("cluster_gap=20", file=fw) print("q_value=0.9", file=fw) print("anchor_points=4", file=fw) print("alignment_method=gg2", file=fw) print("max_gaps_in_alignment=20", file=fw) print("output_path=i-adhore_out", file=fw) print("number_of_threads=4", file=fw) fw.close()
def blast(args): """ %prog blast allfasta clonename Insert a component into agpfile by aligning to the best hit in pool and see if they have good overlaps. """ from jcvi.apps.command import run_megablast p = OptionParser(blast.__doc__) p.add_option("-n", type="int", default=2, help="Take best N hits [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) allfasta, clonename = args fastadir = "fasta" infile = op.join(fastadir, clonename + ".fasta") if not op.exists(infile): fetch([clonename, "--skipcheck", "--outdir=" + fastadir]) outfile = "{0}.{1}.blast".format(clonename, allfasta.split(".")[0]) run_megablast(infile=infile, outfile=outfile, db=allfasta, \ pctid=GoodPct, hitlen=GoodOverlap) blasts = [BlastLine(x) for x in open(outfile)] besthits = [] for b in blasts: if b.query.count("|") >= 3: b.query = b.query.split("|")[3] if b.subject.count("|") >= 3: b.subject = b.subject.split("|")[3] b.query = b.query.rsplit(".", 1)[0] b.subject = b.subject.rsplit(".", 1)[0] if b.query == b.subject: continue if b.subject not in besthits: besthits.append(b.subject) if len(besthits) == opts.n: break for b in besthits: overlap([clonename, b, "--dir=" + fastadir])
def blast(args): """ %prog blast fastafile Run BLASTN against database (default is UniVec_Core). Output .bed format on the vector/contaminant ranges. """ p = OptionParser(blast.__doc__) p.add_option("--dist", dest="dist", default=100, type="int", help="Merge adjacent HSPs separated by [default: %default]") p.add_option("--db", dest="db", default=None, help="Use a different database rather than UniVec_Core") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args fastaprefix = fastafile.split(".", 1)[0] univec = opts.db or download( "ftp://ftp.ncbi.nih.gov/pub/UniVec/UniVec_Core") uniprefix = univec.split(".", 1)[0] fastablast = fastaprefix + ".{0}.blast".format(uniprefix) prog = run_megablast if opts.db else run_vecscreen prog(infile=fastafile, outfile=fastablast, db=univec, pctid=95, hitlen=50) fp = open(fastablast) ranges = [] for row in fp: b = BlastLine(row) ranges.append((b.query, b.qstart, b.qstop)) merged_ranges = range_merge(ranges, dist=opts.dist) bedfile = fastaprefix + ".{0}.bed".format(uniprefix) fw = must_open(bedfile, "w") for seqid, start, end in merged_ranges: print >> fw, "\t".join( str(x) for x in (seqid, start - 1, end, uniprefix)) return bedfile
def BlastOrCoordsLine(filename, filter="ref", dialect="blast", clip=0): allowed_filters = ("ref", "query") REF, QUERY = range(len(allowed_filters)) allowed_dialects = ("blast", "coords") BLAST, COORDS = range(len(allowed_dialects)) assert filter in allowed_filters filter = allowed_filters.index(filter) assert dialect in allowed_dialects dialect = allowed_dialects.index(dialect) fp = open(filename) for i, row in enumerate(fp): if row[0] == '#': continue if dialect == BLAST: b = BlastLine(row) if filter == QUERY: query, start, end = b.query, b.qstart, b.qstop else: query, start, end = b.subject, b.sstart, b.sstop else: try: b = CoordsLine(row) except AssertionError: continue if filter == QUERY: query, start, end = b.query, b.start2, b.end2 else: query, start, end = b.ref, b.start1, b.end1 if start > end: start, end = end, start if clip: # clip cannot be more than 5% of the range r = end - start + 1 cc = min(.05 * r, clip) start = start + cc end = end - cc yield Range(query, start, end, b.score, i)
def main(blast_file, cds_file, bed_file, N=3): # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file).order # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) / 2: continue query, subject = gene_name(b.query), gene_name(b.subject) qi, q = bed[query] si, s = bed[subject] if q.seqid == s.seqid and abs(qi - si) <= N: g.join(query, subject) # dump the grouper ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >> sys.stderr, "Proximal paralogues (dist=%d):" % N print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >> sys.stderr, "Longest families (%d): %s" % ( len(longest_family), ",".join(longest_family))
def bed(args): """ %prog bed btabfile Convert btab to bed format. """ from jcvi.formats.blast import BlastLine p = OptionParser(bed.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) btabfile, = args btab = Btab(btabfile) for b in btab: Bline = BlastLine(b.blastline) print Bline.bedline
def overlap(args): """ %prog overlap <a|a.fasta> <b|b.fasta> Check overlaps between two fasta records. The arguments can be genBank IDs instead of FASTA files. In case of IDs, the sequences will be downloaded first. """ from jcvi.apps.command import BLPATH from jcvi.formats.blast import chain_HSPs p = OptionParser(overlap.__doc__) p.add_option("--dir", default=os.getcwd(), help="Download sequences to dir [default: %default]") p.add_option("--qreverse", default=False, action="store_true", help="Reverse seq a [default: %default]") p.add_option("--nochain", default=False, action="store_true", help="Do not chain adjacent HSPs [default: chain HSPs]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) afasta, bfasta = args dir = opts.dir chain = not opts.nochain # Check first whether it is file or accession name if not op.exists(afasta): af = op.join(dir, afasta + ".fasta") if not op.exists(af): # Check to avoid redownload fetch([afasta, "--skipcheck", "--outdir=" + dir]) afasta = af if not op.exists(bfasta): bf = op.join(dir, bfasta + ".fasta") if not op.exists(bf): fetch([bfasta, "--skipcheck", "--outdir=" + dir]) bfasta = bf assert op.exists(afasta) and op.exists(bfasta) cmd = BLPATH("blastn") cmd += " -query {0} -subject {1}".format(afasta, bfasta) cmd += " -evalue 0.01 -outfmt 6 -perc_identity {0}".format(GoodPct) fp = popen(cmd) hsps = fp.readlines() hsps = [BlastLine(x) for x in hsps] hsps = [x for x in hsps if x.hitlen >= GoodOverlap] dist = 2 * GoodOverlap # Distance to chain the HSPs if chain: logging.debug("Chain HSPs in the Blast output.") hsps = chain_HSPs(hsps, xdist=dist, ydist=dist) if len(hsps) == 0: print >> sys.stderr, "No match found." return None besthsp = hsps[0] aid, asize = Fasta(afasta).itersizes().next() bid, bsize = Fasta(bfasta).itersizes().next() o = Overlap(besthsp, asize, bsize) o.print_graphic(qreverse=opts.qreverse) print >> sys.stderr, str(o) return o
def blastfilter_main(blast_file, p, opts): qbed, sbed, qorder, sorder, is_self = check_beds(blast_file, p, opts) tandem_Nmax = opts.tandem_Nmax cscore = opts.cscore fp = file(blast_file) total_lines = sum(1 for line in fp) logging.debug("Load BLAST file `%s` (total %d lines)" % \ (blast_file, total_lines)) fp.seek(0) blasts = sorted([BlastLine(line) for line in fp], \ key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = opts.strip_names nwarnings = 0 for b in blasts: query, subject = b.query, b.subject if query == subject: continue if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder: if nwarnings < 100: logging.warning("{0} not in {1}".format(query, qbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue if subject not in sorder: if nwarnings < 100: logging.warning("{0} not in {1}".format(subject, sbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q.seqid, s.seqid filtered_blasts.append(b) if cscore: before_filter = len(filtered_blasts) logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore) filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) logging.debug("after filter (%d->%d) .." % (before_filter, len(filtered_blasts))) if tandem_Nmax: logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \ tandem_Nmax) qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(opts.qbed)[0] + ".localdups", "w") \ if opts.tandems_only else None if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_fh = open(op.splitext(opts.sbed)[0] + ".localdups", "w") \ if opts.tandems_only else None sdups_to_mother = write_localdups(standems, sbed, sdups_fh) if opts.tandems_only: # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) # just want to use this script as a tandem finder. sys.exit() before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) logging.debug("after filter (%d->%d) .." % \ (before_filter, len(filtered_blasts))) blastfilteredfile = blast_file + ".filtered" fw = open(blastfilteredfile, "w") write_new_blast(filtered_blasts, fh=fw) fw.close()
def blastplot( ax, blastfile, qsizes, ssizes, qbed, sbed, style="dot", sampleN=None, baseticks=False, insetLabels=False, stripNames=False, highlights=None, ): assert style in DotStyles fp = open(blastfile) qorder = qbed.order if qbed else None sorder = sbed.order if sbed else None data = [] for row in fp: b = BlastLine(row) query, subject = b.query, b.subject if stripNames: query = query.rsplit(".", 1)[0] subject = subject.rsplit(".", 1)[0] if qorder: if query not in qorder: continue qi, q = qorder[query] query = q.seqid qstart, qend = q.start, q.end else: qstart, qend = b.qstart, b.qstop if sorder: if subject not in sorder: continue si, s = sorder[subject] subject = s.seqid sstart, send = s.start, s.end else: sstart, send = b.sstart, b.sstop qi = qsizes.get_position(query, qstart) qj = qsizes.get_position(query, qend) si = ssizes.get_position(subject, sstart) sj = ssizes.get_position(subject, send) if None in (qi, si): continue data.append(((qi, qj), (si, sj))) if sampleN: if len(data) > sampleN: data = sample(data, sampleN) if not data: return logging.error("no blast data imported") xsize, ysize = qsizes.totalsize, ssizes.totalsize logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) if style == "line": for a, b in data: ax.plot(a, b, "ro-", mfc="w", mec="r", ms=3) else: data = [(x[0], y[0]) for x, y in data] x, y = zip(*data) if style == "circle": ax.plot(x, y, "mo", mfc="w", mec="m", ms=3) elif style == "dot": ax.scatter(x, y, s=3, lw=0) xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis xchr_labels, ychr_labels = [], [] ignore = True # tag to mark whether to plot chr name (skip small ones) ignore_size_x = ignore_size_y = 0 # plot the chromosome breaks logging.debug("xbreaks={0} ybreaks={1}".format(len(qsizes), len(ssizes))) for (seqid, beg, end) in qsizes.get_breaks(): ignore = abs(end - beg) < ignore_size_x if ignore: continue seqid = rename_seqid(seqid) xchr_labels.append((seqid, (beg + end) / 2, ignore)) ax.plot([end, end], ylim, "-", lw=1, color="grey") for (seqid, beg, end) in ssizes.get_breaks(): ignore = abs(end - beg) < ignore_size_y if ignore: continue seqid = rename_seqid(seqid) ychr_labels.append((seqid, (beg + end) / 2, ignore)) ax.plot(xlim, [end, end], "-", lw=1, color="grey") # plot the chromosome labels for label, pos, ignore in xchr_labels: if not ignore: if insetLabels: ax.text(pos, 0, label, size=8, ha="center", va="top", color="grey") else: pos = 0.1 + pos * 0.8 / xsize root.text( pos, 0.91, label, size=10, ha="center", va="bottom", rotation=45, color="grey", ) # remember y labels are inverted for label, pos, ignore in ychr_labels: if not ignore: if insetLabels: continue pos = 0.9 - pos * 0.8 / ysize root.text(0.91, pos, label, size=10, va="center", color="grey") # Highlight regions based on a list of BedLine qhighlights = shighlights = None if highlights: if isinstance(highlights[0], BedLine): shighlights = highlights elif len(highlights) == 2: qhighlights, shighlights = highlights if qhighlights: for hl in qhighlights: hls = qsizes.get_position(hl.seqid, hl.start) ax.add_patch( Rectangle((hls, 0), hl.span, ysize, fc="r", alpha=0.2, lw=0)) if shighlights: for hl in shighlights: hls = ssizes.get_position(hl.seqid, hl.start) ax.add_patch( Rectangle((0, hls), xsize, hl.span, fc="r", alpha=0.2, lw=0)) if baseticks: def increaseDensity(a, ratio=4): assert len(a) > 1 stepsize = a[1] - a[0] newstepsize = int(stepsize / ratio) return np.arange(0, a[-1], newstepsize) # Increase the density of the ticks xticks = ax.get_xticks() yticks = ax.get_yticks() xticks = increaseDensity(xticks, ratio=2) yticks = increaseDensity(yticks, ratio=2) ax.set_xticks(xticks) # Plot outward ticklines for pos in xticks[1:]: if pos > xsize: continue pos = 0.1 + pos * 0.8 / xsize root.plot((pos, pos), (0.08, 0.1), "-", color="grey", lw=2) for pos in yticks[1:]: if pos > ysize: continue pos = 0.9 - pos * 0.8 / ysize root.plot((0.09, 0.1), (pos, pos), "-", color="grey", lw=2) ax.set_xlim(xlim) ax.set_ylim(ylim) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_base_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color="gray", size=10) plt.setp(ax.get_yticklabels(), rotation=90)
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \ evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False): if genefam: N = 1e5 # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file) order = bed.order if is_self: # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) qi, q = order[query] si, s = order[subject] if abs(qi - si) <= N and b.evalue <= evalue: if genefam: g.join(query, subject) elif q.seqid == s.seqid: g.join(query, subject) else: homologs = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue if b.evalue > evalue: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) homologs.join(query, subject) if genefam: g = homologs else: g = Grouper() for i, atom in enumerate(bed): for x in range(1, N + 1): if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \ homologs.joined(bed[i-x].accn, atom.accn)]): leni = sizes[bed[i].accn] lenx = sizes[bed[i - x].accn] if abs(leni - lenx) > max(leni, lenx) * (1 - P / 100.): continue g.join(bed[i - x].accn, atom.accn) # dump the grouper fw = must_open(ofile, "w") ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print >> fw, ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >> sys.stderr, "Proximal paralogues (dist=%d):" % N print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >> sys.stderr, "Longest families (%d): %s" % ( len(longest_family), ",".join(longest_family)) return families
def overlap(args): """ %prog overlap <a|a.fasta> <b|b.fasta> Check overlaps between two fasta records. The arguments can be genBank IDs instead of FASTA files. In case of IDs, the sequences will be downloaded first. """ from jcvi.formats.blast import chain_HSPs p = OptionParser(overlap.__doc__) p.add_option("--dir", default=os.getcwd(), help="Download sequences to dir [default: %default]") p.add_option("--suffix", default="fasta", help="Suffix of the sequence file in dir [default: %default]") p.add_option("--qreverse", default=False, action="store_true", help="Reverse seq a [default: %default]") p.add_option("--nochain", default=False, action="store_true", help="Do not chain adjacent HSPs [default: chain HSPs]") p.set_align(pctid=GoodPct, hitlen=GoodOverlap, evalue=.01) p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) afasta, bfasta = args dir = opts.dir chain = not opts.nochain suffix = opts.suffix evalue = opts.evalue pctid = opts.pctid hitlen = opts.hitlen cutoff = Cutoff(pctid, hitlen) # Check first whether it is file or accession name if not op.exists(afasta): af = op.join(dir, ".".join((afasta, suffix))) if not op.exists(af): # Check to avoid redownload entrez([afasta, "--skipcheck", "--outdir=" + dir]) afasta = af if not op.exists(bfasta): bf = op.join(dir, ".".join((bfasta, suffix))) if not op.exists(bf): entrez([bfasta, "--skipcheck", "--outdir=" + dir]) bfasta = bf assert op.exists(afasta) and op.exists(bfasta) cmd = "blastn -dust no" cmd += " -query {0} -subject {1}".format(afasta, bfasta) cmd += " -evalue {0} -outfmt 6 -perc_identity {1}".format(evalue, pctid) fp = popen(cmd) hsps = fp.readlines() hsps = [BlastLine(x) for x in hsps] hsps = [x for x in hsps if x.hitlen >= hitlen] if chain: logging.debug("Chain HSPs in the Blast output.") dist = 2 * hitlen # Distance to chain the HSPs hsps = chain_HSPs(hsps, xdist=dist, ydist=dist) if len(hsps) == 0: print >> sys.stderr, "No match found." return None besthsp = hsps[0] aid, asize = Fasta(afasta).itersizes().next() bid, bsize = Fasta(bfasta).itersizes().next() o = Overlap(besthsp, asize, bsize, cutoff, qreverse=opts.qreverse) o.print_graphic() if opts.outfile: fw = must_open(opts.outfile, "w") print >> fw, str(o) fw.close() return o