def readlen(args): """ %prog readlen fastqfile Calculate read length, will only try the first N reads. Output min, max, and avg for each file. """ p = OptionParser(readlen.__doc__) p.set_firstN() p.add_option("--silent", default=False, action="store_true", help="Do not print read length stats") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) f, = args if not is_fastq(f): logging.debug("File `{0}` does not endswith .fastq or .fq") return 0 s = calc_readlen(f, opts.firstN) if not opts.silent: print "\t".join(str(x) for x in (f, s.min, s.max, s.mean, s.median)) return int(s.max)
def links(args): """ %prog links url Extract all the links "<a href=''>" from web page. """ p = OptionParser(links.__doc__) p.add_option("--img", default=False, action="store_true", help="Extract <img> tags [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) url, = args img = opts.img htmlfile = download(url) page = open(htmlfile).read() soup = BeautifulSoup(page) tag = 'img' if img else 'a' src = 'src' if img else 'href' aa = soup.findAll(tag) for a in aa: link = a.get(src) link = urljoin(url, link) print(link)
def unitigs(args): """ %prog unitigs best.edges Reads Celera Assembler's "best.edges" and extract all unitigs. """ p = OptionParser(unitigs.__doc__) p.add_option("--maxerr", default=2, type="int", help="Maximum error rate") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bestedges, = args G = read_graph(bestedges, maxerr=opts.maxerr, directed=True) H = nx.Graph() intconv = lambda x: int(x.split("-")[0]) for k, v in G.iteritems(): if k == G.get(v, None): H.add_edge(intconv(k), intconv(v)) nunitigs = nreads = 0 for h in nx.connected_component_subgraphs(H, copy=False): st = [x for x in h if h.degree(x) == 1] if len(st) != 2: continue src, target = st path = list(nx.all_simple_paths(h, src, target)) assert len(path) == 1 path, = path print "|".join(str(x) for x in path) nunitigs += 1 nreads += len(path) logging.debug("A total of {0} unitigs built from {1} reads.".format(nunitigs, nreads))
def trim(args): """ %prog trim fastqfile Wraps `fastx_trimmer` to trim from begin or end of reads. """ p = OptionParser(trim.__doc__) p.add_option("-f", dest="first", default=0, type="int", help="First base to keep. Default is 1.") p.add_option("-l", dest="last", default=0, type="int", help="Last base to keep. Default is entire read.") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args obfastqfile = op.basename(fastqfile) fq = obfastqfile.rsplit(".", 1)[0] + ".ntrimmed.fastq" if fastqfile.endswith(".gz"): fq = obfastqfile.rsplit(".", 2)[0] + ".ntrimmed.fastq.gz" cmd = "fastx_trimmer -Q33 " if opts.first: cmd += "-f {0.first} ".format(opts) if opts.last: cmd += "-l {0.last} ".format(opts) sh(cmd, infile=fastqfile, outfile=fq)
def histogram(args): """ %prog histogram *.gff Plot gene statistics based on output of stats. For each gff file, look to see if the metrics folder (i.e. Exon_Length) contains the data and plot them. """ from jcvi.graphics.histogram import histogram_multiple p = OptionParser(histogram.__doc__) p.add_option("--bins", dest="bins", default=40, type="int", help="number of bins to plot in the histogram [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) gff_files = args # metrics = ("Exon_Length", "Intron_Length", "Gene_Length", "Exon_Count") colors = ("red", "green", "blue", "black") vmaxes = (1000, 1000, 4000, 20) xlabels = ("bp", "bp", "bp", "number") for metric, color, vmax, xlabel in zip(metrics, colors, vmaxes, xlabels): logging.debug("Parsing files in `{0}`..".format(metric)) numberfiles = [op.join(metric, op.basename(x).split(".")[0] + ".txt") \ for x in gff_files] histogram_multiple(numberfiles, 0, vmax, xlabel, metric, bins=opts.bins, facet=True, fill=color, prefix=metric + ".")
def filter(args): """ %prog filter consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=10, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args minsize = opts.minsize f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) if size < minsize: continue SeqIO.write(rec, fw, "fasta")
def ids(args): """ %prog ids cdhit.clstr Get the representative ids from clstr file. """ p = OptionParser(ids.__doc__) p.add_option("--prefix", type="int", help="Find rep id for prefix of len [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) clstrfile, = args cf = ClstrFile(clstrfile) prefix = opts.prefix if prefix: reads = list(cf.iter_reps_prefix(prefix=prefix)) else: reads = list(cf.iter_reps()) nreads = len(reads) idsfile = clstrfile.replace(".clstr", ".ids") fw = open(idsfile, "w") for i, name in reads: print("\t".join(str(x) for x in (i, name)), file=fw) logging.debug("A total of {0} unique reads written to `{1}`.".\ format(nreads, idsfile)) fw.close() return idsfile
def contamination(args): """ %prog contamination folder Ecoli.fasta Remove contaminated reads. The FASTQ files in the folder will automatically pair and filtered against Ecoli.fasta to remove contaminants using BOWTIE2. """ from jcvi.apps.bowtie import align p = OptionParser(contamination.__doc__) p.add_option("--mapped", default=False, action="store_true", help="Retain contaminated reads instead [default: %default]") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, ecoli = args ecoli = get_abs_path(ecoli) tag = "--mapped" if opts.mapped else "--unmapped" for p, pf in iter_project(folder, 2): align_opts = [ecoli] + p + [tag] align_opts += ["--cutoff={0}".format(opts.cutoff), "--null"] if opts.mateorientation: align_opts += ["--mateorientation={0}".format(opts.mateorientation)] samfile, logfile = align(align_opts)
def blat(args): """ %prog blat old.fasta new.fasta Generate psl file using blat. """ p = OptionParser(blat.__doc__) p.add_option("--minscore", default=100, type="int", help="Matches minus mismatches gap penalty [default: %default]") p.add_option("--minid", default=98, type="int", help="Minimum sequence identity [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) oldfasta, newfasta = args twobitfiles = [] for fastafile in args: tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat" cmd += " {0} {1}".format(oldtwobit, newfasta) cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\ format(opts.minscore, opts.minid) pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \ for x in (newfasta, oldfasta))) cmd += pslfile sh(cmd)
def pasteprepare(args): """ %prog pasteprepare bacs.fasta Prepare sequences for paste. """ p = OptionParser(pasteprepare.__doc__) p.add_option("--flank", default=5000, type="int", help="Get the seq of size on two ends [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) goodfasta, = args flank = opts.flank pf = goodfasta.rsplit(".", 1)[0] extbed = pf + ".ext.bed" sizes = Sizes(goodfasta) fw = open(extbed, "w") for bac, size in sizes.iter_sizes(): print >> fw, "\t".join(str(x) for x in \ (bac, 0, min(flank, size), bac + "L")) print >> fw, "\t".join(str(x) for x in \ (bac, max(size - flank, 0), size, bac + "R")) fw.close() fastaFromBed(extbed, goodfasta, name=True)
def paste(args): """ %prog paste flanks.bed flanks_vs_assembly.blast backbone.fasta Paste in good sequences in the final assembly. """ from jcvi.formats.bed import uniq p = OptionParser(paste.__doc__) p.add_option("--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.set_rclip(rclip=1) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) pbed, blastfile, bbfasta = args maxsize = opts.maxsize # Max DNA size to replace gap order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds(blastfile, order, log=True, rclip=opts.rclip, maxsize=maxsize, flipbeds=True) beforebed = uniq([beforebed]) afbed = Bed(beforebed) bfbed = Bed(afterbed) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def batchcn(args): """ %prog batchcn workdir samples.csv Run CNV segmentation caller in batch mode. Scans a workdir. """ p = OptionParser(batchcn.__doc__) p.add_option("--upload", default="s3://hli-mv-data-science/htang/ccn", help="Upload cn and seg results to s3") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) workdir, samples = args upload = opts.upload store = upload + "/{}/*.seg".format(workdir) computed = [op.basename(x).split(".")[0] for x in glob_s3(store)] computed = set(computed) # Generate a bunch of cn commands fp = open(samples) nskipped = ntotal = 0 cmd = "python -m jcvi.variation.cnv cn --hmm --cleanup {}".format(workdir) for row in fp: samplekey, path = row.strip().split(",") ntotal += 1 if samplekey in computed: nskipped += 1 continue print " ".join((cmd, samplekey, path)) logging.debug("Skipped: {}".format(percentage(nskipped, ntotal)))
def liftover(args): """ %prog liftover lobstr_v3.0.2_hg38_ref.bed hg38.upper.fa LiftOver CODIS/Y-STR markers. """ p = OptionParser(liftover.__doc__) p.add_option("--checkvalid", default=False, action="store_true", help="Check minscore, period and length") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) refbed, fastafile = args genome = pyfasta.Fasta(fastafile) edits = [] fp = open(refbed) for i, row in enumerate(fp): s = STRLine(row) seq = genome[s.seqid][s.start - 1: s.end].upper() s.motif = get_motif(seq, len(s.motif)) s.fix_counts(seq) if opts.checkvalid and not s.is_valid(): continue edits.append(s) if i % 10000 == 0: print >> sys.stderr, i, "lines read" edits = natsorted(edits, key=lambda x: (x.seqid, x.start)) for e in edits: print str(e)
def getgenes(args): """ %prog getgenes [--options] Read GenBank file, or retrieve from web. Output bed, cds files, and pep file (can turn off with --nopep). Either --gb_dir or --id/--simple should be provided. """ p = OptionParser(getgenes.__doc__) p.add_option("--prefix", default="gbout", help="prefix of output files [default: %default]") p.add_option("--nopep", default=False, action="store_true", help="Only get cds and bed, no pep [default: %default]") filenames, accessions, idfile, opts, args = preparegb(p, args) prefix = opts.prefix GenBank(filenames=filenames, accessions=accessions, idfile=idfile).\ write_genes(output=prefix, individual=opts.individual, \ pep=(not opts.nopep)) if opts.individual: logging.debug("Output written dir {0}".format(prefix)) elif opts.nopep: logging.debug("Output written to {0}.bed, {0}.cds".format(prefix,)) else: logging.debug("Output written to {0}.bed, {0}.cds, {0}.pep".format(prefix,))
def cib(args): """ %prog cib bamfile samplekey Convert BAM to CIB (a binary storage of int8 per base). """ p = OptionParser(cib.__doc__) p.add_option("--prefix", help="Report seqids with this prefix only") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, samplekey = args mkdir(samplekey) bam = pysam.AlignmentFile(bamfile, "rb") refs = [x for x in bam.header["SQ"]] prefix = opts.prefix if prefix: refs = [x for x in refs if x["SN"].startswith(prefix)] task_args = [] for r in refs: task_args.append((bamfile, r, samplekey)) cpus = min(opts.cpus, len(task_args)) logging.debug("Use {} cpus".format(cpus)) p = Pool(processes=cpus) for res in p.imap(bam_to_cib, task_args): continue
def count(args): """ %prog count bamfile gtf Count the number of reads mapped using `htseq-count`. """ p = OptionParser(count.__doc__) p.add_option("--type", default="exon", help="Only count feature type") p.set_cpus(cpus=8) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, gtf = args cpus = opts.cpus pf = bamfile.split(".")[0] countfile = pf + ".count" if not need_update(bamfile, countfile): return nsorted = pf + "_nsorted" nsortedbam, nsortedsam = nsorted + ".bam", nsorted + ".sam" if need_update(bamfile, nsortedsam): cmd = "samtools sort -@ {0} -n {1} {2}".format(cpus, bamfile, nsorted) sh(cmd) cmd = "samtools view -@ {0} -h {1}".format(cpus, nsortedbam) sh(cmd, outfile=nsortedsam) if need_update(nsortedsam, countfile): cmd = "htseq-count --stranded=no --minaqual=10" cmd += " -t {0}".format(opts.type) cmd += " {0} {1}".format(nsortedsam, gtf) sh(cmd, outfile=countfile)
def consensus(args): """ %prog consensus fastafile bamfile Convert bam alignments to consensus FASTQ/FASTA. """ p = OptionParser(consensus.__doc__) p.add_option("--fasta", default=False, action="store_true", help="Generate consensus FASTA sequences [default: %default]") p.add_option("--mask", default=0, type="int", help="Mask bases with quality lower than") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile, bamfile = args fasta = opts.fasta suffix = "fasta" if fasta else "fastq" pf = bamfile.rsplit(".", 1)[0] cnsfile = pf + ".cns.{0}".format(suffix) vcfgzfile = pf + ".vcf.gz" vcf([fastafile, bamfile, "-o", vcfgzfile]) cmd += "zcat {0} | vcfutils.pl vcf2fq".format(vcfgzfile) if fasta: cmd += " | seqtk seq -q {0} -A -".format(opts.mask) sh(cmd, outfile=cnsfile)
def bed(args): """ %prog bed xmlfile Print summary of optical map alignment in BED format. """ from jcvi.formats.bed import sort p = OptionParser(bed.__doc__) p.add_option("--blockonly", default=False, action="store_true", help="Only print out large blocks, not fragments [default: %default]") p.add_option("--point", default=False, action="store_true", help="Print accesssion as single point instead of interval") p.add_option("--scale", type="float", help="Scale the OM distance by factor") p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements [default: %default]") p.add_option("--nosort", default=False, action="store_true", help="Do not sort bed [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) xmlfile, = args bedfile = xmlfile.rsplit(".", 1)[0] + ".bed" om = OpticalMap(xmlfile) om.write_bed(bedfile, point=opts.point, scale=opts.scale, blockonly=opts.blockonly, switch=opts.switch) if not opts.nosort: sort([bedfile, "--inplace"])
def main(args): """ %prog deltafile refidsfile query.fasta ref.fasta Plot one query. Extract the references that have major matches to this query. Control "major" by option --refcov. """ p = OptionParser(main.__doc__) p.add_option("--refcov", default=.01, type="float", help="Minimum reference coverage [default: %default]") p.add_option("--all", default=False, action="store_true", help="Plot one pdf file per ref in refidsfile [default: %default]") p.set_align(pctid=96, hitlen=500) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) deltafile, refidsfile, queryfasta, reffasta = args qsizes = Sizes(queryfasta).mapping rsizes = Sizes(reffasta).mapping refs = SetFile(refidsfile) refcov = opts.refcov pctid = opts.pctid hitlen = opts.hitlen deltafile = filter([deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)]) if opts.all: for r in refs: pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov) if pdffile: sh("mv {0} {1}.pdf".format(pdffile, r)) else: plot_some_queries(refs, qsizes, rsizes, deltafile, refcov)
def chain(args): """ %prog chain blastfile Chain adjacent HSPs together to form larger HSP. The adjacent HSPs have to share the same orientation. """ p = OptionParser(chain.__doc__) p.add_option("--dist", dest="dist", default=100, type="int", help="extent of flanking regions to search [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args dist = opts.dist assert dist > 0 blast = BlastSlow(blastfile) chained_hsps = chain_HSPs(blast, xdist=dist, ydist=dist) for b in chained_hsps: print b
def top10(args): """ %prog top10 blastfile.best Count the most frequent 10 hits. Usually the BLASTFILE needs to be screened the get the best match. You can also provide an .ids file to query the ids. For example the ids file can contain the seqid to species mapping. The ids file is two-column, and can sometimes be generated by `jcvi.formats.fasta ids --description`. """ from jcvi.formats.base import DictFile p = OptionParser(top10.__doc__) p.add_option("--top", default=10, type="int", help="Top N taxa to extract [default: %default]") p.add_option("--ids", default=None, help="Two column ids file to query seqid [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args mapping = DictFile(opts.ids, delimiter="\t") if opts.ids else {} cmd = "cut -f2 {0}".format(blastfile) cmd += " | sort | uniq -c | sort -k1,1nr | head -n {0}".format(opts.top) fp = popen(cmd) for row in fp: count, seqid = row.split() nseqid = mapping.get(seqid, seqid) print "\t".join((count, nseqid))
def annotation(args): """ %prog annotation blastfile > annotations Create simple two column files from the first two coluns in blastfile. Use --queryids and --subjectids to switch IDs or descriptions. """ from jcvi.formats.base import DictFile p = OptionParser(annotation.__doc__) p.add_option("--queryids", help="Query IDS file to switch [default: %default]") p.add_option("--subjectids", help="Subject IDS file to switch [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args d = "\t" qids = DictFile(opts.queryids, delimiter=d) if opts.queryids else None sids = DictFile(opts.subjectids, delimiter=d) if opts.subjectids else None blast = Blast(blastfile) for b in blast: query, subject = b.query, b.subject if qids: query = qids[query] if sids: subject = sids[subject] print "\t".join((query, subject))
def annotate(args): """ %prog annotate blastfile query.fasta subject.fasta Annotate overlap types (dovetail, contained, etc) in BLAST tabular file. """ from jcvi.assembly.goldenpath import Cutoff, Overlap, Overlap_types p = OptionParser(annotate.__doc__) p.set_align(pctid=94, hitlen=500) p.add_option("--hang", default=500, type="int", help="Maximum overhang length") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blastfile, afasta, bfasta = args fp = open(blastfile) asizes = Sizes(afasta).mapping bsizes = Sizes(bfasta).mapping cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang) logging.debug(str(cutoff)) for row in fp: b = BlastLine(row) asize = asizes[b.query] bsize = bsizes[b.subject] if b.query == b.subject: continue ov = Overlap(b, asize, bsize, cutoff) if ov.otype: ov.print_graphic() print "{0}\t{1}".format(b, Overlap_types[ov.otype])
def phytozome(args): """ %prog phytozome species Retrieve genomes and annotations from phytozome FTP. Available species listed below. Use comma to give a list of species to download. For example: $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum """ p = OptionParser(phytozome.__doc__) p.add_option("--version", default="9.0", help="Phytozome version [default: %default]") p.add_option("--assembly", default=False, action="store_true", help="Download assembly [default: %default]") opts, args = p.parse_args(args) url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".\ format(opts.version) valid_species = [x for x in ls_ftp(url) if "." not in x] doc = "\n".join((phytozome.__doc__, tile(valid_species))) p.set_usage(doc) if len(args) != 1: sys.exit(not p.print_help()) species, = args species = species.split(",") for s in species: download_species_phytozome(s, valid_species, url, assembly=opts.assembly)
def bed(args): """ %prog bed blastfile Print out bed file based on coordinates in BLAST report. By default, write out subject positions. Use --swap to write query positions. """ p = OptionParser(bed.__doc__) p.add_option("--swap", default=False, action="store_true", help="Write query positions [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) blastfile, = args swap = opts.swap fp = must_open(blastfile) bedfile = blastfile.rsplit(".", 1)[0] + ".bed" fw = open(bedfile, "w") for row in fp: b = BlastLine(row) if swap: b = b.swapped print >> fw, b.bedline logging.debug("File written to `{0}`.".format(bedfile)) return bedfile
def filterm4(args): """ %prog filterm4 sample.m4 > filtered.m4 Filter .m4 file after blasr is run. As blasr takes a long time to run, changing -bestn is undesirable. This screens the m4 file to retain top hits. """ p = OptionParser(filterm4.__doc__) p.add_option("--best", default=1, type="int", help="Only retain best N hits") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) m4file, = args best = opts.best fp = open(m4file) fw = must_open(opts.outfile, "w") seen = defaultdict(int) retained = total = 0 for row in fp: r = M4Line(row) total += 1 if total % 100000 == 0: logging.debug("Retained {0} lines".\ format(percentage(retained, total))) if seen.get(r.query, 0) < best: fw.write(row) seen[r.query] += 1 retained += 1 fw.close()
def ensembl(args): """ %prog ensembl species Retrieve genomes and annotations from ensembl FTP. Available species listed below. Use comma to give a list of species to download. For example: $ %prog ensembl danio_rerio,gasterosteus_aculeatus """ p = OptionParser(ensembl.__doc__) p.add_option("--version", default="75", help="Ensembl version [default: %default]") opts, args = p.parse_args(args) version = opts.version url = "ftp://ftp.ensembl.org/pub/release-{0}/".format(version) fasta_url = url + "fasta/" valid_species = [x for x in ls_ftp(fasta_url) if "." not in x] doc = "\n".join((ensembl.__doc__, tile(valid_species))) p.set_usage(doc) if len(args) != 1: sys.exit(not p.print_help()) species, = args species = species.split(",") for s in species: download_species_ensembl(s, valid_species, url)
def link(args): """ %prog link metafile Link source to target based on a tabular file. """ from jcvi.apps.base import mkdir p = OptionParser(link.__doc__) p.add_option("--dir", help="Place links in a subdirectory [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) meta, = args d = opts.dir if d: mkdir(d) fp = open(meta) for row in fp: source, target = row.split() source = get_abs_path(source) if d: target = op.join(d, target) lnsf(source, target, log=True)
def rebuild(args): """ %prog rebuild blocksfile blastfile Rebuild anchors file from pre-built blocks file. """ p = OptionParser(rebuild.__doc__) p.add_option("--header", default=False, action="store_true", help="First line is header [default: %default]") p.add_option("--write_blast", default=False, action="store_true", help="Get blast records of rebuilt anchors [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blocksfile, blastfile = args bk = BlockFile(blocksfile, header=opts.header) fw = open("pairs", "w") for a, b, h in bk.iter_all_pairs(): print >> fw, "\t".join((a, b)) fw.close() if opts.write_blast: AnchorFile("pairs").blast(blastfile, "pairs.blast") fw = open("tracks", "w") for g, col in bk.iter_gene_col(): print >> fw, "\t".join(str(x) for x in (g, col)) fw.close()
def summary(args): """ %prog summary anchorfile Provide statistics for pairwise blocks. """ from jcvi.utils.cbook import SummaryStats p = OptionParser(summary.__doc__) p.add_option("--prefix", help="Generate per block stats [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args ac = AnchorFile(anchorfile) clusters = ac.blocks nclusters = len(clusters) nanchors = [len(c) for c in clusters] nranchors = [_score(c) for c in clusters] # non-redundant anchors print >> sys.stderr, "A total of {0} (NR:{1}) anchors found in {2} clusters.".\ format(sum(nanchors), sum(nranchors), nclusters) print >> sys.stderr, "Stats:", SummaryStats(nanchors) print >> sys.stderr, "NR stats:", SummaryStats(nranchors) prefix = opts.prefix if prefix: pad = len(str(nclusters)) for i, c in enumerate(clusters): block_id = "{0}{1:0{2}d}".format(prefix, i + 1, pad) print "\t".join((block_id, str(len(c))))
def ld(args): """ %prog ld map Calculate pairwise linkage disequilibrium given MSTmap. """ import numpy as np from random import sample from jcvi.algorithms.matrix import symmetrize p = OptionParser(ld.__doc__) p.add_option( "--subsample", default=1000, type="int", help="Subsample markers to speed up", ) opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 1: sys.exit(not p.print_help()) (mstmap, ) = args subsample = opts.subsample data = MSTMap(mstmap) markerbedfile = mstmap + ".subsample.bed" ldmatrix = mstmap + ".subsample.matrix" # Take random subsample while keeping marker order if subsample < data.nmarkers: data = [data[x] for x in sorted(sample(range(len(data)), subsample))] else: logging.debug("Use all markers, --subsample ignored") nmarkers = len(data) if need_update(mstmap, (ldmatrix, markerbedfile)): fw = open(markerbedfile, "w") print("\n".join(x.bedline for x in data), file=fw) logging.debug("Write marker set of size {0} to file `{1}`.".format( nmarkers, markerbedfile)) fw.close() M = np.zeros((nmarkers, nmarkers), dtype=float) for i, j in combinations(range(nmarkers), 2): a = data[i] b = data[j] M[i, j] = calc_ldscore(a.genotype, b.genotype) M = symmetrize(M) logging.debug("Write LD matrix to file `{0}`.".format(ldmatrix)) M.tofile(ldmatrix) else: nmarkers = len(Bed(markerbedfile)) M = np.fromfile(ldmatrix, dtype="float").reshape(nmarkers, nmarkers) logging.debug("LD matrix `{0}` exists ({1}x{1}).".format( ldmatrix, nmarkers)) from jcvi.graphics.base import plt, savefig, Rectangle, draw_cmap plt.rcParams["axes.linewidth"] = 0 fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # the heatmap ax.matshow(M, cmap=iopts.cmap) # Plot chromosomes breaks bed = Bed(markerbedfile) xsize = len(bed) extent = (0, nmarkers) chr_labels = [] ignore_size = 20 for (seqid, beg, end) in bed.get_breaks(): ignore = abs(end - beg) < ignore_size pos = (beg + end) / 2 chr_labels.append((seqid, pos, ignore)) if ignore: continue ax.plot((end, end), extent, "w-", lw=1) ax.plot(extent, (end, end), "w-", lw=1) # Plot chromosome labels for label, pos, ignore in chr_labels: pos = 0.1 + pos * 0.8 / xsize if not ignore: root.text(pos, 0.91, label, ha="center", va="bottom", rotation=45, color="grey") root.text(0.09, pos, label, ha="right", va="center", color="grey") ax.set_xlim(extent) ax.set_ylim(extent) ax.set_axis_off() draw_cmap(root, "Pairwise LD (r2)", 0, 1, cmap=iopts.cmap) root.add_patch(Rectangle((0.1, 0.1), 0.8, 0.8, fill=False, ec="k", lw=2)) m = mstmap.split(".")[0] root.text(0.5, 0.06, "Linkage Disequilibrium between {0} markers".format(m), ha="center") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = m + ".subsample" + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def filter(args): """ %prog filter test.blast Produce a new blast file and filter based on: - score: >= cutoff - pctid: >= cutoff - hitlen: >= cutoff - evalue: <= cutoff - ids: valid ids Use --inverse to obtain the complementary records for the criteria above. - noself: remove self-self hits """ p = OptionParser(filter.__doc__) p.add_option("--score", dest="score", default=0, type="int", help="Score cutoff") p.set_align(pctid=95, hitlen=100, evalue=.01) p.add_option("--noself", default=False, action="store_true", help="Remove self-self hits") p.add_option("--ids", help="Path to file with ids to retain") p.add_option("--inverse", default=False, action="store_true", help="Similar to grep -v, inverse") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) if opts.ids: ids = set() for row in must_open(opts.ids): if row[0] == "#": continue row = row.replace(",", "\t") ids.update(row.split()) else: ids = None blastfile, = args inverse = opts.inverse outfile = opts.outfile fp = must_open(blastfile) score, pctid, hitlen, evalue, noself = \ opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.noself newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \ outfile is None else outfile if inverse: newblastfile += ".inverse" fw = must_open(newblastfile, "w") for row in fp: if row[0] == '#': continue c = BlastLine(row) if ids: if c.query in ids and c.subject in ids: noids = False else: noids = True else: noids = None remove = c.score < score or \ c.pctid < pctid or \ c.hitlen < hitlen or \ c.evalue > evalue or \ noids if inverse: remove = not remove remove = remove or (noself and c.query == c.subject) if not remove: print >> fw, row.rstrip() fw.close() return newblastfile
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ from jcvi.algorithms.supermap import supermap from jcvi.utils.range import range_union allowed_iterby = ("query", "query_sbjct") p = OptionParser(covfilter.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--scov", default=False, action="store_true", help="Subject coverage instead of query [default: %default]") p.add_option("--supermap", action="store_true", help="Use supermap instead of union") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") p.add_option( "--iterby", dest="iterby", default="query", choices=allowed_iterby, help="Choose how to iterate through BLAST [default: %default]") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args pctid = opts.pctid pctcov = opts.pctcov union = not opts.supermap scov = opts.scov sz = Sizes(fastafile) sizes = sz.mapping iterby = opts.iterby qspair = iterby == "query_sbjct" if not union: querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(blastfile) iterator = blast.iter_hits_pair if qspair else blast.iter_hits covidstore = {} for query, blines in iterator(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 this_identity = 0 ranges = [] for b in blines: if scov: s, start, stop = b.subject, b.sstart, b.sstop else: s, start, stop = b.query, b.qstart, b.qstop cov_id = s if b.pctid < pctid: continue if start > stop: start, stop = stop, start this_covered += stop - start + 1 this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps ranges.append(("1", start, stop)) if ranges: this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen if union: this_covered = range_union(ranges) this_coverage = this_covered * 100. / sizes[cov_id] covidstore[query] = (this_identity, this_coverage) if this_identity >= pctid and this_coverage >= pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen if opts.list: if qspair: allpairs = defaultdict(list) for (q, s) in covidstore: allpairs[q].append((q, s)) allpairs[s].append((q, s)) for id, size in sz.iter_sizes(): if id not in allpairs: print "\t".join((id, "na", "0", "0")) else: for qs in allpairs[id]: this_identity, this_coverage = covidstore[qs] print "{0}\t{1:.1f}\t{2:.1f}".format( "\t".join(qs), this_identity, this_coverage) else: for query, size in sz.iter_sizes(): this_identity, this_coverage = covidstore.get(query, (0, 0)) print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) m += "Total mapped: {0} ({1:.1f}% of {2})\n".\ format(mapped_count, mapped_count * 100. / total, total) m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) m += "Average id = {0:.2f}%\n".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sz.totalsize m += "Coverage: {0} covered, {1} total\n".\ format(covered, queries_combined) m += "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) logfile = blastfile + ".covfilter.log" fw = open(logfile, "w") for f in (sys.stderr, fw): print >> f, m fw.close() if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast: query = (b.query, b.subject) if qspair else b.query if query in valid: print >> fw, b
def cscore(args): """ %prog cscore blastfile > cscoreOut See supplementary info for sea anemone genome paper, C-score formula: cscore(A,B) = score(A,B) / max(best score for A, best score for B) A C-score of one is the same as reciprocal best hit (RBH). Output file will be 3-column (query, subject, cscore). Use --cutoff to select a different cutoff. """ from jcvi.utils.cbook import gene_name p = OptionParser(cscore.__doc__) p.add_option("--cutoff", default=.9999, type="float", help="Minimum C-score to report [default: %default]") p.add_option("--pct", default=False, action="store_true", help="Also include pct as last column [default: %default]") p.add_option("--writeblast", default=False, action="store_true", help="Also write filtered blast file [default: %default]") p.set_stripnames() p.set_outfile() opts, args = p.parse_args(args) ostrip = opts.strip_names writeblast = opts.writeblast outfile = opts.outfile if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args blast = Blast(blastfile) logging.debug("Register best scores ..") best_score = defaultdict(float) for b in blast: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) score = b.score if score > best_score[query]: best_score[query] = score if score > best_score[subject]: best_score[subject] = score blast = Blast(blastfile) pairs = {} cutoff = opts.cutoff for b in blast: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) score = b.score pctid = b.pctid s = score / max(best_score[query], best_score[subject]) if s > cutoff: pair = (query, subject) if pair not in pairs or s > pairs[pair][0]: pairs[pair] = (s, pctid, b) fw = must_open(outfile, "w") if writeblast: fwb = must_open(outfile + ".filtered.blast", "w") pct = opts.pct for (query, subject), (s, pctid, b) in sorted(pairs.items()): args = [query, subject, "{0:.2f}".format(s)] if pct: args.append("{0:.1f}".format(pctid)) print >> fw, "\t".join(args) if writeblast: print >> fwb, b fw.close() if writeblast: fwb.close()
def completeness(args): """ %prog completeness blastfile ref.fasta > outfile Print statistics for each gene, the coverage of the alignment onto the best hit, as an indicator for completeness of the gene model. For example, one might BLAST sugarcane ESTs against sorghum annotations as reference, to find full-length transcripts. """ from jcvi.utils.range import range_minmax from jcvi.utils.cbook import SummaryStats p = OptionParser(completeness.__doc__) p.add_option( "--ids", help="Save ids that are over 50% complete [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args idsfile = opts.ids f = Sizes(fastafile).mapping b = BlastSlow(blastfile) valid = [] data = [] cutoff = 50 for query, blines in groupby(b, key=lambda x: x.query): blines = list(blines) ranges = [(x.sstart, x.sstop) for x in blines] b = blines[0] query, subject = b.query, b.subject rmin, rmax = range_minmax(ranges) subject_len = f[subject] nterminal_dist = rmin - 1 cterminal_dist = subject_len - rmax covered = (rmax - rmin + 1) * 100 / subject_len if covered > cutoff: valid.append(query) data.append((nterminal_dist, cterminal_dist, covered)) print "\t".join( str(x) for x in (query, subject, nterminal_dist, cterminal_dist, covered)) nd, cd, cv = zip(*data) m = "Total: {0}, Coverage > {1}%: {2}\n".\ format(len(data), cutoff, len(valid)) m += "N-terminal: {0}\n".format(SummaryStats(nd)) m += "C-terminal: {0}\n".format(SummaryStats(cd)) m += "Coverage: {0}".format(SummaryStats(cv)) print >> sys.stderr, m if idsfile: fw = open(idsfile, "w") print >> fw, "\n".join(valid) logging.debug("A total of {0} ids (cov > {1} %) written to `{2}`.".\ format(len(valid), cutoff, idsfile)) fw.close()
def astat(args): """ %prog astat coverage.log Create coverage-rho scatter plot. """ p = OptionParser(astat.__doc__) p.add_option("--cutoff", default=1000, type="int", help="Length cutoff [default: %default]") p.add_option("--genome", default="", help="Genome name [default: %default]") p.add_option("--arrDist", default=False, action="store_true", help="Use arrDist instead [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) covfile, = args cutoff = opts.cutoff genome = opts.genome plot_arrDist = opts.arrDist suffix = ".{0}".format(cutoff) small_covfile = covfile + suffix update_covfile = need_update(covfile, small_covfile) if update_covfile: fw = open(small_covfile, "w") else: logging.debug("Found `{0}`, will use this one".format(small_covfile)) covfile = small_covfile fp = open(covfile) header = fp.next() if update_covfile: fw.write(header) data = [] msg = "{0} tigs scanned ..." for row in fp: tigID, rho, covStat, arrDist = row.split() tigID = int(tigID) if tigID % 1000000 == 0: sys.stderr.write(msg.format(tigID) + "\r") rho, covStat, arrDist = [float(x) for x in (rho, covStat, arrDist)] if rho < cutoff: continue if update_covfile: fw.write(row) data.append((tigID, rho, covStat, arrDist)) print >> sys.stderr, msg.format(tigID) from jcvi.graphics.base import plt, savefig logging.debug("Plotting {0} data points.".format(len(data))) tigID, rho, covStat, arrDist = zip(*data) y = arrDist if plot_arrDist else covStat ytag = "arrDist" if plot_arrDist else "covStat" fig = plt.figure(1, (7, 7)) ax = fig.add_axes([.12, .1, .8, .8]) ax.plot(rho, y, ".", color="lightslategrey") xtag = "rho" info = (genome, xtag, ytag) title = "{0} {1} vs. {2}".format(*info) ax.set_title(title) ax.set_xlabel(xtag) ax.set_ylabel(ytag) if plot_arrDist: ax.set_yscale('log') imagename = "{0}.png".format(".".join(info)) savefig(imagename, dpi=150)
def cn(args): """ %prog cn workdir 102340_NA12878 \ s3://hli-bix-us-west-2/kubernetes/wf-root-test/102340_NA12878/lpierce-ccn_gcn-v2/ Download CCN output folder and convert cib to copy number per 1Kb. """ p = OptionParser(cn.__doc__) p.add_option("--binsize", default=1000, type="int", help="Window size along chromosome") p.add_option( "--cleanup", default=False, action="store_true", help="Clean up downloaded s3 folder", ) p.add_option( "--hmm", default=False, action="store_true", help="Run HMM caller after computing CN", ) p.add_option( "--upload", default="s3://hli-mv-data-science/htang/ccn", help="Upload cn and seg results to s3", ) p.add_option("--rebuildgc", help="Rebuild GC directory rather than pulling from S3") opts, args = p.parse_args(args) if len(args) == 2: workdir, sample_key = args s3dir = None elif len(args) == 3: workdir, sample_key, s3dir = args else: sys.exit(not p.print_help()) n = opts.binsize rebuildgc = opts.rebuildgc mkdir(workdir) sampledir = op.join(workdir, sample_key) if s3dir: sync_from_s3(s3dir, target_dir=sampledir) assert op.exists(sampledir), "Directory {} doesn't exist!".format( sampledir) cndir = op.join(workdir, sample_key + "-cn") if op.exists(cndir): logging.debug("Directory {} exists. Skipped.".format(cndir)) return gcdir = "gc" if rebuildgc: build_gc_array(fastafile=rebuildgc, n=n, gcdir=gcdir) if not op.exists(gcdir): sync_from_s3("s3://hli-mv-data-science/htang/ccn/gc", target_dir=gcdir) # Build GC correction table gc_bin = defaultdict(list) gc_med = {} coverage = [] for seqid in allsomes: gcfile = op.join(gcdir, "{}.{}.gc".format(seqid, n)) if not op.exists(gcfile): logging.error("File {} not found. Continue anyway.".format(gcfile)) continue gc = np.fromfile(gcfile, dtype=np.uint8) cibfile = op.join(sampledir, "{}.{}.cib".format(sample_key, seqid)) cib = load_cib(cibfile) print(seqid, gc.shape[0], cib.shape[0], file=sys.stderr) if seqid in autosomes: for gci, k in zip(gc, cib): gc_bin[gci].append(k) coverage.append((seqid, gc, cib)) for gci, k in gc_bin.items(): nonzero_k = [x for x in k if x] gc_med[gci] = med = np.median(nonzero_k) / 2 print(gci, len(nonzero_k), med, file=sys.stderr) mkdir(cndir) apply_fun = np.vectorize(gc_med.get) # Apply the GC correction over coverage for seqid, gc, cib in coverage: nitems = cib.shape[0] beta = apply_fun(gc[:nitems]) beta_cn = cib / beta cnfile = op.join(cndir, "{}.{}.cn".format(sample_key, seqid)) beta_cn.tofile(cnfile) # Run HMM caller if asked segfile = hmm([workdir, sample_key]) if opts.hmm else None upload = opts.upload if upload: push_to_s3(upload, cndir) if segfile: push_to_s3(upload, segfile) if opts.cleanup: import shutil shutil.rmtree(sampledir) shutil.rmtree(cndir)
def shred(args): """ %prog shred fastafile Similar to the method of `shredContig` in runCA script. The contigs are shredded into pseudo-reads with certain length and depth. """ p = OptionParser(shred.__doc__) p.set_depth(depth=2) p.add_option("--readlen", default=1000, type="int", help="Desired length of the reads [default: %default]") p.add_option("--minctglen", default=0, type="int", help="Ignore contig sequence less than [default: %default]") p.add_option( "--shift", default=50, type="int", help="Overlap between reads must be at least [default: %default]") p.add_option( "--fasta", default=False, action="store_true", help="Output shredded reads as FASTA sequences [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args libID = fastafile.split(".")[0] depth = opts.depth readlen = opts.readlen shift = opts.shift outfile = libID + ".depth{0}".format(depth) if opts.fasta: outfile += ".fasta" else: outfile += ".frg" f = Fasta(fastafile, lazy=True) fw = must_open(outfile, "w", checkexists=True) if not opts.fasta: print >> fw, headerTemplate.format(libID=libID) """ Taken from runCA: |*********| |###################| |--------------------------------------------------| ---------------1--------------- ---------------2--------------- ---------------3--------------- *** - center_increments ### - center_range_width """ for ctgID, (name, rec) in enumerate(f.iteritems_ordered()): seq = rec.seq seqlen = len(seq) if seqlen < opts.minctglen: continue shredlen = min(seqlen - shift, readlen) numreads = max(seqlen * depth / shredlen, 1) center_range_width = seqlen - shredlen ranges = [] if depth == 1: if seqlen < readlen: ranges.append((0, seqlen)) else: for begin in xrange(0, seqlen, readlen - shift): end = min(seqlen, begin + readlen) ranges.append((begin, end)) else: if numreads == 1: ranges.append((0, shredlen)) else: prev_begin = -1 center_increments = center_range_width * 1. / (numreads - 1) for i in xrange(numreads): begin = center_increments * i end = begin + shredlen begin, end = int(begin), int(end) if begin == prev_begin: continue ranges.append((begin, end)) prev_begin = begin for shredID, (begin, end) in enumerate(ranges): shredded_seq = seq[begin:end] fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end) emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta) fw.close() logging.debug("Shredded reads are written to `{0}`.".format(outfile)) return outfile
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = { "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein"], "gb": ["genome", "nuccore", "nucgss"], "est": ["nucest"], "gss": ["nucgss"], "acc": ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein") p.add_option("--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions") p.add_option("--format", default="fasta", choices=valid_formats, help="download format [default: %default]") p.add_option("--database", default="nuccore", choices=valid_databases, help="search database [default: %default]") p.add_option("--retmax", default=1000000, type="int", help="how many results to return [default: %default]") p.add_option( "--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence [default: %default]") p.add_option( "--batchsize", default=500, type="int", help="download the results in batch for speed-up [default: %default]") p.add_option("--outdir", default=None, help="output directory, with accession number as filename") p.add_option("--outprefix", default="out", help="output file name prefix [default: %default]") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) filename, = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if opts.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = opts.format database = opts.database batchsize = opts.batchsize assert database in allowed_databases[fmt], \ "For output format '{0}', allowed databases are: {1}".\ format(fmt, allowed_databases[fmt]) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = opts.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = opts.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, \ skipcheck=opts.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez(list_of_terms, retmax=opts.retmax, \ rettype=fmt, db=database, batchsize=batchsize, \ email=opts.email): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, \ skipcheck=opts.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print >> fw, rec print >> fw seen.add(id) if seen: print >> sys.stderr, "A total of {0} {1} records downloaded.".\ format(totalsize, fmt.upper()) return outfile
def ace(args): """ %prog ace bamfile fastafile convert bam format to ace format. This often allows the remapping to be assessed as a denovo assembly format. bam file needs to be indexed. also creates a .mates file to be used in amos/bambus, and .astat file to mark whether the contig is unique or repetitive based on A-statistics in Celera assembler. """ p = OptionParser(ace.__doc__) p.add_option( "--splitdir", dest="splitdir", default="outRoot", help="split the ace per contig to dir", ) p.add_option( "--unpaired", dest="unpaired", default=False, help="remove read pairs on the same contig", ) p.add_option( "--minreadno", dest="minreadno", default=3, type="int", help="minimum read numbers per contig", ) p.add_option( "--minctgsize", dest="minctgsize", default=100, type="int", help="minimum contig size per contig", ) p.add_option( "--astat", default=False, action="store_true", help="create .astat to list repetitiveness", ) p.add_option( "--readids", default=False, action="store_true", help="create file of mapped and unmapped ids", ) from pysam import Samfile opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, fastafile = args astat = opts.astat readids = opts.readids f = Fasta(fastafile) prefix = bamfile.split(".")[0] acefile = prefix + ".ace" readsfile = prefix + ".reads" astatfile = prefix + ".astat" logging.debug("Load {0}".format(bamfile)) s = Samfile(bamfile, "rb") ncontigs = s.nreferences genomesize = sum(x for a, x in f.itersizes()) logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize)) qual = "20" # default qual totalreads = sum(s.count(x) for x in s.references) logging.debug("Total {0} reads mapped".format(totalreads)) fw = open(acefile, "w") if astat: astatfw = open(astatfile, "w") if readids: readsfw = open(readsfile, "w") print("AS {0} {1}".format(ncontigs, totalreads), file=fw) print(file=fw) for i, contig in enumerate(s.references): cseq = f[contig] nbases = len(cseq) mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped] nreads = len(mapped_reads) nsegments = 0 print("CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments), file=fw) print(fill(str(cseq.seq)), file=fw) print(file=fw) if astat: astat = Astat(nbases, nreads, genomesize, totalreads) print("{0}\t{1:.1f}".format(contig, astat), file=astatfw) text = fill([qual] * nbases, delimiter=" ", width=30) print("BQ\n{0}".format(text), file=fw) print(file=fw) rnames = [] for a in mapped_reads: readname = a.qname rname = readname if readids: print(readname, file=readsfw) rnames.append(rname) strand = "C" if a.is_reverse else "U" paddedstart = a.pos + 1 # 0-based to 1-based af = "AF {0} {1} {2}".format(rname, strand, paddedstart) print(af, file=fw) print(file=fw) for a, rname in zip(mapped_reads, rnames): aseq, npadded = cigar_to_seq(a) if aseq is None: continue ninfos = 0 ntags = 0 alen = len(aseq) rd = "RD {0} {1} {2} {3}\n{4}".format( rname, alen, ninfos, ntags, fill(aseq) ) qs = "QA 1 {0} 1 {0}".format(alen) print(rd, file=fw) print(file=fw) print(qs, file=fw) print(file=fw)
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option("-m", dest="matefile", default=None, help="matepairs file") p.add_option("--maxreadlen", default=32000, type="int", help="Maximum read length allowed [default: %default]") p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args maxreadlen = opts.maxreadlen f = Fasta(fastafile, lazy=True) if maxreadlen > 0: split = False for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug("Sequence {0} (size={1}) longer than max read len {2}".\ format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = (opts.size != 0) mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = "SangerFrags-" + plate frgfile = libname + ".frg" cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta qualfile = make_qual(fastafile, score=21) if mated: if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".\ format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile)
def simulate(args): """ %prog simulate Run simulation on female restitution. """ import seaborn as sns sns.set_style("darkgrid") p = OptionParser(simulate.__doc__) p.add_option( "--verbose", default=False, action="store_true", help="Verbose logging during simulation", ) opts, args, iopts = p.set_image_options(args, figsize="7x10") if len(args) != 0: sys.exit(not p.print_help()) # Construct a composite figure with 6 tracks fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) rows = 7 ypad = 0.05 yinterval = (1 - 2 * ypad) / (rows + 1) yy = 1 - ypad xpad = 0.18 xwidth = 0.6 # Axes are vertically stacked, and share x-axis axes = [] yy_positions = [] # Save yy positions so we can show details to the right laterr for idx in range(rows): yy_positions.append(yy) yy -= yinterval ax = fig.add_axes([xpad, yy, xwidth, yinterval * 0.85]) if idx != rows - 1: plt.setp(ax.get_xticklabels(), visible=False) axes.append(ax) ax1, ax2, ax3, ax4, ax5, ax6, ax7 = axes # Prepare the simulated data # Simulate two parents SS = Genome("SS", "SS", 10, 8) SO = Genome("SO", "SO", 8, 10) verbose = opts.verbose all_F1s = [simulate_F1(SO, SS, verbose=verbose) for _ in range(1000)] all_F2s = [simulate_F2(SO, SS, verbose=verbose) for _ in range(1000)] all_F1intercrosses = [simulate_F1intercross(SO, SS, verbose) for _ in range(1000)] all_BC1s = [simulate_BCn(1, SO, SS, verbose=verbose) for _ in range(1000)] all_BC2s = [simulate_BCn(2, SO, SS, verbose=verbose) for _ in range(1000)] all_BC3s = [simulate_BCn(3, SO, SS, verbose=verbose) for _ in range(1000)] all_BC4s = [simulate_BCn(4, SO, SS, verbose=verbose) for _ in range(1000)] # Plotting f1s = plot_summary(ax1, all_F1s) f2s = plot_summary(ax2, all_F2s) f1is = plot_summary(ax3, all_F1intercrosses) bc1s = plot_summary(ax4, all_BC1s) bc2s = plot_summary(ax5, all_BC2s) bc3s = plot_summary(ax6, all_BC3s) bc4s = plot_summary(ax7, all_BC4s) # Show title to the left xx = xpad / 2 for (title, subtitle), yy in zip( ( ("F1", None), ("F2", "via selfing"), ("F2", "via intercross"), ("BC1", None), ("BC2", None), ("BC3", None), ("BC4", None), ), yy_positions, ): if subtitle: yy -= 0.06 else: yy -= 0.07 root.text(xx, yy, title, color="darkslategray", ha="center", va="center") if subtitle: yy -= 0.02 root.text( xx, yy, subtitle, color="lightslategray", ha="center", va="center" ) # Show summary stats to the right xx = 1 - (1 - xpad - xwidth) / 2 for summary, yy in zip((f1s, f2s, f1is, bc1s, bc2s, bc3s, bc4s), yy_positions): yy -= 0.04 root.text( xx, yy, summary.SO_summary, color=SoColor, ha="center", va="center", ) yy -= 0.02 root.text( xx, yy, summary.SS_summary, color=SsColor, ha="center", va="center", ) yy -= 0.02 root.text( xx, yy, summary.percent_SO_summary, color=SoColor, ha="center", va="center", ) ax7.set_xlabel("Number of unique chromosomes") adjust_spines(ax7, ["bottom"], outward=True) normalize_axes(root) savefig("plotter.pdf", dpi=120) outdir = "simulations" mkdir(outdir) # Write chromosomes to disk for genomes, filename in ( (all_F1s, "all_F1s"), (all_F2s, "all_F2s"), (all_F1intercrosses, "all_F1intercrosses"), (all_BC1s, "all_BC1s"), (all_BC2s, "all_BC2s"), (all_BC3s, "all_BC3s"), (all_BC4s, "all_BC4s"), ): write_chromosomes(genomes, op.join(outdir, filename))
def htg(args): """ %prog htg fastafile template.sbt Prepare sqnfiles for Genbank HTG submission to update existing records. `fastafile` contains the records to update, multiple records are allowed (with each one generating separate sqn file in the sqn/ folder). The record defline has the accession ID. For example, >AC148290.3 Internally, this generates two additional files (phasefile and namesfile) and download records from Genbank. Below is implementation details: `phasefile` contains, for each accession, phase information. For example: AC148290.3 3 HTG 2 mth2-45h12 which means this is a Phase-3 BAC. Record with only a single contig will be labeled as Phase-3 regardless of the info in the `phasefile`. Template file is the Genbank sbt template. See jcvi.formats.sbt for generation of such files. Another problem is that Genbank requires the name of the sequence to stay the same when updating and will kick back with a table of name conflicts. For example: We are unable to process the updates for these entries for the following reason: Seqname has changed Accession Old seq_name New seq_name --------- ------------ ------------ AC239792 mtg2_29457 AC239792.1 To prepare a submission, this script downloads genbank and asn.1 format, and generate the phase file and the names file (use formats.agp.phase() and apps.gbsubmit.asn(), respectively). These get automatically run. However, use --phases if the genbank files contain outdated information. For example, the clone name changes or phase upgrades. In this case, run formats.agp.phase() manually, modify the phasefile and use --phases to override. """ from jcvi.formats.fasta import sequin, ids from jcvi.formats.agp import phase from jcvi.apps.fetch import entrez p = OptionParser(htg.__doc__) p.add_option("--phases", default=None, help="Use another phasefile to override [default: %default]") p.add_option("--comment", default="", help="Comments for this update [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, sbtfile = args pf = fastafile.rsplit(".", 1)[0] idsfile = pf + ".ids" phasefile = pf + ".phases" namesfile = pf + ".names" ids([fastafile, "--outfile={0}".format(idsfile)]) asndir = "asn.1" mkdir(asndir) entrez([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)]) asn(glob("{0}/*".format(asndir)) + \ ["--outfile={0}".format(namesfile)]) if opts.phases is None: gbdir = "gb" mkdir(gbdir) entrez([idsfile, "--format=gb", "--outdir={0}".format(gbdir)]) phase(glob("{0}/*".format(gbdir)) + \ ["--outfile={0}".format(phasefile)]) else: phasefile = opts.phases assert op.exists(namesfile) and op.exists(phasefile) newphasefile = phasefile + ".new" newphasefw = open(newphasefile, "w") comment = opts.comment fastadir = "fasta" sqndir = "sqn" mkdir(fastadir) mkdir(sqndir) from jcvi.graphics.histogram import stem_leaf_plot names = DictFile(namesfile) assert len(set(names.keys())) == len(set(names.values())) phases = DictFile(phasefile) ph = [int(x) for x in phases.values()] # vmin 1, vmax 4, bins 3 stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates") logging.debug("Information loaded for {0} records.".format(len(phases))) assert len(names) == len(phases) newph = [] cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir) sh(cmd, outfile="/dev/null", errfile="/dev/null") acmd = 'tbl2asn -a z -p fasta -r {sqndir}' acmd += ' -i {splitfile} -t {sbtfile} -C tigr' acmd += ' -j "{qualifiers}"' acmd += ' -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr' acmd += ' -y "{comment}" -W T -T T' qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]" nupdated = 0 for row in open(phasefile): atoms = row.rstrip().split("\t") # see formats.agp.phase() for column contents accession, phase, clone = atoms[0], atoms[1], atoms[-1] fafile = op.join(fastadir, accession + ".fa") accession_nv = accession.split(".", 1)[0] newid = names[accession_nv] newidopt = "--newid={0}".format(newid) cloneopt = "--clone={0}".format(clone) splitfile, gaps = sequin([fafile, newidopt, cloneopt]) splitfile = op.basename(splitfile) phase = int(phase) assert phase in (1, 2, 3) oldphase = phase if gaps == 0 and phase != 3: phase = 3 if gaps != 0 and phase == 3: phase = 2 print >> newphasefw, "{0}\t{1}\t{2}".\ format(accession_nv, oldphase, phase) newph.append(phase) qualifiers = qq.format(phase=phase) if ";" in clone: qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]" cmd = acmd.format(accession=accession, accession_nv=accession_nv, sqndir=sqndir, sbtfile=sbtfile, splitfile=splitfile, qualifiers=qualifiers, comment=comment) sh(cmd) verify_sqn(sqndir, accession) nupdated += 1 stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates") print >> sys.stderr, "A total of {0} records updated.".format(nupdated)
def summary(args): """ %prog summary diploid.napus.fractionation gmap.status Provide summary of fractionation. `fractionation` file is generated with loss(). `gmap.status` is generated with genestatus(). """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import percentage, Registry p = OptionParser(summary.__doc__) p.add_option("--extra", help="Cross with extra tsv file") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) frfile, statusfile = args status = DictFile(statusfile) fp = open(frfile) registry = Registry() # keeps all the tags for any given gene for row in fp: seqid, gene, tag = row.split() if tag == ".": registry[gene].append("outside") else: registry[gene].append("inside") if tag[0] == "[": registry[gene].append("no_syntenic_model") if tag.startswith("[S]"): registry[gene].append("[S]") gstatus = status.get(gene, None) if gstatus == "complete": registry[gene].append("complete") elif gstatus == "pseudogene": registry[gene].append("pseudogene") elif gstatus == "partial": registry[gene].append("partial") else: registry[gene].append("gmap_fail") elif tag.startswith("[NS]"): registry[gene].append("[NS]") if "random" in tag or "Scaffold" in tag: registry[gene].append("random") else: registry[gene].append("real_ns") elif tag.startswith("[NF]"): registry[gene].append("[NF]") else: registry[gene].append("syntenic_model") inside = registry.count("inside") outside = registry.count("outside") syntenic = registry.count("syntenic_model") non_syntenic = registry.count("no_syntenic_model") s = registry.count("[S]") ns = registry.count("[NS]") nf = registry.count("[NF]") complete = registry.count("complete") pseudogene = registry.count("pseudogene") partial = registry.count("partial") gmap_fail = registry.count("gmap_fail") random = registry.count("random") real_ns = registry.count("real_ns") complete_models = registry.get_tag("complete") pseudogenes = registry.get_tag("pseudogene") partial_deletions = registry.get_tag("partial") m = "{0} inside synteny blocks\n".format(inside) m += "{0} outside synteny blocks\n".format(outside) m += "{0} has syntenic gene\n".format(syntenic) m += "{0} lack syntenic gene\n".format(non_syntenic) m += "{0} has sequence match in syntenic location\n".format(s) m += "{0} has sequence match in non-syntenic location\n".format(ns) m += "{0} has sequence match in un-ordered scaffolds\n".format(random) m += "{0} has sequence match in real non-syntenic location\n".format( real_ns) m += "{0} has no sequence match\n".format(nf) m += "{0} syntenic sequence - complete model\n".format( percentage(complete, s)) m += "{0} syntenic sequence - partial model\n".format( percentage(partial, s)) m += "{0} syntenic sequence - pseudogene\n".format( percentage(pseudogene, s)) m += "{0} syntenic sequence - gmap fail\n".format(percentage(gmap_fail, s)) print(m, file=sys.stderr) aa = ["complete_models", "partial_deletions", "pseudogenes"] bb = [complete_models, partial_deletions, pseudogenes] for a, b in zip(aa, bb): fw = open(a, "w") print("\n".join(b), file=fw) fw.close() extra = opts.extra if extra: registry.update_from(extra) fp.seek(0) fw = open("registry", "w") for row in fp: seqid, gene, tag = row.split() ts = registry[gene] print("\t".join((seqid, gene, tag, "-".join(ts))), file=fw) fw.close() logging.debug("Registry written.")
def index(args): """ %prog index samfile/bamfile If SAM file, convert to BAM, sort and then index, using SAMTOOLS """ p = OptionParser(index.__doc__) p.add_option( "--fasta", dest="fasta", default=None, help="add @SQ header to the BAM file" ) p.add_option( "--unique", default=False, action="store_true", help="only retain uniquely mapped reads", ) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) (samfile,) = args cpus = opts.cpus fastafile = opts.fasta if fastafile: assert op.exists(fastafile) bamfile = samfile.replace(".sam", ".bam") if fastafile: faifile = fastafile + ".fai" if need_update(fastafile, faifile): sh("samtools faidx {0}".format(fastafile)) cmd = "samtools view -bt {0} {1} -o {2}".format(faifile, samfile, bamfile) else: cmd = "samtools view -bS {0} -o {1}".format(samfile, bamfile) cmd += " -@ {0}".format(cpus) if opts.unique: cmd += " -q 1" if samfile.endswith(".sam") and need_update(samfile, bamfile): sh(cmd) # Already sorted? if bamfile.endswith(".sorted.bam"): sortedbamfile = bamfile else: prefix = bamfile.replace(".bam", "") sortedbamfile = prefix + ".sorted.bam" if need_update(bamfile, sortedbamfile): cmd = "samtools sort {0} -o {1}".format(bamfile, sortedbamfile) cmd += " -@ {0}".format(cpus) sh(cmd) baifile = sortedbamfile + ".bai" if need_update(sortedbamfile, baifile): sh("samtools index {0}".format(sortedbamfile)) return sortedbamfile
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for `bowtie2` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.set_firstN(firstN=0) p.add_option("--full", default=False, action="store_true", help="Enforce end-to-end alignment [default: local]") p.add_option("--reorder", default=False, action="store_true", help="Keep the input read order [default: %default]") p.add_option("--null", default=False, action="store_true", help="Do not write to SAM/BAM output") p.add_option("--fasta", default=False, action="store_true", help="Query reads are FASTA") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") p.set_sam_options(bowtie=True) opts, args = p.parse_args(args) extra = opts.extra mo = opts.mateorientation if mo == '+-': extra += "" elif mo == '-+': extra += "--rf" else: extra += "--ff" PE = True if len(args) == 2: logging.debug("Single-end alignment") PE = False elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) firstN = opts.firstN mapped = opts.mapped unmapped = opts.unmapped fasta = opts.fasta gl = "--end-to-end" if opts.full else "--local" dbfile, readfile = args[0:2] dbfile = check_index(dbfile) prefix = get_prefix(readfile, dbfile) samfile, mapped, unmapped = get_samfile(readfile, dbfile, bowtie=True, mapped=mapped, unmapped=unmapped, bam=opts.bam) logfile = prefix + ".log" if not fasta: offset = guessoffset([readfile]) if not need_update(dbfile, samfile): logging.error("`{0}` exists. `bowtie2` already run.".format(samfile)) return samfile, logfile cmd = "bowtie2 -x {0}".format(dbfile) if PE: r1, r2 = args[1:3] cmd += " -1 {0} -2 {1}".format(r1, r2) cmd += " --maxins {0}".format(opts.cutoff) mtag, utag = "--al-conc", "--un-conc" else: cmd += " -U {0}".format(readfile) mtag, utag = "--al", "--un" if mapped: cmd += " {0} {1}".format(mtag, mapped) if unmapped: cmd += " {0} {1}".format(utag, unmapped) if firstN: cmd += " --upto {0}".format(firstN) cmd += " -p {0}".format(opts.cpus) if fasta: cmd += " -f" else: cmd += " --phred{0}".format(offset) cmd += " {0}".format(gl) if opts.reorder: cmd += " --reorder" cmd += " {0}".format(extra) # Finally the log cmd += " 2> {0}".format(logfile) if opts.null: samfile = "/dev/null" cmd = output_bam(cmd, samfile) sh(cmd) print(open(logfile).read(), file=sys.stderr) return samfile, logfile
def loss(args): """ %prog loss a.b.i1.blocks [a.b-genomic.blast] Extract likely gene loss candidates between genome a and b. """ p = OptionParser(loss.__doc__) p.add_option( "--bed", default=False, action="store_true", help="Genomic BLAST is in bed format", ) p.add_option("--gdist", default=20, type="int", help="Gene distance") p.add_option( "--bdist", default=20000, type="int", help="Base pair distance", ) p.set_beds() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) blocksfile = args[0] emptyblast = len(args) == 1 if emptyblast: genomicblast = "empty.blast" sh("touch {0}".format(genomicblast)) else: genomicblast = args[1] gdist, bdist = opts.gdist, opts.bdist qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts) blocks = [] fp = open(blocksfile) genetrack = {} proxytrack = {} for row in fp: a, b = row.split() genetrack[a] = b blocks.append((a, b)) data = [] for key, rows in groupby(blocks, key=lambda x: x[-1]): rows = list(rows) data.append((key, rows)) imax = len(data) - 1 for i, (key, rows) in enumerate(data): if i == 0 or i == imax: continue if key != ".": continue before, br = data[i - 1] after, ar = data[i + 1] bi, bx = sorder[before] ai, ax = sorder[after] dist = abs(bi - ai) if bx.seqid != ax.seqid or dist > gdist: continue start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end))) start, end = max(start - bdist, 1), end + bdist proxy = (bx.seqid, start, end) for a, b in rows: proxytrack[a] = proxy tags = {} if opts.bed: bed = Bed(genomicblast, sorted=False) key = lambda x: gene_name(x.accn.rsplit(".", 1)[0]) for query, bb in groupby(bed, key=key): bb = list(bb) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.seqid, b.start, b.end) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.seqid, best_b.start, best_b.end) proxytrack[query] = hsp tags[query] = tag else: blast = Blast(genomicblast) for query, bb in blast.iter_hits(): bb = list(bb) query = gene_name(query) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.subject, b.sstart, b.sstop) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.subject, best_b.sstart, best_b.sstop) proxytrack[query] = hsp tags[query] = tag for b in qbed: accn = b.accn target_region = genetrack[accn] if accn in proxytrack: target_region = region_str(proxytrack[accn]) if accn in tags: ptag = "[{0}]".format(tags[accn]) else: ptag = "[NF]" target_region = ptag + target_region print("\t".join((b.seqid, accn, target_region))) if emptyblast: sh("rm -f {0}".format(genomicblast))
def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--haploid", default="chrY,chrM", help="Use haploid model for these chromosomes") p.add_option("--chr", help="Run only this chromosome") p.add_option("--simulation", default=False, action="store_true", help="Simulation mode") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args if opts.simulation: # Simulation mode cmd, vcf_file = allelotype_on_chr(bamfile, "chr4", "/mnt/software/lobSTR/", "TREDs", haploid=opts.haploid) stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats" results_dir = "lobstr_results" mkdir(results_dir) sh(cmd) sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file)) return s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx, haploid=opts.haploid) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
def segment(args): """ %prog segment loss.ids bedfile Merge adjacent gene loss into segmental loss. Then based on the segmental loss, estimate amount of DNA loss in base pairs. Two estimates can be given: - conservative: just within the start and end of a single gene - aggressive: extend the deletion track to the next gene The real deletion size is within these estimates. """ from jcvi.formats.base import SetFile p = OptionParser(segment.__doc__) p.add_option( "--chain", default=1, type="int", help="Allow next N genes to be chained", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) idsfile, bedfile = args bed = Bed(bedfile) order = bed.order ids = SetFile(idsfile) losses = Grouper() skip = opts.chain for i, a in enumerate(bed): a = a.accn for j in range(i + 1, i + 1 + skip): if j >= len(bed): break b = bed[j].accn if a in ids: losses.join(a, a) if a in ids and b in ids: losses.join(a, b) losses = list(losses) singletons = [x for x in losses if len(x) == 1] segments = [x for x in losses if len(x) > 1] ns, nm, nt = len(singletons), len(segments), len(losses) assert ns + nm == nt # Summary for all segments for x in sorted(singletons) + sorted(segments): print("\t".join( str(x) for x in ("|".join(sorted(x)), len(x), estimate_size(x, bed, order)))) # Find longest segment stretch if segments: mx, maxsegment = max([(len(x), x) for x in segments]) print("Longest stretch: run of {0} genes".format(mx), file=sys.stderr) print(" {0}".format("|".join(sorted(maxsegment))), file=sys.stderr) seg_asize = sum(estimate_size(x, bed, order) for x in segments) seg_bsize = sum( estimate_size(x, bed, order, conservative=False) for x in segments) else: seg_asize = seg_bsize = 0 sing_asize = sum(estimate_size(x, bed, order) for x in singletons) sing_bsize = sum( estimate_size(x, bed, order, conservative=False) for x in singletons) total_asize = sing_asize + seg_asize total_bsize = sing_bsize + seg_bsize print( "Singleton ({0}): {1} - {2} bp".format(ns, sing_asize, sing_bsize), file=sys.stderr, ) print("Segment ({0}): {1} - {2} bp".format(nm, seg_asize, seg_bsize), file=sys.stderr) print( "Total ({0}): {1} - {2} bp".format(nt, total_asize, total_bsize), file=sys.stderr, ) print( "Average ({0}): {1} bp".format(nt, (total_asize + total_bsize) / 2), file=sys.stderr, )
def split(args): """ %prog split split.bed evidences.bed predictor1.gff predictor2.gff fastafile Split MAKER models by checking against predictors (such as AUGUSTUS and FGENESH). For each region covered by a working model. Find out the combination of predictors that gives the best accuracy against evidences (such as PASA). `split.bed` can be generated by pulling out subset from a list of ids $ python -m jcvi.formats.base join split.ids working.bed --column=0,3 --noheader | cut -f2-7 > split.bed """ from jcvi.formats.bed import Bed p = OptionParser(split.__doc__) p.add_option( "--key", default="Name", help= "Key in the attributes to extract predictor.gff [default: %default]") p.add_option( "--parents", default="match", help="list of features to extract, use comma to separate (e.g." "'gene,mRNA') [default: %default]") p.add_option( "--children", default="match_part", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]") opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) split_bed, evidences_bed, p1_gff, p2_gff, fastafile = args parents = opts.parents children = opts.children key = opts.key bed = Bed(split_bed) s1 = get_splits(split_bed, p1_gff, parents, key) s2 = get_splits(split_bed, p2_gff, parents, key) for b in bed: query = "{0}:{1}-{2}".format(b.seqid, b.start, b.end) b1 = get_accuracy(query, p1_gff, evidences_bed, fastafile, children, key) b2 = get_accuracy(query, p2_gff, evidences_bed, fastafile, children, key) accn = b.accn c1 = "|".join(s1[accn]) c2 = "|".join(s2[accn]) ac1 = b1.accuracy ac2 = b2.accuracy tag = p1_gff if ac1 >= ac2 else p2_gff tag = tag.split(".")[0] ac1 = "{0:.3f}".format(ac1) ac2 = "{0:.3f}".format(ac2) print "\t".join((accn, tag, ac1, ac2, c1, c2))
def meta(args): """ %prog meta data.bin samples STR.ids STR-exons.wo.bed Compute allele frequencies and prune sites based on missingness. Filter subset of loci that satisfy: 1. no redundancy (unique chr:pos) 2. variable (n_alleles > 1) 3. low level of missing data (>= 50% autosomal + X, > 25% for Y) Write meta file with the following infor: 1. id 2. title 3. gene_name 4. variant_type 5. motif 6. allele_frequency `STR-exons.wo.bed` can be generated like this: $ tail -n 694105 /mnt/software/lobSTR/hg38/index.tab | cut -f1-3 > all-STR.bed $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed """ p = OptionParser(meta.__doc__) p.add_option("--cutoff", default=.5, type="float", help="Percent observed required (chrY half cutoff)") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) binfile, sampleids, strids, wobed = args cutoff = opts.cutoff af_file = "allele_freq" if need_update(binfile, af_file): df, m, samples, loci = read_binfile(binfile, sampleids, strids) nalleles = len(samples) fw = must_open(af_file, "w") for i, locus in enumerate(loci): a = m[:, i] counts = alleles_to_counts(a) af = counts_to_af(counts) seqid = locus.split("_")[0] remove = counts_filter(counts, nalleles, seqid, cutoff=cutoff) print("\t".join((locus, af, remove)), file=fw) fw.close() logging.debug("Load gene intersections from `{}`".format(wobed)) fp = open(wobed) gene_map = defaultdict(set) for row in fp: chr1, start1, end1, chr2, start2, end2, name, ov = row.split() gene_map[(chr1, start1)] |= set(name.split(",")) for k, v in gene_map.items(): non_enst = sorted(x for x in v if not x.startswith("ENST")) #enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST")) gene_map[k] = ",".join(non_enst) TREDS, df = read_treds() metafile = "STRs_{}_SEARCH.meta.tsv".format(timestamp()) write_meta(af_file, gene_map, TREDS, filename=metafile) logging.debug("File `{}` written.".format(metafile))
def scaffold(args): """ %prog scaffold ctgfasta agpfile Build scaffolds based on ordering in the AGP file. """ from jcvi.formats.agp import bed, order_to_agp, build from jcvi.formats.bed import Bed p = OptionParser(scaffold.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Keep IDs with same prefix together [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, agpfile = args sizes = Sizes(ctgfasta).mapping pf = ctgfasta.rsplit(".", 1)[0] phasefile = pf + ".phases" fwphase = open(phasefile, "w") newagpfile = pf + ".new.agp" fwagp = open(newagpfile, "w") scaffoldbuckets = defaultdict(list) bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"]) bb = Bed(bedfile) for s, partialorder in bb.sub_beds(): name = partialorder[0].accn bname = name.rsplit("_", 1)[0] if opts.prefix else s scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder]) # Now the buckets contain a mixture of singletons and partially resolved # scaffolds. Print the scaffolds first then remaining singletons. for bname, scaffolds in sorted(scaffoldbuckets.items()): ctgorder = [] singletons = set() for scaf in sorted(scaffolds): for node, orientation in scaf: ctgorder.append((node, orientation)) if len(scaf) == 1: singletons.add(node) nscaffolds = len(scaffolds) nsingletons = len(singletons) if nsingletons == 1 and nscaffolds == 0: phase = 3 elif nsingletons == 0 and nscaffolds == 1: phase = 2 else: phase = 1 msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\ format(bname, nscaffolds, nsingletons, phase) print >> sys.stderr, msg print >> fwphase, "\t".join((bname, str(phase))) order_to_agp(bname, ctgorder, sizes, fwagp) fwagp.close() os.remove(bedfile) fastafile = "final.fasta" build([newagpfile, ctgfasta, fastafile]) tidy([fastafile])
def trf(args): """ %prog trf outdir Run TRF on FASTA files. """ from jcvi.apps.base import iglob cparams = "1 1 2 80 5 200 2000" p = OptionParser(trf.__doc__) p.add_option("--mismatch", default=31, type="int", help="Mismatch and gap penalty") p.add_option("--minscore", default=MINSCORE, type="int", help="Minimum score to report") p.add_option("--period", default=6, type="int", help="Maximum period to report") p.add_option("--lobstr", default=False, action="store_true", help="Generate output for lobSTR") p.add_option("--telomeres", default=False, action="store_true", help="Run telomere search: minscore=140 period=7") p.add_option("--centromeres", default=False, action="store_true", help="Run centromere search: {}".format(cparams)) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) outdir, = args minlength = opts.minscore / 2 mm = MakeManager() if opts.telomeres: opts.minscore, opts.period = 140, 7 params = "2 {0} {0} 80 10 {1} {2}".\ format(opts.mismatch, opts.minscore, opts.period).split() if opts.centromeres: params = cparams.split() bedfiles = [] for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")): pf = op.basename(fastafile).rsplit(".", 1)[0] # Commands starting with trf ignores errors cmd1 = "-trf {0} {1} -d -h".format(fastafile, " ".join(params)) datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat" bedfile = "{0}.trf.bed".format(pf) cmd2 = "cat {} | grep -v ^Parameters".format(datfile) if opts.lobstr: cmd2 += " | awk '($8 >= {} && $8 <= {})'".\ format(minlength, READLEN - minlength) else: cmd2 += " | awk '($8 >= 0)'" cmd2 += " | sed 's/ /\\t/g'" cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile) mm.add(fastafile, datfile, cmd1) mm.add(datfile, bedfile, cmd2) bedfiles.append(bedfile) bedfile = "trf.bed" cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile) mm.add(bedfiles, bedfile, cmd) mm.write()