def fromaligns(args): """ %prog fromaligns out.aligns Convert aligns file (old MCscan output) to anchors file. """ p = OptionParser(fromaligns.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) alignsfile, = args fp = must_open(alignsfile) fw = must_open(opts.outfile, "w") for row in fp: if row.startswith("## Alignment"): print >> fw, "###" continue if row[0] == '#' or not row.strip(): continue atoms = row.split(':')[-1].split() print >> fw, "\t".join(atoms[:2]) fw.close()
def silicosoma(args): """ %prog silicosoma in.silico > out.soma Convert .silico to .soma file. Format of .silico A text file containing in-silico digested contigs. This file contains pairs of lines. The first line in each pair constains an identifier, this contig length in bp, and the number of restriction sites, separated by white space. The second line contains a white space delimited list of the restriction site positions. Format of .soma Each line of the text file contains two decimal numbers: The size of the fragment and the standard deviation (both in kb), separated by white space. The standard deviation is ignored. """ p = OptionParser(silicosoma.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) silicofile, = args fp = must_open(silicofile) fw = must_open(opts.outfile, "w") fp.next() positions = [int(x) for x in fp.next().split()] for a, b in pairwise(positions): assert a <= b fragsize = int(round((b - a) / 1000.)) # kb if fragsize: print >> fw, fragsize, 0
def __init__(self, filename, source="JCVI"): super(MultiGenBank, self).__init__(filename) assert op.exists(filename) pf = filename.rsplit(".", 1)[0] fastafile, gfffile = pf + ".fasta", pf + ".gff" fasta_fw = must_open(fastafile, "w") gff_fw = must_open(gfffile, "w") self.source = source self.counter = defaultdict(list) nrecs, nfeats = 0, 0 for rec in SeqIO.parse(filename, "gb"): seqid = rec.name rec.id = seqid SeqIO.write([rec], fasta_fw, "fasta") rf = rec.features for f in rf: self.print_gffline(gff_fw, f, seqid) nfeats += 1 for sf in f.sub_features: self.print_gffline(gff_fw, sf, seqid, parent=f) nfeats += 1 nrecs += 1 logging.debug("A total of {0} records written to `{1}`.".\ format(nrecs, fastafile)) fasta_fw.close() logging.debug("A total of {0} features written to `{1}`.".\ format(nfeats, gfffile)) gff_fw.close()
def bed_to_bedpe(bedfile, bedpefile, pairsbedfile=None, matesfile=None, ca=False): """ This converts the bedfile to bedpefile, assuming the reads are from CA. """ fp = must_open(bedfile) fw = must_open(bedpefile, "w") if pairsbedfile: fwpairs = must_open(pairsbedfile, "w") clones = defaultdict(list) for row in fp: b = BedLine(row) name = b.accn clonename = clone_name(name, ca=ca) clones[clonename].append(b) if matesfile: fp = open(matesfile) libraryline = fp.next() # 'library bes 37896 126916' lib, name, smin, smax = libraryline.split() assert lib == "library" smin, smax = int(smin), int(smax) logging.debug("Happy mates for lib {0} fall between {1} - {2}".\ format(name, smin, smax)) nbedpe = 0 nspan = 0 for clonename, blines in clones.items(): if len(blines) == 2: a, b = blines aseqid, astart, aend = a.seqid, a.start, a.end bseqid, bstart, bend = b.seqid, b.start, b.end print >> fw, "\t".join(str(x) for x in (aseqid, astart - 1, aend, bseqid, bstart - 1, bend, clonename)) nbedpe += 1 else: a, = blines aseqid, astart, aend = a.seqid, a.start, a.end bseqid, bstart, bend = 0, 0, 0 if pairsbedfile: start = min(astart, bstart) if bstart > 0 else astart end = max(aend, bend) if bend > 0 else aend if aseqid != bseqid: continue span = end - start + 1 if (not matesfile) or (smin <= span <= smax): print >> fwpairs, "\t".join(str(x) for x in \ (aseqid, start - 1, end, clonename)) nspan += 1 fw.close() logging.debug("A total of {0} bedpe written to `{1}`.".\ format(nbedpe, bedpefile)) if pairsbedfile: fwpairs.close() logging.debug("A total of {0} spans written to `{1}`.".\ format(nspan, pairsbedfile))
def prepare(args): """ %prog prepare pairsfile cdsfile [pepfile] -o paired.cds.fasta Pick sequences from cdsfile to form pairs, ready to be calculated. The pairsfile can be generated from formats.blast.cscore(). The first two columns contain the pair. """ from jcvi.formats.fasta import Fasta p = OptionParser(prepare.__doc__) p.set_outfile() opts, args = p.parse_args(args) outfile = opts.outfile if len(args) == 2: pairsfile, cdsfile = args pepfile = None elif len(args) == 3: pairsfile, cdsfile, pepfile = args else: sys.exit(not p.print_help()) f = Fasta(cdsfile) fp = open(pairsfile) fw = must_open(outfile, "w") if pepfile: assert outfile != "stdout", "Please specify outfile name." f2 = Fasta(pepfile) fw2 = must_open(outfile + ".pep", "w") for row in fp: if row[0] == '#': continue a, b = row.split()[:2] if a == b: logging.debug("Self pairs found: {0} - {1}. Ignored".format(a, b)) continue if a not in f: a = find_first_isoform(a, f) assert a, a if b not in f: b = find_first_isoform(b, f) assert b, b acds = f[a] bcds = f[b] SeqIO.write((acds, bcds), fw, "fasta") if pepfile: apep = f2[a] bpep = f2[b] SeqIO.write((apep, bpep), fw2, "fasta") fw.close() if pepfile: fw2.close()
def write_fasta(self, output="gbfasta", individual=False): if not individual: fw = must_open(output+".fasta", "w") for recid, rec in self.iteritems(): if individual: mkdir(output) fw = must_open(op.join(output, recid+".fasta"), "w") seqid = rec.id.split(".")[0] if not seqid: seqid = rec.name.split(".")[0] seq = rec.seq fw.write(">{0}\n{1}\n".format(seqid, seq))
def main(blastfile, p, opts): sqlite = opts.sqlite qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) filtered_blast = read_blast(blastfile, qorder, sorder, \ is_self=is_self, ostrip=opts.strip_names) all_data = [(b.qi, b.si) for b in filtered_blast] c = None if sqlite: conn = sqlite3.connect(sqlite) c = conn.cursor() c.execute("drop table if exists synteny") c.execute("create table synteny (query text, anchor text, " "gray varchar(1), score integer, dr integer, " "orientation varchar(1), qnote text, snote text)") fw = None else: fw = must_open(opts.outfile, "w") batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=False) if qbed.filename == sbed.filename: logging.debug("Self comparisons, mirror ignored") else: batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=True) if sqlite: c.execute("create index q on synteny (query)") conn.commit() c.close() else: fw.close()
def annotate(args): """ %prog annotate blastfile query.fasta subject.fasta Annotate overlap types (dovetail, contained, etc) in BLAST tabular file. """ from jcvi.assembly.goldenpath import Cutoff, Overlap, Overlap_types p = OptionParser(annotate.__doc__) p.set_align(pctid=94, hitlen=500) p.add_option("--hang", default=500, type="int", help="Maximum overhang length") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blastfile, afasta, bfasta = args fp = must_open(blastfile) asizes = Sizes(afasta).mapping bsizes = Sizes(bfasta).mapping cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang) logging.debug(str(cutoff)) for row in fp: b = BlastLine(row) asize = asizes[b.query] bsize = bsizes[b.subject] if b.query == b.subject: continue ov = Overlap(b, asize, bsize, cutoff) if ov.otype: ov.print_graphic() print("{0}\t{1}".format(b, Overlap_types[ov.otype]))
def group(args): """ %prog group anchorfiles Group the anchors into ortho-groups. Can input multiple anchor files. """ p = OptionParser(group.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) anchorfiles = args groups = Grouper() for anchorfile in anchorfiles: ac = AnchorFile(anchorfile) for a, b, idx in ac.iter_pairs(): groups.join(a, b) logging.debug("Created {0} groups with {1} members.".\ format(len(groups), groups.num_members)) outfile = opts.outfile fw = must_open(outfile, "w") for g in groups: print >> fw, ",".join(sorted(g)) fw.close() return outfile
def get_info(): infofiles = glob("*.info") info = {} for row in must_open(infofiles): a = row.split()[0] info[a] = row.rstrip() return info
def uniq(args): """ %prog uniq vcffile Retain only the first entry in vcf file. """ from urlparse import parse_qs p = OptionParser(uniq.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args fp = must_open(vcffile) data = [] for row in fp: if row[0] == '#': print row.strip() continue v = VcfLine(row) data.append(v) for pos, vv in groupby(data, lambda x: x.pos): vv = list(vv) if len(vv) == 1: print vv[0] continue bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0])) print bestv
def filter(args): """ %prog filter consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=10, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args minsize = opts.minsize f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) if size < minsize: continue SeqIO.write(rec, fw, "fasta")
def filtervcf(args): """ %prog filtervcf NA12878.hg38.vcf.gz Filter lobSTR VCF using script shipped in lobSTR. Input file can be a list of vcf files. """ p = OptionParser(filtervcf.__doc__) p.set_home("lobstr", default="/mnt/software/lobSTR") p.set_aws_opts(store="hli-mv-data-science/htang/str") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args lhome = opts.lobstr_home store = opts.output_path if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] vcffiles = [x for x in vcffiles if ".filtered." not in x] run_args = [(x, lhome, x.startswith("s3://") and store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for res in p.map_async(run_filter, run_args).get(): continue
def __init__(self, filename): super(OVL, self).__init__(filename) fp = must_open(filename) contained = set() for row in fp: o = OVLLine(row) self.append(o) if o.tag == "a in b": contained.add(o.a) elif o.tag == "b in a": contained.add(o.b) logging.debug("Imported {} links. Contained tigs: {}".\ format(len(self), len(contained))) self.contained = contained self.graph = BiGraph() for o in self: if o.tag == "a->b": a, b = o.a, o.b elif o.tag == "b->a": a, b = o.b, o.a if a in contained or b in contained: continue bstrand = '<' if o.bstrand == '-' else '>' self.graph.add_edge(a, b, '>', bstrand, length=o.score)
def read_scores(scoresfile, opts=None, sort=False, trimsuffix=True): scores = {} _pid, _score, resolve = (0.0, 0.0, 'alignment') if opts == None \ else (opts.pid, opts.score, opts.resolve) fp = must_open(scoresfile) logging.debug("Load scores file `{0}`".format(scoresfile)) for row in fp: (new, old, identity, score) = row.strip().split("\t") if trimsuffix: old = re.sub('\.\d+$', '', old) if resolve == "alignment": match = re.search("\d+\/\d+\s+\(\s*(\d+\.\d+)%\)", identity) pid = match.group(1) if float(pid) < _pid or float(score) < _score: continue else: pid = identity if new not in scores: scores[new] = [] scores[new].append((new, old, float(pid), float(score))) if sort: for new in scores: scores[new].sort(key=lambda k: (-k[2], -k[3])) return scores
def sizes(args): """ %prog sizes bedfile Infer the sizes for each seqid. Useful before dot plots. """ p = OptionParser(sizes.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args assert op.exists(bedfile) sizesfile = bedfile.rsplit(".", 1)[0] + ".sizes" fw = must_open(sizesfile, "w", checkexists=True, skipcheck=True) if fw: b = Bed(bedfile) for s, sbeds in b.sub_beds(): print >> fw, "{0}\t{1}".format(\ s, max(x.end for x in sbeds)) logging.debug("Sizes file written to `{0}`.".format(sizesfile)) return sizesfile
def write_csv(header, contents, sep=",", filename="stdout", thousands=False, tee=False): """ Write csv that are aligned with the column headers. >>> header = ["x_value", "y_value"] >>> contents = [(1, 100), (2, 200)] >>> write_csv(header, contents) x_value, y_value 1, 100 2, 200 """ from jcvi.formats.base import must_open, is_number from jcvi.utils.cbook import thousands as th fw = must_open(filename, "w") allcontents = [header] + contents if header else contents cols = len(contents[0]) for content in allcontents: assert len(content) == cols # Stringify the contents for i, content in enumerate(allcontents): if thousands: content = [int(x) if is_number(x, cast=int) else x for x in content] content = [th(x) if (is_number(x, cast=int) and x >= 1000) else x for x in content] allcontents[i] = [str(x) for x in content] colwidths = [max(len(x[i]) for x in allcontents) for i in xrange(cols)] sep += " " for content in allcontents: rjusted = [x.rjust(cw) for x, cw in zip(content, colwidths)] formatted = sep.join(rjusted) print >> fw, formatted if tee and filename != "stdout": print formatted
def get_mixture(data, components): """ probs = [.476, .509] mus = [.69069, -.15038] variances = [.468982e-1, .959052e-1] """ from jcvi.apps.base import popen probs, mus, sigmas = [], [], [] fw = must_open("tmp", "w") log_data = [log(x) for x in data if x > .05] data = "\n".join(["%.4f" % x for x in log_data]).replace("inf\n", "") fw.write(data) fw.close() cmd = "gmm-bic {0} {1} {2}".format(components, len(log_data), fw.name) pipe = popen(cmd) for row in pipe: if row[0] != '#': continue atoms = row.split(",") a, b, c = atoms[1:4] a = float(a) b = float(b) c = float(c) mus.append(a) sigmas.append(b) probs.append(c) os.remove(fw.name) return probs, mus, sigmas
def bed(args): """ %prog bed blastfile Print out bed file based on coordinates in BLAST report. By default, write out subject positions. Use --swap to write query positions. """ p = OptionParser(bed.__doc__) p.add_option("--swap", default=False, action="store_true", help="Write query positions [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) blastfile, = args swap = opts.swap fp = must_open(blastfile) bedfile = blastfile.rsplit(".", 1)[0] + ".bed" fw = open(bedfile, "w") for row in fp: b = BlastLine(row) if swap: b = b.swapped print >> fw, b.bedline logging.debug("File written to `{0}`.".format(bedfile)) return bedfile
def digest(args): """ %prog digest fastafile NspI,BfuCI Digest fasta sequences to map restriction site positions. """ p = OptionParser(digest.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, enzymes = args enzymes = enzymes.split(",") enzymes = [x for x in AllEnzymes if str(x) in enzymes] f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") header = ["Contig", "Length"] + [str(x) for x in enzymes] print("\t".join(header), file=fw) for name, rec in f.iteritems_ordered(): row = [name, len(rec)] for e in enzymes: pos = e.search(rec.seq) pos = "na" if not pos else "|".join(str(x) for x in pos) row.append(pos) print("\t".join(str(x) for x in row), file=fw)
def blast(self, blastfile=None, outfile=None): """ convert anchor file to 12 col blast file """ from jcvi.formats.blast import BlastSlow, BlastLineByConversion if not outfile: outfile = self.filename + ".blast" if blastfile is not None: blasts = BlastSlow(blastfile).to_dict() else: blasts = None fw = must_open(outfile, "w", checkexists=True) nlines = 0 for a, b, id in self.iter_pairs(): if (a, b) in blasts: bline = blasts[(a, b)] elif (b, a) in blasts: bline = blasts[(b, a)] else: line = "\t".join((a, b)) bline = BlastLineByConversion(line, mode="110000000000") print >> fw, bline nlines += 1 fw.close() logging.debug("A total of {0} BLAST lines written to `{1}`."\ .format(nlines, outfile)) return outfile
def filterm4(args): """ %prog filterm4 sample.m4 > filtered.m4 Filter .m4 file after blasr is run. As blasr takes a long time to run, changing -bestn is undesirable. This screens the m4 file to retain top hits. """ p = OptionParser(filterm4.__doc__) p.add_option("--best", default=1, type="int", help="Only retain best N hits") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) m4file, = args best = opts.best fp = open(m4file) fw = must_open(opts.outfile, "w") seen = defaultdict(int) retained = total = 0 for row in fp: r = M4Line(row) total += 1 if total % 100000 == 0: logging.debug("Retained {0} lines".\ format(percentage(retained, total))) if seen.get(r.query, 0) < best: fw.write(row) seen[r.query] += 1 retained += 1 fw.close()
def __init__(self, filename, defaultcolor='#fb8072', header=False): super(BlockFile, self).__init__(filename) fp = must_open(filename) hd = fp.next().rstrip().split("\t") ncols = len(hd) if header: self.header = hd else: fp.seek(0) self.header = range(ncols) data = [] highlight = [] for row in fp: hl = ("*" in row) # r* highlights the block in red color if hl: hl, row = row.split("*", 1) hl = hl or defaultcolor atoms = row.rstrip().split("\t") atoms = [x.strip() for x in atoms] atoms = ["." if x == "" else x for x in atoms] if len(atoms) > ncols: atoms = atoms[:ncols] elif len(atoms) < ncols: atoms = atoms + ["."] * (ncols - len(atoms)) data.append(atoms) highlight.append(hl) self.data = data self.highlight = highlight self.columns = zip(*data) self.ncols = ncols
def smart_reroot(treefile, outgroupfile, outfile, format=0): """ simple function to reroot Newick format tree using ete2 Tree reading format options see here: http://packages.python.org/ete2/tutorial/tutorial_trees.html#reading-newick-trees """ tree = Tree(treefile, format=format) leaves = [t.name for t in tree.get_leaves()][::-1] outgroup = [] for o in must_open(outgroupfile): o = o.strip() for leaf in leaves: if leaf[:len(o)] == o: outgroup.append(leaf) if outgroup: break if not outgroup: print("Outgroup not found. Tree {0} cannot be rerooted.".format(treefile), file=sys.stderr) return treefile try: tree.set_outgroup(tree.get_common_ancestor(*outgroup)) except ValueError: assert type(outgroup) == list outgroup = outgroup[0] tree.set_outgroup(outgroup) tree.write(outfile=outfile, format=format) logging.debug("Rerooted tree printed to {0}".format(outfile)) return outfile
def SH_raxml(reftree, querytree, phy_file, shout="SH_out.txt"): """ SH test using RAxML querytree can be a single tree or a bunch of trees (eg. from bootstrapping) """ assert op.isfile(reftree) shout = must_open(shout, "a") raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work")) mkdir(raxml_work) raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \ sequences=phy_file, algorithm="h", model="GTRGAMMA", \ name="SH", starting_tree=reftree, bipartition_filename=querytree, \ working_dir=raxml_work) logging.debug("Running SH test in RAxML: %s" % raxml_cl) o, stderr = raxml_cl() # hard coded try: pval = re.search('(Significantly.*:.*)', o).group(0) except: print("SH test failed.", file=sys.stderr) else: pval = pval.strip().replace("\t"," ").replace("%","\%") print("{0}\t{1}".format(op.basename(querytree), pval), file=shout) logging.debug("SH p-value appended to %s" % shout.name) shout.close() return shout.name
def bincount(args): """ %prog bincount fastafile binfile Count K-mers in the bin. """ from bitarray import bitarray from jcvi.formats.sizes import Sizes p = OptionParser(bincount.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, binfile = args K = opts.K fp = open(binfile) a = bitarray() a.fromfile(fp) f = Sizes(fastafile) tsize = 0 fw = must_open(opts.outfile, "w") for name, seqlen in f.iter_sizes(): ksize = seqlen - K + 1 b = a[tsize : tsize + ksize] bcount = b.count() print >> fw, "\t".join(str(x) for x in (name, bcount)) tsize += ksize
def __init__(self, filename, aat_dialect=False): super(Btab, self).__init__(filename) for line in must_open(filename): if line[0] == "#": continue self.append(BtabLine(line, aat_dialect=aat_dialect))
def augustus(args): """ %prog augustus augustus.gff3 > reformatted.gff3 AUGUSTUS does generate a gff3 (--gff3=on) but need some refinement. """ from jcvi.formats.gff import Gff p = OptionParser(augustus.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) ingff3, = args gff = Gff(ingff3) fw = must_open(opts.outfile, "w") seen = defaultdict(int) for g in gff: if g.type not in ("gene", "transcript", "CDS"): continue if g.type == "transcript": g.type = "mRNA" prefix = g.seqid + "_" pid = prefix + g.id newid = "{0}-{1}".format(pid, seen[pid]) if pid in seen else pid seen[pid] += 1 g.attributes["ID"] = [newid] g.attributes["Parent"] = [(prefix + x) for x in g.attributes["Parent"]] g.update_attributes() print >> fw, g fw.close()
def print_to_bed(self, filename="stdout"): fw = must_open(filename, "w") for lg, markers in sorted(self.items()): for marker, pos in markers: print >> fw, "\t".join(str(x) for x in \ (lg, pos, pos + 1, marker)) fw.close()
def __init__(self, filename, sorted=False): super(BlastSlow, self).__init__(filename) fp = must_open(filename) for row in fp: self.append(BlastLine(row)) if not sorted: self.sort(key=lambda x: x.query)
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = { "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein"], "gb": ["genome", "nuccore", "nucgss"], "est": ["nucest"], "gss": ["nucgss"], "acc": ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein") p.add_option("--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions") p.add_option("--format", default="fasta", choices=valid_formats, help="download format [default: %default]") p.add_option("--database", default="nuccore", choices=valid_databases, help="search database [default: %default]") p.add_option("--retmax", default=1000000, type="int", help="how many results to return [default: %default]") p.add_option( "--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence [default: %default]") p.add_option( "--batchsize", default=500, type="int", help="download the results in batch for speed-up [default: %default]") p.add_option("--outdir", default=None, help="output directory, with accession number as filename") p.add_option("--outprefix", default="out", help="output file name prefix [default: %default]") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) filename, = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if opts.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = opts.format database = opts.database batchsize = opts.batchsize assert database in allowed_databases[fmt], \ "For output format '{0}', allowed databases are: {1}".\ format(fmt, allowed_databases[fmt]) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = opts.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = opts.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, \ skipcheck=opts.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez(list_of_terms, retmax=opts.retmax, \ rettype=fmt, db=database, batchsize=batchsize, \ email=opts.email): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, \ skipcheck=opts.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print >> fw, rec print >> fw seen.add(id) if seen: print >> sys.stderr, "A total of {0} {1} records downloaded.".\ format(totalsize, fmt.upper()) return outfile
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ from jcvi.algorithms.supermap import supermap from jcvi.utils.range import range_union allowed_iterby = ("query", "query_sbjct") p = OptionParser(covfilter.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--scov", default=False, action="store_true", help="Subject coverage instead of query [default: %default]") p.add_option("--supermap", action="store_true", help="Use supermap instead of union") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") p.add_option( "--iterby", dest="iterby", default="query", choices=allowed_iterby, help="Choose how to iterate through BLAST [default: %default]") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args pctid = opts.pctid pctcov = opts.pctcov union = not opts.supermap scov = opts.scov sz = Sizes(fastafile) sizes = sz.mapping iterby = opts.iterby qspair = iterby == "query_sbjct" if not union: querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(blastfile) iterator = blast.iter_hits_pair if qspair else blast.iter_hits covidstore = {} for query, blines in iterator(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 this_identity = 0 ranges = [] for b in blines: if scov: s, start, stop = b.subject, b.sstart, b.sstop else: s, start, stop = b.query, b.qstart, b.qstop cov_id = s if b.pctid < pctid: continue if start > stop: start, stop = stop, start this_covered += stop - start + 1 this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps ranges.append(("1", start, stop)) if ranges: this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen if union: this_covered = range_union(ranges) this_coverage = this_covered * 100. / sizes[cov_id] covidstore[query] = (this_identity, this_coverage) if this_identity >= pctid and this_coverage >= pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen if opts.list: if qspair: allpairs = defaultdict(list) for (q, s) in covidstore: allpairs[q].append((q, s)) allpairs[s].append((q, s)) for id, size in sz.iter_sizes(): if id not in allpairs: print "\t".join((id, "na", "0", "0")) else: for qs in allpairs[id]: this_identity, this_coverage = covidstore[qs] print "{0}\t{1:.1f}\t{2:.1f}".format( "\t".join(qs), this_identity, this_coverage) else: for query, size in sz.iter_sizes(): this_identity, this_coverage = covidstore.get(query, (0, 0)) print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) m += "Total mapped: {0} ({1:.1f}% of {2})\n".\ format(mapped_count, mapped_count * 100. / total, total) m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) m += "Average id = {0:.2f}%\n".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sz.totalsize m += "Coverage: {0} covered, {1} total\n".\ format(covered, queries_combined) m += "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) logfile = blastfile + ".covfilter.log" fw = open(logfile, "w") for f in (sys.stderr, fw): print >> f, m fw.close() if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast: query = (b.query, b.subject) if qspair else b.query if query in valid: print >> fw, b
def calibrate(args): """ %prog calibrate calibrate.JPG boxsize Calibrate pixel-inch ratio and color adjustment. - `calibrate.JPG` is the photo containig a colorchecker - `boxsize` is the measured size for the boxes on printed colorchecker, in squared centimeter (cm2) units """ xargs = args[2:] p = OptionParser(calibrate.__doc__) opts, args, iopts = add_seeds_options(p, args) if len(args) != 2: sys.exit(not p.print_help()) imagefile, boxsize = args boxsize = float(boxsize) # Read in color checker colorcheckerfile = op.join(datadir, "colorchecker.txt") colorchecker = [] expected = 0 for row in open(colorcheckerfile): boxes = row.split() colorchecker.append(boxes) expected += len(boxes) folder = op.split(imagefile)[0] objects = seeds([imagefile, "--outdir={0}".format(folder)] + xargs) nseeds = len(objects) logging.debug("Found {0} boxes (expected={1})".format(nseeds, expected)) assert ( expected - 4 <= nseeds <= expected + 4 ), "Number of boxes drastically different from {0}".format(expected) # Calculate pixel-cm ratio boxes = [t.area for t in objects] reject = reject_outliers(boxes) retained_boxes = [b for r, b in zip(reject, boxes) if not r] mbox = np.median(retained_boxes) # in pixels pixel_cm_ratio = (mbox / boxsize) ** 0.5 logging.debug( "Median box size: {0} pixels. Measured box size: {1} cm2".format(mbox, boxsize) ) logging.debug("Pixel-cm ratio: {0}".format(pixel_cm_ratio)) xs = [t.x for t in objects] ys = [t.y for t in objects] idx_xs = get_kmeans(xs, 6) idx_ys = get_kmeans(ys, 4) for xi, yi, s in zip(idx_xs, idx_ys, objects): s.rank = (yi, xi) objects.sort(key=lambda x: x.rank) colormap = [] for s in objects: x, y = s.rank observed, expected = s.rgb, rgb_to_triplet(colorchecker[x][y]) colormap.append((np.array(observed), np.array(expected))) # Color transfer tr0 = np.eye(3).flatten() print("Initial distance:", total_error(tr0, colormap), file=sys.stderr) tr = fmin(total_error, tr0, args=(colormap,)) tr.resize((3, 3)) print("RGB linear transform:\n", tr, file=sys.stderr) calib = {"PixelCMratio": pixel_cm_ratio, "RGBtransform": tr.tolist()} jsonfile = op.join(folder, "calibrate.json") fw = must_open(jsonfile, "w") print(json.dumps(calib, indent=4), file=fw) fw.close() logging.debug("Calibration specs written to `{0}`.".format(jsonfile)) return jsonfile
def ystr(args): """ %prog ystr chrY.vcf Print out Y-STR info given VCF. Marker name extracted from tabfile. """ from jcvi.utils.table import write_csv p = OptionParser(ystr.__doc__) p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args si = STRFile(opts.lobstr_home, db="hg38-named") register = si.register header = "Marker|Reads|Ref|Genotype|Motif".split("|") contents = [] fp = must_open(vcffile) reader = vcf.Reader(fp) simple_register = {} for record in reader: name = register[(record.CHROM, record.POS)] info = record.INFO ref = int(float(info["REF"])) rpa = info.get("RPA", ref) if isinstance(rpa, list): rpa = "|".join(str(int(float(x))) for x in rpa) ru = info["RU"] simple_register[name] = rpa for sample in record.samples: contents.append((name, sample["ALLREADS"], ref, rpa, ru)) # Multi-part markers a, b, c = "DYS389I", "DYS389B.1", "DYS389B" if a in simple_register and b in simple_register: simple_register[c] = int(simple_register[a]) + int(simple_register[b]) # Multi-copy markers mm = ["DYS385", "DYS413", "YCAII"] for m in mm: ma, mb = m + 'a', m + 'b' if ma not in simple_register or mb not in simple_register: simple_register[ma] = simple_register[mb] = None del simple_register[ma] del simple_register[mb] continue if simple_register[ma] > simple_register[mb]: simple_register[ma], simple_register[mb] = \ simple_register[mb], simple_register[ma] write_csv(header, contents, sep=" ") print "[YSEARCH]" build_ysearch_link(simple_register) print "[YFILER]" build_yhrd_link(simple_register, panel=YHRD_YFILER) print "[YFILERPLUS]" build_yhrd_link(simple_register, panel=YHRD_YFILERPLUS) print "[YSTR-ALL]" build_yhrd_link(simple_register, panel=USYSTR_ALL)
def assemble(args): """ %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta] Run the PASA alignment assembly pipeline If two transcript fasta files (Trinity denovo and genome guided) are provided and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome> Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(assemble.__doc__) p.set_pasa_opts() p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, dnfasta, = args[:3] ggfasta = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() aligners = opts.aligners.split(",") for aligner in aligners: if aligner not in ALLOWED_ALIGNERS: logging.error("Error: Unknown aligner `{0}`".format(aligner)) logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \ "combine multiple aligners in list separated by comma") sys.exit() clean = opts.clean seqclean = op.join(opts.tgi_home, "seqclean") accn_extract = which(op.join(PASA_HOME, "misc_utilities", \ "accession_extractor.pl")) launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) build_compreh_trans = which(op.join(PASA_HOME, "scripts", \ "build_comprehensive_transcriptome.dbi")) fl_accs = opts.fl_accs cpus = opts.cpus grid = opts.grid prepare, runfile = opts.prepare, "run.sh" pctcov, pctid = opts.pctcov, opts.pctid compreh_pctid = opts.compreh_pctid compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice cmds = [] if ggfasta: transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge() accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn) cmds.append(accn_extract_cmd) if not prepare: sh(accn_extract_cmd) else: symlink(dnfasta, tfasta) transcripts = tfasta if opts.grid and not opts.threaded: opts.threaded = opts.cpus prjobid = None if clean: cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, cpus) if prepare: cmds.append(cleancmd) else: prjobid = sh(cleancmd, grid=grid, grid_opts=opts) aafw = must_open(aaconf, "w") print >> aafw, alignAssembly_conf.format("{0}_pasa".format(pasa_db), \ pctcov, pctid, bpsplice) aafw.close() symlink(genome, gfasta) aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, gfasta) aacmd += " -t {0}.clean -T -u {0}".format(transcripts) if clean else \ " -t {0}".format(transcripts) if fl_accs: symlink(fl_accs, flaccs) aacmd += " -f {0}".format(flaccs) if ggfasta: aacmd += " --TDN {0}".format(tdn) aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \ opts.intron, cpus) if prepare: cmds.append(aacmd) else: opts.hold_jid = prjobid prjobid = sh(aacmd, grid=grid, grid_opts=opts) if opts.compreh and ggfasta: comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts) comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format(compreh_pctid, compreh_pctcov) if prepare: cmds.append(comprehcmd) else: opts.hold_jid = prjobid prjobid = sh(comprehcmd, grid=grid, grid_opts=opts) if prepare: write_file(runfile, "\n".join(cmds)) # initialize run script
def consolidate(args): """ %prog consolidate gffile1 gffile2 ... > consolidated.out Given 2 or more gff files generated by pasa annotation comparison, iterate through each locus (shared locus name or overlapping CDS) and identify same/different isoforms (shared splicing structure) across the input datasets. If `slop` is enabled, consolidation will collapse any variation in terminal UTR lengths, keeping the longest as representative. """ from jcvi.formats.base import longest_unique_prefix from jcvi.formats.gff import make_index, match_subfeats from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper from itertools import combinations, product supported_modes = ["name", "coords"] p = OptionParser(consolidate.__doc__) p.add_option("--slop", default=False, action="store_true", help="allow minor variation in terminal 5'/3' UTR" + \ " start/stop position [default: %default]") p.add_option("--inferUTR", default=False, action="store_true", help="infer presence of UTRs from exon coordinates") p.add_option("--mode", default="name", choices=supported_modes, help="method used to determine overlapping loci") p.add_option("--summary", default=False, action="store_true", help="Generate summary table of consolidation process") p.add_option("--clusters", default=False, action="store_true", help="Generate table of cluster members after consolidation") p.set_outfile() opts, args = p.parse_args(args) slop = opts.slop inferUTR = opts.inferUTR mode = opts.mode if len(args) < 2: sys.exit(not p.print_help()) gffdbx = {} for gffile in args: dbn = longest_unique_prefix(gffile, args) gffdbx[dbn] = make_index(gffile) loci = Grouper() for dbn in gffdbx: odbns = [odbn for odbn in gffdbx if dbn != odbn] for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')): if mode == "name": loci.join(gene.id, (gene.id, dbn)) else: if (gene.id, dbn) not in loci: loci.join((gene.id, dbn)) gene_cds = list(gffdbx[dbn].children(gene, \ featuretype='CDS', order_by=('start'))) gene_cds_start, gene_cds_stop = gene_cds[0].start, \ gene_cds[-1].stop for odbn in odbns: for ogene_cds in gffdbx[odbn].region(seqid=gene.seqid, \ start=gene_cds_start, end=gene_cds_stop, \ strand=gene.strand, featuretype='CDS'): for ogene in gffdbx[odbn].parents(ogene_cds, featuretype='gene'): loci.join((gene.id, dbn), (ogene.id, odbn)) gfeats = {} mrna = AutoVivification() for i, locus in enumerate(loci): gene = "gene_{0:0{pad}}".format(i, pad=6) \ if mode == "coords" else None for elem in locus: if type(elem) == tuple: _gene, dbn = elem if gene is None: gene = _gene g = gffdbx[dbn][_gene] if gene not in gfeats: gfeats[gene] = g gfeats[gene].attributes['ID'] = [gene] else: if g.start < gfeats[gene].start: gfeats[gene].start = g.start if g.stop > gfeats[gene].stop: gfeats[gene].stop = g.stop c = list(gffdbx[dbn].children(_gene, featuretype='mRNA', order_by='start')) if len(c) > 0: mrna[gene][dbn] = c fw = must_open(opts.outfile, "w") print >> fw, "##gff-version 3" seen = {} if opts.summary: summaryfile = "{0}.summary.txt".format(opts.outfile.rsplit(".")[0]) sfw = must_open(summaryfile, "w") summary = ["id"] summary.extend(gffdbx.keys()) print >> sfw, "\t".join(str(x) for x in summary) if opts.clusters: clustersfile = "{0}.clusters.txt".format(opts.outfile.rsplit(".")[0]) cfw = must_open(clustersfile, "w") clusters = ["id", "dbns", "members", "trlens"] print >> cfw, "\t".join(str(x) for x in clusters) for gene in mrna: g = Grouper() dbns = list(combinations(mrna[gene], 2)) if len(dbns) > 0: for dbn1, dbn2 in dbns: dbx1, dbx2 = gffdbx[dbn1], gffdbx[dbn2] for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]): mrna1s, mrna2s = mrna1.stop - mrna1.start + 1, \ mrna2.stop - mrna2.start + 1 g.join((dbn1, mrna1.id, mrna1s)) g.join((dbn2, mrna2.id, mrna2s)) if match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype='CDS'): res = [] ftypes = ['exon'] if inferUTR else ['five_prime_UTR', 'three_prime_UTR'] for ftype in ftypes: res.append(match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype=ftype, slop=slop)) if all(r == True for r in res): g.join((dbn1, mrna1.id, mrna1s), (dbn2, mrna2.id, mrna2s)) else: for dbn1 in mrna[gene]: for mrna1 in mrna[gene][dbn1]: g.join((dbn1, mrna1.id, mrna1.stop - mrna1.start + 1)) print >> fw, gfeats[gene] for group in g: group.sort(key=lambda x: x[2], reverse=True) dbs, mrnas = [el[0] for el in group], [el[1] for el in group] d, m = dbs[0], mrnas[0] dbid, _mrnaid = "|".join(str(x) for x in set(dbs)), [] for x in mrnas: if x not in _mrnaid: _mrnaid.append(x) mrnaid = "{0}|{1}".format(dbid, "-".join(_mrnaid)) if mrnaid not in seen: seen[mrnaid] = 0 else: seen[mrnaid] += 1 mrnaid = "{0}-{1}".format(mrnaid, seen[mrnaid]) _mrna = gffdbx[d][m] _mrna.attributes['ID'] = [mrnaid] _mrna.attributes['Parent'] = [gene] children = gffdbx[d].children(m, order_by='start') print >> fw, _mrna for child in children: child.attributes['ID'] = ["{0}|{1}".format(dbid, child.id)] child.attributes['Parent'] = [mrnaid] print >> fw, child if opts.summary: summary = [mrnaid] summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx]) print >> sfw, "\t".join(str(x) for x in summary) if opts.clusters: clusters = [mrnaid] clusters.append(",".join(str(el[0]) for el in group)) clusters.append(",".join(str(el[1]) for el in group)) clusters.append(",".join(str(el[2]) for el in group)) print >> cfw, "\t".join(str(x) for x in clusters) fw.close() if opts.summary: sfw.close() if opts.clusters: cfw.close()
def compare(args): """ %prog compare pasa_db_name [--annots_gff3=annotation.gff3] Run the PASA annotation comparison pipeline This assumes that PASA alignment assembly has alredy been completed and run directory contains `genome.fasta` and `transcript.fasta` files. If `--annots_gff3` is specified, the PASA database is loaded with the annotations first before starting annotation comparison. Otherwise, it uses previously loaded annotation data. Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(compare.__doc__) p.set_pasa_opts(action="compare") p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) pasa_db, = args PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) annots_gff3 = opts.annots_gff3 grid = opts.grid prepare, runfile = opts.prepare, "run.sh" os.chdir(pasa_db) if prepare: write_file(runfile, "", append=True, skipcheck=True) # initialize run script acfw = must_open(acconf, "w") print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \ opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \ opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \ opts.stompovl, opts.trust_FL, opts.utr_exons) acfw.close() if not op.exists(gfasta): sys.exit("Genome fasta file `{0}` does not exist".format(gfasta)) transcripts = tfasta if not op.exists(transcripts): sys.exit("Transcript fasta file `{0}` does not exist".format(transcripts)) if op.exists("{0}.clean".format(transcripts)): transcripts = "{0}.clean".format(transcripts) accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \ acconf, gfasta, transcripts, opts.genetic_code) if annots_gff3: if not op.exists(annots_gff3): sys.exit("Annotation gff3 file `{0}` does not exist".format(annots_gff3)) symlink(annots_gff3, annotation) accmd += " -L --annots_gff3 {0}".format(annotation) if prepare: write_file(runfile, accmd, append=True) else: sh(accmd, grid=grid, grid_opts=opts)
def mcscan(args): """ %prog mcscan bedfile anchorfile [options] Stack synteny blocks on a reference bed, MCSCAN style. The first column in the output is the reference order, given in the bedfile. Then each column next to it are separate 'tracks'. If --mergetandem=tandem_file is specified, tandem_file should have each tandem cluster as one line, tab separated. """ p = OptionParser(mcscan.__doc__) p.add_option("--iter", default=100, type="int", help="Max number of chains to output [default: %default]") p.add_option( "--ascii", default=False, action="store_true", help="Output symbols rather than gene names [default: %default]") p.add_option( "--Nm", default=10, type="int", help="Clip block ends to allow slight overlaps [default: %default]") p.add_option("--trackids", action="store_true", help="Track block IDs in separate file [default: %default]") p.add_option("--mergetandem", default=None, help="merge tandems genes in output acoording to PATH-TO-TANDEM_FILE, "\ "cannot be used with --ascii") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, anchorfile = args ascii = opts.ascii clip = opts.Nm trackids = opts.trackids ofile = opts.outfile mergetandem = opts.mergetandem bed = Bed(bedfile) order = bed.order if trackids: olog = ofile + ".tracks" fwlog = must_open(olog, "w") if mergetandem: assert not ascii tandems = {} for row in file(mergetandem): row = row.split() s = ";".join(row) for atom in row: tandems[atom] = s ac = AnchorFile(anchorfile) ranges = [] block_pairs = defaultdict(dict) blocks = ac.blocks for i, ib in enumerate(blocks): q, s, t = zip(*ib) if q[0] not in order: q, s = s, q r = get_range(q, s, t, i, order, block_pairs, clip=clip) ranges.append(r) assert q[0] in order if s[0] not in order: continue # is_self comparison q, s = s, q r = get_range(q, s, t, i, order, block_pairs, clip=clip) ranges.append(r) fw = must_open(ofile, "w") tracks = [] print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges)) iteration = 0 while ranges: if iteration >= opts.iter: break selected, score = range_chain(ranges) tracks.append(selected) selected = set(x.id for x in selected) if trackids: print >> fwlog, ",".join(str(x) for x in sorted(selected)) ranges = [x for x in ranges if x.id not in selected] msg = "Chain {0}: score={1}".format(iteration, score) if ranges: msg += " {0} blocks remained..".format(len(ranges)) else: msg += " done!" print >> sys.stderr, msg iteration += 1 mbed = [] for b in bed: id = b.accn atoms = [] for track in tracks: track_ids = [x.id for x in track] for tid in track_ids: pairs = block_pairs[tid] anchor = pairs.get(id, ".") if anchor != ".": break if ascii and anchor != ".": anchor = "x" atoms.append(anchor) mbed.append((id, atoms)) for id, atoms in mbed: sep = "" if ascii else "\t" if mergetandem: for i, atom in enumerate(atoms): atoms[i] = tandems.get(atom, atom) print >> fw, "\t".join((id, sep.join(atoms))) logging.debug("MCscan blocks written to `{0}`.".format(ofile)) if trackids: logging.debug("Block IDs written to `{0}`.".format(olog))
def filter(args): """ %prog filter test.blast Produce a new blast file and filter based on: - score: >= cutoff - pctid: >= cutoff - hitlen: >= cutoff - evalue: <= cutoff - ids: valid ids Use --inverse to obtain the complementary records for the criteria above. - noself: remove self-self hits """ p = OptionParser(filter.__doc__) p.add_option("--score", dest="score", default=0, type="int", help="Score cutoff") p.set_align(pctid=95, hitlen=100, evalue=.01) p.add_option("--noself", default=False, action="store_true", help="Remove self-self hits") p.add_option("--ids", help="Path to file with ids to retain") p.add_option("--inverse", default=False, action="store_true", help="Similar to grep -v, inverse") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) if opts.ids: ids = set() for row in must_open(opts.ids): if row[0] == "#": continue row = row.replace(",", "\t") ids.update(row.split()) else: ids = None blastfile, = args inverse = opts.inverse outfile = opts.outfile fp = must_open(blastfile) score, pctid, hitlen, evalue, noself = \ opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.noself newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \ outfile is None else outfile if inverse: newblastfile += ".inverse" fw = must_open(newblastfile, "w") for row in fp: if row[0] == '#': continue c = BlastLine(row) if ids: if c.query in ids and c.subject in ids: noids = False else: noids = True else: noids = None remove = c.score < score or \ c.pctid < pctid or \ c.hitlen < hitlen or \ c.evalue > evalue or \ noids if inverse: remove = not remove remove = remove or (noself and c.query == c.subject) if not remove: print >> fw, row.rstrip() fw.close() return newblastfile
def cscore(args): """ %prog cscore blastfile > cscoreOut See supplementary info for sea anemone genome paper, C-score formula: cscore(A,B) = score(A,B) / max(best score for A, best score for B) A C-score of one is the same as reciprocal best hit (RBH). Output file will be 3-column (query, subject, cscore). Use --cutoff to select a different cutoff. """ from jcvi.utils.cbook import gene_name p = OptionParser(cscore.__doc__) p.add_option("--cutoff", default=.9999, type="float", help="Minimum C-score to report [default: %default]") p.add_option("--pct", default=False, action="store_true", help="Also include pct as last column [default: %default]") p.add_option("--writeblast", default=False, action="store_true", help="Also write filtered blast file [default: %default]") p.set_stripnames() p.set_outfile() opts, args = p.parse_args(args) ostrip = opts.strip_names writeblast = opts.writeblast outfile = opts.outfile if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args blast = Blast(blastfile) logging.debug("Register best scores ..") best_score = defaultdict(float) for b in blast: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) score = b.score if score > best_score[query]: best_score[query] = score if score > best_score[subject]: best_score[subject] = score blast = Blast(blastfile) pairs = {} cutoff = opts.cutoff for b in blast: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) score = b.score pctid = b.pctid s = score / max(best_score[query], best_score[subject]) if s > cutoff: pair = (query, subject) if pair not in pairs or s > pairs[pair][0]: pairs[pair] = (s, pctid, b) fw = must_open(outfile, "w") if writeblast: fwb = must_open(outfile + ".filtered.blast", "w") pct = opts.pct for (query, subject), (s, pctid, b) in sorted(pairs.items()): args = [query, subject, "{0:.2f}".format(s)] if pct: args.append("{0:.1f}".format(pctid)) print >> fw, "\t".join(args) if writeblast: print >> fwb, b fw.close() if writeblast: fwb.close()
def write(self, filename="stdout"): fw = must_open(filename, "w") for e in self.edges.values(): print >> fw, e logging.debug("Graph written to `{0}`.".format(filename))
def __init__(self, filename): super(Blast, self).__init__(filename) self.fp = must_open(filename)
def write_AGP(self, filename, orientationguide={}): """ For each component, we have two overlaps: North and South. ======= |||| South ====(=================) Current BAC North |||| =============== For the case that says "Non-terminal", the overlap will not be considered. North-South would suggest a '+' orientation, South-North would suggest a '-' orientation. In most cases, unless the overlap involves phase1 BAC, the selected range will be shown as the brackets above - exclude North overlap, and include South overlap (aka the "left-greedy" rule). """ fw = must_open(filename, "w") for aid, bb in groupby(self.lines, key=lambda x: x.aid): bb = list(bb) north, south = bb aid = north.aid assert aid == south.aid aphase = north.aphase chr = north.chr size = north.asize ar = [chr, 0, 0, 0] northline = southline = None northrange = southrange = None # Warn if adjacent components do not have valid overlaps if south.is_no_overlap: print >> sys.stderr, south # Most gaps, except telomeres occur twice, so only do the "North" if north.is_gap: bar = ar + self.get_agp_gap(north.bid) northline = "\t".join(str(x) for x in bar) else: if north.isTerminal: northrange = north.astart, north.astop if south.is_gap: if south.bid == "telomere": bar = ar + self.get_agp_gap(south.bid) southline = "\t".join(str(x) for x in bar) else: if south.isTerminal: southrange = south.astart, south.astop else: bar = ar + self.get_agp_gap("fragment") southline = "\t".join(str(x) for x in bar) # Determine the orientation and clear range for the current BAC clr = [1, size] orientation = sorientation = None if northrange: start, stop = northrange Lhang = start - 1 Rhang = size - stop orientation = '+' if Lhang < Rhang else '-' if north.bphase == 1 and north.bphase < aphase: if Lhang < Rhang: # North overlap at 5` clr[0] = start else: clr[1] = stop # Override left-greedy (also see below) else: if Lhang < Rhang: clr[0] = stop + 1 else: clr[1] = start - 1 if southrange: start, stop = southrange Lhang = start - 1 Rhang = size - stop sorientation = '+' if Lhang > Rhang else '-' # Override left-greedy (also see above) if aphase == 1 and aphase < south.bphase: if Lhang < Rhang: # South overlap at 5` clr[0] = stop + 1 else: clr[1] = start - 1 else: if Lhang < Rhang: clr[0] = start else: clr[1] = stop if orientation: if sorientation: try: assert orientation == sorientation, \ "Orientation conflicts:\n{0}\n{1}".format(north, south) except AssertionError as e: logging.debug(e) else: if sorientation: orientation = sorientation else: # Both overlaps fail to define orientation orientation = orientationguide.get(aid, "+") component_type = "D" if aphase in (1, 2) else "F" bar = ar + [component_type, aid, clr[0], clr[1], orientation] cline = "\t".join(str(x) for x in bar) if northline: print >> fw, northline print >> fw, cline if southline: print >> fw, southline fw.close() reindex([filename, "--inplace"])
def dotplot(args): """ %prog dotplot map.csv ref.fasta Make dotplot between chromosomes and linkage maps. The input map is csv formatted, for example: ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition scaffold_2707,11508,1,0 scaffold_2707,11525,1,1.2 """ from jcvi.assembly.allmaps import CSVMapLine from jcvi.formats.sizes import Sizes from jcvi.utils.natsort import natsorted from jcvi.graphics.base import shorten from jcvi.graphics.dotplot import ( plt, savefig, markup, normalize_axes, downsample, plot_breaks_and_labels, thousands, ) p = OptionParser(dotplot.__doc__) p.set_outfile(outfile=None) opts, args, iopts = p.set_image_options(args, figsize="8x8", style="dark", dpi=90, cmap="copper") if len(args) != 2: sys.exit(not p.print_help()) csvfile, fastafile = args sizes = natsorted(Sizes(fastafile).mapping.items()) seen = set() raw_data = [] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # the whole canvas ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # the dot plot fp = must_open(csvfile) for row in fp: m = CSVMapLine(row) seen.add(m.seqid) raw_data.append(m) # X-axis is the genome assembly ctgs, ctg_sizes = zip(*sizes) xsize = sum(ctg_sizes) qb = list(np.cumsum(ctg_sizes)) qbreaks = list(zip(ctgs, [0] + qb, qb)) qstarts = dict(zip(ctgs, [0] + qb)) # Y-axis is the map key = lambda x: x.lg raw_data.sort(key=key) ssizes = {} for lg, d in groupby(raw_data, key=key): ssizes[lg] = max([x.cm for x in d]) ssizes = natsorted(ssizes.items()) lgs, lg_sizes = zip(*ssizes) ysize = sum(lg_sizes) sb = list(np.cumsum(lg_sizes)) sbreaks = list(zip([("LG" + x) for x in lgs], [0] + sb, sb)) sstarts = dict(zip(lgs, [0] + sb)) # Re-code all the scatter dots data = [(qstarts[x.seqid] + x.pos, sstarts[x.lg] + x.cm, "g") for x in raw_data if (x.seqid in qstarts)] npairs = downsample(data) x, y, c = zip(*data) ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) # Flip X-Y label gy, gx = op.basename(csvfile).split(".")[:2] gx, gy = shorten(gx, maxchar=30), shorten(gy, maxchar=30) xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize, qbreaks, sbreaks) ax.set_xlim(xlim) ax.set_ylim(ylim) title = "Alignment: {} vs {}".format(gx, gy) title += " ({} markers)".format(thousands(npairs)) root.set_title(markup(title), x=0.5, y=0.96, color="k") logging.debug(title) normalize_axes(root) image_name = opts.outfile or (csvfile.rsplit(".", 1)[0] + "." + iopts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) fig.clear()
def print_to_anchors(self, outfile): fw = must_open(outfile, "w") for row in self: print(row.anchorline, file=fw) fw.close()
def bed_to_bedpe(bedfile, bedpefile, pairsbedfile=None, matesfile=None, ca=False): """ This converts the bedfile to bedpefile, assuming the reads are from CA. """ fp = must_open(bedfile) fw = must_open(bedpefile, "w") if pairsbedfile: fwpairs = must_open(pairsbedfile, "w") clones = defaultdict(list) for row in fp: b = BedLine(row) name = b.accn clonename = clone_name(name, ca=ca) clones[clonename].append(b) if matesfile: fp = open(matesfile) libraryline = fp.next() # 'library bes 37896 126916' lib, name, smin, smax = libraryline.split() assert lib == "library" smin, smax = int(smin), int(smax) logging.debug("Happy mates for lib {0} fall between {1} - {2}".\ format(name, smin, smax)) nbedpe = 0 nspan = 0 for clonename, blines in clones.items(): if len(blines) == 2: a, b = blines aseqid, astart, aend = a.seqid, a.start, a.end bseqid, bstart, bend = b.seqid, b.start, b.end print >> fw, "\t".join( str(x) for x in (aseqid, astart - 1, aend, bseqid, bstart - 1, bend, clonename)) nbedpe += 1 else: a, = blines aseqid, astart, aend = a.seqid, a.start, a.end bseqid, bstart, bend = 0, 0, 0 if pairsbedfile: start = min(astart, bstart) if bstart > 0 else astart end = max(aend, bend) if bend > 0 else aend if aseqid != bseqid: continue span = end - start + 1 if (not matesfile) or (smin <= span <= smax): print >> fwpairs, "\t".join(str(x) for x in \ (aseqid, start - 1, end, clonename)) nspan += 1 fw.close() logging.debug("A total of {0} bedpe written to `{1}`.".\ format(nbedpe, bedpefile)) if pairsbedfile: fwpairs.close() logging.debug("A total of {0} spans written to `{1}`.".\ format(nspan, pairsbedfile))
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option( "--clean", default=False, action="store_true", help="Clean up irregular chars in seq", ) p.add_option("--matefile", help="Matepairs file") p.add_option("--maxreadlen", default=262143, type="int", help="Maximum read length allowed") p.add_option("--minreadlen", default=1000, type="int", help="Minimum read length allowed") p.add_option( "--sequential", default=False, action="store_true", help="Overwrite read name (e.g. long Pacbio name)", ) p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args maxreadlen = opts.maxreadlen minreadlen = opts.minreadlen if maxreadlen > 0: split = False f = Fasta(fastafile, lazy=True) for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug( "Sequence {0} (size={1}) longer than max read len {2}". format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = opts.size != 0 mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = plate frgfile = libname + ".frg" if opts.clean: cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta if mated: qualfile = make_qual(fastafile, score=21) if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile) return fw = must_open(frgfile, "w") print(headerTemplate.format(libID=libname), file=fw) sequential = opts.sequential i = j = 0 for fragID, seq in parse_fasta(fastafile): if len(seq) < minreadlen: j += 1 continue i += 1 if sequential: fragID = libname + str(100000000 + i) emitFragment(fw, fragID, libname, seq) fw.close() logging.debug( "A total of {0} fragments written to `{1}` ({2} discarded).".format( i, frgfile, j))
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \ evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False): if genefam: N = 1e5 # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file) order = bed.order if is_self: # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) qi, q = order[query] si, s = order[subject] if abs(qi - si) <= N and b.evalue <= evalue: if genefam: g.join(query, subject) elif q.seqid == s.seqid: g.join(query, subject) else: homologs = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue if b.evalue > evalue: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) homologs.join(query, subject) if genefam: g = homologs else: g = Grouper() for i, atom in enumerate(bed): for x in range(1, N + 1): if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \ homologs.joined(bed[i-x].accn, atom.accn)]): leni = sizes[bed[i].accn] lenx = sizes[bed[i - x].accn] if abs(leni - lenx) > max(leni, lenx) * (1 - P / 100.): continue g.join(bed[i - x].accn, atom.accn) # dump the grouper fw = must_open(ofile, "w") ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print >> fw, ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >> sys.stderr, "Proximal paralogues (dist=%d):" % N print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >> sys.stderr, "Longest families (%d): %s" % ( len(longest_family), ",".join(longest_family)) return families
def trim(args): """ %prog trim fastqfiles Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes the paired reads mode. See manual: <http://www.usadellab.org/cms/index.php?page=trimmomatic> """ tv = "0.32" TrimJar = "trimmomatic-{0}.jar".format(tv) p = OptionParser(trim.__doc__) p.add_option("--path", default=op.join("~/bin", TrimJar), help="Path to trimmomatic jar file [default: %default]") p.set_phred() p.add_option("--nofrags", default=False, action="store_true", help="Discard frags file in PE mode [default: %default]") p.add_option("--minqv", default=15, type="int", help="Average qv after trimming [default: %default]") p.add_option("--minlen", default=36, type="int", help="Minimum length after trimming [default: %default]") p.add_option("--adapteronly", default=False, action="store_true", help="Only trim adapters with no qv trimming [default: %default]") p.add_option("--nogz", default=False, action="store_true", help="Do not write to gzipped files [default: %default]") p.add_option("--log", default=None, dest="trimlog", help="Specify a `trimlog` file [default: %default]") p.set_cpus(cpus=4) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) path = op.expanduser(opts.path) url = \ "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\ .format(tv) if not op.exists(path): path = download(url) TrimUnzipped = "Trimmomatic-" + tv if not op.exists(TrimUnzipped): sh("unzip " + path) os.remove(path) path = op.join(TrimUnzipped, TrimJar) assert op.exists(path), \ "Couldn't find Trimmomatic jar file at `{0}`".\ format(path) adaptersfile = "adapters.fasta" Adapters = must_open(op.join(datadir, adaptersfile)).read() write_file(adaptersfile, Adapters, skipcheck=True) assert op.exists(adaptersfile), \ "Please place the illumina adapter sequence in `{0}`".\ format(adaptersfile) if opts.phred is None: offset = guessoffset([args[0]]) else: offset = int(opts.phred) phredflag = " -phred{0}".format(offset) threadsflag = " -threads {0}".format(opts.cpus) if opts.trimlog: trimlog = " -trimlog {0}".format(opts.trimlog) cmd = "java -Xmx4g -jar {0}".format(path) frags = ".frags.fastq" pairs = ".pairs.fastq" if not opts.nogz: frags += ".gz" pairs += ".gz" get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0] get_dirname = lambda x: "{0}/".format(op.dirname(x)) if op.dirname(x) else '' if len(args) == 1: cmd += " SE" cmd += phredflag cmd += threadsflag if opts.trimlog: cmd += trimlog fastqfile, = args prefix = get_prefix(fastqfile) dirname = get_dirname(fastqfile) frags1 = dirname + prefix + frags cmd += " {0}".format(" ".join((fastqfile, frags1))) else: cmd += " PE" cmd += phredflag cmd += threadsflag if opts.trimlog: cmd += trimlog fastqfile1, fastqfile2 = args prefix1 = get_prefix(fastqfile1) dirname1 = get_dirname(fastqfile1) prefix2 = get_prefix(fastqfile2) dirname2 = get_dirname(fastqfile2) pairs1 = dirname1 + prefix1 + pairs pairs2 = dirname2 + prefix2 + pairs frags1 = dirname1 + prefix1 + frags frags2 = dirname2 + prefix2 + frags if opts.nofrags: frags1 = "/dev/null" frags2 = "/dev/null" cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \ pairs1, frags1, pairs2, frags2))) cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile) if not opts.adapteronly: cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv) cmd += " MINLEN:{0}".format(opts.minlen) if offset != 33: cmd += " TOPHRED33" sh(cmd)
def shred(args): """ %prog shred fastafile Similar to the method of `shredContig` in runCA script. The contigs are shredded into pseudo-reads with certain length and depth. """ p = OptionParser(shred.__doc__) p.set_depth(depth=2) p.add_option("--readlen", default=1000, type="int", help="Desired length of the reads [default: %default]") p.add_option("--minctglen", default=0, type="int", help="Ignore contig sequence less than [default: %default]") p.add_option( "--shift", default=50, type="int", help="Overlap between reads must be at least [default: %default]") p.add_option( "--fasta", default=False, action="store_true", help="Output shredded reads as FASTA sequences [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args libID = fastafile.split(".")[0] depth = opts.depth readlen = opts.readlen shift = opts.shift outfile = libID + ".depth{0}".format(depth) if opts.fasta: outfile += ".fasta" else: outfile += ".frg" f = Fasta(fastafile, lazy=True) fw = must_open(outfile, "w", checkexists=True) if not opts.fasta: print >> fw, headerTemplate.format(libID=libID) """ Taken from runCA: |*********| |###################| |--------------------------------------------------| ---------------1--------------- ---------------2--------------- ---------------3--------------- *** - center_increments ### - center_range_width """ for ctgID, (name, rec) in enumerate(f.iteritems_ordered()): seq = rec.seq seqlen = len(seq) if seqlen < opts.minctglen: continue shredlen = min(seqlen - shift, readlen) numreads = max(seqlen * depth / shredlen, 1) center_range_width = seqlen - shredlen ranges = [] if depth == 1: if seqlen < readlen: ranges.append((0, seqlen)) else: for begin in xrange(0, seqlen, readlen - shift): end = min(seqlen, begin + readlen) ranges.append((begin, end)) else: if numreads == 1: ranges.append((0, shredlen)) else: prev_begin = -1 center_increments = center_range_width * 1. / (numreads - 1) for i in xrange(numreads): begin = center_increments * i end = begin + shredlen begin, end = int(begin), int(end) if begin == prev_begin: continue ranges.append((begin, end)) prev_begin = begin for shredID, (begin, end) in enumerate(ranges): shredded_seq = seq[begin:end] fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end) emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta) fw.close() logging.debug("Shredded reads are written to `{0}`.".format(outfile)) return outfile
def certificate(args): """ %prog certificate tpffile certificatefile Generate certificate file for all overlaps in tpffile. tpffile can be generated by jcvi.formats.agp.tpf(). North chr1 2 0 AC229737.8 telomere 58443 South chr1 2 1 AC229737.8 AC202463.29 58443 37835 58443 + Non-terminal Each line describes a relationship between the current BAC and the north/south BAC. First, "North/South" tag, then the chromosome, phases of the two BACs, ids of the two BACs, the size and the overlap start-stop of the CURRENT BAC, and orientation. Each BAC will have two lines in the certificate file. """ p = OptionParser(certificate.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) tpffile, certificatefile = args fastadir = "fasta" tpf = TPF(tpffile) data = check_certificate(certificatefile) fw = must_open(certificatefile, "w") for i, a in enumerate(tpf): if a.is_gap: continue aid = a.component_id af = op.join(fastadir, aid + ".fasta") if not op.exists(af): # Check to avoid redownload entrez([aid, "--skipcheck", "--outdir=" + fastadir]) north, south = tpf.getNorthSouthClone(i) aphase, asize = phase(aid) for tag, p in (("North", north), ("South", south)): if not p: # end of the chromosome ov = "telomere\t{0}".format(asize) elif p.isCloneGap: bphase = "0" ov = "{0}\t{1}".format(p.gap_type, asize) else: bid = p.component_id bphase, bsize = phase(bid) key = (tag, aid, bid) if key in data: print >> fw, data[key] continue ar = [aid, bid, "--dir=" + fastadir] o = overlap(ar) ov = o.certificateline if o \ else "{0}\t{1}\tNone".format(bid, asize) print >> fw, "\t".join(str(x) for x in \ (tag, a.object, aphase, bphase, aid, ov)) fw.flush()
def subset(args): """ %prog subset pairsfile ksfile1 ksfile2 ... -o pairs.ks Subset some pre-calculated ks ka values (in ksfile) according to pairs in tab delimited pairsfile/anchorfile. """ p = OptionParser(subset.__doc__) p.add_option("--noheader", action="store_true", help="don't write ksfile header line [default: %default]") p.add_option("--block", action="store_true", help="preserve block structure in input [default: %default]") p.set_stripnames() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pairsfile, ksfiles = args[0], args[1:] noheader = opts.noheader block = opts.block if block: noheader = True outfile = opts.outfile ksvals = {} for ksfile in ksfiles: ksvals.update(dict((line.name, line) for line in \ KsFile(ksfile, strip_names=opts.strip_names))) fp = open(pairsfile) fw = must_open(outfile, "w") if not noheader: print >>fw, fields i = j = 0 for row in fp: if row[0] == '#': if block: print >>fw, row.strip() continue a, b = row.split()[:2] name = ";".join((a, b)) if name not in ksvals: name = ";".join((b, a)) if name not in ksvals: j += 1 print >>fw, "\t".join((a, b, ".", ".")) continue ksline = ksvals[name] if block: print >>fw, "\t".join(str(x) for x in (a, b, ksline.ks)) else: ksline.name = ";".join((a, b)) print >>fw, ksline i += 1 fw.close() logging.debug("{0} pairs not found in ksfiles".format(j)) logging.debug("{0} ks records written to `{1}`".format(i, outfile)) return outfile
def main(): """ %prog database.fa query.fa [options] Run LASTZ similar to the BLAST interface, and generates -m8 tabular format """ p = OptionParser(main.__doc__) supported_formats = tuple(x.strip() for x in \ "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\ "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(',')) p.add_option("-a", "-A", dest="cpus", default=1, type="int", help="parallelize job to multiple cpus [default: %default]") p.add_option("--format", default="BLASTN-", choices=supported_formats, help="output format, one of {0} [default: %default]".\ format("|".join(supported_formats))) p.add_option("--path", dest="lastz_path", default=None, help="specify LASTZ path") p.add_option( "--mask", dest="mask", default=False, action="store_true", help="treat lower-case letters as mask info [default: %default]") p.add_option( "--similar", default=False, action="store_true", help="Use options tuned for close comparison [default: %default]") set_params(p) set_outfile(p) set_grid(p) opts, args = p.parse_args() if len(args) != 2: sys.exit(p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") grid = opts.grid if grid: print >> sys.stderr, "Running jobs on JCVI grid" extra = opts.extra if opts.similar: extra += similarOptions lastz_bin = opts.lastz_path or "lastz" assert lastz_bin.endswith( "lastz"), "You need to include lastz in your path" mask = opts.mask cpus = opts.cpus logging.debug("Dispatch job to %d cpus" % cpus) format = opts.format blastline = (format == "BLASTN-") # The axt, maf, etc. format can only be run on splitted database (i.e. one # FASTA record per file). The splitted files are then parallelized for the # computation, as opposed to splitting queries through "subsample". outdir = "outdir" if not blastline: from jcvi.formats.fasta import Fasta from jcvi.formats.chain import faToTwoBit mkdir(outdir) bfasta_2bit = faToTwoBit(bfasta_fn) bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered()) apf = op.basename(afasta_fn).split(".")[0] args = [] # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format for id in bids: bfasta = "/".join((bfasta_2bit, id)) outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format)) args.append((bfasta, afasta_fn, outfile, \ lastz_bin, extra, mask, format, grid)) if grid: cmds = [lastz_2bit(x) for x in args] g = Grid(cmds) g.run() g.writestatus() p = Pool(cpus) p.map(lastz_2bit, args) return lock = Lock() if grid: cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \ lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)] mkdir(outdir) g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\ format(i) for i in range(len(cmds))]) g.run() g.writestatus() else: args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask) for k in xrange(cpus)] g = Jobs(target=lastz, args=args) g.run()
def print_to_anchors(self, outfile): fw = must_open(outfile, "w") for row in self: print >> fw, row.anchorline fw.close()
def path(args): """ %prog path input.bed scaffolds.fasta Construct golden path given a set of genetic maps. The respective weight for each map is given in file `weights.txt`. The map with the highest weight is considered the pivot map. The final output is an AGP file that contains ordered scaffolds. """ oargs = args p = OptionParser(path.__doc__) p.add_option("-b", "--bedfile", help=SUPPRESS_HELP) p.add_option("-s", "--fastafile", help=SUPPRESS_HELP) p.add_option("-w", "--weightsfile", default="weights.txt", help="Use weights from file") p.add_option("--distance", default="rank", choices=distance_choices, help="Distance function when building initial consensus") p.add_option("--linkage", default="double", choices=linkage_choices, help="Linkage function when building initial consensus") p.add_option("--gapsize", default=100, type="int", help="Insert gaps of size between scaffolds") p.add_option("--ngen", default=500, type="int", help="Iterations in GA, more ~ slower") p.add_option("--npop", default=100, type="int", help="Population size in GA, more ~ slower") p.add_option("--seqid", help="Only run partition with this seqid") p.add_option("--links", default=10, type="int", help="Only plot matchings more than") p.add_option("--noplot", default=False, action="store_true", help="Do not visualize the alignments") p.set_cpus(cpus=16) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, fastafile = args inputbed = opts.bedfile or inputbed fastafile = opts.fastafile or fastafile pf = inputbed.rsplit(".", 1)[0] bedfile = pf + ".bed" weightsfile = opts.weightsfile gapsize = opts.gapsize ngen = opts.ngen npop = opts.npop cpus = opts.cpus if sys.version_info[:2] < (2, 7): logging.debug("Python version: {0}. CPUs set to 1.".\ format(sys.version.splitlines()[0].strip())) cpus = 1 function = get_function(opts.distance) cc = Map(bedfile, function) mapnames = cc.mapnames allseqids = cc.seqids weights = Weights(weightsfile, mapnames) pivot = weights.pivot ref = weights.ref linkage = opts.linkage oseqid = opts.seqid logging.debug("Linkage function: {0}-linkage".format(linkage)) linkage = {"single": min, "double": double_linkage, "complete": max, "average": np.mean, "median": np.median}[linkage] # Partition the linkage groups into consensus clusters C = Grouper() # Initialize the partitions for mlg in cc.mlgs: C.join(mlg) logging.debug("Partition LGs based on {0}".format(ref)) for mapname in mapnames: if mapname == ref: continue # Compute co-occurrence between LG pairs G = defaultdict(int) for s in allseqids: s = Scaffold(s, cc) s.add_LG_pairs(G, (ref, mapname)) # Convert edge list to adj list nodes = defaultdict(list) for (a, b), w in G.items(): nodes[a].append((b, w)) # Find the best ref LG every non-ref LG matches to for n, neighbors in nodes.items(): if n.split("-")[0] == ref: continue neighbors = dict(neighbors) best_neighbor, best_value = best_no_ambiguous(neighbors, n) if best_neighbor is None: continue C.join(n, best_neighbor) partitions = defaultdict(list) # Partition the scaffolds and assign them to one consensus for s in allseqids: s = Scaffold(s, cc) seqid = s.seqid counts = {} for mlg, count in s.mlg_counts.items(): consensus = C[mlg] mapname = mlg.split("-")[0] mw = weights[mapname] if consensus not in counts: counts[consensus] = 0 counts[consensus] += count * mw best_consensus, best_value = best_no_ambiguous(counts, seqid) if best_consensus is None: continue partitions[best_consensus].append(seqid) # Perform OO within each partition agpfile = pf + ".chr.agp" tourfile = pf + ".tour" sizes = Sizes(fastafile).mapping fwagp = must_open(agpfile, "w") fwtour = must_open(tourfile, "w") solutions = [] for lgs, scaffolds in sorted(partitions.items()): if oseqid and oseqid not in lgs: continue tag = "|".join(lgs) lgs_maps = set(x.split("-")[0] for x in lgs) if pivot not in lgs_maps: logging.debug("Skipping {0} ...".format(tag)) continue logging.debug("Working on {0} ...".format(tag)) s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes, function=function, linkage=linkage, ngen=ngen, npop=npop, cpus=cpus) for fw in (sys.stderr, fwtour): print >> fw, ">{0} ({1})".format(s.object, tag) print >> fw, " ".join("".join(x) for x in s.tour) solutions.append(s) fwtour.close() # meta-data about the run parameters command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\ format(" ".join(oargs)) comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\ format(version, get_today(), command) AGP.print_header(fwagp, comment=comment) for s in sorted(solutions, key=lambda x: x.object): order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize, gaptype="map") fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile)) logging.debug("Tour file written to `{0}`.".format(tourfile)) build([inputbed, fastafile]) summaryfile = pf + ".summary.txt" summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)]) if not opts.noplot: plotall([inputbed, "--links={0}".format(opts.links)])
def calc(args): """ %prog calc [prot.fasta] cds.fasta > out.ks Protein file is optional. If only one file is given, it is assumed to be CDS sequences with correct frame (frame 0). Results will be written to stdout. Both protein file and nucleotide file are assumed to be Fasta format, with adjacent records as the pairs to compare. Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li Calculate synonymous mutation rates for gene pairs This does the following: 1. Fetches a protein pair. 2. Aligns the protein pair with clustalw (default) or muscle. 3. Convert the output to Fasta format. 4. Use this alignment info to align gene sequences using PAL2NAL 5. Run PAML yn00 to calculate synonymous mutation rates. """ from jcvi.formats.fasta import translate p = OptionParser(calc.__doc__) p.add_option("--longest", action="store_true", help="Get longest ORF, only works if no pep file, "\ "e.g. ESTs [default: %default]") p.add_option("--msa", default="clustalw", choices=("clustalw", "muscle"), help="software used to align the proteins [default: %default]") p.add_option("--workdir", default=os.getcwd(), help="Work directory") p.set_outfile() opts, args = p.parse_args(args) if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print >>sys.stderr, "Incorrect arguments" sys.exit(not p.print_help()) output_h = must_open(opts.outfile, "w") print >> output_h, fields work_dir = op.join(opts.workdir, "syn_analysis") mkdir(work_dir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \ zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator): print >>sys.stderr, "--------", p_rec_1.name, p_rec_2.name if opts.msa == "clustalw": align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir) mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \ find_synonymous(mrtrans_fasta, work_dir) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write("%s\n" % (",".join(str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng)))) output_h.flush() # Clean-up sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
def seeds(args): """ %prog seeds [pngfile|jpgfile] Extract seed metrics from [pngfile|jpgfile]. Use --rows and --cols to crop image. """ p = OptionParser(seeds.__doc__) p.set_outfile() opts, args, iopts = add_seeds_options(p, args) if len(args) != 1: sys.exit(not p.print_help()) (pngfile,) = args pf = opts.prefix or op.basename(pngfile).rsplit(".", 1)[0] sigma, kernel = opts.sigma, opts.kernel rows, cols = opts.rows, opts.cols labelrows, labelcols = opts.labelrows, opts.labelcols ff = opts.filter calib = opts.calibrate outdir = opts.outdir if outdir != ".": mkdir(outdir) if calib: calib = json.load(must_open(calib)) pixel_cm_ratio, tr = calib["PixelCMratio"], calib["RGBtransform"] tr = np.array(tr) nbcolor = opts.changeBackground pngfile = convert_background(pngfile, nbcolor) resizefile, mainfile, labelfile, exif = convert_image( pngfile, pf, outdir=outdir, rotate=opts.rotate, rows=rows, cols=cols, labelrows=labelrows, labelcols=labelcols, ) oimg = load_image(resizefile) img = load_image(mainfile) fig, (ax1, ax2, ax3, ax4) = plt.subplots( ncols=4, nrows=1, figsize=(iopts.w, iopts.h) ) # Edge detection img_gray = rgb2gray(img) logging.debug("Running {0} edge detection ...".format(ff)) if ff == "canny": edges = canny(img_gray, sigma=opts.sigma) elif ff == "roberts": edges = roberts(img_gray) elif ff == "sobel": edges = sobel(img_gray) edges = clear_border(edges, buffer_size=opts.border) selem = disk(kernel) closed = closing(edges, selem) if kernel else edges filled = binary_fill_holes(closed) # Watershed algorithm if opts.watershed: distance = distance_transform_edt(filled) local_maxi = peak_local_max(distance, threshold_rel=0.05, indices=False) coordinates = peak_local_max(distance, threshold_rel=0.05) markers, nmarkers = label(local_maxi, return_num=True) logging.debug("Identified {0} watershed markers".format(nmarkers)) labels = watershed(closed, markers, mask=filled) else: labels = label(filled) # Object size filtering w, h = img_gray.shape canvas_size = w * h min_size = int(round(canvas_size * opts.minsize / 100)) max_size = int(round(canvas_size * opts.maxsize / 100)) logging.debug( "Find objects with pixels between {0} ({1}%) and {2} ({3}%)".format( min_size, opts.minsize, max_size, opts.maxsize ) ) # Plotting ax1.set_title("Original picture") ax1.imshow(oimg) params = "{0}, $\sigma$={1}, $k$={2}".format(ff, sigma, kernel) if opts.watershed: params += ", watershed" ax2.set_title("Edge detection\n({0})".format(params)) closed = gray2rgb(closed) ax2_img = labels if opts.edges: ax2_img = closed elif opts.watershed: ax2.plot(coordinates[:, 1], coordinates[:, 0], "g.") ax2.imshow(ax2_img, cmap=iopts.cmap) ax3.set_title("Object detection") ax3.imshow(img) filename = op.basename(pngfile) if labelfile: accession = extract_label(labelfile) else: accession = pf # Calculate region properties rp = regionprops(labels) rp = [x for x in rp if min_size <= x.area <= max_size] nb_labels = len(rp) logging.debug("A total of {0} objects identified.".format(nb_labels)) objects = [] for i, props in enumerate(rp): i += 1 if i > opts.count: break y0, x0 = props.centroid orientation = props.orientation major, minor = props.major_axis_length, props.minor_axis_length major_dx = cos(orientation) * major / 2 major_dy = sin(orientation) * major / 2 minor_dx = sin(orientation) * minor / 2 minor_dy = cos(orientation) * minor / 2 ax2.plot((x0 - major_dx, x0 + major_dx), (y0 + major_dy, y0 - major_dy), "r-") ax2.plot((x0 - minor_dx, x0 + minor_dx), (y0 - minor_dy, y0 + minor_dy), "r-") npixels = int(props.area) # Sample the center of the blob for color d = min(int(round(minor / 2 * 0.35)) + 1, 50) x0d, y0d = int(round(x0)), int(round(y0)) square = img[(y0d - d) : (y0d + d), (x0d - d) : (x0d + d)] pixels = [] for row in square: pixels.extend(row) logging.debug( "Seed #{0}: {1} pixels ({2} sampled) - {3:.2f}%".format( i, npixels, len(pixels), 100.0 * npixels / canvas_size ) ) rgb = pixel_stats(pixels) objects.append(Seed(filename, accession, i, rgb, props, exif)) minr, minc, maxr, maxc = props.bbox rect = Rectangle( (minc, minr), maxc - minc, maxr - minr, fill=False, ec="w", lw=1 ) ax3.add_patch(rect) mc, mr = (minc + maxc) / 2, (minr + maxr) / 2 ax3.text(mc, mr, "{0}".format(i), color="w", ha="center", va="center", size=6) for ax in (ax2, ax3): ax.set_xlim(0, h) ax.set_ylim(w, 0) # Output identified seed stats ax4.text(0.1, 0.92, "File: {0}".format(latex(filename)), color="g") ax4.text(0.1, 0.86, "Label: {0}".format(latex(accession)), color="m") yy = 0.8 fw = must_open(opts.outfile, "w") if not opts.noheader: print(Seed.header(calibrate=calib), file=fw) for o in objects: if calib: o.calibrate(pixel_cm_ratio, tr) print(o, file=fw) i = o.seedno if i > 7: continue ax4.text(0.01, yy, str(i), va="center", bbox=dict(fc="none", ec="k")) ax4.text(0.1, yy, o.pixeltag, va="center") yy -= 0.04 ax4.add_patch( Rectangle((0.1, yy - 0.025), 0.12, 0.05, lw=0, fc=rgb_to_hex(o.rgb)) ) ax4.text(0.27, yy, o.hashtag, va="center") yy -= 0.06 ax4.text( 0.1, yy, "(A total of {0} objects displayed)".format(nb_labels), color="darkslategray", ) normalize_axes(ax4) for ax in (ax1, ax2, ax3): xticklabels = [int(x) for x in ax.get_xticks()] yticklabels = [int(x) for x in ax.get_yticks()] ax.set_xticklabels(xticklabels, family="Helvetica", size=8) ax.set_yticklabels(yticklabels, family="Helvetica", size=8) image_name = op.join(outdir, pf + "." + iopts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) return objects
def query(args): """ %prog query "SELECT feat_name FROM asm_feature WHERE feat_type = \\"{0}\\" AND end5 <= \\"{1}\\" AND end3 >= \\"{2}\\"" ::: datafile1 .... Script takes the data from tab-delimited datafile(s) and replaces the placeholders in the query which is then executed. Depending upon the type of query, results are either printed out (when running `select`) or not (when running `insert`, `update` or `delete`) If the query contains quotes around field values, then these need to be escaped with \\ """ p = OptionParser(query.__doc__) p.set_db_opts() p.add_option( "--dryrun", default=False, action="store_true", help="Don't commit to database. Just print queries [default: %default]" ) p.set_sep(help="Specify output field separator") p.set_verbose(help="Print out all the queries") p.set_outfile() opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) fieldsep = opts.sep sep = ":::" files = None if sep in args: sepidx = args.index(sep) files = args[sepidx + 1:] args = args[:sepidx] if not files: files = [""] qrys = [] qry = " ".join(args) if ";" in qry: for q in qry.split(";"): if len(q.strip()) > 0: qrys.append(q) else: qrys.append(qry) queries = set() if files: for datafile in files: datafile = datafile.strip() fp = must_open(datafile) for row in fp: for qry in qrys: qry = qry.strip() m = re.findall(r"\{\d+\}", qry) if m: mi = [int(x.strip("{}")) for x in m] atoms = row.strip().split("\t") assert max(mi) <= len(atoms), \ "Number of columns in `datafile`({0})".format(len(atoms)) + \ " != number of `placeholders`({0})".format(len(m)) natoms = [atoms[x] for x in mi] for idx, (match, atom) in enumerate(zip(m, natoms)): qry = qry.replace(match, atom) queries.add(qry) else: for qry in qrys: if re.search(r"\{\d+\}", qry): logging.error( "Query `{0}` contains placeholders, no datafile(s) specified" .format(qry)) sys.exit() queries.add(qry) if not opts.dryrun: fw = must_open(opts.outfile, "w") dbh, cur = connect(opts.dbname, connector=opts.dbconn, hostname=opts.hostname, \ username=opts.username, password=opts.password, port=opts.port) cflag = None for qry in queries: if opts.dryrun or opts.verbose: print(qry) if not opts.dryrun: if to_commit(qry): execute(cur, qry) cflag = True else: results = fetchall(cur, qry, connector=opts.dbconn) for result in results: print(fieldsep.join([str(x) for x in result]), file=fw) if not opts.dryrun and cflag: commit(dbh)
def meta(args): """ %prog meta data.bin samples STR.ids STR-exons.wo.bed Compute allele frequencies and prune sites based on missingness. Filter subset of loci that satisfy: 1. no redundancy (unique chr:pos) 2. variable (n_alleles > 1) 3. low level of missing data (>= 50% autosomal + X, > 25% for Y) Write meta file with the following infor: 1. id 2. title 3. gene_name 4. variant_type 5. motif 6. allele_frequency `STR-exons.wo.bed` can be generated like this: $ tail -n 694105 /mnt/software/lobSTR/hg38/index.tab | cut -f1-3 > all-STR.bed $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed """ p = OptionParser(meta.__doc__) p.add_option("--cutoff", default=.5, type="float", help="Percent observed required (chrY half cutoff)") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) binfile, sampleids, strids, wobed = args cutoff = opts.cutoff af_file = "allele_freq" if need_update(binfile, af_file): df, m, samples, loci = read_binfile(binfile, sampleids, strids) nalleles = len(samples) fw = must_open(af_file, "w") for i, locus in enumerate(loci): a = m[:, i] counts = alleles_to_counts(a) af = counts_to_af(counts) seqid = locus.split("_")[0] remove = counts_filter(counts, nalleles, seqid, cutoff=cutoff) print >> fw, "\t".join((locus, af, remove)) fw.close() logging.debug("Load gene intersections from `{}`".format(wobed)) fp = open(wobed) gene_map = defaultdict(set) for row in fp: chr1, start1, end1, chr2, start2, end2, name, ov = row.split() gene_map[(chr1, start1)] |= set(name.split(",")) for k, v in gene_map.items(): non_enst = sorted(x for x in v if not x.startswith("ENST")) #enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST")) gene_map[k] = ",".join(non_enst) TREDS, df = read_treds() metafile = "STRs_{}_SEARCH.meta.tsv".format(timestamp()) write_meta(af_file, gene_map, TREDS, filename=metafile) logging.debug("File `{}` written.".format(metafile))
def estimategaps(args): """ %prog estimategaps input.bed Estimate sizes of inter-scaffold gaps. The AGP file generated by path() command has unknown gap sizes with a generic number of Ns (often 100 Ns). The AGP file `input.chr.agp` will be modified in-place. """ p = OptionParser(estimategaps.__doc__) p.add_option("--minsize", default=100, type="int", help="Minimum gap size") p.add_option("--maxsize", default=500000, type="int", help="Maximum gap size") p.add_option("--links", default=10, type="int", help="Only use linkage grounds with matchings more than") p.set_verbose(help="Print details for each gap calculation") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) inputbed, = args pf = inputbed.rsplit(".", 1)[0] agpfile = pf + ".chr.agp" bedfile = pf + ".lifted.bed" cc = Map(bedfile, scaffold_info=True) agp = AGP(agpfile) minsize, maxsize = opts.minsize, opts.maxsize links = opts.links verbose = opts.verbose outagpfile = pf + ".estimategaps.agp" fw = must_open(outagpfile, "w") for ob, components in agp.iter_object(): components = list(components) s = Scaffold(ob, cc) mlg_counts = s.mlg_counts gaps = [x for x in components if x.is_gap] gapsizes = [None] * len(gaps) # master for mlg, count in mlg_counts.items(): if count < links: continue g = GapEstimator(cc, agp, ob, mlg) g.compute_all_gaps(minsize=minsize, maxsize=maxsize, \ verbose=verbose) # Merge evidence from this mlg into master assert len(g.gapsizes) == len(gaps) for i, gs in enumerate(gapsizes): gg = g.gapsizes[i] if gs is None: gapsizes[i] = gg elif gg: gapsizes[i] = min(gs, gg) print gapsizes # Modify AGP i = 0 for x in components: if x.is_gap: x.gap_length = gapsizes[i] or minsize x.component_type = 'U' if x.gap_length == 100 else 'N' i += 1 print >> fw, x fw.close() reindex([outagpfile, "--inplace"])