def batchoverlap(args): """ %prog batchoverlap pairs.txt outdir Check overlaps between pairs of sequences. """ p = OptionParser(batchoverlap.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pairsfile, outdir = args fp = open(pairsfile) cmds = [] mkdir("overlaps") for row in fp: a, b = row.split()[:2] oa = op.join(outdir, a + ".fa") ob = op.join(outdir, b + ".fa") cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format(oa, ob) cmd += " -o overlaps/{0}_{1}.ov".format(a, b) cmds.append(cmd) print "\n".join(cmds)
def mconsensus(args): """ %prog mconsensus *.consensus Call consensus along the stacks from cross-sample clustering. """ p = OptionParser(mconsensus.__doc__) p.add_option("--allele_counts", default="allele_counts", help="Directory to generate allele counts") add_consensus_options(p) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) consensusfiles = args prefix = opts.prefix acdir = opts.allele_counts store = ClustStores(consensusfiles) pctid = find_pctid(consensusfiles) pf = prefix + ".P{0}".format(pctid) clustSfile = pf + ".clustS" AC = makeloci(clustSfile, store, prefix, minsamp=opts.minsamp) mkdir(acdir) acfile = pf + ".allele_counts" fw = open(acfile, "w") seen = DefaultOrderedDict(list) # chr, pos => taxa print >> fw, "# " + "\t".join(ACHEADER) # Sort allele counts into separate files for ac in AC: chrpos = ac.chr, ac.pos seen[chrpos].append(ac) print >> fw, ac.tostring(taxon=True) fw.close() logging.debug("Populate all taxa and instantiate empty vector if missing") all_taxa = set([op.basename(x).split(".")[0] for x in consensusfiles]) taxon_to_ac = defaultdict(list) for chrpos, aclist in seen.items(): included_taxa = set([x.taxon for x in aclist]) missing_taxa = all_taxa - included_taxa template = deepcopy(aclist[0]) template.clear() for ac in aclist: taxon_to_ac[ac.taxon].append(ac) for tx in missing_taxa: taxon_to_ac[tx].append(template) logging.debug("Write allele counts for all taxa") for tx, aclist in sorted(taxon_to_ac.items()): tx_acfile = op.join(acdir, tx + ".allele_counts") fw = open(tx_acfile, "w") print >> fw, "# " + "\t".join(ACHEADER_NO_TAXON) for ac in aclist: print >> fw, ac.tostring() fw.close() logging.debug("Written {0} sites in `{1}`".\ format(len(aclist), tx_acfile))
def cib(args): """ %prog cib bamfile samplekey Convert BAM to CIB (a binary storage of int8 per base). """ p = OptionParser(cib.__doc__) p.add_option("--prefix", help="Report seqids with this prefix only") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, samplekey = args mkdir(samplekey) bam = pysam.AlignmentFile(bamfile, "rb") refs = [x for x in bam.header["SQ"]] prefix = opts.prefix if prefix: refs = [x for x in refs if x["SN"].startswith(prefix)] task_args = [] for r in refs: task_args.append((bamfile, r, samplekey)) cpus = min(opts.cpus, len(task_args)) logging.debug("Use {} cpus".format(cpus)) p = Pool(processes=cpus) for res in p.imap(bam_to_cib, task_args): continue
def batch(args): """ %prog batch splits output The arguments are two folders. Input FASTA sequences are in splits/. Output csv files are in output/. Must have folders swissprot/, tair/, trembl/ that contains the respective BLAST output. Once finished, you can run, for example: $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml """ p = OptionParser(batch.__doc__) p.add_option("--path", default="~/code/AHRD/", help="Path where AHRD is installed [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) splits, output = args mkdir(output) for f in glob("{0}/*.fasta".format(splits)): fb = op.basename(f).split(".")[0] fw = open(op.join(output, fb + ".yml"), "w") path = op.expanduser(opts.path) dir = op.join(path, "test/resources") outfile = op.join(output, fb + ".csv") print >> fw, Template.format(dir, fb, f, outfile)
def batchccn(args): """ %prog batchccn test.csv Run CCN script in batch. Write makefile. """ p = OptionParser(batchccn.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (csvfile, ) = args mm = MakeManager() pf = op.basename(csvfile).split(".")[0] mkdir(pf) header = next(open(csvfile)) header = None if header.strip().endswith(".bam") else "infer" logging.debug("Header={}".format(header)) df = pd.read_csv(csvfile, header=header) cmd = "perl /mnt/software/ccn_gcn_hg38_script/ccn_gcn_hg38.pl" cmd += " -n {} -b {}" cmd += " -o {} -r hg38".format(pf) for i, (sample_key, bam) in df.iterrows(): cmdi = cmd.format(sample_key, bam) outfile = "{}/{}/{}.ccn".format(pf, sample_key, sample_key) mm.add(csvfile, outfile, cmdi) mm.write()
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder): samplefq = [] for i in range(2): samplefq.append( op.join(work, prefix + "_{0}.first.fastq".format(i + 1))) first([str(opts.firstN)] + [p[i]] + ["-o", samplefq[i]]) os.chdir(work) align_args = [ref] + [op.basename(fq) for fq in samplefq] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def batch(args): """ %prog batch all.cds *.anchors Compute Ks values for a set of anchors file. This will generate a bunch of work directories for each comparisons. The anchorsfile should be in the form of specie1.species2.anchors. """ from jcvi.apps.grid import MakeManager p = OptionParser(batch.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) cdsfile = args[0] anchors = args[1:] workdirs = [".".join(op.basename(x).split(".")[:2]) for x in anchors] for wd in workdirs: mkdir(wd) mm = MakeManager() for wd, ac in zip(workdirs, anchors): pairscdsfile = wd + ".cds.fasta" cmd = "python -m jcvi.apps.ks prepare {} {} -o {}".\ format(ac, cdsfile, pairscdsfile) mm.add((ac, cdsfile), pairscdsfile, cmd) ksfile = wd + ".ks" cmd = "python -m jcvi.apps.ks calc {} -o {} --workdir {}".\ format(pairscdsfile, ksfile, wd) mm.add(pairscdsfile, ksfile, cmd) mm.write()
def prepare_synteny(tourfile, lastfile, odir, p, opts): """ Prepare synteny plots for movie(). """ qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) qbedfile = op.abspath(qbedfile) sbedfile = op.abspath(sbedfile) qbed = Bed(qbedfile, sorted=False) contig_to_beds = dict(qbed.sub_beds()) # Create a separate directory for the subplots and movie mkdir(odir, overwrite=True) os.chdir(odir) logging.debug("Change into subdir `{}`".format(odir)) # Make anchorsfile anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) \ + ".anchors" fw = open(anchorsfile, "w") for b in Blast(lastfile): print >> fw, "\t".join((gene_name(b.query), gene_name(b.subject), str(int(b.score)))) fw.close() # Symlink sbed symlink(sbedfile, op.basename(sbedfile)) return anchorsfile, qbedfile, contig_to_beds
def calc(args): """ %prog calc [prot.fasta] cds.fasta > out.ks Protein file is optional. If only one file is given, it is assumed to be CDS sequences with correct frame (frame 0). Results will be written to stdout. Both protein file and nucleotide file are assumed to be Fasta format, with adjacent records as the pairs to compare. Author: Haibao Tang <*****@*****.**>, Brad Chapman Calculate synonymous mutation rates for gene pairs This does the following: 1. Fetches a protein pair. 2. Aligns the protein pair with clustalw 3. Convert the output to Fasta format. 4. Use this alignment info to align gene sequences using PAL2NAL 5. Run PAML yn00 to calculate synonymous mutation rates. """ p = OptionParser(calc.__doc__) set_outfile(p) opts, args = p.parse_args(args) if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print >> sys.stderr, "Incorrect arguments" sys.exit(not p.print_help()) output_h = must_open(opts.outfile, "w") output_h.write("name,dS-yn,dN-yn,dS-ng,dN-ng\n") work_dir = op.join(os.getcwd(), "syn_analysis") mkdir(work_dir) if not protein_file: protein_file = translate_dna(dna_file) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \ zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator): print >> sys.stderr, "--------", p_rec_1.name, p_rec_2.name align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir) mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2, work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \ find_synonymous(mrtrans_fasta, work_dir) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write("%s\n" % (",".join( str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng)))) output_h.flush() # Clean-up sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
def merge(args): """ %prog merge folder1 ... Consolidate split contents in the folders. The folders can be generated by the split() process and several samples may be in separate fastq files. This program merges them. """ p = OptionParser(merge.__doc__) p.add_option("--outdir", default="outdir", help="Output final reads in [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) folders = args outdir = opts.outdir mkdir(outdir) files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders) files = list(files) key = lambda x: op.basename(x).split(".")[0] files.sort(key=key) for id, fns in groupby(files, key=key): fns = list(fns) outfile = op.join(outdir, "{0}.fastq".format(id)) FileMerger(fns, outfile=outfile).merge(checkexists=True)
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option( "--notreds", default=False, action="store_true", help="Remove TREDs from the bed file", ) p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trfbed, fastafile = args pf = fastafile.split(".")[0] lhome = opts.lobstr_home mkdir(pf) if opts.notreds: newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 seen = set() for row in fp: r = STRLine(row) total += 1 name = r.longname if name in seen: continue seen.add(name) print(r, file=newbed) retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(newbedfile, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def sra(args): """ %prog sra [term|term.ids] Given an SRA run ID, fetch the corresponding .sra file from the sra-instant FTP. The term can also be a file containing list of SRR ids, one per line. """ p = OptionParser(sra.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) term, = args if op.isfile(term): terms = [x.strip() for x in open(term)] else: terms = [term] for term in terms: srafile = download_srr_term(term) pf = srafile.split(".")[0] mkdir(pf) cmd = "fastq-dump --outdir {} --split-files {}".format(pf, srafile) sh(cmd)
def augustus(args): """ %prog augustus species gffile fastafile Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from: <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html> """ p = OptionParser(snap.__doc__) p.set_home("augustus") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) species, gffile, fastafile = args mhome = opts.augustus_home augdir = "augustus" cwd = os.getcwd() mkdir(augdir) os.chdir(augdir) sh("{0}/scripts/new_species.pl --species={1}".format(mhome, species)) sh("{0}/scripts/gff2gbSmallDNA.pl ../{1} ../{2} 1000 raw.gb".format(mhome, gffile, fastafile)) sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".format(mhome, species)) sh("cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst") sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".format(mhome)) sh("grep -c LOCUS raw.gb training.gb") sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}".format(mhome, species)) os.chdir(cwd) sh("cp -r {0}/species/{1} augustus/".format(mhome, species))
def gcn(args): """ %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz Compile gene copy njumber based on CANVAS results. """ p = OptionParser(gcn.__doc__) p.set_cpus() p.set_tmpdir(tmpdir="tmp") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) exonbed = args[0] canvasvcfs = args[1:] tsvfile = opts.outfile tmpdir = opts.tmpdir mkdir(tmpdir) set_tempdir(tmpdir) df = vcf_to_df(canvasvcfs, exonbed, opts.cpus) for suffix in (".avgcn", ".medcn"): df_to_tsv(df, tsvfile, suffix)
def __init__( self, data: TSPDataModel, work_dir=Work_dir, clean=True, verbose=False, precision=0, seed=666, ): """Run concorde on TSP instance Args: data (TSPDataModel): TSP instance with edge weights work_dir ([type], optional): Path to the work dir. Defaults to Work_dir. clean (bool, optional): Clean up intermediate results. Defaults to True. verbose (bool, optional): Show verbose messages. Defaults to False. precision (int, optional): Float precision of distance. Defaults to 0. seed (int, optional): Random seed. Defaults to 666. """ self.data = data self.work_dir = work_dir self.clean = clean self.verbose = verbose mkdir(work_dir) tspfile = op.join(work_dir, "data.tsp") self.print_to_tsplib(tspfile, precision=precision) _, outfile = self.run_concorde(tspfile, seed=seed) self.tour = self.parse_output(outfile) if clean: shutil.rmtree(work_dir) residual_output = ["data.sol", "data.res", "Odata.res"] FileShredder(residual_output, verbose=False)
def split_old(args): fi, dirw = op.realpath(args.fi), op.realpath(args.outdir) n = args.N if not op.exists(dirw): mkdir(dirw) else: sh("rm -rf %s/*" % dirw) cdir = os.path.dirname(os.path.realpath(__file__)) cwd = os.getcwd() os.chdir(dirw) sh("ln -sf %s part.fas" % fi) sh("pyfasta split -n %d part.fas" % n) sh("rm part.fas part.fas.*") digit = ndigit(n) sizes = [] for i in range(0, n): fmt = "part.%%0%dd.fas" % digit fp = fmt % i sizes.append(os.stat(fp).st_size) sizes.sort() print("size range: %s - %s" % (prettysize(sizes[0]), prettysize(sizes[n - 1])))
def batchccn(args): """ %prog batchccn test.csv Run CCN script in batch. Write makefile. """ p = OptionParser(batchccn.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) csvfile, = args mm = MakeManager() pf = op.basename(csvfile).split(".")[0] mkdir(pf) header = open(csvfile).next() header = None if header.strip().endswith(".bam") else "infer" logging.debug("Header={}".format(header)) df = pd.read_csv(csvfile, header=header) cmd = "perl /mnt/software/ccn_gcn_hg38_script/ccn_gcn_hg38.pl" cmd += " -n {} -b {}" cmd += " -o {} -r hg38".format(pf) for i, (sample_key, bam) in df.iterrows(): cmdi = cmd.format(sample_key, bam) outfile = "{}/{}/{}.ccn".format(pf, sample_key, sample_key) mm.add(csvfile, outfile, cmdi) mm.write()
def genemark(args): """ %prog genemark species fastafile Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig model gff file is needed. """ p = OptionParser(genemark.__doc__) p.set_home("gmes") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) species, fastafile = args mhome = opts.gmes_home gmdir = "genemark" mkdir(gmdir) cwd = os.getcwd() os.chdir(gmdir) cmd = "ln -sf ../{0}".format(fastafile) sh(cmd) license = op.expanduser("~/.gm_key") assert op.exists(license), "License key ({0}) not found!".format(license) cmd = "{0}/gm_es.pl {1}".format(mhome, fastafile) sh(cmd) os.chdir(cwd) logging.debug("GENEMARK matrix written to `{0}/mod/{1}.mod`".format( gmdir, species))
def SH_raxml(reftree, querytree, phy_file, shout="SH_out.txt"): """ SH test using RAxML querytree can be a single tree or a bunch of trees (eg. from bootstrapping) """ assert op.isfile(reftree) shout = must_open(shout, "a") raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work")) mkdir(raxml_work) raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \ sequences=phy_file, algorithm="h", model="GTRGAMMA", \ name="SH", starting_tree=reftree, bipartition_filename=querytree, \ working_dir=raxml_work) logging.debug("Running SH test in RAxML: %s" % raxml_cl) o, stderr = raxml_cl() # hard coded try: pval = re.search('(Significantly.*:.*)', o).group(0) except: print >> sys.stderr, "SH test failed." else: pval = pval.strip().replace("\t", " ").replace("%", "\%") print >> shout, "{0}\t{1}".format(op.basename(querytree), pval) logging.debug("SH p-value appended to %s" % shout.name) shout.close() return shout.name
def build_ml_raxml(alignment, outfile, work_dir=".", **kwargs): """ build maximum likelihood tree of DNA seqs with RAxML """ work_dir = op.join(work_dir, "work") mkdir(work_dir) phy_file = op.join(work_dir, "aln.phy") AlignIO.write(alignment, file(phy_file, "w"), "phylip-relaxed") raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work")) mkdir(raxml_work) raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \ sequences=phy_file, algorithm="a", model="GTRGAMMA", \ parsimony_seed=12345, rapid_bootstrap_seed=12345, \ num_replicates=100, name="aln", \ working_dir=raxml_work, **kwargs) logging.debug("Building ML tree using RAxML: %s" % raxml_cl) stdout, stderr = raxml_cl() tree_file = "{0}/RAxML_bipartitions.aln".format(raxml_work) if not op.exists(tree_file): print("***RAxML failed.", file=sys.stderr) sh("rm -rf %s" % raxml_work, log=False) return None sh("cp {0} {1}".format(tree_file, outfile), log=False) logging.debug("ML tree printed to %s" % outfile) sh("rm -rf %s" % raxml_work) return outfile, phy_file
def genemark(args): """ %prog genemark species fastafile Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig model gff file is needed. """ p = OptionParser(genemark.__doc__) p.set_home("gmes") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) species, fastafile = args mhome = opts.gmes_home gmdir = "genemark" mkdir(gmdir) cwd = os.getcwd() os.chdir(gmdir) cmd = "ln -sf ../{0}".format(fastafile) sh(cmd) license = op.expanduser("~/.gm_key") assert op.exists(license), "License key ({0}) not found!".format(license) cmd = "{0}/gm_es.pl {1}".format(mhome, fastafile) sh(cmd) os.chdir(cwd) logging.debug("GENEMARK matrix written to `{0}/mod/{1}.mod`".format(gmdir, species))
def batchoverlap(args): """ %prog batchoverlap pairs.txt outdir Check overlaps between pairs of sequences. """ p = OptionParser(batchoverlap.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pairsfile, outdir = args fp = open(pairsfile) cmds = [] mkdir("overlaps") for row in fp: a, b = row.split()[:2] oa = op.join(outdir, a + ".fa") ob = op.join(outdir, b + ".fa") cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format( oa, ob) cmd += " -o overlaps/{0}_{1}.ov".format(a, b) cmds.append(cmd) print "\n".join(cmds)
def sra(args): """ %prog sra [term|term.ids] Given an SRA run ID, fetch the corresponding .sra file from the sra-instant FTP. The term can also be a file containing list of SRR ids, one per line. Once downloaded, the SRA file is processed through `fastq-dump` to produce FASTQ formatted sequence files, which are gzipped by default. """ p = OptionParser(sra.__doc__) p.add_option("--nogzip", dest="nogzip", default=False, action="store_true", help="Do not gzip the FASTQ generated by fastq-dump") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) term, = args if op.isfile(term): terms = [x.strip() for x in open(term)] else: terms = [term] for term in terms: srafile = download_srr_term(term) pf = srafile.split(".")[0] mkdir(pf) _opts = [srafile, "--paired", "--outdir={0}".format(pf)] if not opts.nogzip: _opts.append("--compress=gzip") fromsra(_opts)
def prepare_synteny(tourfile, lastfile, odir, p, opts): """ Prepare synteny plots for movie(). """ qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) qbedfile = op.abspath(qbedfile) sbedfile = op.abspath(sbedfile) qbed = Bed(qbedfile, sorted=False) contig_to_beds = dict(qbed.sub_beds()) # Create a separate directory for the subplots and movie mkdir(odir, overwrite=True) os.chdir(odir) logging.debug("Change into subdir `{}`".format(odir)) # Make anchorsfile anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) + ".anchors" fw = open(anchorsfile, "w") for b in Blast(lastfile): print >> fw, "\t".join( (gene_name(b.query), gene_name(b.subject), str(int(b.score)))) fw.close() # Symlink sbed symlink(sbedfile, op.basename(sbedfile)) return anchorsfile, qbedfile, contig_to_beds
def sra(args): """ %prog sra [term|term.ids] Given an SRA run ID, fetch the corresponding .sra file from the sra-instant FTP. The term can also be a file containing list of SRR ids, one per line. Once downloaded, the SRA file is processed through `fastq-dump` to produce FASTQ formatted sequence files, which are gzipped by default. """ p = OptionParser(sra.__doc__) sp1.add_argument("--nogzip", dest="nogzip", default=False, action="store_true", help="Do not gzip the FASTQ generated by fastq-dump") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) term, = args if op.isfile(term): terms = [x.strip() for x in open(term)] else: terms = [term] for term in terms: srafile = download_srr_term(term) pf = srafile.split(".")[0] mkdir(pf) _opts = [srafile, "--paired", "--outdir={0}".format(pf)] if not args.nogzip: _args.append("--compress=gzip") fromsra(_opts)
def merge(args): """ %prog merge folder1 ... Consolidate split contents in the folders. The folders can be generated by the split() process and several samples may be in separate fastq files. This program merges them. """ p = OptionParser(merge.__doc__) p.set_outdir(outdir="outdir") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) folders = args outdir = opts.outdir mkdir(outdir) files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders) files = list(files) key = lambda x: op.basename(x).split(".")[0] files.sort(key=key) for id, fns in groupby(files, key=key): fns = list(fns) outfile = op.join(outdir, "{0}.fastq".format(id)) FileMerger(fns, outfile=outfile).merge(checkexists=True)
def link(args): """ %prog link metafile Link source to target based on a tabular file. """ from jcvi.apps.base import mkdir p = OptionParser(link.__doc__) p.add_option("--dir", help="Place links in a subdirectory") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (meta, ) = args d = opts.dir if d: mkdir(d) fp = open(meta) cwd = op.dirname(get_abs_path(meta)) for row in fp: source, target = row.split() source = op.join(cwd, source) if d: target = op.join(d, target) lnsf(source, target, log=True)
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including CLC, BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) if aligner == "clc": from jcvi.apps.clc import align from jcvi.formats.cas import pairs as ps else: from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder, 2): samplefq = op.join(work, prefix + ".first.fastq") first([str(opts.firstN)] + p + ["-o", samplefq]) os.chdir(work) align_args = [ref, op.basename(samplefq)] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def build_ml_raxml(alignment, outfile, work_dir=".", **kwargs): """ build maximum likelihood tree of DNA seqs with RAxML """ work_dir = op.join(work_dir, "work") mkdir(work_dir) phy_file = op.join(work_dir, "aln.phy") AlignIO.write(alignment, file(phy_file, "w"), "phylip-relaxed") raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work")) mkdir(raxml_work) raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \ sequences=phy_file, algorithm="a", model="GTRGAMMA", \ parsimony_seed=12345, rapid_bootstrap_seed=12345, \ num_replicates=100, name="aln", \ working_dir=raxml_work, **kwargs) logging.debug("Building ML tree using RAxML: %s" % raxml_cl) stdout, stderr = raxml_cl() tree_file = "{0}/RAxML_bipartitions.aln".format(raxml_work) if not op.exists(tree_file): print >> sys.stderr, "***RAxML failed." sh("rm -rf %s" % raxml_work, log=False) return None sh("cp {0} {1}".format(tree_file, outfile), log=False) logging.debug("ML tree printed to %s" % outfile) sh("rm -rf %s" % raxml_work) return outfile, phy_file
def SH_raxml(reftree, querytree, phy_file, shout="SH_out.txt"): """ SH test using RAxML querytree can be a single tree or a bunch of trees (eg. from bootstrapping) """ assert op.isfile(reftree) shout = must_open(shout, "a") raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work")) mkdir(raxml_work) raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \ sequences=phy_file, algorithm="h", model="GTRGAMMA", \ name="SH", starting_tree=reftree, bipartition_filename=querytree, \ working_dir=raxml_work) logging.debug("Running SH test in RAxML: %s" % raxml_cl) o, stderr = raxml_cl() # hard coded try: pval = re.search('(Significantly.*:.*)', o).group(0) except: print("SH test failed.", file=sys.stderr) else: pval = pval.strip().replace("\t"," ").replace("%","\%") print("{0}\t{1}".format(op.basename(querytree), pval), file=shout) logging.debug("SH p-value appended to %s" % shout.name) shout.close() return shout.name
def link(args): """ %prog link metafile Link source to target based on a tabular file. """ from jcvi.apps.base import mkdir p = OptionParser(link.__doc__) p.add_option("--dir", help="Place links in a subdirectory [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) meta, = args d = opts.dir if d: mkdir(d) fp = open(meta) for row in fp: source, target = row.split() source = get_abs_path(source) if d: target = op.join(d, target) lnsf(source, target, log=True)
def batch(args): """ %prog batch splits output The arguments are two folders. Input FASTA sequences are in splits/. Output csv files are in output/. Must have folders swissprot/, tair/, trembl/ that contains the respective BLAST output. Once finished, you can run, for example: $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml """ p = OptionParser(batch.__doc__) ahrd_weights = {"blastp": [0.5, 0.3, 0.2], "blastx": [0.6, 0.4, 0.0]} blast_progs = tuple(ahrd_weights.keys()) p.add_option("--path", default="~/code/AHRD/", help="Path where AHRD is installed [default: %default]") p.add_option("--blastprog", default="blastp", choices=blast_progs, help="Specify the blast program being run. Based on this option," \ + " the AHRD parameters (score_weights) will be modified." \ + " [default: %default]") p.add_option("--iprscan", default=None, help="Specify path to InterProScan results file if available." \ + " If specified, the yml conf file will be modified" \ + " appropriately. [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) splits, output = args mkdir(output) bit_score, db_score, ovl_score = ahrd_weights[opts.blastprog] for f in glob("{0}/*.fa*".format(splits)): fb = op.basename(f).rsplit(".", 1)[0] fw = open(op.join(output, fb + ".yml"), "w") path = op.expanduser(opts.path) dir = op.join(path, "test/resources") outfile = op.join(output, fb + ".csv") interpro = iprscanTemplate.format(opts.iprscan) if opts.iprscan else "" print(Template.format(dir, fb, f, outfile, bit_score, db_score, ovl_score, interpro), file=fw) if opts.iprscan: if not op.lexists("interpro.xml"): symlink(op.join(iprscan_datadir, "interpro.xml"), "interpro.xml") if not op.lexists("interpro.dtd"): symlink(op.join(iprscan_datadir, "interpro.dtd"), "interpro.dtd")
def calc(args): """ %prog calc [prot.fasta] cds.fasta > out.ks Protein file is optional. If only one file is given, it is assumed to be CDS sequences with correct frame (frame 0). Results will be written to stdout. Both protein file and nucleotide file are assumed to be Fasta format, with adjacent records as the pairs to compare. Author: Haibao Tang <*****@*****.**>, Brad Chapman Calculate synonymous mutation rates for gene pairs This does the following: 1. Fetches a protein pair. 2. Aligns the protein pair with clustalw 3. Convert the output to Fasta format. 4. Use this alignment info to align gene sequences using PAL2NAL 5. Run PAML yn00 to calculate synonymous mutation rates. """ p = OptionParser(calc.__doc__) set_outfile(p) opts, args = p.parse_args(args) if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print >>sys.stderr, "Incorrect arguments" sys.exit(not p.print_help()) output_h = must_open(opts.outfile, "w") output_h.write("name,dS-yn,dN-yn,dS-ng,dN-ng\n") work_dir = op.join(os.getcwd(), "syn_analysis") mkdir(work_dir) if not protein_file: protein_file = translate_dna(dna_file) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \ zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator): print >>sys.stderr, "--------", p_rec_1.name, p_rec_2.name align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir) mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2, work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \ find_synonymous(mrtrans_fasta, work_dir) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write("%s\n" % (",".join(str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng)))) output_h.flush() # Clean-up sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
def correct(args): """ %prog correct *.fastq Correct the fastqfile and generated corrected fastqfiles. This calls assembly.allpaths.prepare() to generate input files for ALLPATHS-LG. The naming convention for your fastqfiles are important, and are listed below. By default, this will correct all PE reads, and remove duplicates of all MP reads, and results will be placed in `frag_reads.corr.{pairs,frags}.fastq` and `jump_reads.corr.{pairs,frags}.fastq`. """ from jcvi.assembly.allpaths import prepare from jcvi.assembly.base import FastqNamings p = OptionParser(correct.__doc__ + FastqNamings) p.add_option( "--nofragsdedup", default=False, action="store_true", help="Don't deduplicate the fragment reads [default: %default]") p.add_option("--cpus", default=32, type="int", help="Number of threads to run [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastq = args tag, tagj = "frag_reads", "jump_reads" prepare(["Unknown"] + fastq + ["--norun"]) datadir = "data" mkdir(datadir) fullpath = op.join(os.getcwd(), datadir) nthreads = " NUM_THREADS={0}".format(opts.cpus) phred64 = (guessoffset([args[0]]) == 64) orig = datadir + "/{0}_orig".format(tag) origfastb = orig + ".fastb" if need_update(fastq, origfastb): cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}'".\ format(fullpath, opts.cpus) if phred64: cmd += " PHRED_64=True" sh(cmd) if op.exists(origfastb): dedup = not opts.nofragsdedup correct_frag(datadir, tag, origfastb, nthreads, dedup=dedup) origj = datadir + "/{0}_orig".format(tagj) origjfastb = origj + ".fastb" if op.exists(origjfastb): correct_jump(datadir, tagj, origjfastb, nthreads)
def stats(args): """ %prog stats infile.gff Collect gene statistics based on gff file. There are some terminology issues here and so normally we call "gene" are actually mRNA, and sometimes "exon" are actually CDS, but they are configurable. Thee numbers are written to text file in four separate folders, corresponding to the four metrics: Exon length, Intron length, Gene length, Exon count With data written to disk then you can run %prog histogram """ p = OptionParser(stats.__doc__) p.add_option("--gene", default="mRNA", help="The gene type [default: %default]") p.add_option("--exon", default="CDS", help="The exon type [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gff_file, = args g = make_index(gff_file) exon_lengths = [] intron_lengths = [] gene_lengths = [] exon_counts = [] for feat in g.features_of_type(opts.gene): exons = [] for c in g.children(feat.id, 1): if c.featuretype != opts.exon: continue exons.append((c.chrom, c.start, c.stop)) introns = range_interleave(exons) feat_exon_lengths = [(stop - start + 1) for (chrom, start, stop) in exons] feat_intron_lengths = [(stop - start + 1) for (chrom, start, stop) in introns] exon_lengths += feat_exon_lengths intron_lengths += feat_intron_lengths gene_lengths.append(sum(feat_exon_lengths)) exon_counts.append(len(feat_exon_lengths)) a = SummaryStats(exon_lengths) b = SummaryStats(intron_lengths) c = SummaryStats(gene_lengths) d = SummaryStats(exon_counts) for x, title in zip((a, b, c, d), metrics): x.title = title print(x, file=sys.stderr) prefix = gff_file.split(".")[0] for x in (a, b, c, d): dirname = x.title mkdir(dirname) txtfile = op.join(dirname, prefix + ".txt") x.tofile(txtfile)
def batch(args): """ %prog batch splits output The arguments are two folders. Input FASTA sequences are in splits/. Output csv files are in output/. Must have folders swissprot/, tair/, trembl/ that contains the respective BLAST output. Once finished, you can run, for example: $ parallel java -Xmx2g -jar ~/code/AHRD/dist/ahrd.jar {} ::: output/*.yml """ p = OptionParser(batch.__doc__) ahrd_weights = { "blastp": [0.5, 0.3, 0.2], "blastx": [0.6, 0.4, 0.0] } blast_progs = tuple(ahrd_weights.keys()) p.add_option("--path", default="~/code/AHRD/", help="Path where AHRD is installed [default: %default]") p.add_option("--blastprog", default="blastp", choices=blast_progs, help="Specify the blast program being run. Based on this option," \ + " the AHRD parameters (score_weights) will be modified." \ + " [default: %default]") p.add_option("--iprscan", default=None, help="Specify path to InterProScan results file if available." \ + " If specified, the yml conf file will be modified" \ + " appropriately. [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) splits, output = args mkdir(output) bit_score, db_score, ovl_score = ahrd_weights[opts.blastprog] for f in glob("{0}/*.fasta".format(splits)): fb = op.basename(f).rsplit(".", 1)[0] fw = open(op.join(output, fb + ".yml"), "w") path = op.expanduser(opts.path) dir = op.join(path, "test/resources") outfile = op.join(output, fb + ".csv") interpro = iprscanTemplate.format(opts.iprscan) if opts.iprscan else "" print >> fw, Template.format(dir, fb, f, outfile, bit_score, db_score, ovl_score, interpro) if opts.iprscan: if not op.lexists("interpro.xml"): symlink(op.join(iprscan_datadir, "interpro.xml"), "interpro.xml") if not op.lexists("interpro.dtd"): symlink(op.join(iprscan_datadir, "interpro.dtd"), "interpro.dtd")
def make_link(self, firstN=0): mkdir(self.genome) if firstN > 0: first([str(firstN), self.fastq, "--outfile={0}".format(self.link)]) return if op.islink(self.link): os.unlink(self.link) os.symlink(get_abs_path(self.fastq), self.link)
def mergecn(args): """ %prog mergecn FACE.csv Compile matrix of GC-corrected copy numbers. Place a bunch of folders in csv file. Each folder will be scanned, one chromosomes after another. """ p = OptionParser(mergecn.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (csvfile, ) = args samples = [x.replace("-cn", "").strip().strip("/") for x in open(csvfile)] betadir = "beta" mkdir(betadir) for seqid in allsomes: names = [ op.join(s + "-cn", "{}.{}.cn".format(op.basename(s), seqid)) for s in samples ] arrays = [np.fromfile(name, dtype=np.float) for name in names] shapes = [x.shape[0] for x in arrays] med_shape = np.median(shapes) arrays = [x for x in arrays if x.shape[0] == med_shape] ploidy = 2 if seqid not in ("chrY", "chrM") else 1 if seqid in sexsomes: chr_med = [np.median([x for x in a if x > 0]) for a in arrays] chr_med = np.array(chr_med) idx = get_kmeans(chr_med, k=2) zero_med = np.median(chr_med[idx == 0]) one_med = np.median(chr_med[idx == 1]) logging.debug("K-means with {} c0:{} c1:{}".format( seqid, zero_med, one_med)) higher_idx = 1 if one_med > zero_med else 0 # Use the higher mean coverage componen arrays = np.array(arrays)[idx == higher_idx] arrays = [[x] for x in arrays] ar = np.concatenate(arrays) print(seqid, ar.shape) rows, columns = ar.shape beta = [] std = [] for j in range(columns): a = ar[:, j] beta.append(np.median(a)) std.append(np.std(a) / np.mean(a)) beta = np.array(beta) / ploidy betafile = op.join(betadir, "{}.beta".format(seqid)) beta.tofile(betafile) stdfile = op.join(betadir, "{}.std".format(seqid)) std = np.array(std) std.tofile(stdfile) logging.debug("Written to `{}`".format(betafile)) ar.tofile("{}.bin".format(seqid))
def minimap(args): """ %prog minimap ref.fasta query.fasta Wrap minimap2 aligner using query against sequences. When query and ref is the same, we are in "self-scan" mode (e.g. useful for finding internal duplications resulted from mis-assemblies). """ from jcvi.apps.grid import MakeManager from jcvi.formats.fasta import Fasta p = OptionParser(minimap.__doc__) p.add_option( "--chunks", type="int", default=2000000, help="Split ref.fasta into chunks of size in self-scan mode", ) p.set_outdir(outdir="outdir") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args chunks = opts.chunks outdir = opts.outdir if ref != query: raise NotImplementedError # "self-scan" mode # build faidx (otherwise, parallel make may complain) sh("samtools faidx {}".format(ref)) f = Fasta(ref) mkdir(outdir) mm = MakeManager() for name, size in f.itersizes(): start = 0 for end in range(chunks, size, chunks): fafile = op.join(outdir, "{}_{}_{}.fa".format(name, start + 1, end)) cmd = "samtools faidx {} {}:{}-{} -o {}".format( ref, name, start + 1, end, fafile) mm.add(ref, fafile, cmd) paffile = fafile.rsplit(".", 1)[0] + ".paf" cmd = "minimap2 -P {} {} > {}".format(fafile, fafile, paffile) mm.add(fafile, paffile, cmd) epsfile = fafile.rsplit(".", 1)[0] + ".eps" cmd = "minidot {} > {}".format(paffile, epsfile) mm.add(paffile, epsfile, cmd) start += chunks mm.write()
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option("--notreds", default=False, action="store_true", help="Remove TREDs from the bed file") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trfbed, fastafile = args pf = fastafile.split(".")[0] lhome = opts.lobstr_home mkdir(pf) if opts.notreds: newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 seen = set() for row in fp: r = STRLine(row) total += 1 name = r.longname if name in seen: continue seen.add(name) print >> newbed, r retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(newbedfile, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def augustus(args): """ %prog augustus species gffile fastafile Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from: <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html> """ p = OptionParser(augustus.__doc__) p.add_option( "--autotrain", default=False, action="store_true", help="Run autoAugTrain.pl to iteratively train AUGUSTUS", ) p.set_home("augustus") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) species, gffile, fastafile = args gffile = os.path.abspath(gffile) fastafile = os.path.abspath(fastafile) mhome = opts.augustus_home augdir = "augustus" cwd = os.getcwd() mkdir(augdir) os.chdir(augdir) target = "{0}/config/species/{1}".format(mhome, species) if op.exists(target): logging.debug("Removing existing target `{0}`".format(target)) sh("rm -rf {0}".format(target)) config_path = "{0}/config".format(mhome) sh("{0}/scripts/new_species.pl --species={1} --AUGUSTUS_CONFIG_PATH={2}". format(mhome, species, config_path)) sh("{0}/scripts/gff2gbSmallDNA.pl {1} {2} 1000 raw.gb".format( mhome, gffile, fastafile)) sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".format( mhome, species)) sh(r"cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst" ) sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".format( mhome)) sh("grep -c LOCUS raw.gb training.gb") # autoAugTrain failed to execute, disable for now if opts.autotrain: sh("rm -rf {0}".format(target)) sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}" .format(mhome, species)) os.chdir(cwd) sh("cp -r {0} augustus/".format(target))
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38 Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option("--fixseq", action="store_true", default=False, help="Scan sequences to extract perfect STRs") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) trfbed, fastafile, pf = args lhome = opts.lobstr_home mkdir(pf) if opts.fixseq: genome = pyfasta.Fasta(fastafile) newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 for row in fp: s = STRLine(row) total += 1 for ns in s.iter_exact_str(genome): if not ns.is_valid(): continue print >> newbed, ns retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(trfbed, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def correct(args): """ %prog correct *.fastq Correct the fastqfile and generated corrected fastqfiles. This calls assembly.allpaths.prepare() to generate input files for ALLPATHS-LG. The naming convention for your fastqfiles are important, and are listed below. By default, this will correct all PE reads, and remove duplicates of all MP reads, and results will be placed in `frag_reads.corr.{pairs,frags}.fastq` and `jump_reads.corr.{pairs,frags}.fastq`. """ from jcvi.assembly.allpaths import prepare from jcvi.assembly.base import FastqNamings p = OptionParser(correct.__doc__ + FastqNamings) p.add_option("--nofragsdedup", default=False, action="store_true", help="Don't deduplicate the fragment reads [default: %default]") p.add_option("--cpus", default=32, type="int", help="Number of threads to run [default: %default]") p.add_option("--phred64", default=False, action="store_true", help="Reads are all phred 64 offset [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastq = args tag, tagj = "frag_reads", "jump_reads" prepare(["Unknown"] + fastq + ["--norun"]) datadir = "data" mkdir(datadir) fullpath = op.join(os.getcwd(), datadir) nthreads = " NUM_THREADS={0}".format(opts.cpus) orig = datadir + "/{0}_orig".format(tag) origfastb = orig + ".fastb" if need_update(fastq, origfastb): cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}'".\ format(fullpath, opts.cpus) if opts.phred64: cmd += " PHRED_64=True" sh(cmd) if op.exists(origfastb): dedup = not opts.nofragsdedup correct_frag(datadir, tag, origfastb, nthreads, dedup=dedup) origj = datadir + "/{0}_orig".format(tagj) origjfastb = origj + ".fastb" if op.exists(origjfastb): correct_jump(datadir, tagj, origjfastb, nthreads)
def compilevcf(args): """ %prog compilevcf samples.csv Compile vcf results into master spreadsheet. """ p = OptionParser(compilevcf.__doc__) p.add_option("--db", default="hg38", help="Use these lobSTR db") p.add_option( "--nofilter", default=False, action="store_true", help="Do not filter the variants", ) p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (samples, ) = args workdir = opts.workdir store = opts.output_path cleanup = not opts.nocleanup filtered = not opts.nofilter dbs = opts.db.split(",") cwd = os.getcwd() mkdir(workdir) os.chdir(workdir) samples = op.join(cwd, samples) stridsfile = "STR.ids" if samples.endswith((".vcf", ".vcf.gz")): vcffiles = [samples] else: vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print("\n".join(uids), file=fw) fw.close() run_args = [(x, filtered, cleanup, store) for x in vcffiles] cpus = min(opts.cpus, len(run_args)) p = Pool(processes=cpus) for _ in p.map_async(run_compile, run_args).get(): continue
def mergecn(args): """ %prog mergecn FACE.csv Compile matrix of GC-corrected copy numbers. Place a bunch of folders in csv file. Each folder will be scanned, one chromosomes after another. """ p = OptionParser(mergecn.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) csvfile, = args samples = [x.replace("-cn", "").strip().strip("/") for x in open(csvfile)] betadir = "beta" mkdir(betadir) for seqid in allsomes: names = [op.join(s + "-cn", "{}.{}.cn". format(op.basename(s), seqid)) for s in samples] arrays = [np.fromfile(name, dtype=np.float) for name in names] shapes = [x.shape[0] for x in arrays] med_shape = np.median(shapes) arrays = [x for x in arrays if x.shape[0] == med_shape] ploidy = 2 if seqid not in ("chrY", "chrM") else 1 if seqid in sexsomes: chr_med = [np.median([x for x in a if x > 0]) for a in arrays] chr_med = np.array(chr_med) idx = get_kmeans(chr_med, k=2) zero_med = np.median(chr_med[idx == 0]) one_med = np.median(chr_med[idx == 1]) logging.debug("K-means with {} c0:{} c1:{}" .format(seqid, zero_med, one_med)) higher_idx = 1 if one_med > zero_med else 0 # Use the higher mean coverage componen arrays = np.array(arrays)[idx == higher_idx] arrays = [[x] for x in arrays] ar = np.concatenate(arrays) print seqid, ar.shape rows, columns = ar.shape beta = [] std = [] for j in xrange(columns): a = ar[:, j] beta.append(np.median(a)) std.append(np.std(a) / np.mean(a)) beta = np.array(beta) / ploidy betafile = op.join(betadir, "{}.beta".format(seqid)) beta.tofile(betafile) stdfile = op.join(betadir, "{}.std".format(seqid)) std = np.array(std) std.tofile(stdfile) logging.debug("Written to `{}`".format(betafile)) ar.tofile("{}.bin".format(seqid))
def prepare(args): """ %prog prepare countfolder families Parse list of count files and group per family into families folder. """ p = OptionParser(prepare.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) counts, families = args countfiles = glob(op.join(counts, "*.count")) countsdb = defaultdict(list) for c in countfiles: rs = RiceSample(c) countsdb[(rs.tissue, rs.ind)].append(rs) # Merge duplicates - data sequenced in different batches key = lambda x: (x.label, x.rep) for (tissue, ind), rs in sorted(countsdb.items()): rs.sort(key=key) nrs = len(rs) for i in xrange(nrs): ri = rs[i] if not ri.working: continue for j in xrange(i + 1, nrs): rj = rs[j] if key(ri) != key(rj): continue ri.merge(rj) rj.working = False countsdb[(tissue, ind)] = [x for x in rs if x.working] # Group into families mkdir("families") for (tissue, ind), r in sorted(countsdb.items()): r = list(r) if r[0].label != "F1": continue P1, P2 = r[0].P1, r[0].P2 P1, P2 = countsdb[(tissue, P1)], countsdb[(tissue, P2)] rs = P1 + P2 + r groups = [1] * len(P1) + [2] * len(P2) + [3] * len(r) assert len(rs) == len(groups) outfile = "-".join((tissue, ind)) merge_counts(rs, op.join(families, outfile)) groupsfile = outfile + ".groups" fw = open(op.join(families, groupsfile), "w") print >> fw, ",".join(str(x) for x in groups) fw.close()
def omg(args): """ %prog omg weightsfile Run Sankoff's OMG algorithm to get orthologs. Download OMG code at: <http://137.122.149.195/IsbraSoftware/OMGMec.html> This script only writes the partitions, but not launch OMGMec. You may need to: $ parallel "java -cp ~/code/OMGMec TestOMGMec {} 4 > {}.out" ::: work/gf????? Then followed by omgparse() to get the gene lists. """ p = OptionParser(omg.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) weightsfiles = args groupfile = group(weightsfiles + ["--outfile=groups"]) weights = get_weights(weightsfiles) info = get_info() fp = open(groupfile) work = "work" mkdir(work) for i, row in enumerate(fp): gf = op.join(work, "gf{0:05d}".format(i)) genes = row.rstrip().split(",") fw = open(gf, "w") contents = "" npairs = 0 for gene in genes: gene_pairs = weights[gene] for a, b, c in gene_pairs: if b not in genes: continue contents += "weight {0}".format(c) + '\n' contents += info[a] + '\n' contents += info[b] + '\n\n' npairs += 1 header = "a group of genes :length ={0}".format(npairs) print >> fw, header print >> fw, contents fw.close()
def _get_records(self): gbdir = "gb" dirmade = mkdir(gbdir) if not dirmade: sh("rm -rf {0}_old; mv -f {0} {0}_old".format(gbdir,)) assert mkdir(gbdir) entrez([self.idfile, "--format=gb", "--database=nuccore", "--outdir={0}"\ .format(gbdir)]) logging.debug('GenBank records written to {0}.'.format(gbdir)) return gbdir
def write_lst(bedfile): pf = op.basename(bedfile).split(".")[0] mkdir(pf) bed = Bed(bedfile) stanza = [] for seqid, bs in bed.sub_beds(): fname = op.join(pf, "{0}.lst".format(seqid)) fw = open(fname, "w") for b in bs: print >> fw, "{0}{1}".format(b.accn.replace(" ", ""), b.strand) stanza.append((seqid, fname)) fw.close() return pf, stanza
def error(args): """ %prog error backup_folder Find all errors in ../5-consensus/*.err and pull the error unitigs into backup/ folder. """ p = OptionParser(error.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) backup_folder, = args mkdir(backup_folder) fw = open("errors.log", "w") seen = set() for g in sorted(glob("../5-consensus/*.err")): if "partitioned" in g: continue fp = open(g) partID = op.basename(g).rsplit(".err", 1)[0] partID = int(partID.split("_")[-1]) for row in fp: if row.startswith(working): unitigID = row.split("(")[0].split()[-1] continue if not failed.upper() in row.upper(): continue uu = (partID, unitigID) if uu in seen: continue seen.add(uu) print >> fw, "\t".join(str(x) for x in (partID, unitigID)) cmd = "{0} {1}".format(*uu) unitigfile = pull(cmd.split()) cmd = "mv {0} {1}".format(unitigfile, backup_folder) sh(cmd) fp.close() logging.debug("A total of {0} unitigs saved to {1}.".\ format(len(seen), backup_folder))
def compile(args): """ %prog compile samples.csv Compile vcf results into master spreadsheet. """ from multiprocessing import Pool p = OptionParser(compile.__doc__) p.add_option("--db", default="hg38,hg38-named", help="Use these lobSTR db") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samples, = args workdir = opts.workdir dbs = opts.db.split(",") mkdir(workdir) os.chdir(workdir) stridsfile = "STR.ids" vcffiles = [x.strip() for x in must_open(samples)] if not op.exists(stridsfile): ids = [] for db in dbs: ids.extend(STRFile(opts.lobstr_home, db=db).ids) uids = uniqify(ids) logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids))) fw = open(stridsfile, "w") print >> fw, "\n".join(uids) fw.close() # Generate two alleles dipuids = [] for uid in uids: dipuids.extend([uid + ".1", uid + ".2"]) fw = open("header.ids", "w") print >> fw, ",".join(dipuids) fw.close() p = Pool(processes=opts.cpus) run_args = [(x, opts.store, opts.cleanup) for x in vcffiles] #run(run_args[0]) for res in p.map_async(run, run_args).get(): continue