Example #1
0
def uclust(args):
    """
    %prog uclust fastafile

    Use `usearch` to remove duplicate reads.
    """
    p = OptionParser(uclust.__doc__)
    p.set_align(pctid=98)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    pf, sf = fastafile.rsplit(".", 1)
    sortedfastafile = pf + ".sorted.fasta"
    if need_update(fastafile, sortedfastafile):
        cmd = "usearch -sortbylength {0} -fastaout {1}".\
                    format(fastafile, sortedfastafile)
        sh(cmd)

    pf = fastafile + ".P{0}.uclust".format(opts.pctid)
    clstrfile = pf + ".clstr"
    centroidsfastafile = pf + ".centroids.fasta"
    if need_update(sortedfastafile, centroidsfastafile):
        cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile)
        cmd += " -id {0}".format(identity)
        cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile)
        sh(cmd)
Example #2
0
File: sam.py Project: rrane/jcvi
def count(args):
    """
    %prog count bamfile gtf

    Count the number of reads mapped using `htseq-count`.
    """
    p = OptionParser(count.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, gtf = args
    pf = bamfile.split(".")[0]
    countfile = pf + ".count"
    nsorted = pf + "_nsorted"
    nsortedbam, nsortedsam = nsorted + ".bam", nsorted + ".sam"
    if need_update(bamfile, nsortedsam):
        cmd = "samtools sort -n {0} {1}".format(bamfile, nsorted)
        sh(cmd)
        cmd = "samtools view -h {0}".format(nsortedbam)
        sh(cmd, outfile=nsortedsam)

    if need_update(nsortedsam, countfile):
        cmd = "htseq-count --stranded=no --minaqual=10"
        cmd += " {0} {1}".format(nsortedsam, gtf)
        sh(cmd, outfile=countfile)
Example #3
0
File: tgbs.py Project: fw1121/jcvi
def snp(args):
    """
    %prog snp input.gsnap

    Run SNP calling on GSNAP output after apps.gsnap.align().
    """
    p = OptionParser(snp.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gsnapfile, = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    nativefile = pf + ".native"
    if need_update(gsnapfile, nativefile):
        cmd = op.join(EYHOME, "convert2native.pl")
        cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile)
        cmd += " -proc {0}".format(opts.cpus)
        sh(cmd)

    snpfile = pf + ".snp"
    if need_update(nativefile, snpfile):
        cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl")
        cmd += " --native {0} -o {1}".format(nativefile, snpfile)
        cmd += " -a 2 -ac 0.3 -c 0.8"
        sh(cmd)
Example #4
0
def correct_frag(datadir, tag, origfastb, nthreads,
                 dedup=False, haploidify=False, suffix=False):
    filt = datadir + "/{0}_filt".format(tag)
    filtfastb = filt + ".fastb"
    run_RemoveDodgyReads(infile=origfastb, outfile=filtfastb,
                         removeDuplicates=dedup, rc=False, nthreads=nthreads)

    filtpairs = filt + ".pairs"
    edit = datadir + "/{0}_edit".format(tag)
    editpairs = edit + ".pairs"
    if need_update(filtpairs, editpairs):
        cmd = "ln -sf {0} {1}.pairs".format(op.basename(filtpairs), edit)
        sh(cmd)

    editfastb = edit + ".fastb"
    if need_update(filtfastb, editfastb):
        cmd = "FindErrors HEAD_IN={0} HEAD_OUT={1}".format(filt, edit)
        cmd += " PLOIDY_FILE=data/ploidy"
        cmd += nthreads
        sh(cmd)

    corr = datadir + "/{0}_corr".format(tag)
    corrfastb = corr + ".fastb"
    if need_update(editfastb, corrfastb):
        cmd = "CleanCorrectedReads DELETE=True"
        cmd += " HEAD_IN={0} HEAD_OUT={1}".format(edit, corr)
        cmd += " PLOIDY_FILE={0}/ploidy".format(datadir)
        if haploidify:
            cmd += " HAPLOIDIFY=True"
        cmd += nthreads
        sh(cmd)

    export_fastq(datadir, corrfastb, suffix=suffix)
Example #5
0
File: bed.py Project: bennyyu/jcvi
def index(args):
    """
    %prog index bedfile

    Compress frgscffile.sorted and index it using `tabix`.
    """
    p = OptionParser(index.__doc__)
    p.add_option("--query", help="Chromosome location [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    gzfile = bedfile + ".gz"

    if need_update(bedfile, gzfile):
        bedfile = sort([bedfile])
        cmd = "bgzip -c {0}".format(bedfile)
        sh(cmd, outfile=gzfile)

    tbifile = gzfile + ".tbi"

    if need_update(gzfile, tbifile):
        cmd = "tabix -p bed {0}".format(gzfile)
        sh(cmd)

    query = opts.query
    if not query:
        return

    cmd = "tabix {0} {1}".format(gzfile, query)
    sh(cmd, outfile=opts.outfile)
Example #6
0
File: sam.py Project: arvin580/jcvi
def count(args):
    """
    %prog count bamfile gtf

    Count the number of reads mapped using `htseq-count`.
    """
    p = OptionParser(count.__doc__)
    p.add_option("--type", default="exon",
                 help="Only count feature type")
    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, gtf = args
    cpus = opts.cpus
    pf = bamfile.split(".")[0]
    countfile = pf + ".count"
    if not need_update(bamfile, countfile):
        return

    nsorted = pf + "_nsorted"
    nsortedbam, nsortedsam = nsorted + ".bam", nsorted + ".sam"
    if need_update(bamfile, nsortedsam):
        cmd = "samtools sort -@ {0} -n {1} {2}".format(cpus, bamfile, nsorted)
        sh(cmd)
        cmd = "samtools view -@ {0} -h {1}".format(cpus, nsortedbam)
        sh(cmd, outfile=nsortedsam)

    if need_update(nsortedsam, countfile):
        cmd = "htseq-count --stranded=no --minaqual=10"
        cmd += " -t {0}".format(opts.type)
        cmd += " {0} {1}".format(nsortedsam, gtf)
        sh(cmd, outfile=countfile)
Example #7
0
def jellyfish(args):
    """
    %prog jellyfish [*.fastq|*.fasta]

    Run jellyfish to dump histogram to be used in kmer.histogram().
    """
    from jcvi.apps.base import getfilesize
    from jcvi.utils.cbook import human_size
    p = OptionParser(jellyfish.__doc__)
    p.add_option("-K", default=23, type="int",
                 help="K-mer size [default: %default]")
    p.add_option("--coverage", default=40, type="int",
                 help="Expected sequence coverage [default: %default]")
    p.add_option("--prefix", default="jf",
                 help="Database prefix [default: %default]")
    p.add_option("--nohist", default=False, action="store_true",
                 help="Do not print histogram [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    K = opts.K
    coverage = opts.coverage

    totalfilesize = sum(getfilesize(x) for x in fastqfiles)
    fq = fastqfiles[0]
    pf = opts.prefix
    gzip = fq.endswith(".gz")

    hashsize = totalfilesize / coverage
    logging.debug("Total file size: {0}, hashsize (-s): {1}".\
                    format(human_size(totalfilesize,
                           a_kilobyte_is_1024_bytes=True), hashsize))

    jfpf = "{0}-K{1}".format(pf, K)
    jfdb = jfpf
    fastqfiles = " ".join(fastqfiles)

    cmd = "jellyfish count -t {0} -C -o {1}".format(opts.cpus, jfpf)
    cmd += " -s {0} -m {1}".format(hashsize, K)
    if gzip:
        cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0"
    else:
        cmd += " " + fastqfiles

    if need_update(fastqfiles, jfdb):
        sh(cmd)

    if opts.nohist:
        return

    jfhisto = jfpf + ".histogram"
    cmd = "jellyfish histo -t 64 {0} -o {1}".format(jfdb, jfhisto)

    if need_update(jfdb, jfhisto):
        sh(cmd)
Example #8
0
File: sam.py Project: arvin580/jcvi
def index(args):
    """
    %prog index samfile/bamfile

    If SAM file, convert to BAM, sort and then index, using SAMTOOLS
    """
    p = OptionParser(index.__doc__)
    p.add_option("--fasta", dest="fasta", default=None,
            help="add @SQ header to the BAM file [default: %default]")
    p.add_option("--unique", default=False, action="store_true",
            help="only retain uniquely mapped reads [default: %default]")
    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    samfile, = args
    cpus = opts.cpus
    fastafile = opts.fasta
    if fastafile:
        assert op.exists(fastafile)

    bamfile = samfile.replace(".sam", ".bam")
    if fastafile:
        faifile = fastafile + ".fai"
        if need_update(fastafile, faifile):
            sh("samtools faidx {0}".format(fastafile))
        cmd = "samtools view -bt {0} {1} -F 4 -o {2}".\
                format(faifile, samfile, bamfile)
    else:
        cmd = "samtools view -bS {0} -F 4 -o {1}".\
                format(samfile, bamfile)

    cmd += " -@ {0}".format(cpus)
    if opts.unique:
        cmd += " -q 1"

    if samfile.endswith(".sam") and need_update(samfile, bamfile):
        sh(cmd)

    # Already sorted?
    if bamfile.endswith(".sorted.bam"):
        sortedbamfile = bamfile
    else:
        prefix = bamfile.replace(".bam", "")
        sortedbamfile = prefix + ".sorted.bam"

    if need_update(bamfile, sortedbamfile):
        cmd = "samtools sort {0} {1}.sorted".format(bamfile, prefix)
        cmd += " -@ {0}".format(cpus)
        sh(cmd)

    baifile = sortedbamfile + ".bai"
    if need_update(sortedbamfile, baifile):
        sh("samtools index {0}".format(sortedbamfile))

    return sortedbamfile
Example #9
0
def dedup(args):
    """
    %prog dedup scaffolds.fasta

    Remove redundant contigs with CD-HIT. This is run prior to
    assembly.sspace.embed().
    """
    from jcvi.formats.fasta import gaps
    from jcvi.apps.cdhit import deduplicate, ids

    p = OptionParser(dedup.__doc__)
    p.set_align(pctid=GoodPct)
    p.set_mingap(default=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    scaffolds, = args
    mingap = opts.mingap
    splitfile, oagpfile, cagpfile = gaps([scaffolds, "--split", "--mingap={0}".format(mingap)])

    dd = splitfile + ".cdhit"
    clstrfile = dd + ".clstr"
    idsfile = dd + ".ids"
    if need_update(splitfile, clstrfile):
        deduplicate([splitfile, "--pctid={0}".format(opts.pctid)])
    if need_update(clstrfile, idsfile):
        ids([clstrfile])

    agp = AGP(cagpfile)
    reps = set(x.split()[-1] for x in open(idsfile))
    pf = scaffolds.rsplit(".", 1)[0]
    dedupagp = pf + ".dedup.agp"
    fw = open(dedupagp, "w")

    ndropped = ndroppedbases = 0
    for a in agp:
        if not a.is_gap and a.component_id not in reps:
            span = a.component_span
            logging.debug("Drop component {0} ({1})".\
                          format(a.component_id, span))
            ndropped += 1
            ndroppedbases += span
            continue
        print >> fw, a
    fw.close()

    logging.debug("Dropped components: {0}, Dropped bases: {1}".\
                  format(ndropped, ndroppedbases))
    logging.debug("Deduplicated file written to `{0}`.".format(dedupagp))

    tidyagp = tidy([dedupagp, splitfile])
    dedupfasta = pf + ".dedup.fasta"
    build([tidyagp, dd, dedupfasta])

    return dedupfasta
Example #10
0
File: chain.py Project: rrane/jcvi
def frompsl(args):
    """
    %prog frompsl old.new.psl old.fasta new.fasta

    Generate chain file from psl file. The pipeline is describe in:
    <http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver>
    """
    from jcvi.formats.sizes import Sizes

    p = OptionParser(frompsl.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    pslfile, oldfasta, newfasta = args
    pf = oldfasta.split(".")[0]

    # Use liftUp to change the coordinate system. Requires .lft files
    # This step is skipped as the output psl is the same as input?

    # Chain together alignments from using axtChain
    chainfile = pf + ".chain"
    twobitfiles = []
    for fastafile in (oldfasta, newfasta):
        tbfile = faToTwoBit(fastafile)
        twobitfiles.append(tbfile)
    oldtwobit, newtwobit = twobitfiles

    if need_update(pslfile, chainfile):
        cmd = "axtChain -linearGap=medium -psl {0}".format(pslfile)
        cmd += " {0} {1} {2}".format(oldtwobit, newtwobit, chainfile)
        sh(cmd)

    # Sort chain files
    sortedchain = chainfile.rsplit(".", 1)[0] + ".sorted.chain"
    if need_update(chainfile, sortedchain):
        cmd = "chainSort {0} {1}".format(chainfile, sortedchain)
        sh(cmd)

    # Make alignment nets from chains
    netfile = pf + ".net"
    oldsizes = Sizes(oldfasta).filename
    newsizes = Sizes(newfasta).filename
    if need_update((sortedchain, oldsizes, newsizes), netfile):
        cmd = "chainNet {0} {1} {2}".format(sortedchain, oldsizes, newsizes)
        cmd += " {0} /dev/null".format(netfile)
        sh(cmd)

    # Create liftOver chain file
    liftoverfile = pf + ".liftover.chain"
    if need_update((netfile, sortedchain), liftoverfile):
        cmd = "netChainSubset {0} {1} {2}".\
                format(netfile, sortedchain, liftoverfile)
        sh(cmd)
Example #11
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=96, pctcov=0)
    p.add_option("--fast", default=False, action="store_true",
                 help="Place sequence in the first cluster")
    p.add_option("--consensus", default=False, action="store_true",
                 help="Compute consensus sequences")
    p.add_option("--reads", default=False, action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand", default=False, action="store_true",
                 help="Enforce same strand alignment")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.
    fastafile, qualfile = fasta([fastafile, "--seqtk"])

    ocmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, ocmd)
    cmd += " -c {0}".format(identity)
    if ocmd == "cd-hit-est":
        cmd += " -d 0"  # include complete defline
        if opts.samestrand:
            cmd += " -r 0"
    if not opts.fast:
        cmd += " -g 1"
    if opts.pctcov != 0:
        cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.)

    dd = fastafile + ".P{0}.cdhit".format(opts.pctid)
    clstr = dd + ".clstr"

    cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd)
    if need_update(fastafile, (dd, clstr)):
        sh(cmd)

    if opts.consensus:
        cons = dd + ".consensus"
        cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus")
        cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\
                    format(clstr, fastafile, cons)
        if need_update((clstr, fastafile), cons):
            sh(cmd)

    return dd
Example #12
0
def pasa(args):
    """
    %prog ${pasadb}.assemblies.fasta ${pasadb}.pasa_assemblies.gff3

    Wraps `pasa_asmbls_to_training_set.dbi`.
    """
    from jcvi.formats.base import SetFile
    from jcvi.formats.gff import Gff

    p = OptionParser(pasa.__doc__)
    p.set_home("pasa")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, gffile = args
    transcodergff = fastafile + ".transdecoder.gff3"
    transcodergenomegff = fastafile + ".transdecoder.genome.gff3"
    if need_update((fastafile, gffile), (transcodergff, transcodergenomegff)):
        cmd = "{0}/scripts/pasa_asmbls_to_training_set.dbi".format(opts.pasa_home)
        cmd += " --pasa_transcripts_fasta {0} --pasa_transcripts_gff3 {1}".\
                format(fastafile, gffile)
        sh(cmd)

    completeids = fastafile.rsplit(".", 1)[0] + ".complete.ids"
    if need_update(transcodergff, completeids):
        cmd = "grep complete {0} | cut -f1 | sort -u".format(transcodergff)
        sh(cmd, outfile=completeids)

    complete = SetFile(completeids)
    seen = set()
    completegff = transcodergenomegff.rsplit(".", 1)[0] + ".complete.gff3"
    fw = open(completegff, "w")
    gff = Gff(transcodergenomegff)
    for g in gff:
        a = g.attributes
        if "Parent" in a:
            id = a["Parent"][0]
        else:
            id = a["ID"][0]
        asmbl_id = id.split("|")[0]
        if asmbl_id not in complete:
            continue
        print >> fw, g
        if g.type == "gene":
            seen.add(id)

    fw.close()
    logging.debug("A total of {0} complete models extracted to `{1}`.".\
                    format(len(seen), completegff))
Example #13
0
def data(args):
    """
    %prog data data.bin samples.ids STR.ids meta.tsv

    Make data.tsv based on meta.tsv.
    """
    p = OptionParser(data.__doc__)
    p.add_option("--notsv", default=False, action="store_true",
                 help="Do not write data.tsv")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    databin, sampleids, strids, metafile = args
    final_columns, percentiles = read_meta(metafile)
    df, m, samples, loci = read_binfile(databin, sampleids, strids)

    # Clean the data
    m %= 1000  # Get the larger of the two alleles
    m[m == 999] = -1  # Missing data

    final = set(final_columns)
    remove = []
    for i, locus in enumerate(loci):
        if locus not in final:
            remove.append(locus)
            continue

    pf = "STRs_{}_SEARCH".format(timestamp())
    filteredstrids = "{}.STR.ids".format(pf)
    fw = open(filteredstrids, "w")
    print >> fw, "\n".join(final_columns)
    fw.close()
    logging.debug("Dropped {} columns; Retained {} columns (`{}`)".\
                    format(len(remove), len(final_columns), filteredstrids))

    # Remove low-quality columns!
    df.drop(remove, inplace=True, axis=1)
    df.columns = final_columns

    filtered_bin = "{}.data.bin".format(pf)
    if need_update(databin, filtered_bin):
        m = df.as_matrix()
        m.tofile(filtered_bin)
        logging.debug("Filtered binary matrix written to `{}`".format(filtered_bin))

    # Write data output
    filtered_tsv = "{}.data.tsv".format(pf)
    if not opts.notsv and need_update(databin, filtered_tsv):
        df.to_csv(filtered_tsv, sep="\t", index_label="SampleKey")
Example #14
0
def cluster(args):
    """
    %prog cluster prefix fastqfiles

    Use `vsearch` to remove duplicate reads. This routine is heavily influenced
    by PyRAD: <https://github.com/dereneaton/pyrad>.
    """
    p = OptionParser(cluster.__doc__)
    add_consensus_options(p)
    p.set_align(pctid=95)
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    prefix = args[0]
    fastqfiles = args[1:]
    cpus = opts.cpus
    pctid = opts.pctid
    mindepth = opts.mindepth
    minlength = opts.minlength
    fastafile, qualfile = fasta(fastqfiles + ["--seqtk",
                                "--outdir={0}".format(opts.outdir),
                                "--outfile={0}".format(prefix + ".fasta")])

    prefix = op.join(opts.outdir, prefix)
    pf = prefix + ".P{0}".format(pctid)
    derepfile = prefix + ".derep"
    if need_update(fastafile, derepfile):
        derep(fastafile, derepfile, minlength, cpus)

    userfile = pf + ".u"
    notmatchedfile = pf + ".notmatched"
    if need_update(derepfile, userfile):
        cluster_smallmem(derepfile, userfile, notmatchedfile,
                         minlength, pctid, cpus)

    clustfile = pf + ".clust"
    if need_update((derepfile, userfile, notmatchedfile), clustfile):
        makeclust(derepfile, userfile, notmatchedfile, clustfile,
                  mindepth=mindepth)

    clustSfile = pf + ".clustS"
    if need_update(clustfile, clustSfile):
        parallel_musclewrap(clustfile, cpus)

    statsfile = pf + ".stats"
    if need_update(clustSfile, statsfile):
        makestats(clustSfile, statsfile, mindepth=mindepth)
Example #15
0
def mcluster(args):
    """
    %prog mcluster *.consensus

    Cluster across samples using consensus sequences.
    """
    p = OptionParser(mcluster.__doc__)
    add_consensus_options(p)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    consensusfiles = args
    minlength = opts.minlength
    cpus = opts.cpus
    pf = opts.prefix
    pctid = find_pctid(consensusfiles)

    pf += ".P{0}".format(pctid)
    consensusfile = pf + ".consensus.fasta"
    if need_update(consensusfiles, consensusfile):
        fw_cons = must_open(consensusfile, "w")
        totalseqs = 0
        for cf in consensusfiles:
            nseqs = 0
            s = op.basename(cf).split(".")[0]
            for name, seq in parse_fasta(cf):
                name = '.'.join((s, name))
                print >> fw_cons, ">{0}\n{1}".format(name, seq)
                nseqs += 1
            logging.debug("Read `{0}`: {1} seqs".format(cf, nseqs))
            totalseqs += nseqs
        logging.debug("Total: {0} seqs".format(totalseqs))
        fw_cons.close()

    userfile = pf + ".u"
    notmatchedfile = pf + ".notmatched"
    if need_update(consensusfile, userfile):
        cluster_smallmem(consensusfile, userfile, notmatchedfile,
                         minlength, pctid, cpus)

    clustfile = pf + ".clust"
    if need_update((consensusfile, userfile, notmatchedfile), clustfile):
        makeclust(consensusfile, userfile, notmatchedfile, clustfile)

    clustSfile = pf + ".clustS"
    if need_update(clustfile, clustSfile):
        parallel_musclewrap(clustfile, cpus, minsamp=opts.minsamp)
Example #16
0
File: tgbs.py Project: fw1121/jcvi
def bam(args):
    """
    %prog snp input.gsnap ref.fasta

    Convert GSNAP output to BAM.
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.sam import index

    p = OptionParser(bam.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gsnapfile, fastafile = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    uniqsam = pf + ".unique.sam"
    if need_update((gsnapfile, fastafile), uniqsam):
        cmd = op.join(EYHOME, "gsnap2gff3.pl")
        sizesfile = Sizes(fastafile).filename
        cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam)
        cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus)
        sh(cmd)

    index([uniqsam])
Example #17
0
def geneinfo(bed, order, genomeidx, ploidy):
    bedfile = bed.filename
    p = bedfile.split(".")[0]
    idx = genomeidx[p]
    pd = ploidy[p]
    infofile = p + ".info"

    if not need_update(bedfile, infofile):
        return infofile

    fwinfo = open(infofile, "w")

    for s in bed:
        chr = "".join(x for x in s.seqid if x in string.digits)
        try:
            chr = int(chr)
        except ValueError:
            chr = "0"

        print >> fwinfo, "\t".join(str(x) for x in \
                    (s.accn, chr, s.start, s.end, s.strand, idx, pd))
    fwinfo.close()

    logging.debug("Update info file `{0}`.".format(infofile))

    return infofile
Example #18
0
File: evm.py Project: radaniba/jcvi
def pasa(args):
    """
    %prog pasa pasa_db fastafile

    Run EVM in TIGR-only mode.
    """
    p = OptionParser(pasa.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pasa_db, fastafile = args

    termexons = "pasa.terminal_exons.gff3"
    if need_update(fastafile, termexons):
        cmd = "$ANNOT_DEVEL/PASA2/scripts/pasa_asmbls_to_training_set.dbi"
        cmd += ' -M "{0}:mysql.tigr.org" -p "access:access"'.format(pasa_db)
        cmd += ' -g {0}'.format(fastafile)
        sh(cmd)

        cmd = "$EVM/PasaUtils/retrieve_terminal_CDS_exons.pl"
        cmd += " trainingSetCandidates.fasta trainingSetCandidates.gff"
        sh(cmd, outfile=termexons)

    return termexons
Example #19
0
File: ca.py Project: arvin580/jcvi
def parse_ctgs(bestedges, frgtoctg):
    cache = "frgtoctg.cache"
    if need_update(frgtoctg, cache):
        reads_to_ctgs = {}
        frgtodeg = frgtoctg.replace(".frgctg", ".frgdeg")
        iidtouid = frgtoctg.replace(".posmap.frgctg", ".iidtouid")
        fp = open(iidtouid)
        frgstore = {}
        for row in fp:
            tag, iid, uid = row.split()
            if tag == "FRG":
                frgstore[uid] = int(iid)

        for pf, f in zip(("ctg", "deg"), (frgtoctg, frgtodeg)):
            fp = open(f)
            logging.debug("Parse posmap file `{0}`".format(f))
            for row in fp:
                frg, ctg = row.split()[:2]
                frg = frgstore[frg]
                reads_to_ctgs[frg] = pf + ctg
            logging.debug("Loaded mapping: {0}".format(len(reads_to_ctgs)))

        fw = open(cache, "w")
        cPickle.dump(reads_to_ctgs, fw)
        fw.close()
        logging.debug("Contig mapping written to `{0}`".format(cache))

    reads_to_ctgs = cPickle.load(open(cache))
    logging.debug("Contig mapping loaded from `{0}`".format(cache))
    return reads_to_ctgs
Example #20
0
def uniq(args):
    """
    %prog uniq bedfile > newbedfile

    Remove overlapping features with higher scores.
    """
    p = OptionParser(uniq.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    uniqbedfile = bedfile.split(".")[0] + ".uniq.bed"
    bed = Bed(bedfile)
    if not need_update(bedfile, uniqbedfile):
        return uniqbedfile

    ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \
                for i, x in enumerate(bed)]
    selected, score = range_chain(ranges)
    selected = [bed[x.id] for x in selected]

    newbed = Bed()
    newbed.extend(selected)
    newbed.print_to_file(uniqbedfile, sorted=True)
    logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed)))

    return uniqbedfile
Example #21
0
File: bwa.py Project: rrane/jcvi
def sampe(args, opts):
    """
    %prog sampe database.fasta read1.fq read2.fq

    Wrapper for `bwa sampe`. Output will be read1.sam.
    """
    dbfile, read1file, read2file = args
    safile = check_index(dbfile)
    sai1file = check_aln(dbfile, read1file, cpus=opts.cpus)
    sai2file = check_aln(dbfile, read2file, cpus=opts.cpus)

    samfile, _, unmapped = get_samfile(read1file, dbfile,
                                       bam=opts.bam, unmapped=opts.unmapped)
    if not need_update((safile, sai1file, sai2file), samfile):
        logging.error("`{0}` exists. `bwa samse` already run.".format(samfile))
        return "", samfile

    cmd = "bwa sampe " + " ".join((dbfile, sai1file, sai2file, \
                                   read1file, read2file))
    cmd += " " + opts.extra
    if opts.cutoff:
        cmd += " -a {0}".format(opts.cutoff)
    if opts.uniq:
        cmd += " -n 1"

    return cmd, samfile
Example #22
0
def pairs(args):
    """
    See __doc__ for set_options_pairs().
    """
    from jcvi.formats.blast import report_pairs, set_options_pairs
    p = set_options_pairs()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args

    basename = bedfile.split(".")[0]
    insertsfile = ".".join((basename, "inserts"))

    sortedbedfile = op.basename(bedfile).rsplit(".", 1)[0] + ".sorted.bed"
    if need_update(bedfile, sortedbedfile):
        bedfile = sort([bedfile, "--accn"])
    else:
        bedfile = sortedbedfile

    fp = open(bedfile)
    data = [BedLine(row) for i, row in enumerate(fp) if i < opts.nrows]

    ascii = not opts.pdf
    return bedfile, report_pairs(data, opts.cutoff, opts.mateorientation,
           pairsfile=opts.pairsfile, insertsfile=insertsfile,
           rclip=opts.rclip, ascii=ascii, bins=opts.bins,
           distmode=opts.distmode)
Example #23
0
def first(args):
    """
    %prog first N fastqfile(s)

    Get first N reads from file.
    """
    from jcvi.apps.base import need_update

    p = OptionParser(first.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    N = int(args[0])
    nlines = N * 4
    fastqfiles = args[1:]
    fastqfile = fastqfiles[0]
    outfile = opts.outfile
    if not need_update(fastqfiles, outfile):
        logging.debug("File `{0}` exists. Will not overwrite.".format(outfile))
        return

    gz = fastqfile.endswith(".gz")
    for fastqfile in fastqfiles:
        if gz:
            cmd = "zcat {0} | head -n {1}".format(fastqfile, nlines)
        else:
            cmd = "head -n {0} {1}".format(nlines, fastqfile)

        sh(cmd, outfile=opts.outfile, append=True)
Example #24
0
File: bed.py Project: yangjl/jcvi
def mergeBed(bedfile, d=0, nms=False, s=False, scores=None):
    sort([bedfile, "-i"])
    cmd = "mergeBed -i {0}".format(bedfile)
    if d:
        cmd += " -d {0}".format(d)
    if nms:
        nargs = len(open(bedfile).readline().split())
        if nargs <= 3:
            logging.debug("Only {0} columns detected... set nms=True"\
                            .format(nargs))
        else:
            cmd += " -c 4 -o collapse"
    if s:
        cmd += " -s"
    if scores:
        valid_opts = ("sum", "min", "max", "mean", "median",
                "mode", "antimode", "collapse")
        if not scores in valid_opts:
            scores = "mean"
        cmd += " -scores {0}".format(scores)

    mergebedfile = op.basename(bedfile).rsplit(".", 1)[0] + ".merge.bed"

    if need_update(bedfile, mergebedfile):
        sh(cmd, outfile=mergebedfile)
    return mergebedfile
Example #25
0
def alignextend(args):
    """
    %prog alignextend ref.fasta read.1.fastq read.2.fastq

    Wrapper around AMOS alignextend.
    """
    p = OptionParser(alignextend.__doc__)
    p.add_option("--nosuffix", default=False, action="store_true",
                 help="Do not add /1/2 suffix to the read [default: %default]")
    p.set_home("amos")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref, r1, r2 = args
    pf = op.basename(r1).split(".")[0]
    cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl")
    if not opts.nosuffix:
        cmd += " -suffix"
    bwa_idx = "{0}.ref.fa.sa".format(pf)
    if not need_update(ref, bwa_idx):
        cmd += " -noindex"
    cmd += " -threads {0}".format(opts.cpus)
    offset = guessoffset([r1])
    if offset == 64:
        cmd += " -I"
    cmd += " ".join(("", pf, ref, r1, r2))
    sh(cmd)
Example #26
0
def dust(args):
    """
    %prog dust assembly.fasta

    Remove low-complexity contigs within assembly.
    """
    p = OptionParser(dust.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    dustfastafile = fastafile.rsplit(".", 1)[0] + ".dust.fasta"
    if need_update(fastafile, dustfastafile):
        cmd = "dustmasker -in {0}".format(fastafile)
        cmd += " -out {1} -outfmt fasta".format(dustfastafile)
        sh(cmd)

    for name, seq in parse_fasta(dustfastafile):
        nlow = sum(1 for x in seq if x in "acgtN")
        pctlow = nlow * 100. / len(seq)
        if pctlow < 98:
            continue
        #print "{0}\t{1:.1f}".format(name, pctlow)
        print name
Example #27
0
File: sam.py Project: arvin580/jcvi
def fpkm(args):
    """
    %prog fpkm fastafile *.bam

    Calculate FPKM values from BAM file.
    """
    p = OptionParser(fpkm.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]
    # Create a DUMMY gff file for cuffdiff
    gffile = fastafile.rsplit(".", 1)[0] + ".gff"
    if need_update(fastafile, gffile):
        fw = open(gffile, "w")
        f = Fasta(fastafile, lazy=True)
        for key, size in f.itersizes_ordered():
            print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\
                1, size, ".", ".", ".", "ID=" + key))
        fw.close()
        logging.debug("Dummy GFF created: {0}".format(gffile))

    cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles))
    sh(cmd)
Example #28
0
def correct_pairs(p, pf, tag):
    """
    Take one pair of reads and correct to generate *.corr.fastq.
    """
    from jcvi.assembly.preprocess import correct as cr

    logging.debug("Work on {0} ({1})".format(pf, ','.join(p)))
    itag = tag[0]
    cm = ".".join((pf, itag))
    targets = (cm + ".1.corr.fastq", cm + ".2.corr.fastq", \
                pf + ".PE-0.corr.fastq")
    if not need_update(p, targets):
        logging.debug("Corrected reads found: {0}. Skipped.".format(targets))
        return

    slink(p, pf, tag)

    cwd = os.getcwd()
    os.chdir(pf)
    cr(sorted(glob("*.fastq") + glob("*.fastq.gz")) + ["--nofragsdedup"])
    sh("mv {0}.1.corr.fastq ../{1}".format(itag, targets[0]))
    sh("mv {0}.2.corr.fastq ../{1}".format(itag, targets[1]))
    sh("mv frag_reads_corr.corr.fastq ../{0}".format(targets[2]))

    logging.debug("Correction finished: {0}".format(targets))
    os.chdir(cwd)
Example #29
0
def prepare(args):
    """
    %prog prepare jira.txt

    Parse JIRA report and prepare input. Look for all FASTQ files in the report
    and get the prefix. Assign fastq to a folder and a new file name indicating
    the library type (e.g. PE-500, MP-5000, etc.).

    Note that JIRA report can also be a list of FASTQ files.
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--first", default=0, type="int", help="Use only first N reads [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    jfile, = args
    metafile = jfile + ".meta"

    if need_update(jfile, metafile):
        fp = open(jfile)
        fastqfiles = [x.strip() for x in fp if ".fastq" in x]
        metas = [Meta(x) for x in fastqfiles]

        fw = open(metafile, "w")
        print >> fw, "\n".join(str(x) for x in metas)
        print >> sys.stderr, "Now modify `{0}`, and restart this script.".format(metafile)
        print >> sys.stderr, "Each line is : genome library fastqfile"
        fw.close()
        return

    mf = MetaFile(metafile)
    for m in mf:
        m.make_link(firstN=opts.first)
Example #30
0
File: snp.py Project: Hensonmw/jcvi
def rmdup(args):
    """
    %prog rmdup *.bam > rmdup.cmds

    Remove PCR duplicates from BAM files, generate a list of commands.
    """
    p = OptionParser(rmdup.__doc__)
    p.add_option("-S", default=False, action="store_true",
                 help="Treat PE reads as SE in rmdup")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    bams = args
    cmd = "samtools rmdup"
    if opts.S:
        cmd += " -S"
    for b in bams:
        if "rmdup" in b:
            continue
        rb = b.rsplit(".", 1)[0] + ".rmdup.bam"
        if not need_update(b, rb):
            continue
        print " ".join((cmd, b, rb))
Example #31
0
    def split(self, N, force=False):
        """
        There are two modes of splitting the records
        - batch: splitting is sequentially to records/N chunks
        - cycle: placing each record in the splitted files and cycles

        use `cycle` if the len of the record is not evenly distributed
        """
        mode = self.mode
        assert mode in ("batch", "cycle", "optimal")
        logging.debug("set split mode=%s" % mode)

        self.names = self.__class__.get_names(self.filename, N)
        if self.outputdir:
            self.names = [op.join(self.outputdir, x) for x in self.names]

        if not need_update(self.filename, self.names) and not force:
            logging.error("file %s already existed, skip file splitting" %
                          self.names[0])
            return

        filehandles = [open(x, "w") for x in self.names]

        if mode == "batch":
            for batch, fw in zip(self._batch_iterator(N), filehandles):
                count = self.write(fw, batch)
                logging.debug("write %d records to %s" % (count, fw.name))

        elif mode == "cycle":
            handle = self._open(self.filename)
            for record, fw in zip(handle, cycle(filehandles)):
                count = self.write(fw, [record])

        elif mode == "optimal":
            """
            This mode is based on Longest Processing Time (LPT) algorithm:

            A simple, often-used algorithm is the LPT algorithm (Longest
            Processing Time) which sorts the jobs by its processing time and
            then assigns them to the machine with the earliest end time so far.
            This algorithm achieves an upper bound of 4/3 - 1/(3m) OPT.

            Citation: <http://en.wikipedia.org/wiki/Multiprocessor_scheduling>
            """
            endtime = [0] * N
            handle = self._open(self.filename)
            for record in handle:
                mt, mi = min((x, i) for (i, x) in enumerate(endtime))
                fw = filehandles[mi]
                count = self.write(fw, [record])
                endtime[mi] += len(record)

        for fw in filehandles:
            fw.close()
Example #32
0
    def __init__(self, bedfile, sizesfile):
        from jcvi.apps.command import BDPATH
        from jcvi.formats.bed import sort

        sortedbedfile = bedfile.rsplit(".", 1)[0] + ".sorted.bed"
        if need_update(bedfile, sortedbedfile):
            sort([bedfile])
        bedfile = sortedbedfile

        coveragefile = bedfile + ".coverage"
        if need_update(bedfile, coveragefile):
            cmd = BDPATH("genomeCoverageBed")
            cmd += " -bg -i {0} -g {1}".format(bedfile, sizesfile)
            sh(cmd, outfile=coveragefile)

        self.sizes = Sizes(sizesfile).mapping

        filename = coveragefile
        assert filename.endswith(".coverage")
        super(Coverage, self).__init__(filename)
Example #33
0
def check_index(dbfile):
    dbfile = get_abs_path(dbfile)
    safile = dbfile + ".1.bt2"
    if need_update(dbfile, safile):
        cmd = "bowtie2-build {0} {0}".format(dbfile)
        sh(cmd)
    else:
        logging.error(
            "`{0}` exists. `bowtie2-build` already run.".format(safile))

    return dbfile
Example #34
0
def validate(args):
    """
    %prog validate outdir genome.fasta

    Validate current folder after MAKER run and check for failures. Failed batch
    will be written to a directory for additional work.
    """
    p = OptionParser(validate.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, genome = args
    counter = Counter()

    fsnames, suffix = get_fsnames(outdir)
    dsfile = "{0}{1}/{0}.maker.output/{0}_master_datastore_index.log"
    dslogs = [dsfile.format(x, suffix) for x in fsnames]
    all_failed = []
    for f, d in zip(fsnames, dslogs):
        dslog = DatastoreIndexFile(d)
        counter.update(dslog.scaffold_status.values())
        all_failed.extend([(f, x) for x in dslog.failed])

    cmd = 'tail maker.*.out | grep -c "now finished"'
    n = int(popen(cmd).read())
    assert len(fsnames) == n
    print("ALL jobs have been finished", file=sys.stderr)

    nfailed = len(all_failed)
    if nfailed == 0:
        print("ALL scaffolds are completed with no errors", file=sys.stderr)
        return

    print("Scaffold status:", file=sys.stderr)
    print(counter, file=sys.stderr)
    failed = "FAILED"
    fw = open(failed, "w")
    print("\n".join(["\t".join((f, x)) for f, x in all_failed]), file=fw)
    fw.close()

    nlines = sum(1 for _ in open("FAILED"))
    assert nlines == nfailed
    print("FAILED !! {0} instances.".format(nfailed), file=sys.stderr)

    # Rebuild the failed batch
    failed_ids = failed + ".ids"
    failed_fasta = failed + ".fasta"
    cmd = "cut -f2 {0}".format(failed)
    sh(cmd, outfile=failed_ids)
    if need_update((genome, failed_ids), failed_fasta):
        cmd = "faSomeRecords {} {} {}".format(genome, failed_ids, failed_fasta)
        sh(cmd)
Example #35
0
File: bed.py Project: radaniba/jcvi
def intersectBed(bedfile1, bedfile2):
    cmd = "intersectBed"
    cmd += " -a {0} -b {1}".format(bedfile1, bedfile2)
    suffix = ".intersect.bed"

    intersectbedfile = ".".join((op.basename(bedfile1).split(".")[0],
                                 op.basename(bedfile2).split(".")[0])) + suffix

    if need_update([bedfile1, bedfile2], intersectbedfile):
        sh(cmd, outfile=intersectbedfile)
    return intersectbedfile
Example #36
0
def get_bed_file(gff_file, stype, key):

    from jcvi.formats.gff import bed

    opr = stype.replace(",", "") + ".bed"
    bed_opts = ["--type=" + stype, "--key=" + key]
    bed_file = ".".join((gff_file.split(".")[0], opr))

    if need_update(gff_file, bed_file):
        bed([gff_file, "--outfile={0}".format(bed_file)] + bed_opts)

    return bed_file
Example #37
0
def mergeBed(bedfile, d=0, nms=False):
    cmd = "mergeBed -i {0}".format(bedfile)
    if d:
        cmd += " -d {0}".format(d)
    if nms:
        cmd += " -nms"

    mergebedfile = op.basename(bedfile).rsplit(".", 1)[0] + ".merge.bed"

    if need_update(bedfile, mergebedfile):
        sh(cmd, outfile=mergebedfile)
    return mergebedfile
Example #38
0
def merylhistogram(merylfile):
    """
    Run meryl to dump histogram to be used in kmer.histogram(). The merylfile
    are the files ending in .mcidx or .mcdat.
    """
    pf, sf = op.splitext(merylfile)
    outfile = pf + ".histogram"
    if need_update(merylfile, outfile):
        cmd = "meryl -Dh -s {0}".format(pf)
        sh(cmd, outfile=outfile)

    return outfile
Example #39
0
def coverage(args):
    """
    %prog coverage fastafile bamfile

    Calculate coverage for BAM file. BAM file will be sorted unless with
    --nosort.
    """
    p = OptionParser(coverage.__doc__)
    p.add_option(
        "--format",
        default="bigwig",
        choices=("bedgraph", "bigwig", "coverage"),
        help="Output format",
    )
    p.add_option("--nosort", default=False, action="store_true", help="Do not sort BAM")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, bamfile = args
    format = opts.format
    if opts.nosort:
        logging.debug("BAM sorting skipped")
    else:
        bamfile = index([bamfile, "--fasta={0}".format(fastafile)])

    pf = bamfile.rsplit(".", 2)[0]
    sizesfile = Sizes(fastafile).filename
    cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile)
    if format in ("bedgraph", "bigwig"):
        cmd += " -bg"
        bedgraphfile = pf + ".bedgraph"
        sh(cmd, outfile=bedgraphfile)

        if format == "bedgraph":
            return bedgraphfile

        bigwigfile = pf + ".bigwig"
        cmd = "bedGraphToBigWig {0} {1} {2}".format(bedgraphfile, sizesfile, bigwigfile)
        sh(cmd)
        return bigwigfile

    coveragefile = pf + ".coverage"
    if need_update(fastafile, coveragefile):
        sh(cmd, outfile=coveragefile)

    gcf = GenomeCoverageFile(coveragefile)
    fw = must_open(opts.outfile, "w")
    for seqid, cov in gcf.iter_coverage_seqid():
        print("\t".join((seqid, "{0:.1f}".format(cov))), file=fw)
    fw.close()
Example #40
0
    def __init__(self, filename, index=False):
        super(Maf, self).__init__(filename)

        indexfile = filename + ".idx"
        if index:
            if need_update(filename, indexfile):
                self.build_index(filename, indexfile)

            self.index = maf.Index(filename, indexfile)

        fp = open(filename)
        self.reader = maf.Reader(fp)
Example #41
0
def fill(args):
    """
    %prog fill frag_reads_corr.fastb

    Run FillFragments on `frag_reads_corr.fastb`.
    """
    p = OptionParser(fill.__doc__)
    p.add_option("--stretch", default=3, type="int",
                 help="MAX_STRETCH to pass to FillFragments [default: %default]")
    p.add_option("--cpus", default=32, type="int",
                 help="Number of threads to run [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastb, = args
    assert fastb == "frag_reads_corr.fastb"

    pcfile = "frag_reads_corr.k28.pc.info"
    nthreads = " NUM_THREADS={0}".format(opts.cpus)
    maxstretch = " MAX_STRETCH={0}".format(opts.stretch)
    if need_update(fastb, pcfile):
        cmd = "PathReads READS_IN=frag_reads_corr"
        cmd += nthreads
        sh(cmd)

    filledfastb = "filled_reads.fastb"
    if need_update(pcfile, filledfastb):
        cmd = "FillFragments PAIRS_OUT=frag_reads_corr_cpd"
        cmd += " PRECORRECT_LIBSTATS=True"
        cmd += maxstretch
        cmd += nthreads
        sh(cmd)

    filledfasta = "filled_reads.fasta"
    if need_update(filledfastb, filledfasta):
        cmd = "Fastb2Fasta IN=filled_reads.fastb OUT=filled_reads.fasta"
        sh(cmd)
Example #42
0
def correct_frag(datadir,
                 tag,
                 origfastb,
                 nthreads,
                 dedup=False,
                 haploidify=False):
    filt = datadir + "/{0}_filt".format(tag)
    filtfastb = filt + ".fastb"
    run_RemoveDodgyReads(infile=origfastb,
                         outfile=filtfastb,
                         removeDuplicates=dedup,
                         rc=False,
                         nthreads=nthreads)

    filtpairs = filt + ".pairs"
    edit = datadir + "/{0}_edit".format(tag)
    editpairs = edit + ".pairs"
    if need_update(filtpairs, editpairs):
        cmd = "ln -sf {0} {1}.pairs".format(op.basename(filtpairs), edit)
        sh(cmd)

    editfastb = edit + ".fastb"
    if need_update(filtfastb, editfastb):
        cmd = "FindErrors HEAD_IN={0} HEAD_OUT={1}".format(filt, edit)
        cmd += " PLOIDY_FILE=data/ploidy"
        cmd += nthreads
        sh(cmd)

    corr = datadir + "/{0}_corr".format(tag)
    corrfastb = corr + ".fastb"
    if need_update(editfastb, corrfastb):
        cmd = "CleanCorrectedReads DELETE=True"
        cmd += " HEAD_IN={0} HEAD_OUT={1}".format(edit, corr)
        cmd += " PLOIDY_FILE={0}/ploidy".format(datadir)
        if haploidify:
            cmd += " HAPLOIDIFY=True"
        cmd += nthreads
        sh(cmd)

    export_fastq(datadir, corrfastb)
Example #43
0
def fasta(args):
    """
    %prog fasta fastqfiles

    Convert fastq to fasta and qual file.
    """
    p = OptionParser(fasta.__doc__)
    p.add_option("--seqtk",
                 default=False,
                 action="store_true",
                 help="Use seqtk to convert")
    p.set_outdir()
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    outdir = opts.outdir
    if outdir and outdir != ".":
        mkdir(outdir)

    fastqfile = fastqfiles[0]
    pf = op.basename(fastqfile)
    gzinput = pf.endswith(".gz")
    if gzinput:
        pf = pf.rsplit(".", 1)[0]

    pf, sf = pf.rsplit(".", 1)
    if sf not in ("fq", "fastq"):
        logging.debug("Assumed FASTA: suffix not `fq` or `fastq`")
        return fastqfile, None

    fastafile, qualfile = pf + ".fasta", pf + ".qual"
    outfile = opts.outfile or fastafile
    outfile = op.join(outdir, outfile)
    if opts.seqtk:
        if need_update(fastqfiles, outfile):
            for i, fastqfile in enumerate(fastqfiles):
                cmd = "seqtk seq -A {0} -L 30 -l 70".format(fastqfile)
                # First one creates file, following ones append to it
                sh(cmd, outfile=outfile, append=i)
        else:
            logging.debug("Outfile `{0}` already exists.".format(outfile))
        return outfile, None

    for fastqfile in fastqfiles:
        SeqIO.convert(fastqfile, "fastq", fastafile, "fasta")
        SeqIO.convert(fastqfile, "fastq", qualfile, "qual")

    return fastafile, qualfile
Example #44
0
File: bed.py Project: radaniba/jcvi
def fastaFromBed(bedfile, fastafile, name=False, stranded=False):
    outfile = op.basename(bedfile).rsplit(".", 1)[0] + ".fasta"
    cmd = "fastaFromBed -fi {0} -bed {1} -fo {2}".\
            format(fastafile, bedfile, outfile)
    if name:
        cmd += " -name"
    if stranded:
        cmd += " -s"

    if need_update([bedfile, fastafile], outfile):
        sh(cmd, outfile=outfile)

    return outfile
Example #45
0
def make_index(gff_file):
    """
    Make a sqlite database for fast retrieval of features.
    """
    import GFFutils
    db_file = gff_file + ".db"

    if need_update(gff_file, db_file):
        if op.exists(db_file):
            os.remove(db_file)
        GFFutils.create_gffdb(gff_file, db_file)

    return GFFutils.GFFDB(db_file)
Example #46
0
def star(args):
    """
    %prog star folder reference

    Run star on a folder with reads.
    """
    p = OptionParser(star.__doc__)
    p.add_option("--single",
                 default=False,
                 action="store_true",
                 help="Single end mapping")
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    mm = MakeManager()

    num = 1 if opts.single else 2
    folder, reference = args
    gd = "GenomeDir"
    mkdir(gd)
    STAR = "STAR --runThreadN {0} --genomeDir {1}".format(cpus, gd)

    # Step 0: build genome index
    genomeidx = op.join(gd, "Genome")
    if need_update(reference, genomeidx):
        cmd = STAR + " --runMode genomeGenerate"
        cmd += " --genomeFastaFiles {0}".format(reference)
        mm.add(reference, genomeidx, cmd)

    # Step 1: align
    for p, prefix in iter_project(folder, opts.names, num):
        pf = "{0}_star".format(prefix)
        bamfile = pf + "Aligned.sortedByCoord.out.bam"
        cmd = STAR + " --readFilesIn {0}".format(" ".join(p))
        if p[0].endswith(".gz"):
            cmd += " --readFilesCommand zcat"
        cmd += " --outSAMtype BAM SortedByCoordinate"
        cmd += " --outFileNamePrefix {0}".format(pf)
        cmd += " --twopassMode Basic"
        # Compatibility for cufflinks
        cmd += " --outSAMstrandField intronMotif"
        cmd += " --outFilterIntronMotifs RemoveNoncanonical"
        mm.add(p, bamfile, cmd)

    mm.write()
Example #47
0
File: cas.py Project: radaniba/jcvi
def check_txt(casfile):
    """
    Check to see if the casfile is already converted to txtfile with txt().
    """
    if casfile.endswith(".cas"):
        castabfile = casfile.replace(".cas", ".txt")
        if need_update(casfile, castabfile):
            castabfile = txt([casfile])
        else:
            logging.debug("File `{0}` found.".format(castabfile))
    else:
        castabfile = casfile

    return castabfile
Example #48
0
    def __init__(self, bedfile, sizesfile):

        bedfile = sort([bedfile])
        coveragefile = bedfile + ".coverage"
        if need_update(bedfile, coveragefile):
            cmd = "genomeCoverageBed"
            cmd += " -bg -i {0} -g {1}".format(bedfile, sizesfile)
            sh(cmd, outfile=coveragefile)

        self.sizes = Sizes(sizesfile).mapping

        filename = coveragefile
        assert filename.endswith(".coverage")
        super(Coverage, self).__init__(filename)
Example #49
0
def split_fastafile(fastafile, maxreadlen=32000):
    pf = fastafile.split(".")[0]
    smallfastafile = pf + "-small.fasta"
    bigfastafile = pf + "-big.fasta"
    shredfastafile = pf + "-big.depth1.fasta"

    if need_update(fastafile, (smallfastafile, shredfastafile)):
        filter([fastafile, str(maxreadlen), "--less", "-o", smallfastafile])
        filter([fastafile, str(maxreadlen), "-o", bigfastafile])
        shred(["--depth=1", "--shift={0}".format(maxreadlen / 100), \
                "--readlen={0}".format(maxreadlen), \
                "--fasta", bigfastafile])

    return smallfastafile, shredfastafile
Example #50
0
def check_index(dbfile, supercat=False, go=True):
    if supercat:
        updated = False
        pf = dbfile.rsplit(".", 1)[0]
        supercatfile = pf + ".supercat"
        coordsfile = supercatfile + ".coords"
        if go and need_update(dbfile, supercatfile):
            cmd = "tGBS-Generate_Pseudo_Genome.pl"
            cmd += " -f {0} -o {1}".format(dbfile, supercatfile)
            sh(cmd)
            # Rename .coords file since gmap_build will overwrite it
            coordsbak = backup(coordsfile)
            updated = True
        dbfile = supercatfile + ".fasta"

    #dbfile = get_abs_path(dbfile)
    dbdir, filename = op.split(dbfile)
    if not dbdir:
        dbdir = "."
    dbname = filename.rsplit(".", 1)[0]
    safile = op.join(dbdir, "{0}/{0}.genomecomp".format(dbname))
    if dbname == filename:
        dbname = filename + ".db"

    if not go:
        return dbdir, dbname

    if need_update(dbfile, safile):
        cmd = "gmap_build -D {0} -d {1} {2}".format(dbdir, dbname, filename)
        sh(cmd)
    else:
        logging.error("`{0}` exists. `gmap_build` already run.".format(safile))

    if go and supercat and updated:
        sh("mv {0} {1}".format(coordsbak, coordsfile))

    return dbdir, dbname
Example #51
0
def check_aln(dbfile, readfile, cpus=32):
    from jcvi.formats.fastq import guessoffset

    saifile = readfile.rsplit(".", 1)[0] + ".sai"
    if need_update((dbfile, readfile), saifile):
        offset = guessoffset([readfile])
        cmd = "bwa aln " + " ".join((dbfile, readfile))
        cmd += " -t {0}".format(cpus)
        if offset == 64:
            cmd += " -I"
        sh(cmd, outfile=saifile)
    else:
        logging.error("`{0}` exists. `bwa aln` already run.".format(saifile))

    return saifile
Example #52
0
def run_compile(arg):
    filename, filtered, cleanup, store = arg
    csvfile = filename + ".csv"
    try:
        if filename.startswith("s3://"):
            if check_exists_s3(csvfile):
                logging.debug("{} exists. Skipped.".format(csvfile))
            else:
                write_csv_ev(filename, filtered, cleanup, store=store)
                logging.debug("{} written and uploaded.".format(csvfile))
        else:
            if need_update(filename, csvfile):
                write_csv_ev(filename, filtered, cleanup, store=None)
    except Exception as e:
        logging.debug("Thread failed! Error: {}".format(e))
Example #53
0
def run_filter(arg):
    vcffile, lhome, store = arg
    filteredvcf = vcffile.replace(".vcf", ".filtered.vcf")
    try:
        if vcffile.startswith("s3://"):
            if check_exists_s3(filteredvcf):
                logging.debug("{} exists. Skipped.".format(filteredvcf))
            else:
                write_filtered(vcffile, lhome, store=store)
                logging.debug("{} written and uploaded.".format(filteredvcf))
        else:
            if need_update(vcffile, filteredvcf):
                write_filtered(vcffile, lhome, store=None)
    except Exception as e:
        logging.debug("Thread failed! Error: {}".format(e))
Example #54
0
def check_index(dbfile):
    dbfile = get_abs_path(dbfile)
    dbdir, filename = op.split(dbfile)
    if not dbdir:
        dbdir = "."
    dbname = filename.rsplit(".", 1)[0]
    safile = op.join(dbdir, "{0}/{0}.salcpchilddc".format(dbname))
    if dbname == filename:
        dbname = filename + ".db"
    if need_update(dbfile, safile):
        cmd = "gmap_build -D {0} -d {1} {2}".format(dbdir, dbname, filename)
        sh(cmd)
    else:
        logging.error("`{0}` exists. `gmap_build` already run.".format(safile))

    return dbdir, dbname
Example #55
0
def split_fastafile(fastafile, maxreadlen=32000):
    from jcvi.formats.fasta import filter

    pf = fastafile.split(".")[0]
    smallfastafile = pf + "-small.fasta"
    bigfastafile = pf + "-big.fasta"
    shredfastafile = pf + "-big.depth1.fasta"

    maxreadlen = str(maxreadlen)
    if need_update(fastafile, (smallfastafile, shredfastafile)):
        filter([fastafile, maxreadlen, "--less", "-o", smallfastafile])
        filter([fastafile, maxreadlen, "-o", bigfastafile])
        shred(["--depth=1", "--readlen={0}".format(maxreadlen), \
                "--fasta", bigfastafile])

    return smallfastafile, shredfastafile
Example #56
0
def build(args):
    """
    %prog build input.bed scaffolds.fasta

    Build associated genome FASTA file and CHAIN file that can be used to lift
    old coordinates to new coordinates. The CHAIN file will be used to lift the
    original marker positions to new positions in the reconstructed genome. The
    new positions of the markers will be reported in *.lifted.bed.
    """
    p = OptionParser(build.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, scaffolds = args
    pf = inputbed.rsplit(".", 1)[0]
    mapbed = pf + ".bed"
    chr_agp = pf + ".chr.agp"
    chr_fasta = pf + ".chr.fasta"
    if need_update((chr_agp, scaffolds), chr_fasta):
        agp_build([chr_agp, scaffolds, chr_fasta])

    unplaced_agp = pf + ".unplaced.agp"
    if need_update((chr_agp, scaffolds), unplaced_agp):
        write_unplaced_agp(chr_agp, scaffolds, unplaced_agp)

    unplaced_fasta = pf + ".unplaced.fasta"
    if need_update((unplaced_agp, scaffolds), unplaced_fasta):
        agp_build([unplaced_agp, scaffolds, unplaced_fasta])

    combined_agp = pf + ".agp"
    if need_update((chr_agp, unplaced_agp), combined_agp):
        FileMerger((chr_agp, unplaced_agp), combined_agp).merge()

    combined_fasta = pf + ".fasta"
    if need_update((chr_fasta, unplaced_fasta), combined_fasta):
        FileMerger((chr_fasta, unplaced_fasta), combined_fasta).merge()

    chainfile = pf + ".chain"
    if need_update((combined_agp, scaffolds, combined_fasta), chainfile):
        fromagp([combined_agp, scaffolds, combined_fasta])

    liftedbed = mapbed.rsplit(".", 1)[0] + ".lifted.bed"
    if need_update((mapbed, chainfile), liftedbed):
        cmd = "liftOver -minMatch=1 {0} {1} {2} unmapped".\
                format(mapbed, chainfile, liftedbed)
        sh(cmd)

    sort([liftedbed, "-i"])  # Sort bed in place
Example #57
0
def alignextend(args):
    """
    %prog alignextend ref.fasta read.1.fastq read.2.fastq

    Wrapper around AMOS alignextend.
    """
    choices = "prepare,align,filter,rmdup,genreads".split(",")
    p = OptionParser(alignextend.__doc__)
    p.add_option("--nosuffix", default=False, action="store_true",
                 help="Do not add /1/2 suffix to the read [default: %default]")
    p.add_option("--rc", default=False, action="store_true",
                 help="Reverse complement the reads before alignment")
    p.add_option("--len", default=100, type="int",
                 help="Extend to this length")
    p.add_option("--stage", default="prepare", choices=choices,
                 help="Start from certain stage")
    p.add_option("--dup", default=10, type="int",
                 help="Filter duplicates with coordinates within this distance")
    p.add_option("--maxdiff", default=1, type="int",
                 help="Maximum number of differences")
    p.set_home("amos")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref, r1, r2 = args
    pf = op.basename(r1).split(".")[0]
    cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl")
    if not opts.nosuffix:
        cmd += " -suffix"
    bwa_idx = "{0}.ref.fa.sa".format(pf)
    if not need_update(ref, bwa_idx):
        cmd += " -noindex"
    cmd += " -threads {0}".format(opts.cpus)
    offset = guessoffset([r1])
    if offset == 64:
        cmd += " -I"
    if opts.rc:
        cmd += " -rc"
    cmd += " -allow -len {0} -dup {1}".format(opts.len, opts.dup)
    cmd += " -min {0} -max {1}".format(2 * opts.len, 20 * opts.len)
    cmd += " -maxdiff {0}".format(opts.maxdiff)
    cmd += " -stage {0}".format(opts.stage)
    cmd += " ".join(("", pf, ref, r1, r2))
    sh(cmd)
Example #58
0
    def merge(self, checkexists=False):
        outfile = self.outfile
        if checkexists and not need_update(self.filelist, outfile):
            logging.debug("File `{0}` exists. Merge skipped.".format(outfile))
            return

        files = " ".join(self.filelist)
        ingz, outgz = self.ingz, self.outgz
        if ingz and outgz:  # can merge gz files directly
            cmd = "cat {0} > {1}".format(files, outfile)
            sh(cmd)
        else:
            cmd = "zcat" if self.ingz else "cat"
            cmd += " " + files
            sh(cmd, outfile=outfile)

        return outfile
Example #59
0
def gmap(args):
    """
    %prog gmap database.fasta fastafile

    Wrapper for `gmap`.
    """
    p = OptionParser(gmap.__doc__)
    p.add_option("--cross",
                 default=False,
                 action="store_true",
                 help="Cross-species alignment")
    p.add_option(
        "--npaths",
        default=0,
        type="int",
        help="Maximum number of paths to show."
        " If set to 0, prints two paths if chimera"
        " detected, else one.",
    )
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    dbfile, fastafile = args
    assert op.exists(dbfile) and op.exists(fastafile)
    prefix = get_prefix(fastafile, dbfile)
    logfile = prefix + ".log"
    gmapfile = prefix + ".gmap.gff3"

    if not need_update((dbfile, fastafile), gmapfile):
        logging.error("`{0}` exists. `gmap` already run.".format(gmapfile))
    else:
        dbdir, dbname = check_index(dbfile)
        cmd = "gmap -D {0} -d {1}".format(dbdir, dbname)
        cmd += " -f 2 --intronlength=100000"  # Output format 2
        cmd += " -t {0}".format(opts.cpus)
        cmd += " --npaths {0}".format(opts.npaths)
        if opts.cross:
            cmd += " --cross-species"
        cmd += " " + fastafile

        sh(cmd, outfile=gmapfile, errfile=logfile)

    return gmapfile, logfile
Example #60
0
    def __init__(self, filename, select=None):
        assert op.exists(filename), "File `{0}` not found".format(filename)

        # filename can be both .sizes file or FASTA formatted file
        sizesname = filename

        if not filename.endswith(".sizes"):
            sizesname = filename + ".sizes"
            filename = get_abs_path(filename)
            if need_update(filename, sizesname):
                cmd = "faSize"
                if which(cmd):
                    cmd += " -detailed {0}".format(filename)
                    sh(cmd, outfile=sizesname)
                else:
                    from jcvi.formats.fasta import Fasta

                    f = Fasta(filename)
                    fw = open(sizesname, "w")
                    for k, size in f.itersizes_ordered():
                        print("\t".join((k, str(size))), file=fw)
                    fw.close()

            filename = sizesname

        assert filename.endswith(".sizes")

        super(Sizes, self).__init__(filename)
        self.fp = open(filename)
        self.filename = filename

        # get sizes for individual contigs, both in list and dict
        # this is to preserve the input order in the sizes file
        sizes = list(self.iter_sizes())
        if select:
            assert select > 0
            sizes = [x for x in sizes if x[1] >= select]
        self.sizes_mapping = dict(sizes)

        # get cumulative sizes, both in list and dict
        ctgs, sizes = zip(*sizes)
        self.sizes = sizes
        cumsizes = np.cumsum([0] + list(sizes))
        self.ctgs = ctgs
        self.cumsizes = cumsizes
        self.cumsizes_mapping = dict(zip(ctgs, cumsizes))