Exemple #1
0
def count(args):
    """
    %prog count bamfile gtf

    Count the number of reads mapped using `htseq-count`.
    """
    p = OptionParser(count.__doc__)
    sp1.add_argument("--type", default="exon",
                 help="Only count feature type")
    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, gtf = args
    cpus = args.cpus
    pf = bamfile.split(".")[0]
    countfile = pf + ".count"
    if not need_update(bamfile, countfile):
        return

    nsorted = pf + "_nsorted"
    nsortedbam, nsortedsam = nsorted + ".bam", nsorted + ".sam"
    if need_update(bamfile, nsortedsam):
        cmd = "samtools sort -@ {0} -n {1} {2}".format(cpus, bamfile, nsorted)
        sh(cmd)
        cmd = "samtools view -@ {0} -h {1}".format(cpus, nsortedbam)
        sh(cmd, outfile=nsortedsam)

    if need_update(nsortedsam, countfile):
        cmd = "htseq-count --stranded=no --minaqual=10"
        cmd += " -t {0}".format(args.type)
        cmd += " {0} {1}".format(nsortedsam, gtf)
        sh(cmd, outfile=countfile)
Exemple #2
0
def index(args):
    """
    %prog index samfile/bamfile

    If SAM file, convert to BAM, sort and then index, using SAMTOOLS
    """
    p = OptionParser(index.__doc__)
    sp1.add_argument("--fasta", dest="fasta", default=None,
            help="add @SQ header to the BAM file [default: %default]")
    sp1.add_argument("--unique", default=False, action="store_true",
            help="only retain uniquely mapped reads [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    samfile, = args
    cpus = args.cpus
    fastafile = args.fasta
    if fastafile:
        assert op.exists(fastafile)

    bamfile = samfile.replace(".sam", ".bam")
    if fastafile:
        faifile = fastafile + ".fai"
        if need_update(fastafile, faifile):
            sh("samtools faidx {0}".format(fastafile))
        cmd = "samtools view -bt {0} {1} -o {2}".\
                format(faifile, samfile, bamfile)
    else:
        cmd = "samtools view -bS {0} -o {1}".\
                format(samfile, bamfile)

    cmd += " -@ {0}".format(cpus)
    if args.unique:
        cmd += " -q 1"

    if samfile.endswith(".sam") and need_update(samfile, bamfile):
        sh(cmd)

    # Already sorted?
    if bamfile.endswith(".sorted.bam"):
        sortedbamfile = bamfile
    else:
        prefix = bamfile.replace(".bam", "")
        sortedbamfile = prefix + ".sorted.bam"

    if need_update(bamfile, sortedbamfile):
        cmd = "samtools sort {0} -o {1}".format(bamfile, sortedbamfile)
        cmd += " -@ {0}".format(cpus)
        sh(cmd)

    baifile = sortedbamfile + ".bai"
    if need_update(sortedbamfile, baifile):
        sh("samtools index {0}".format(sortedbamfile))

    return sortedbamfile
Exemple #3
0
def frompsl(args):
    """
    %prog frompsl old.new.psl old.fasta new.fasta

    Generate chain file from psl file. The pipeline is describe in:
    <http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver>
    """
    from maize.formats.sizes import Sizes

    p = OptionParser(frompsl.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    pslfile, oldfasta, newfasta = args
    pf = oldfasta.split(".")[0]

    # Chain together alignments from using axtChain
    chainfile = pf + ".chain"
    twobitfiles = []
    for fastafile in (oldfasta, newfasta):
        tbfile = faToTwoBit(fastafile)
        twobitfiles.append(tbfile)
    oldtwobit, newtwobit = twobitfiles

    if need_update(pslfile, chainfile):
        cmd = "axtChain -linearGap=medium -psl {0}".format(pslfile)
        cmd += " {0} {1} {2}".format(oldtwobit, newtwobit, chainfile)
        sh(cmd)

    # Sort chain files
    sortedchain = chainfile.rsplit(".", 1)[0] + ".sorted.chain"
    if need_update(chainfile, sortedchain):
        cmd = "chainSort {0} {1}".format(chainfile, sortedchain)
        sh(cmd)

    # Make alignment nets from chains
    netfile = pf + ".net"
    oldsizes = Sizes(oldfasta).filename
    newsizes = Sizes(newfasta).filename
    if need_update((sortedchain, oldsizes, newsizes), netfile):
        cmd = "chainNet {0} {1} {2}".format(sortedchain, oldsizes, newsizes)
        cmd += " {0} /dev/null".format(netfile)
        sh(cmd)

    # Create liftOver chain file
    liftoverfile = pf + ".liftover.chain"
    if need_update((netfile, sortedchain), liftoverfile):
        cmd = "netChainSubset {0} {1} {2}".\
                format(netfile, sortedchain, liftoverfile)
        sh(cmd)
Exemple #4
0
def fpkm(args):
    """
    %prog fpkm fastafile *.bam

    Calculate FPKM values from BAM file.
    """
    p = OptionParser(fpkm.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]
    # Create a DUMMY gff file for cuffdiff
    gffile = fastafile.rsplit(".", 1)[0] + ".gff"
    if need_update(fastafile, gffile):
        fw = open(gffile, "w")
        f = Fasta(fastafile, lazy=True)
        for key, size in f.itersizes_ordered():
            print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\
                1, size, ".", ".", ".", "ID=" + key))
        fw.close()
        logging.debug("Dummy GFF created: {0}".format(gffile))

    cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles))
    sh(cmd)
Exemple #5
0
def first(args):
    """
    %prog first N fastqfile(s)

    Get first N reads from file.
    """
    from maize.apps.base import need_update

    p = OptionParser(first.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    N = int(args[0])
    nlines = N * 4
    fastqfiles = args[1:]
    fastqfile = fastqfiles[0]
    outfile = args.outfile
    if not need_update(fastqfiles, outfile):
        logging.debug("File `{0}` exists. Will not overwrite.".format(outfile))
        return

    gz = fastqfile.endswith(".gz")
    for fastqfile in fastqfiles:
        if gz:
            cmd = "zcat {0} | head -n {1}".format(fastqfile, nlines)
        else:
            cmd = "head -n {0} {1}".format(nlines, fastqfile)

        sh(cmd, outfile=args.outfile, append=True)
Exemple #6
0
def filter(args):
    """
    %prog filter <deltafile|coordsfile>

    Produce a new delta/coords file and filter based on id% or cov%.
    Use `delta-filter` for .delta file.
    """
    p = OptionParser(filter.__doc__)
    p.set_align(pctid=0, hitlen=0)
    sp1.add_argument("--overlap", default=False, action="store_true",
            help="Print overlap status (e.g. terminal, contained)")

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    pctid = args.pctid
    hitlen = args.hitlen

    filename, = args
    if pctid == 0 and hitlen == 0:
        return filename

    pf, suffix = filename.rsplit(".", 1)
    outfile = "".join((pf, ".P{0}L{1}.".format(int(pctid), int(hitlen)), suffix))
    if not need_update(filename, outfile):
        return outfile

    if suffix == "delta":
        cmd = "delta-filter -i {0} -l {1} {2}".format(pctid, hitlen, filename)
        sh(cmd, outfile=outfile)
        return outfile

    fp = open(filename)
    fw = must_open(outfile, "w")
    for row in fp:
        try:
            c = CoordsLine(row)
        except AssertionError:
            continue

        if c.identity < pctid:
            continue
        if c.len2 < hitlen:
            continue
        if args.overlap and not c.overlap:
            continue

        outrow = row.rstrip()
        if args.overlap:
            ov = Overlap_types[c.overlap]
            outrow += "\t" + ov
        print >> fw, outrow

    return outfile
Exemple #7
0
    def __init__(self, filename, index=False):
        super(Maf, self).__init__(filename)

        indexfile = filename + ".idx"
        if index:
            if need_update(filename, indexfile):
                self.build_index(filename, indexfile)

            self.index = maf.Index(filename, indexfile)

        fp = open(filename)
        self.reader = maf.Reader(fp)
Exemple #8
0
    def __init__(self, filename, index=False):
        super(Maf, self).__init__(filename)

        indexfile = filename + ".idx"
        if index:
            if need_update(filename, indexfile):
                self.build_index(filename, indexfile)

            self.index = maf.Index(filename, indexfile)

        fp = open(filename)
        self.reader = maf.Reader(fp)
Exemple #9
0
def fasta(args):
    """
    %prog fasta fastqfiles

    Convert fastq to fasta and qual file.
    """
    p = OptionParser(fasta.__doc__)
    sp1.add_argument("--seqtk",
                     default=False,
                     action="store_true",
                     help="Use seqtk to convert")
    p.set_outdir()
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    outdir = args.outdir
    if outdir and outdir != ".":
        mkdir(outdir)

    fastqfile = fastqfiles[0]
    pf = op.basename(fastqfile)
    gzinput = pf.endswith(".gz")
    if gzinput:
        pf = pf.rsplit(".", 1)[0]

    pf, sf = pf.rsplit(".", 1)
    if sf not in ("fq", "fastq"):
        logging.debug("Assumed FASTA: suffix not `fq` or `fastq`")
        return fastqfile, None

    fastafile, qualfile = pf + ".fasta", pf + ".qual"
    outfile = args.outfile or fastafile
    outfile = op.join(outdir, outfile)
    if args.seqtk:
        if need_update(fastqfiles, outfile):
            for i, fastqfile in enumerate(fastqfiles):
                cmd = "seqtk seq -A {0} -L 30 -l 70".format(fastqfile)
                # First one creates file, following ones append to it
                sh(cmd, outfile=outfile, append=i)
        else:
            logging.debug("Outfile `{0}` already exists.".format(outfile))
        return outfile, None

    for fastqfile in fastqfiles:
        SeqIO.convert(fastqfile, "fastq", fastafile, "fasta")
        SeqIO.convert(fastqfile, "fastq", qualfile, "qual")

    return fastafile, qualfile
Exemple #10
0
def coverage(args):
    """
    %prog coverage fastafile bamfile

    Calculate coverage for BAM file. BAM file will be sorted unless with
    --nosort.
    """
    p = OptionParser(coverage.__doc__)
    sp1.add_argument("--format", default="bigwig",
                 choices=("bedgraph", "bigwig", "coverage"),
                 help="Output format")
    sp1.add_argument("--nosort", default=False, action="store_true",
                 help="Do not sort BAM")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, bamfile = args
    format = args.format
    if args.nosort:
        logging.debug("BAM sorting skipped")
    else:
        bamfile = index([bamfile, "--fasta={0}".format(fastafile)])

    pf = bamfile.rsplit(".", 2)[0]
    sizesfile = Sizes(fastafile).filename
    cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile)
    if format in ("bedgraph", "bigwig"):
        cmd += " -bg"
        bedgraphfile = pf + ".bedgraph"
        sh(cmd, outfile=bedgraphfile)

        if format == "bedgraph":
            return bedgraphfile

        bigwigfile = pf + ".bigwig"
        cmd = "bedGraphToBigWig {0} {1} {2}".\
                    format(bedgraphfile, sizesfile, bigwigfile)
        sh(cmd)
        return bigwigfile

    coveragefile = pf + ".coverage"
    if need_update(fastafile, coveragefile):
        sh(cmd, outfile=coveragefile)

    gcf = GenomeCoverageFile(coveragefile)
    fw = must_open(args.outfile, "w")
    for seqid, cov in gcf.iter_coverage_seqid():
        print >> fw, "\t".join((seqid, "{0:.1f}".format(cov)))
    fw.close()
Exemple #11
0
    def __init__(self, filename, select=None):
        assert op.exists(filename), "File `{0}` not found".format(filename)

        # filename can be both .sizes file or FASTA formatted file
        sizesname = filename

        if not filename.endswith(".sizes"):
            sizesname = filename + ".sizes"
            filename = get_abs_path(filename)
            if need_update(filename, sizesname):
                cmd = "faSize"
                if which(cmd):
                    cmd += " -detailed {0}".format(filename)
                    sh(cmd, outfile=sizesname)
                else:
                    from jcvi.formats.fasta import Fasta

                    f = Fasta(filename)
                    fw = open(sizesname, "w")
                    for k, size in f.itersizes_ordered():
                        fw.write("\t".join((k, str(size))) + "\n")
                    fw.close()

            filename = sizesname

        assert filename.endswith(".sizes")

        super(Sizes, self).__init__(filename)
        self.fp = open(filename)
        self.filename = filename

        # get sizes for individual contigs, both in list and dict
        # this is to preserve the input order in the sizes file
        sizes = list(self.iter_sizes())
        if select:
            assert select > 0
            sizes = [x for x in sizes if x[1] >= select]
        self.sizes_mapping = dict(sizes)

        # get cumulative sizes, both in list and dict
        ctgs, sizes = zip(*sizes)
        self.sizes = sizes
        cumsizes = np.cumsum([0] + list(sizes))
        self.ctgs = ctgs
        self.cumsizes = cumsizes
        self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
Exemple #12
0
def blast(args):
    """
    %prog blast <deltafile|coordsfile>

    Covert delta or coordsfile to BLAST tabular output.
    """
    p = OptionParser(blast.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    deltafile, = args
    blastfile = deltafile.rsplit(".", 1)[0] + ".blast"

    if need_update(deltafile, blastfile):
        coords = Coords(deltafile)
        fw = open(blastfile, "w")
        for c in coords:
            print >> fw, c.blastline
Exemple #13
0
    def __init__(self, filename, sorted=False, header=False):

        if filename.endswith(".delta"):
            coordsfile = filename.rsplit(".", 1)[0] + ".coords"
            if need_update(filename, coordsfile):
                fromdelta([filename])
            filename = coordsfile

        super(Coords, self).__init__(filename)

        fp = open(filename)
        if header:
            self.cmd = fp.next()

        for row in fp:
            try:
                self.append(CoordsLine(row))
            except AssertionError:
                pass

        if sorted:
            self.ref_sort()
Exemple #14
0
def pairs(args):
    """
    See __doc__ for OptionParser.set_pairs().
    """
    import maize.formats.bed

    p = OptionParser(pairs.__doc__)
    p.set_pairs()
    opts, targs = p.parse_args(args)

    if len(targs) != 1:
        sys.exit(not p.print_help())

    samfile, = targs
    bedfile = samfile.rsplit(".", 1)[0] + ".bed"
    if need_update(samfile, bedfile):
        cmd = "bamToBed -i {0}".format(samfile)
        sh(cmd, outfile=bedfile)

    args[args.index(samfile)] = bedfile

    return maize.formats.bed.pairs(args)
Exemple #15
0
    def wrapper(*args, **kwargs):
        assert outfile in kwargs, \
            "You need to specify `outfile=` on function call"
        if infile in kwargs:
            infilename = listify(kwargs[infile])
            for x in infilename:
                assert op.exists(x), \
                    "The specified infile `{0}` does not exist".format(x)

        outfilename = kwargs[outfile]
        if need_update(infilename, outfilename):
            return func(*args, **kwargs)
        else:
            msg = "File `{0}` exists. Computation skipped." \
                .format(outfilename)
            logging.debug(msg)

        outfilename = listify(outfilename)

        for x in outfilename:
            assert op.exists(x), \
                    "Something went wrong, `{0}` not found".format(x)

        return outfilename
Exemple #16
0
def faToTwoBit(fastafile):
    twobitfile = fastafile.rsplit(".", 1)[0] + ".2bit"
    cmd = "faToTwoBit {0} {1}".format(fastafile, twobitfile)
    if need_update(fastafile, twobitfile):
        sh(cmd)
    return twobitfile
Exemple #17
0
def mstmap(args):
    """
    %prog mstmap bcffile/vcffile > matrixfile

    Convert bcf/vcf format to mstmap input.
    """
    from maize.assembly.geneticmap import MSTMatrix

    p = OptionParser(mstmap.__doc__)
    p.add_option("--dh", default=False, action="store_true",
                 help="Double haploid population, no het [default: %default]")
    p.add_option("--freq", default=.2, type="float",
                 help="Allele must be above frequency [default: %default]")
    p.add_option("--mindepth", default=3, type="int",
                 help="Only trust genotype calls with depth [default: %default]")
    p.add_option("--missing_threshold", default=.25, type="float",
                 help="Fraction missing must be below")
    p.add_option("--noheader", default=False, action="store_true",
                 help="Do not print MSTmap run parameters [default: %default]")
    p.add_option("--pv4", default=False, action="store_true",
                 help="Enable filtering strand-bias, tail distance bias, etc. "
                 "[default: %default]")
    p.add_option("--freebayes", default=False, action="store_true",
                 help="VCF output from freebayes")
    p.set_sep(sep=".", help="Use separator to simplify individual names")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    if vcffile.endswith(".bcf"):
        bcffile = vcffile
        vcffile = bcffile.rsplit(".", 1)[0] + ".vcf"
        cmd = "bcftools view {0}".format(bcffile)
        cmd += " | vcfutils.pl varFilter"
        if not opts.pv4:
            cmd += " -1 0 -2 0 -3 0 -4 0 -e 0"
        if need_update(bcffile, vcffile):
            sh(cmd, outfile=vcffile)

    freq = opts.freq
    sep = opts.sep
    depth_index = 1 if opts.freebayes else 2

    ptype = "DH" if opts.dh else "RIL6"
    nohet = ptype == "DH"
    fp = open(vcffile)
    genotypes = []
    for row in fp:
        if row[:2] == "##":
            continue
        atoms = row.split()
        if row[0] == '#':
            ind = [x.split(sep)[0] for x in atoms[9:]]
            nind = len(ind)
            mh = ["locus_name"] + ind
            continue

        marker = "{0}.{1}".format(*atoms[:2])

        geno = atoms[9:]
        geno = [encode_genotype(x, mindepth=opts.mindepth,
                                depth_index=depth_index,
                                nohet=nohet) for x in geno]
        assert len(geno) == nind
        f = 1. / nind

        if geno.count("A") * f < freq:
            continue
        if geno.count("B") * f < freq:
            continue
        if geno.count("-") * f > opts.missing_threshold:
            continue

        genotype = [marker] + geno
        genotypes.append(genotype)

    mm = MSTMatrix(genotypes, mh, ptype, opts.missing_threshold)
    mm.write(opts.outfile, header=(not opts.noheader))
Exemple #18
0
def mstmap(args):
    """
    %prog mstmap bcffile/vcffile > matrixfile

    Convert bcf/vcf format to mstmap input.
    """
    from maize.assembly.geneticmap import MSTMatrix

    p = OptionParser(mstmap.__doc__)
    p.add_option("--dh",
                 default=False,
                 action="store_true",
                 help="Double haploid population, no het [default: %default]")
    p.add_option("--freq",
                 default=.2,
                 type="float",
                 help="Allele must be above frequency [default: %default]")
    p.add_option(
        "--mindepth",
        default=3,
        type="int",
        help="Only trust genotype calls with depth [default: %default]")
    p.add_option("--missing_threshold",
                 default=.25,
                 type="float",
                 help="Fraction missing must be below")
    p.add_option("--noheader",
                 default=False,
                 action="store_true",
                 help="Do not print MSTmap run parameters [default: %default]")
    p.add_option("--pv4",
                 default=False,
                 action="store_true",
                 help="Enable filtering strand-bias, tail distance bias, etc. "
                 "[default: %default]")
    p.add_option("--freebayes",
                 default=False,
                 action="store_true",
                 help="VCF output from freebayes")
    p.set_sep(sep=".", help="Use separator to simplify individual names")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    if vcffile.endswith(".bcf"):
        bcffile = vcffile
        vcffile = bcffile.rsplit(".", 1)[0] + ".vcf"
        cmd = "bcftools view {0}".format(bcffile)
        cmd += " | vcfutils.pl varFilter"
        if not opts.pv4:
            cmd += " -1 0 -2 0 -3 0 -4 0 -e 0"
        if need_update(bcffile, vcffile):
            sh(cmd, outfile=vcffile)

    freq = opts.freq
    sep = opts.sep
    depth_index = 1 if opts.freebayes else 2

    ptype = "DH" if opts.dh else "RIL6"
    nohet = ptype == "DH"
    fp = open(vcffile)
    genotypes = []
    for row in fp:
        if row[:2] == "##":
            continue
        atoms = row.split()
        if row[0] == '#':
            ind = [x.split(sep)[0] for x in atoms[9:]]
            nind = len(ind)
            mh = ["locus_name"] + ind
            continue

        marker = "{0}.{1}".format(*atoms[:2])

        geno = atoms[9:]
        geno = [
            encode_genotype(x,
                            mindepth=opts.mindepth,
                            depth_index=depth_index,
                            nohet=nohet) for x in geno
        ]
        assert len(geno) == nind
        f = 1. / nind

        if geno.count("A") * f < freq:
            continue
        if geno.count("B") * f < freq:
            continue
        if geno.count("-") * f > opts.missing_threshold:
            continue

        genotype = [marker] + geno
        genotypes.append(genotype)

    mm = MSTMatrix(genotypes, mh, ptype, opts.missing_threshold)
    mm.write(opts.outfile, header=(not opts.noheader))