Ejemplo n.º 1
0
Archivo: fastq.py Proyecto: rrane/jcvi
def convert(args):
    """
    %prog convert in.fastq out.fastq

    illumina fastq quality encoding uses offset 64, and sanger uses 33. This
    script creates a new file with the correct encoding
    """
    p = OptionParser(convert.__doc__)
    p.set_phred()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    infastq, outfastq = args
    phred = opts.phred or str(guessoffset([infastq]))
    fin = "illumina" if phred == "64" else "sanger"
    fout = "sanger" if phred == "64" else "illumina"

    seqret = "seqret"
    if infastq.endswith(".gz"):
        cmd = "zcat {0} | ".format(infastq)
        cmd += seqret + " fastq-{0}::stdin fastq-{1}::stdout".format(fin, fout)
    else:
        cmd = seqret + " fastq-{0}::{1} fastq-{2}::stdout".format(fin, infastq, fout)

    sh(cmd, outfile=outfastq)

    return outfastq
Ejemplo n.º 2
0
Archivo: ca.py Proyecto: zjwang6/jcvi
def fastq(args):
    """
    %prog fastq fastqfile

    Convert reads formatted as FASTQ file, and convert to CA frg file.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(fastq.__doc__)
    p.add_option(
        "--outtie",
        dest="outtie",
        default=False,
        action="store_true",
        help="Are these outie reads?",
    )
    p.set_phred()
    p.set_size()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(p.print_help())

    fastqfiles = [get_abs_path(x) for x in args]
    size = opts.size
    outtie = opts.outtie
    if size > 1000 and (not outtie):
        logging.debug(
            "[warn] long insert size {0} but not outtie".format(size))

    mated = size != 0
    libname = op.basename(args[0]).split(".")[0]
    libname = libname.replace("_1_sequence", "")

    frgfile = libname + ".frg"
    mean, sv = get_mean_sv(opts.size)

    cmd = "fastqToCA"
    cmd += " -libraryname {0} ".format(libname)
    fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles)
    if mated:
        assert len(args) in (
            1, 2), "you need one or two fastq files for mated library"
        fastqs = "-mates {0}".format(",".join(fastqfiles))
        cmd += "-insertsize {0} {1} ".format(mean, sv)
    cmd += fastqs

    offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]])
    illumina = offset == 64
    if illumina:
        cmd += " -type illumina"
    if outtie:
        cmd += " -outtie"

    sh(cmd, outfile=frgfile)
Ejemplo n.º 3
0
def tophat(args):
    """
    %prog tophat folder reference

    Run tophat on a folder of reads.
    """
    from jcvi.apps.bowtie import check_index
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(tophat.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.add_option("--single", default=False, action="store_true",
                 help="Single end mapping")
    p.add_option("--intron", default=15000, type="int",
                 help="Max intron size [default: %default]")
    p.add_option("--dist", default=-50, type="int",
                 help="Mate inner distance [default: %default]")
    p.add_option("--stdev", default=50, type="int",
                 help="Mate standard deviation [default: %default]")
    p.set_phred()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    num = 1 if opts.single else 2
    folder, reference = args
    reference = check_index(reference)
    for p, prefix in iter_project(folder, n=num):
        outdir = "{0}_tophat".format(prefix)
        outfile = op.join(outdir, "accepted_hits.bam")
        if op.exists(outfile):
            logging.debug("File `{0}` found. Skipping.".format(outfile))
            continue

        cmd = "tophat -p {0}".format(opts.cpus)
        if opts.gtf:
            cmd += " -G {0}".format(opts.gtf)
        cmd += " -o {0}".format(outdir)

        if num == 1:  # Single-end
            a, = p
        else:  # Paired-end
            a, b = p
            cmd += " --max-intron-length {0}".format(opts.intron)
            cmd += " --mate-inner-dist {0}".format(opts.dist)
            cmd += " --mate-std-dev {0}".format(opts.stdev)

        phred = opts.phred or str(guessoffset([a]))
        if phred == "64":
            cmd += " --phred64-quals"
        cmd += " {0} {1}".format(reference, " ".join(p))

        sh(cmd)
Ejemplo n.º 4
0
Archivo: ca.py Proyecto: arvin580/jcvi
def fastq(args):
    """
    %prog fastq fastqfile

    Convert reads formatted as FASTQ file, and convert to CA frg file.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(fastq.__doc__)
    p.add_option(
        "--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads? [default: %default]"
    )
    p.set_phred()
    p.set_size()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(p.print_help())

    fastqfiles = [get_abs_path(x) for x in args]
    size = opts.size
    outtie = opts.outtie
    if size > 1000 and (not outtie):
        logging.debug("[warn] long insert size {0} but not outtie".format(size))

    mated = size != 0
    libname = op.basename(args[0]).split(".")[0]
    libname = libname.replace("_1_sequence", "")

    frgfile = libname + ".frg"
    mean, sv = get_mean_sv(opts.size)

    cmd = "fastqToCA"
    cmd += " -libraryname {0} ".format(libname)
    fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles)
    if mated:
        assert len(args) in (1, 2), "you need one or two fastq files for mated library"
        fastqs = "-mates {0}".format(",".join(fastqfiles))
        cmd += "-insertsize {0} {1} ".format(mean, sv)
    cmd += fastqs

    offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]])
    illumina = offset == 64
    if illumina:
        cmd += " -type illumina"
    if outtie:
        cmd += " -outtie"

    sh(cmd, outfile=frgfile)
Ejemplo n.º 5
0
def convert(args):
    """
    %prog convert in.fastq

    illumina fastq quality encoding uses offset 64, and sanger uses 33. This
    script creates a new file with the correct encoding. Output gzipped file if
    input is also gzipped.
    """
    p = OptionParser(convert.__doc__)
    p.set_phred()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    infastq, = args
    phred = opts.phred or str(guessoffset([infastq]))
    ophred = {"64": "33", "33": "64"}[phred]

    gz = infastq.endswith(".gz")
    outfastq = infastq.rsplit(".", 1)[0] if gz else infastq
    pf, sf = outfastq.rsplit(".", 1)
    outfastq = "{0}.q{1}.{2}".format(pf, ophred, sf)
    if gz:
        outfastq += ".gz"

    fin = "illumina" if phred == "64" else "sanger"
    fout = "sanger" if phred == "64" else "illumina"

    seqret = "seqret"
    if infastq.endswith(".gz"):
        cmd = "zcat {0} | ".format(infastq)
        cmd += seqret + " fastq-{0}::stdin fastq-{1}::stdout".\
                format(fin, fout)
    else:
        cmd = seqret + " fastq-{0}::{1} fastq-{2}::stdout".\
                format(fin, infastq, fout)

    sh(cmd, outfile=outfastq)

    return outfastq
Ejemplo n.º 6
0
Archivo: fastq.py Proyecto: yangjl/jcvi
def convert(args):
    """
    %prog convert in.fastq

    illumina fastq quality encoding uses offset 64, and sanger uses 33. This
    script creates a new file with the correct encoding. Output gzipped file if
    input is also gzipped.
    """
    p = OptionParser(convert.__doc__)
    p.set_phred()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    infastq, = args
    phred = opts.phred or str(guessoffset([infastq]))
    ophred = {"64": "33", "33": "64"}[phred]

    gz = infastq.endswith(".gz")
    outfastq = infastq.rsplit(".", 1)[0] if gz else infastq
    pf, sf = outfastq.rsplit(".", 1)
    outfastq = "{0}.q{1}.{2}".format(pf, ophred, sf)
    if gz:
        outfastq += ".gz"

    fin = "illumina" if phred == "64" else "sanger"
    fout = "sanger" if phred == "64" else "illumina"

    seqret = "seqret"
    if infastq.endswith(".gz"):
        cmd = "zcat {0} | ".format(infastq)
        cmd += seqret + " fastq-{0}::stdin fastq-{1}::stdout".\
                format(fin, fout)
    else:
        cmd = seqret + " fastq-{0}::{1} fastq-{2}::stdout".\
                format(fin, infastq, fout)

    sh(cmd, outfile=outfastq)

    return outfastq
Ejemplo n.º 7
0
Archivo: snp.py Proyecto: Hensonmw/jcvi
def gatk(args):
    """
    %prog gatk bamfile reference.fasta

    Call SNPs based on GATK best practices.
    """
    p = OptionParser(gatk.__doc__)
    p.add_option("--indelrealign", default=False, action="store_true",
                 help="Perform indel realignment")
    p.set_home("gatk")
    p.set_home("picard")
    p.set_phred()
    p.set_cpus(cpus=24)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, ref = args
    pf = bamfile.rsplit(".", 1)[0]
    mm = MakeManager()
    picard = "java -Xmx32g -jar {0}/picard.jar".format(opts.picard_home)
    tk = "java -Xmx32g -jar {0}/GenomeAnalysisTK.jar".format(opts.gatk_home)
    tk += " -R {0}".format(ref)

    # Step 0 - build reference
    dictfile = ref.rsplit(".", 1)[0] + ".dict"
    cmd1 = picard + " CreateSequenceDictionary"
    cmd1 += " R={0} O={1}".format(ref, dictfile)
    cmd2 = "samtools faidx {0}".format(ref)
    mm.add(ref, dictfile, (cmd1, cmd2))

    # Step 1 - sort bam
    sortedbamfile = pf + ".sorted.bam"
    cmd = picard + " SortSam"
    cmd += " INPUT={0} OUTPUT={1}".format(bamfile, sortedbamfile)
    cmd += " SORT_ORDER=coordinate CREATE_INDEX=true"
    mm.add(bamfile, sortedbamfile, cmd)

    # Step 2 - mark duplicates
    dedupbamfile = pf + ".dedup.bam"
    cmd = picard + " MarkDuplicates"
    cmd += " INPUT={0} OUTPUT={1}".format(sortedbamfile, dedupbamfile)
    cmd += " METRICS_FILE=dedup.log CREATE_INDEX=true"
    mm.add(sortedbamfile, dedupbamfile, cmd)

    if opts.indelrealign:
        # Step 3 - create indel realignment targets
        intervals = pf + ".intervals"
        cmd = tk + " -T RealignerTargetCreator"
        cmd += " -I {0} -o {1}".format(dedupbamfile, intervals)
        mm.add(dedupbamfile, intervals, cmd)

        # Step 4 - indel realignment
        realignedbamfile = pf + ".realigned.bam"
        cmd = tk + " -T IndelRealigner"
        cmd += " -targetIntervals {0}".format(intervals)
        cmd += " -I {0} -o {1}".format(dedupbamfile, realignedbamfile)
        mm.add((dictfile, intervals), realignedbamfile, cmd)
    else:
        realignedbamfile = dedupbamfile

    # Step 5 - SNP calling
    vcf = pf + ".vcf"
    cmd = tk + " -T HaplotypeCaller"
    cmd += " -I {0}".format(realignedbamfile)
    cmd += " --genotyping_mode DISCOVERY"
    cmd += " -stand_emit_conf 10 -stand_call_conf 30"
    cmd += " -nct {0}".format(opts.cpus)
    cmd += " -o {0}".format(vcf)
    if opts.phred == "64":
        cmd += " --fix_misencoded_quality_scores"
    mm.add(realignedbamfile, vcf, cmd)

    # Step 6 - SNP filtering
    filtered_vcf = pf + ".filtered.vcf"
    cmd = tk + " -T VariantFiltration"
    cmd += " -V {0}".format(vcf)
    cmd += ' --filterExpression "DP < 10 || DP > 300 || QD < 2.0 || FS > 60.0 || MQ < 40.0"'
    cmd += ' --filterName "LOWQUAL"'
    cmd += ' --genotypeFilterExpression "isHomVar == 1"'
    cmd += ' --genotypeFilterName "HOMOVAR"'
    cmd += ' --genotypeFilterExpression "isHet == 1"'
    cmd += ' --genotypeFilterName "HET"'
    cmd += " -o {0}".format(filtered_vcf)
    mm.add(vcf, filtered_vcf, cmd)

    mm.write()
Ejemplo n.º 8
0
def tophat(args):
    """
    %prog tophat folder reference

    Run tophat on a folder of reads.
    """
    from jcvi.apps.bowtie import check_index
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(tophat.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.add_option("--single",
                 default=False,
                 action="store_true",
                 help="Single end mapping")
    p.add_option("--intron",
                 default=15000,
                 type="int",
                 help="Max intron size [default: %default]")
    p.add_option("--dist",
                 default=-50,
                 type="int",
                 help="Mate inner distance [default: %default]")
    p.add_option("--stdev",
                 default=50,
                 type="int",
                 help="Mate standard deviation [default: %default]")
    p.set_phred()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    num = 1 if opts.single else 2
    folder, reference = args
    reference = check_index(reference)
    for p, prefix in iter_project(folder, n=num):
        outdir = "{0}_tophat".format(prefix)
        outfile = op.join(outdir, "accepted_hits.bam")
        if op.exists(outfile):
            logging.debug("File `{0}` found. Skipping.".format(outfile))
            continue

        cmd = "tophat -p {0}".format(opts.cpus)
        if opts.gtf:
            cmd += " -G {0}".format(opts.gtf)
        cmd += " -o {0}".format(outdir)

        if num == 1:  # Single-end
            a, = p
        else:  # Paired-end
            a, b = p
            cmd += " --max-intron-length {0}".format(opts.intron)
            cmd += " --mate-inner-dist {0}".format(opts.dist)
            cmd += " --mate-std-dev {0}".format(opts.stdev)

        phred = opts.phred or str(guessoffset([a]))
        if phred == "64":
            cmd += " --phred64-quals"
        cmd += " {0} {1}".format(reference, " ".join(p))

        sh(cmd)
Ejemplo n.º 9
0
def gatk(args):
    """
    %prog gatk bamfile reference.fasta

    Call SNPs based on GATK best practices.
    """
    p = OptionParser(gatk.__doc__)
    p.add_option("--indelrealign", default=False, action="store_true",
                 help="Perform indel realignment")
    p.set_home("gatk")
    p.set_home("picard")
    p.set_phred()
    p.set_cpus(cpus=24)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, ref = args
    pf = bamfile.rsplit(".", 1)[0]
    mm = MakeManager()
    picard = "java -Xmx32g -jar {0}/picard.jar".format(opts.picard_home)
    tk = "java -Xmx32g -jar {0}/GenomeAnalysisTK.jar".format(opts.gatk_home)
    tk += " -R {0}".format(ref)

    # Step 0 - build reference
    dictfile = ref.rsplit(".", 1)[0] + ".dict"
    cmd1 = picard + " CreateSequenceDictionary"
    cmd1 += " R={0} O={1}".format(ref, dictfile)
    cmd2 = "samtools faidx {0}".format(ref)
    mm.add(ref, dictfile, (cmd1, cmd2))

    # Step 1 - sort bam
    sortedbamfile = pf + ".sorted.bam"
    cmd = picard + " SortSam"
    cmd += " INPUT={0} OUTPUT={1}".format(bamfile, sortedbamfile)
    cmd += " SORT_ORDER=coordinate CREATE_INDEX=true"
    mm.add(bamfile, sortedbamfile, cmd)

    # Step 2 - mark duplicates
    dedupbamfile = pf + ".dedup.bam"
    cmd = picard + " MarkDuplicates"
    cmd += " INPUT={0} OUTPUT={1}".format(sortedbamfile, dedupbamfile)
    cmd += " METRICS_FILE=dedup.log CREATE_INDEX=true"
    mm.add(sortedbamfile, dedupbamfile, cmd)

    if opts.indelrealign:
        # Step 3 - create indel realignment targets
        intervals = pf + ".intervals"
        cmd = tk + " -T RealignerTargetCreator"
        cmd += " -I {0} -o {1}".format(dedupbamfile, intervals)
        mm.add(dedupbamfile, intervals, cmd)

        # Step 4 - indel realignment
        realignedbamfile = pf + ".realigned.bam"
        cmd = tk + " -T IndelRealigner"
        cmd += " -targetIntervals {0}".format(intervals)
        cmd += " -I {0} -o {1}".format(dedupbamfile, realignedbamfile)
        mm.add((dictfile, intervals), realignedbamfile, cmd)
    else:
        realignedbamfile = dedupbamfile

    # Step 5 - SNP calling
    vcf = pf + ".vcf"
    cmd = tk + " -T HaplotypeCaller"
    cmd += " -I {0}".format(realignedbamfile)
    cmd += " --genotyping_mode DISCOVERY"
    cmd += " -stand_emit_conf 10 -stand_call_conf 30"
    cmd += " -nct {0}".format(opts.cpus)
    cmd += " -o {0}".format(vcf)
    if opts.phred == "64":
        cmd += " --fix_misencoded_quality_scores"
    mm.add(realignedbamfile, vcf, cmd)

    # Step 6 - SNP filtering
    filtered_vcf = pf + ".filtered.vcf"
    cmd = tk + " -T VariantFiltration"
    cmd += " -V {0}".format(vcf)
    cmd += ' --filterExpression "DP < 10 || DP > 300 || QD < 2.0 || FS > 60.0 || MQ < 40.0"'
    cmd += ' --filterName "LOWQUAL"'
    cmd += ' --genotypeFilterExpression "isHomVar == 1"'
    cmd += ' --genotypeFilterName "HOMOVAR"'
    cmd += ' --genotypeFilterExpression "isHet == 1"'
    cmd += ' --genotypeFilterName "HET"'
    cmd += " -o {0}".format(filtered_vcf)
    mm.add(vcf, filtered_vcf, cmd)

    mm.write()
Ejemplo n.º 10
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    tv = "0.32"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    p = OptionParser(trim.__doc__)
    p.add_option("--path", default=op.join("~/bin", TrimJar),
            help="Path to trimmomatic jar file [default: %default]")
    p.set_phred()
    p.add_option("--nofrags", default=False, action="store_true",
            help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv", default=15, type="int",
            help="Average qv after trimming [default: %default]")
    p.add_option("--minlen", default=36, type="int",
            help="Minimum length after trimming [default: %default]")
    p.add_option("--adapteronly", default=False, action="store_true",
            help="Only trim adapters with no qv trimming [default: %default]")
    p.add_option("--nogz", default=False, action="store_true",
            help="Do not write to gzipped files [default: %default]")
    p.add_option("--log", default=None, dest="trimlog",
            help="Specify a `trimlog` file [default: %default]")
    p.set_cpus(cpus=4)
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path), \
        "Couldn't find Trimmomatic jar file at `{0}`".\
        format(path)

    adaptersfile = "adapters.fasta"
    Adapters = must_open(op.join(datadir, adaptersfile)).read()
    write_file(adaptersfile, Adapters, skipcheck=True)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)
    threadsflag = " -threads {0}".format(opts.cpus)
    if opts.trimlog:
        trimlog = " -trimlog {0}".format(opts.trimlog)

    cmd = "java -Xmx4g -jar {0}".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    get_dirname = lambda x: "{0}/".format(op.dirname(x)) if op.dirname(x) else ''
    if len(args) == 1:
        cmd += " SE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        dirname = get_dirname(fastqfile)
        frags1 = dirname + prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += " PE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        dirname1 = get_dirname(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        dirname2 = get_dirname(fastqfile2)
        pairs1 = dirname1 + prefix1 + pairs
        pairs2 = dirname2 + prefix2 + pairs
        frags1 = dirname1 + prefix1 + frags
        frags2 = dirname2 + prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile)

    if not opts.adapteronly:
        cmd += " LEADING:3 TRAILING:3"
        cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv)

    cmd += " MINLEN:{0}".format(opts.minlen)

    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd)