Beispiel #1
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(
        out_dir, "%s-report.json" %
        utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [
        os.path.join(
            out_dir,
            "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
        for x in fastq_files
    ]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            adapters_args = " ".join(["-a %s" % a for a in adapters])
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(
                    **locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                adapters_args = adapters_args + " " + " ".join(
                    ["-A %s" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple(
                    [objectstore.cl_input(x) for x in fastq_files])
                output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(
                    **locals())
            adapters_args += " --no-default-adapters"  # Prevent GitHub queries
            quality_base = "64" if dd.get_quality_format(
                data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (
                tx_report_file, dd.get_sample_name(data))
            ropts = " ".join(
                str(x) for x in config_utils.get_resources(
                    "atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"),
                                 ("--minimum-length", ["-m "],
                                  str(dd.get_min_read_length(data))),
                                 ("--nextseq-trim", [], "25")]:
                if k not in ropts and not any(alt_k in ropts
                                              for alt_k in alt_ks):
                    extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % dd.get_num_cores(data)
                           if dd.get_num_cores(data) > 1 else "")
            cmd = (
                "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}"
            )
            do.run(cmd.format(**locals()),
                   "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Beispiel #2
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(
        out_dir, "%s-report.json" %
        utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [
        os.path.join(
            out_dir,
            "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
        for x in fastq_files
    ]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            adapters_args = " ".join(["-a %s" % a for a in adapters])
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(
                    **locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                if adapters and len(adapters) <= 2:
                    aligner_args = "--aligner insert"
                adapters_args = adapters_args + " " + " ".join(
                    ["-A %s" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple(
                    [objectstore.cl_input(x) for x in fastq_files])
                output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(
                    **locals())
            quality_base = "64" if dd.get_quality_format(
                data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (
                tx_report_file, dd.get_sample_name(data))
            ropts = " ".join(
                str(x) for x in config_utils.get_resources(
                    "atropos", data["config"]).get("options", []))
            thread_args = ("--threads %s" % dd.get_num_cores(data)
                           if dd.get_num_cores(data) > 1 else "")
            cmd = (
                "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                "{adapters_args} {aligner_args} {input_args} {output_args} {report_args}"
            )
            cmd += " --quality-cutoff=5 --minimum-length=%s" % dd.get_min_read_length(
                data)
            do.run(cmd.format(**locals()),
                   "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Beispiel #3
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            # polyX trimming, anchored to the 3' ends of reads
            if "polyx" in dd.get_adapters(data):
                adapters += ["A{200}$", "C{200}$", "G{200}$", "T{200}$"]
            adapters_args = " ".join(["-a '%s'" % a for a in adapters])
            adapters_args += " --overlap 8"  # Avoid very short internal matches (default is 3)
            adapters_args += " --no-default-adapters --no-cache-adapters"  # Prevent GitHub queries and saving pickles
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                cores = dd.get_num_cores(data)
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                cores = max(1, dd.get_num_cores(data) // 2)
                adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files])
                output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) "
                               "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals())
            quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file,
                                                                                     dd.get_sample_name(data))
            ropts = " ".join(str(x) for x in
                             config_utils.get_resources("atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True),
                                       ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True),
                                       ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or
                                                                     "polyg" in dd.get_adapters(data)))]:
                if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks):
                    if want:
                        extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % cores if cores > 1 else "")
            cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                   "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}")
            do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Beispiel #4
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            # polyX trimming, anchored to the 3' ends of reads
            if "polyx" in dd.get_adapters(data):
                adapters += ["A{200}", "C{200}", "G{200}", "T{200}"]
            adapters_args = " ".join(["-a '%s'" % a for a in adapters])
            adapters_args += " --overlap 8"  # Avoid very short internal matches (default is 3)
            adapters_args += " --no-default-adapters --no-cache-adapters"  # Prevent GitHub queries and saving pickles
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                cores = dd.get_num_cores(data)
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                cores = max(1, dd.get_num_cores(data) // 2)
                adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files])
                output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) "
                               "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals())
            quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file,
                                                                                     dd.get_sample_name(data))
            ropts = " ".join(str(x) for x in
                             config_utils.get_resources("atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True),
                                       ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True),
                                       ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or
                                                                     "polyg" in dd.get_adapters(data)))]:
                if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks):
                    if want:
                        extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % cores if cores > 1 else "")
            cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                   "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}")
            do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Beispiel #5
0
def _can_use_mem(fastq_file, data):
    """bwa-mem handle longer (> 70bp) reads with improved piping.
    Randomly samples 5000 reads from the first two million.
    Default to no piping if more than 75% of the sampled reads are small.
    """
    min_size = 70
    thresh = 0.75
    head_count = 8000000
    tocheck = 5000
    seqtk = config_utils.get_program("seqtk", data["config"])
    fastq_file = objectstore.cl_input(fastq_file)
    gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith(".gz") else "cat {fastq_file}"
    cmd = (
        gzip_cmd + " | head -n {head_count} | "
        "{seqtk} sample -s42 - {tocheck} | "
        "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c"
    )
    count_out = subprocess.check_output(
        cmd.format(**locals()), shell=True, executable="/bin/bash", stderr=open("/dev/null", "w")
    )
    if not count_out.strip():
        raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals()))
    shorter = 0
    for count, size in (l.strip().split() for l in count_out.strip().split("\n")):
        if int(size) < min_size:
            shorter += int(count)
    return (float(shorter) / float(tocheck)) <= thresh
Beispiel #6
0
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data):
    """Convert CRAM to fastq in a specified region.
    """
    ref_file = tz.get_in(["reference", "fasta", "base"], data)
    resources = config_utils.get_resources("bamtofastq", data["config"])
    cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory",
                                                          "1G")) * cores
    rext = "-%s" % region.replace(":", "_").replace("-",
                                                    "_") if region else "full"
    out_s, out_p1, out_p2, out_o1, out_o2 = [
        os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext))
        for fext in ["s1", "p1", "p2", "o1", "o2"]
    ]
    if not utils.file_exists(out_p1):
        with file_transaction(data, out_s, out_p1, out_p2, out_o1, out_o2) as \
             (tx_out_s, tx_out_p1, tx_out_p2, tx_out_o1, tx_out_o2):
            cram_file = objectstore.cl_input(cram_file)
            sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0]
            cmd = (
                "bamtofastq filename={cram_file} inputformat=cram T={sortprefix} "
                "gz=1 collate=1 colsbs={max_mem} exclude=SECONDARY,SUPPLEMENTARY "
                "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O={tx_out_o1} O2={tx_out_o2} "
                "reference={ref_file}")
            if region:
                cmd += " ranges='{region}'"
            do.run(cmd.format(**locals()),
                   "CRAM to fastq %s" % region if region else "")
    return [[out_p1, out_p2, out_s]]
def is_paired(bam_file):
    """Determine if a BAM file has paired reads.

    Works around issues with head closing the samtools pipe using signal trick from:
    http://stackoverflow.com/a/12451083/252589
    """
    bam_file = objectstore.cl_input(bam_file)
    cmd = ("set -o pipefail; "
           "samtools view -h {bam_file} | head -300000 | "
           "samtools view -S -f 1 /dev/stdin  | head -1 | wc -l")
    p = subprocess.Popen(
        cmd.format(**locals()),
        shell=True,
        executable=do.find_bash(),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        preexec_fn=lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL))
    stdout, stderr = p.communicate()
    stdout = stdout.decode()
    stderr = stderr.decode()
    stderr = stderr.strip()
    if ((p.returncode == 0 or p.returncode == 141) and
        (stderr == "" or
         (stderr.startswith("gof3r") and stderr.endswith("broken pipe")))):
        return int(stdout) > 0
    else:
        raise ValueError("Failed to check paired status of BAM file: %s" %
                         str(stderr))
Beispiel #8
0
def _can_use_mem(fastq_file, data, read_min_size=None):
    """bwa-mem handle longer (> 70bp) reads with improved piping.
    Randomly samples 5000 reads from the first two million.
    Default to no piping if more than 75% of the sampled reads are small.
    If we've previously calculated minimum read sizes (from rtg SDF output)
    we can skip the formal check.
    """
    min_size = 70
    if read_min_size and read_min_size >= min_size:
        return True
    thresh = 0.75
    head_count = 8000000
    tocheck = 5000
    seqtk = config_utils.get_program("seqtk", data["config"])
    fastq_file = objectstore.cl_input(fastq_file)
    gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith(
        ".gz") else "cat {fastq_file}"
    cmd = (gzip_cmd + " | head -n {head_count} | "
           "{seqtk} sample -s42 - {tocheck} | "
           "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c")
    count_out = subprocess.check_output(cmd.format(**locals()),
                                        shell=True,
                                        executable="/bin/bash")
    if not count_out.strip():
        raise IOError("Failed to check fastq file sizes with: %s" %
                      cmd.format(**locals()))
    shorter = 0
    for count, size in (l.strip().split()
                        for l in count_out.strip().split("\n")):
        if int(size) < min_size:
            shorter += int(count)
    return (float(shorter) / float(tocheck)) <= thresh
Beispiel #9
0
def _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    out_file = os.path.join(work_dir, os.path.basename(in_file) +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = objectstore.is_remote(in_file)
            in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip)
            if needs_convert:
                in_file = fastq_convert_pipe_cl(in_file, {"config": config})
            if needs_gunzip and not needs_convert:
                gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                       "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if needs_convert else ""
                do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input")
            else:
                raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip,
                                                                     needs_gunzip, needs_convert))
    return out_file
Beispiel #10
0
def fastq_size_output(fastq_file, tocheck):
    head_count = 8000000
    fastq_file = objectstore.cl_input(fastq_file)
    gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith(
        ".gz") else "cat {fastq_file}"
    cmd = (utils.local_path_export() + gzip_cmd + " | head -n {head_count} | "
           "seqtk sample -s42 - {tocheck} | "
           "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c")

    def fix_signal():
        """Avoid spurious 'cat: write error: Broken pipe' message due to head command.

        Work around from:
        https://bitbucket.org/brodie/cram/issues/16/broken-pipe-when-heading-certain-output
        """
        signal.signal(signal.SIGPIPE, signal.SIG_DFL)

    count_out = subprocess.check_output(cmd.format(**locals()),
                                        shell=True,
                                        executable="/bin/bash",
                                        preexec_fn=fix_signal).decode()
    if not count_out.strip():
        raise IOError("Failed to check fastq file sizes with: %s" %
                      cmd.format(**locals()))
    for count, size in (l.strip().split()
                        for l in count_out.strip().split("\n")):
        yield count, size
Beispiel #11
0
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None):
    """Get a pre-prepared genome from S3, unpacking it locally.

    Supports runs on AWS where we can retrieve the resources on demand. Upgrades
    GEMINI in place if installed inside a Docker container with the biological data.
    GEMINI install requires write permissions to standard data directories -- works
    on AWS but not generalizable elsewhere.
    """
    from bcbio.variation import population
    from bcbio import install
    if not out_dir:
        out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                                  "inputs", "data", "genomes"))
    for target in REMAP_NAMES.get(name, [name]):
        ref_dir = os.path.join(out_dir, genome_build, target)
        if not os.path.exists(ref_dir):
            if target in INPLACE_INDEX:
                ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
                # Need to add genome resources so we can retrieve GTF files for STAR
                data["genome_resources"] = get_resources(data["genome_build"], ref_file, data)
                INPLACE_INDEX[target](ref_file, ref_dir, data)
            else:
                # XXX Currently only supports genomes from S3 us-east-1 bucket.
                # Need to assess how slow this is from multiple regions and generalize to non-AWS.
                fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target)
                try:
                    objectstore.connect(fname)
                except:
                    raise ValueError("Could not find reference genome file %s %s" % (genome_build, name))
                with utils.chdir(out_dir):
                    cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp"
                    do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build)
    ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
    if data.get("genome_build"):
        gresources = get_resources(data["genome_build"], ref_file, data)
        if data.get("files") and population.do_db_build([data], need_bam=False, gresources=gresources):
            # symlink base GEMINI directory to work directory, avoiding write/space issues
            out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data"))
            orig_gemini_dir = install.get_gemini_dir()
            # Remove empty initial directory created by installer
            if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0:
                if os.path.islink(orig_gemini_dir):
                    os.remove(orig_gemini_dir)
                else:
                    os.rmdir(orig_gemini_dir)
            if not os.path.exists(orig_gemini_dir):
                os.symlink(out_gemini_dir, orig_gemini_dir)
            cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"]
            do.run(cmd, "Download GEMINI data")
    genome_dir = os.path.join(out_dir, genome_build)
    genome_build = genome_build.replace("-test", "")
    if need_remap or name == "samtools":
        return os.path.join(genome_dir, "seq", "%s.fa" % genome_build)
    else:
        ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1])
        base_name = os.path.commonprefix(os.listdir(ref_dir))
        while base_name.endswith("."):
            base_name = base_name[:-1]
        return os.path.join(ref_dir, base_name)
Beispiel #12
0
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None):
    """Get a pre-prepared genome from S3, unpacking it locally.

    Supports runs on AWS where we can retrieve the resources on demand. Upgrades
    GEMINI in place if installed inside a Docker container with the biological data.
    GEMINI install requires write permissions to standard data directories -- works
    on AWS but not generalizable elsewhere.
    """
    from bcbio.variation import population
    from bcbio import install
    if not out_dir:
        out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                                  "inputs", "data", "genomes"))
    for target in REMAP_NAMES.get(name, [name]):
        ref_dir = os.path.join(out_dir, genome_build, target)
        if not os.path.exists(ref_dir):
            if target in INPLACE_INDEX:
                ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
                # Need to add genome resources so we can retrieve GTF files for STAR
                data["genome_resources"] = get_resources(data["genome_build"], ref_file, data)
                INPLACE_INDEX[target](ref_file, ref_dir, data)
            else:
                # XXX Currently only supports genomes from S3 us-east-1 bucket.
                # Need to assess how slow this is from multiple regions and generalize to non-AWS.
                fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target)
                try:
                    objectstore.connect(fname)
                except:
                    raise ValueError("Could not find reference genome file %s %s" % (genome_build, name))
                with utils.chdir(out_dir):
                    cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp"
                    do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build)
    ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
    if data.get("genome_build"):
        if (data.get("files") and population.do_db_build([data], need_bam=False)
              and population.support_gemini_orig(data)):
            # symlink base GEMINI directory to work directory, avoiding write/space issues
            out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data"))
            orig_gemini_dir = install.get_gemini_dir()
            # Remove empty initial directory created by installer
            if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0:
                if os.path.islink(orig_gemini_dir):
                    os.remove(orig_gemini_dir)
                else:
                    os.rmdir(orig_gemini_dir)
            if not os.path.exists(orig_gemini_dir):
                os.symlink(out_gemini_dir, orig_gemini_dir)
            cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"]
            do.run(cmd, "Download GEMINI data")
    genome_dir = os.path.join(out_dir, genome_build)
    genome_build = genome_build.replace("-test", "")
    if need_remap or name == "samtools":
        return os.path.join(genome_dir, "seq", "%s.fa" % genome_build)
    else:
        ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1])
        base_name = os.path.commonprefix(os.listdir(ref_dir))
        while base_name.endswith("."):
            base_name = base_name[:-1]
        return os.path.join(ref_dir, base_name)
Beispiel #13
0
def fastq_convert_pipe_cl(in_file, data):
    """Create an anonymous pipe converting Illumina 1.3-1.7 to Sanger.

    Uses seqtk: https://github.com/lh3/seqt
    """
    seqtk = config_utils.get_program("seqtk", data["config"])
    in_file = objectstore.cl_input(in_file)
    return "<({seqtk} seq -Q64 -V {in_file})".format(**locals())
Beispiel #14
0
def is_paired(bam_file):
    """Determine if a BAM file has paired reads.
    """
    bam_file = objectstore.cl_input(bam_file)
    cmd = "sambamba view -h {bam_file} | head -50000 | " "sambamba view -S -F paired /dev/stdin  | head -1 | wc -l"
    out = subprocess.check_output(
        cmd.format(**locals()), shell=True, executable=do.find_bash(), stderr=open("/dev/null", "w")
    )
    return int(out) > 0
Beispiel #15
0
def _cutadapt_pe_cmd(fastq_files, out_files, quality_format, base_cmd, data):
    """
    run cutadapt in paired end mode
    """
    fq1, fq2 = [objectstore.cl_input(x) for x in fastq_files]
    of1, of2 = out_files
    base_cmd += " --minimum-length={min_length} ".format(min_length=dd.get_min_read_length(data))
    first_cmd = base_cmd + " -o {of1_tx} -p {of2_tx} " + fq1 + " " + fq2
    return first_cmd + "| tee > {log_tx};"
Beispiel #16
0
def _cutadapt_pe_cmd(fastq_files, out_files, quality_format, base_cmd, data):
    """
    run cutadapt in paired end mode
    """
    fq1, fq2 = [objectstore.cl_input(x) for x in fastq_files]
    of1, of2 = out_files
    base_cmd += " --minimum-length={min_length} ".format(min_length=dd.get_min_read_length(data))
    first_cmd = base_cmd + " -o {of1_tx} -p {of2_tx} " + fq1 + " " + fq2
    return first_cmd + "| tee > {log_tx};"
Beispiel #17
0
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False, output_infix=''):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory",
                                                          "1G")) * cores
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(
        work_dir, "%s%s-1.fq.gz" %
        (os.path.splitext(os.path.basename(bam_file))[0], output_infix))
    out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        if not bam.is_paired(bam_file):
            out_file_2 = None
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = (
                    "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                    "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            try:
                do.run(cmd.format(**locals()),
                       "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info(
                        "bamtofastq deflate IO failure preparing %s. Retrying with single core."
                        % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
    if needs_retry:
        return _bgzip_from_bam(bam_file, dirs, config, is_retry=True)
    else:
        return [
            x for x in [out_file_1, out_file_2]
            if x is not None and utils.file_exists(x)
        ]
Beispiel #18
0
def is_paired(bam_file):
    """Determine if a BAM file has paired reads.
    """
    bam_file = objectstore.cl_input(bam_file)
    cmd = ("sambamba view -h {bam_file} | head -50000 | "
           "sambamba view -S -F paired /dev/stdin  | head -1 | wc -l")
    out = subprocess.check_output(cmd.format(**locals()), shell=True,
                                  executable=do.find_bash(),
                                  stderr=open("/dev/null", "w"))
    return int(out) > 0
Beispiel #19
0
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    config = data["config"]
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix))
    out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        if not bam.is_paired(bam_file):
            out_file_2 = None
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0)
            if prep_cmd:
                fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1)
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                if prep_cmd:
                    fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            extra_opts = " ".join([str(x) for x in resources.get("options", [])])
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str
            try:
                do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core."
                                % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
    if needs_retry:
        return _bgzip_from_bam(bam_file, dirs, data, is_retry=True)
    else:
        return [x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x)]
Beispiel #20
0
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd):
    """
    this has to use the -o option, not redirect to stdout in order for gzipping to be
    honored
    """
    min_length = MINIMUM_LENGTH
    cmd = base_cmd + " --minimum-length={min_length} ".format(**locals())
    fq1 = objectstore.cl_input(fastq_files[0])
    of1 = out_files[0]
    cmd += " -o {of1} " + str(fq1)
    return cmd
Beispiel #21
0
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd):
    """
    this has to use the -o option, not redirect to stdout in order for gzipping to be
    honored
    """
    min_length = MINIMUM_LENGTH
    cmd = base_cmd + " --minimum-length={min_length} ".format(**locals())
    fq1 = objectstore.cl_input(fastq_files[0])
    of1 = out_files[0]
    cmd += " -o {of1} " + str(fq1)
    return cmd
Beispiel #22
0
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd, data):
    """
    this has to use the -o option, not redirect to stdout in order for
    gzipping to be supported
    """
    min_length = dd.get_min_read_length(data)
    cmd = base_cmd + " --minimum-length={min_length} ".format(**locals())
    fq1 = objectstore.cl_input(fastq_files[0])
    of1 = out_files[0]
    cmd += " -o {of1_tx} " + str(fq1)
    cmd = "%s | tee > {log_tx}" % cmd
    return cmd
Beispiel #23
0
def _cutadapt_pe_nosickle(fastq_files, out_files, quality_format, base_cmd):
    """
    sickle has an issue with 0 length reads, here is the open issue for it:
    https://github.com/najoshi/sickle/issues/32
    until that is resolved, this is a workaround which avoids using sickle
    """
    fq1, fq2 = [objectstore.cl_input(x) for x in fastq_files]
    of1, of2 = out_files
    base_cmd += " --minimum-length={min_length} ".format(min_length=MINIMUM_LENGTH)
    first_cmd = base_cmd + " -o {tmp_fq1} -p {tmp_fq2} " + fq1 + " " + fq2
    second_cmd = base_cmd + " -o {of2_tx} -p {of1_tx} {tmp_fq2} {tmp_fq1}"
    return first_cmd + ";" + second_cmd + "; rm {tmp_fq1} {tmp_fq2} "
Beispiel #24
0
def _cutadapt_pe_nosickle(fastq_files, out_files, quality_format, base_cmd):
    """
    sickle has an issue with 0 length reads, here is the open issue for it:
    https://github.com/najoshi/sickle/issues/32
    until that is resolved, this is a workaround which avoids using sickle
    """
    fq1, fq2 = [objectstore.cl_input(x) for x in fastq_files]
    of1, of2 = out_files
    base_cmd += " --minimum-length={min_length} ".format(min_length=MINIMUM_LENGTH)
    first_cmd = base_cmd + " -o {tmp_fq1} -p {tmp_fq2} " + fq1 + " " + fq2
    second_cmd = base_cmd + " -o {of2_tx} -p {of1_tx} {tmp_fq2} {tmp_fq1}"
    return first_cmd + ";" + second_cmd + "; rm {tmp_fq1} {tmp_fq2} "
Beispiel #25
0
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd, data):
    """
    this has to use the -o option, not redirect to stdout in order for
    gzipping to be supported
    """
    min_length = dd.get_min_read_length(data)
    cmd = base_cmd + " --minimum-length={min_length} ".format(**locals())
    fq1 = objectstore.cl_input(fastq_files[0])
    of1 = out_files[0]
    cmd += " -o {of1_tx} " + str(fq1)
    cmd = "%s | tee > {log_tx}" % cmd
    return cmd
Beispiel #26
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            adapters_args = " ".join(["-a %s" % a for a in adapters])
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(**locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                adapters_args = adapters_args + " " + " ".join(["-A %s" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files])
                output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(**locals())
            quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file,
                                                                                     dd.get_sample_name(data))
            ropts = " ".join(str(x) for x in
                             config_utils.get_resources("atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"),
                                 ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)))]:
                if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks):
                    extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
            cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                   "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}")
            do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Beispiel #27
0
def _bgzip_file(finput, config, work_dir, needs_bgzip, needs_gunzip,
                needs_convert, data):
    """Handle bgzip of input file, potentially gunzipping an existing file.

    Handles cases where finput might be multiple files and need to be concatenated.
    """
    if isinstance(finput, six.string_types):
        in_file = finput
    else:
        assert not needs_convert, "Do not yet handle quality conversion with multiple inputs"
        return _bgzip_multiple_files(finput, work_dir, data)
    out_file = os.path.join(
        work_dir,
        os.path.basename(in_file).replace(".bz2", "") +
        (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = objectstore.is_remote(in_file)
            in_file = objectstore.cl_input(in_file,
                                           unpack=needs_gunzip or needs_convert
                                           or needs_bgzip
                                           or dd.get_trim_ends(data))
            if needs_convert or dd.get_trim_ends(data):
                in_file = fastq_convert_pipe_cl(in_file, data)
            if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)):
                if in_file.endswith(".bz2"):
                    gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals())
                else:
                    gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run(
                    "{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".
                    format(**locals()), "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if (needs_convert
                                         or dd.get_trim_ends(data)) else ""
                do.run(
                    "cat {in_file} {bgzip} > {tx_out_file}".format(**locals()),
                    "Get remote input")
            else:
                raise ValueError(
                    "Unexpected inputs: %s %s %s %s" %
                    (in_file, needs_bgzip, needs_gunzip, needs_convert))
    return out_file
Beispiel #28
0
def is_empty(bam_file):
    """Determine if a BAM file is empty
    """
    bam_file = objectstore.cl_input(bam_file)
    cmd = ("set -o pipefail; "
           "samtools view {bam_file} | head -1 | wc -l")
    p = subprocess.Popen(cmd.format(**locals()), shell=True,
                         executable=do.find_bash(),
                         stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                         preexec_fn=lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL))
    stdout, stderr = p.communicate()
    stderr = stderr.strip()
    if ((p.returncode == 0 or p.returncode == 141) and
         (stderr == "" or (stderr.startswith("gof3r") and stderr.endswith("broken pipe")))):
        return int(stdout) == 0
    else:
        raise ValueError("Failed to check empty status of BAM file: %s" % str(stderr))
Beispiel #29
0
def is_empty(bam_file):
    """Determine if a BAM file is empty
    """
    bam_file = objectstore.cl_input(bam_file)
    sambamba = config_utils.get_program("sambamba", {})
    cmd = ("set -o pipefail; "
           "{sambamba} view {bam_file} | head -1 | wc -l")
    p = subprocess.Popen(cmd.format(**locals()), shell=True,
                         executable=do.find_bash(),
                         stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                         preexec_fn=lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL))
    stdout, stderr = p.communicate()
    stderr = stderr.strip()
    if ((p.returncode == 0 or p.returncode == 141) and
         (stderr == "" or (stderr.startswith("gof3r") and stderr.endswith("broken pipe")))):
        return int(stdout) == 0
    else:
        raise ValueError("Failed to check empty status of BAM file: %s" % str(stderr))
Beispiel #30
0
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores  # 1Gb/core default
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0])
    if bam.is_paired(bam_file):
        out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    else:
        out_file_2 = None
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            try:
                do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError, msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core."
                                % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
Beispiel #31
0
def is_paired(bam_file):
    """Determine if a BAM file has paired reads.

    Works around issues with head closing the samtools pipe using signal trick from:
    http://stackoverflow.com/a/12451083/252589
    """
    bam_file = objectstore.cl_input(bam_file)
    cmd = ("set -o pipefail; "
           "sambamba view -h {bam_file} | head -50000 | "
           "sambamba view -S -F paired /dev/stdin  | head -1 | wc -l")
    p = subprocess.Popen(cmd.format(**locals()), shell=True,
                         executable=do.find_bash(),
                         stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                         preexec_fn=lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL))
    stdout, stderr = p.communicate()
    if p.returncode == 0 or p.returncode == 141 and stderr.strip() == "":
        return int(stdout) > 0
    else:
        raise ValueError("Failed to check paired status of BAM file: %s" % str(stderr))
Beispiel #32
0
def _can_use_mem(fastq_file, data, read_min_size=None):
    """bwa-mem handle longer (> 70bp) reads with improved piping.
    Randomly samples 5000 reads from the first two million.
    Default to no piping if more than 75% of the sampled reads are small.
    If we've previously calculated minimum read sizes (from rtg SDF output)
    we can skip the formal check.
    """
    min_size = 70
    if read_min_size and read_min_size >= min_size:
        return True
    thresh = 0.75
    head_count = 8000000
    tocheck = 5000
    seqtk = config_utils.get_program("seqtk", data["config"])
    fastq_file = objectstore.cl_input(fastq_file)
    gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith(
        ".gz") else "cat {fastq_file}"
    cmd = (gzip_cmd + " | head -n {head_count} | "
           "{seqtk} sample -s42 - {tocheck} | "
           "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c")

    def fix_signal():
        """Avoid spurious 'cat: write error: Broken pipe' message due to head command.

        Work around from:
        https://bitbucket.org/brodie/cram/issues/16/broken-pipe-when-heading-certain-output
        """
        signal.signal(signal.SIGPIPE, signal.SIG_DFL)

    count_out = subprocess.check_output(cmd.format(**locals()),
                                        shell=True,
                                        executable="/bin/bash",
                                        preexec_fn=fix_signal)
    if not count_out.strip():
        raise IOError("Failed to check fastq file sizes with: %s" %
                      cmd.format(**locals()))
    shorter = 0
    for count, size in (l.strip().split()
                        for l in count_out.strip().split("\n")):
        if int(size) < min_size:
            shorter += int(count)
    return (float(shorter) / float(tocheck)) <= thresh
Beispiel #33
0
def fastq_size_output(fastq_file, tocheck):
    head_count = 8000000
    fastq_file = objectstore.cl_input(fastq_file)
    gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith(".gz") else "cat {fastq_file}"
    cmd = (utils.local_path_export() + gzip_cmd + " | head -n {head_count} | "
           "seqtk sample -s42 - {tocheck} | "
           "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c")
    def fix_signal():
        """Avoid spurious 'cat: write error: Broken pipe' message due to head command.

        Work around from:
        https://bitbucket.org/brodie/cram/issues/16/broken-pipe-when-heading-certain-output
        """
        signal.signal(signal.SIGPIPE, signal.SIG_DFL)
    count_out = subprocess.check_output(cmd.format(**locals()), shell=True,
                                        executable="/bin/bash", preexec_fn=fix_signal).decode()
    if not count_out.strip():
        raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals()))
    for count, size in (l.strip().split() for l in count_out.strip().split("\n")):
        yield count, size
Beispiel #34
0
def _bgzip_file(finput, config, work_dir, needs_bgzip, needs_gunzip, needs_convert, data):
    """Handle bgzip of input file, potentially gunzipping an existing file.

    Handles cases where finput might be multiple files and need to be concatenated.
    """
    if isinstance(finput, six.string_types):
        in_file = finput
    else:
        assert not needs_convert, "Do not yet handle quality conversion with multiple inputs"
        return _bgzip_multiple_files(finput, work_dir, data)
    out_file = os.path.join(work_dir, os.path.basename(in_file).replace(".bz2", "") +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = objectstore.is_remote(in_file)
            in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or
                                           needs_bgzip or dd.get_trim_ends(data))
            if needs_convert or dd.get_trim_ends(data):
                in_file = fastq_convert_pipe_cl(in_file, data)
            if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)):
                if in_file.endswith(".bz2"):
                    gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals())
                else:
                    gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                       "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if (needs_convert or dd.get_trim_ends(data)) else ""
                do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input")
            else:
                raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip,
                                                                     needs_gunzip, needs_convert))
    return out_file
Beispiel #35
0
def _seqtk_fastq_prep_cl(data, in_file=None, read_num=0):
    """Provide a commandline for prep of fastq inputs with seqtk.

    Handles fast conversion of fastq quality scores and trimming.
    """
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    trim_ends = dd.get_trim_ends(data)
    seqtk = config_utils.get_program("seqtk", data["config"])
    if in_file:
        in_file = objectstore.cl_input(in_file)
    else:
        in_file = "/dev/stdin"
    cmd = ""
    if needs_convert:
        cmd += "{seqtk} seq -Q64 -V {in_file}".format(**locals())
    if trim_ends:
        left_trim, right_trim = trim_ends[0:2] if data.get("read_num", read_num) == 0 else trim_ends[2:4]
        if left_trim or right_trim:
            trim_infile = "/dev/stdin" if needs_convert else in_file
            pipe = " | " if needs_convert else ""
            cmd += "{pipe}{seqtk} trimfq -b {left_trim} -e {right_trim} {trim_infile}".format(**locals())
    return cmd
Beispiel #36
0
def _seqtk_fastq_prep_cl(data, in_file=None, read_num=0):
    """Provide a commandline for prep of fastq inputs with seqtk.

    Handles fast conversion of fastq quality scores and trimming.
    """
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    trim_ends = dd.get_trim_ends(data)
    seqtk = config_utils.get_program("seqtk", data["config"])
    if in_file:
        in_file = objectstore.cl_input(in_file)
    else:
        in_file = "/dev/stdin"
    cmd = ""
    if needs_convert:
        cmd += "{seqtk} seq -Q64 -V {in_file}".format(**locals())
    if trim_ends:
        left_trim, right_trim = trim_ends[0:2] if data.get("read_num", read_num) == 0 else trim_ends[2:4]
        if left_trim or right_trim:
            trim_infile = "/dev/stdin" if needs_convert else in_file
            pipe = " | " if needs_convert else ""
            cmd += "{pipe}{seqtk} trimfq -b {left_trim} -e {right_trim} {trim_infile}".format(**locals())
    return cmd
Beispiel #37
0
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data):
    """Convert CRAM to fastq in a specified region.
    """
    ref_file = tz.get_in(["reference", "fasta", "base"], data)
    resources = config_utils.get_resources("bamtofastq", data["config"])
    cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores
    rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full"
    out_s, out_p1, out_p2, out_o1, out_o2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" %
                                                          (base_name, rext, fext))
                                             for fext in ["s1", "p1", "p2", "o1", "o2"]]
    if not utils.file_exists(out_p1):
        with file_transaction(data, out_s, out_p1, out_p2, out_o1, out_o2) as \
             (tx_out_s, tx_out_p1, tx_out_p2, tx_out_o1, tx_out_o2):
            cram_file = objectstore.cl_input(cram_file)
            sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0]
            cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} "
                   "gz=1 collate=1 colsbs={max_mem} exclude=SECONDARY,SUPPLEMENTARY "
                   "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O={tx_out_o1} O2={tx_out_o2} "
                   "reference={ref_file}")
            if region:
                cmd += " ranges='{region}'"
            do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "")
    return [[out_p1, out_p2, out_s]]