Example #1
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
        else:
            fastq_file = utils.remote_cl_input(fastq_file)
            pair_file = utils.remote_cl_input(pair_file)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if ("bwa-mem" in tz.get_in(["config", "algorithm", "tools_off"], data, [])
              or not _can_use_mem(fastq_file, data)):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file,
                                        names, rg_info, data)
        else:
            out_file = _align_mem(fastq_file, pair_file, ref_file, out_file,
                                  names, rg_info, data)
    data["work_bam"] = out_file
    return data
Example #2
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
        else:
            fastq_file = utils.remote_cl_input(fastq_file)
            pair_file = utils.remote_cl_input(pair_file)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if not _can_use_mem(fastq_file, data):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file,
                                        names, rg_info, data)
        else:
            out_file = _align_mem(fastq_file, pair_file, ref_file, out_file,
                                  names, rg_info, data)
    data["work_bam"] = out_file
    return data
Example #3
0
def _can_use_mem(fastq_file, data):
    """bwa-mem handle longer (> 70bp) reads with improved piping.
    Randomly samples 5000 reads from the first two million.
    Default to no piping if more than 75% of the sampled reads are small.
    """
    min_size = 70
    thresh = 0.75
    head_count = 8000000
    tocheck = 5000
    seqtk = config_utils.get_program("seqtk", data["config"])
    fastq_file = utils.remote_cl_input(fastq_file)
    gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith(
        ".gz") else "cat {fastq_file}"
    cmd = (gzip_cmd + " | head -n {head_count} | "
           "{seqtk} sample -s42 - {tocheck} | "
           "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c")
    count_out = subprocess.check_output(cmd.format(**locals()),
                                        shell=True,
                                        executable="/bin/bash",
                                        stderr=open("/dev/null", "w"))
    if not count_out.strip():
        raise IOError("Failed to check fastq file sizes with: %s" %
                      cmd.format(**locals()))
    shorter = 0
    for count, size in (l.strip().split()
                        for l in count_out.strip().split("\n")):
        if int(size) < min_size:
            shorter += int(count)
    return (float(shorter) / float(tocheck)) <= thresh
Example #4
0
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data):
    """Convert CRAM to fastq in a specified region.
    """
    ref_file = tz.get_in(["reference", "fasta", "base"], data)
    resources = config_utils.get_resources("bamtofastq", data["config"])
    cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1)
    max_mem = int(resources.get("memory",
                                "1073741824")) * cores  # 1Gb/core default
    rext = "-%s" % region.replace(":", "_").replace("-",
                                                    "_") if region else "full"
    out_s, out_p1, out_p2, out_o1, out_o2 = [
        os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext))
        for fext in ["s1", "p1", "p2", "o1", "o2"]
    ]
    if not utils.file_exists(out_p1):
        with file_transaction(data, out_s, out_p1, out_p2, out_o1, out_o2) as \
             (tx_out_s, tx_out_p1, tx_out_p2, tx_out_o1, tx_out_o2):
            cram_file = utils.remote_cl_input(cram_file)
            sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0]
            cmd = (
                "bamtofastq filename={cram_file} inputformat=cram T={sortprefix} "
                "gz=1 collate=1 colsbs={max_mem} exclude=SECONDARY,SUPPLEMENTARY "
                "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O={tx_out_o1} O2={tx_out_o2} "
                "reference={ref_file}")
            if region:
                cmd += " ranges='{region}'"
            do.run(cmd.format(**locals()),
                   "CRAM to fastq %s" % region if region else "")
    return [[out_p1, out_p2, out_s]]
Example #5
0
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip, needs_convert):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file = os.path.join(work_dir, os.path.basename(in_file) +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = in_file.startswith(utils.SUPPORTED_REMOTES)
            in_file = utils.remote_cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip)
            if needs_convert:
                in_file = fastq_convert_pipe_cl(in_file, {"config": config})
            if needs_gunzip:
                gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                       "bgzip input file")
            elif is_remote:
                do.run("cat {in_file} > {tx_out_file}".format(**locals()), "Get remote input")
            else:
                raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip,
                                                                     needs_gunzip, needs_convert))
    return out_file
Example #6
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease").upper()
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                in_bam = utils.remote_cl_input(in_bam)
                cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Example #7
0
def fastq_convert_pipe_cl(in_file, data):
    """Create an anonymous pipe converting Illumina 1.3-1.7 to Sanger.

    Uses seqtk: https://github.com/lh3/seqt
    """
    seqtk = config_utils.get_program("seqtk", data["config"])
    in_file = utils.remote_cl_input(in_file)
    return "<({seqtk} seq -Q64 -V {in_file})".format(**locals())
Example #8
0
def fastq_convert_pipe_cl(in_file, data):
    """Create an anonymous pipe converting Illumina 1.3-1.7 to Sanger.

    Uses seqtk: https://github.com/lh3/seqt
    """
    seqtk = config_utils.get_program("seqtk", data["config"])
    in_file = utils.remote_cl_input(in_file)
    return "<({seqtk} seq -Q64 -V {in_file})".format(**locals())
Example #9
0
def is_paired(bam_file):
    """Determine if a BAM file has paired reads.
    """
    bam_file = utils.remote_cl_input(bam_file)
    cmd = ("sambamba view -h {bam_file} | head -50000 | "
           "sambamba view -S -F paired /dev/stdin  | head -1 | wc -l")
    out = subprocess.check_output(cmd.format(**locals()), shell=True,
                                  executable=do.find_bash(),
                                  stderr=open("/dev/null", "w"))
    return int(out) > 0
Example #10
0
def is_paired(bam_file):
    """Determine if a BAM file has paired reads.
    """
    bam_file = utils.remote_cl_input(bam_file)
    cmd = ("sambamba view -h {bam_file} | head -50000 | "
           "sambamba view -S -F paired /dev/stdin  | head -1 | wc -l")
    out = subprocess.check_output(cmd.format(**locals()),
                                  shell=True,
                                  executable=do.find_bash(),
                                  stderr=open("/dev/null", "w"))
    return int(out) > 0
Example #11
0
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd):
    """
    this has to use the -o option, not redirect to stdout in order for gzipping to be
    honored
    """
    min_length = MINIMUM_LENGTH
    cmd = base_cmd + " --minimum-length={min_length} ".format(**locals())
    fq1 = utils.remote_cl_input(fastq_files[0])
    of1 = out_files[0]
    cmd += " -o {of1} " + str(fq1)
    return cmd
Example #12
0
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd):
    """
    this has to use the -o option, not redirect to stdout in order for gzipping to be
    honored
    """
    min_length = MINIMUM_LENGTH
    cmd = base_cmd + " --minimum-length={min_length} ".format(**locals())
    fq1 = utils.remote_cl_input(fastq_files[0])
    of1 = out_files[0]
    cmd += " -o {of1} " + str(fq1)
    return cmd
Example #13
0
def _cutadapt_pe_nosickle(fastq_files, out_files, quality_format, base_cmd):
    """
    sickle has an issue with 0 length reads, here is the open issue for it:
    https://github.com/najoshi/sickle/issues/32
    until that is resolved, this is a workaround which avoids using sickle
    """
    fq1, fq2 = [utils.remote_cl_input(x) for x in fastq_files]
    of1, of2 = out_files
    base_cmd += " --minimum-length={min_length} ".format(min_length=MINIMUM_LENGTH)
    first_cmd = base_cmd + " -o {tmp_fq1} -p {tmp_fq2} " + fq1 + " " + fq2
    second_cmd = base_cmd + " -o {of2_tx} -p {of1_tx} {tmp_fq2} {tmp_fq1}"
    return first_cmd + ";" + second_cmd + "; rm {tmp_fq1} {tmp_fq2} "
Example #14
0
def _cutadapt_pe_nosickle(fastq_files, out_files, quality_format, base_cmd):
    """
    sickle has an issue with 0 length reads, here is the open issue for it:
    https://github.com/najoshi/sickle/issues/32
    until that is resolved, this is a workaround which avoids using sickle
    """
    fq1, fq2 = [utils.remote_cl_input(x) for x in fastq_files]
    of1, of2 = out_files
    base_cmd += " --minimum-length={min_length} ".format(min_length=MINIMUM_LENGTH)
    first_cmd = base_cmd + " -o {tmp_fq1} -p {tmp_fq2} " + fq1 + " " + fq2
    second_cmd = base_cmd + " -o {of2_tx} -p {of1_tx} {tmp_fq2} {tmp_fq1}"
    return first_cmd + ";" + second_cmd + "; rm {tmp_fq1} {tmp_fq2} "
Example #15
0
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = int(resources.get("memory",
                                "1073741824")) * cores  # 1Gb/core default
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(
        work_dir,
        "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0])
    if bam.is_paired(bam_file):
        out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    else:
        out_file_2 = None
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = (
                    "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                    "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = utils.remote_cl_input(bam_file)
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            try:
                do.run(cmd.format(**locals()),
                       "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError, msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info(
                        "bamtofastq deflate IO failure preparing %s. Retrying with single core."
                        % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
Example #16
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3,
                                         "decrease").upper()
    bwa_resources = config_utils.get_resources("bwa", data["config"])
    bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])])
                  if "options" in bwa_resources else "")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file,
                                    bam.is_paired(in_bam)) as (tobam_cl,
                                                               tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                in_bam = utils.remote_cl_input(in_bam)
                cmd = (
                    "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                    "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                    "| {bwa} mem -p -M -t {num_cores} {bwa_params} -R '{rg_info}' -v 1 {ref_file} - | "
                )
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"],
                       None, [
                           do.file_nonempty(tx_out_file),
                           do.file_reasonable_size(tx_out_file, in_bam)
                       ])
    return out_file
Example #17
0
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores  # 1Gb/core default
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0])
    if bam.is_paired(bam_file):
        out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    else:
        out_file_2 = None
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = utils.remote_cl_input(bam_file)
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            try:
                do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError, msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core."
                                % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
Example #18
0
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip, needs_convert):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file = os.path.join(work_dir, os.path.basename(in_file) +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            assert needs_bgzip
            bgzip = tools.get_bgzip_cmd(config)
            in_file = utils.remote_cl_input(in_file)
            if needs_convert:
                in_file = fastq_convert_pipe_cl(in_file, {"config": config})
            if needs_gunzip:
                gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                   "bgzip input file")
    return out_file
Example #19
0
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip, needs_convert):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file = os.path.join(work_dir, os.path.basename(in_file) +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            assert needs_bgzip
            bgzip = tools.get_bgzip_cmd(config)
            in_file = utils.remote_cl_input(in_file)
            if needs_convert:
                in_file = fastq_convert_pipe_cl(in_file, {"config": config})
            if needs_gunzip:
                gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                   "bgzip input file")
    return out_file
Example #20
0
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip,
                needs_convert):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file = os.path.join(
        work_dir,
        os.path.basename(in_file) +
        (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = in_file.startswith(utils.SUPPORTED_REMOTES)
            in_file = utils.remote_cl_input(in_file,
                                            unpack=needs_gunzip
                                            or needs_convert or needs_bgzip)
            if needs_convert:
                in_file = fastq_convert_pipe_cl(in_file, {"config": config})
            if needs_gunzip and not needs_convert:
                gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run(
                    "{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".
                    format(**locals()), "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if needs_convert else ""
                do.run(
                    "cat {in_file} {bgzip} > {tx_out_file}".format(**locals()),
                    "Get remote input")
            else:
                raise ValueError(
                    "Unexpected inputs: %s %s %s %s" %
                    (in_file, needs_bgzip, needs_gunzip, needs_convert))
    return out_file
Example #21
0
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data):
    """Convert CRAM to fastq in a specified region.
    """
    ref_file = tz.get_in(["reference", "fasta", "base"], data)
    resources = config_utils.get_resources("bamtofastq", data["config"])
    cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores  # 1Gb/core default
    rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full"
    out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" %
                                          (base_name, rext, fext))
                             for fext in ["s1", "p1", "p2"]]
    if not utils.file_exists(out_p1):
        with file_transaction(data, out_s, out_p1, out_p2) as (tx_out_s, tx_out_p1, tx_out_p2):
            cram_file = utils.remote_cl_input(cram_file)
            sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0]
            cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} "
                   "gz=1 collate=1 colsbs={max_mem} "
                   "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O=/dev/null O2=/dev/null "
                   "reference={ref_file}")
            if region:
                cmd += " ranges='{region}'"
            do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "")
    return [[out_p1, out_p2, out_s]]
Example #22
0
def _can_use_mem(fastq_file, data):
    """bwa-mem handle longer (> 70bp) reads with improved piping.
    Randomly samples 5000 reads from the first two million.
    Default to no piping if more than 75% of the sampled reads are small.
    """
    min_size = 70
    thresh = 0.75
    head_count = 8000000
    tocheck = 5000
    seqtk = config_utils.get_program("seqtk", data["config"])
    fastq_file = utils.remote_cl_input(fastq_file)
    gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith(".gz") else "cat {fastq_file}"
    cmd = (gzip_cmd + " | head -n {head_count} | "
           "{seqtk} sample -s42 - {tocheck} | "
           "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c")
    count_out = subprocess.check_output(cmd.format(**locals()), shell=True,
                                        executable="/bin/bash", stderr=open("/dev/null", "w"))
    if not count_out.strip():
        raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals()))
    shorter = 0
    for count, size in (l.strip().split() for l in count_out.strip().split("\n")):
        if int(size) < min_size:
            shorter += int(count)
    return (float(shorter) / float(tocheck)) <= thresh