Ejemplo n.º 1
0
def _bgzip_from_bam(bam_file, dirs, config):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default
    bgzip = _get_bgzip_cmd(config)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0])
    if bam.is_paired(bam_file):
        out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    else:
        out_file_2 = None
    if not utils.file_exists(out_file_1):
        with file_transaction(out_file_1) as tx_out_file:
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                   checks=[do.file_reasonable_size(tx_out_file, bam_file)])
    return [x for x in [out_file_1, out_file_2] if x is not None]
Ejemplo n.º 2
0
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False, output_infix=''):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory",
                                                          "1G")) * cores
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(
        work_dir, "%s%s-1.fq.gz" %
        (os.path.splitext(os.path.basename(bam_file))[0], output_infix))
    out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        if not bam.is_paired(bam_file):
            out_file_2 = None
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = (
                    "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                    "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            try:
                do.run(cmd.format(**locals()),
                       "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info(
                        "bamtofastq deflate IO failure preparing %s. Retrying with single core."
                        % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
    if needs_retry:
        return _bgzip_from_bam(bam_file, dirs, config, is_retry=True)
    else:
        return [
            x for x in [out_file_1, out_file_2]
            if x is not None and utils.file_exists(x)
        ]
Ejemplo n.º 3
0
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    config = data["config"]
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix))
    out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        if not bam.is_paired(bam_file):
            out_file_2 = None
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0)
            if prep_cmd:
                fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1)
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                if prep_cmd:
                    fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            extra_opts = " ".join([str(x) for x in resources.get("options", [])])
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str
            try:
                do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core."
                                % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
    if needs_retry:
        return _bgzip_from_bam(bam_file, dirs, data, is_retry=True)
    else:
        return [x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x)]
Ejemplo n.º 4
0
def run(bam_file, data, out_dir):
    config = data["config"]
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data)
    broad_runner = broad.PicardCmdRunner("picard", config)
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample)
    hsinsert_file = os.path.join(out_dir, "%s-sort.insert_metrics" % sample)
    if utils.file_exists(hsmetric_file):
        return hsmetric_file
    with utils.chdir(out_dir):
        with tx_tmpdir() as tmp_dir:
            cur_bam = os.path.basename(bam_fname)
            if not os.path.exists(cur_bam):
                os.symlink(bam_fname, cur_bam)
            gen_metrics = PicardMetrics(broad_runner, tmp_dir)
            gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname),
                               target_file, target_file, None, config)
    do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "")
    do.run("sed -i 's/-sort.bam//g' %s" % hsinsert_file, "")
    return hsmetric_file
Ejemplo n.º 5
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compres_bdg_files(out_dir)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(method)
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning("macs2 terminated with an error.\n"
                                 "Please, check the message and report "
                                 "error if it is related to bcbio.\n"
                                 "You can add specific options for the sample "
                                 "setting resources as explained in docs: "
                                 "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
    _compres_bdg_files(out_dir)
    return _get_output_files(out_dir)
Ejemplo n.º 6
0
def run(bam_file, data, out_dir):
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    broad_runner = broad.PicardCmdRunner("picard", data["config"])
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    out_base = utils.splitext_plus(os.path.basename(bam_fname))[0]
    hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base)
    hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base)
    if not utils.file_exists(hsmetric_file) and not utils.file_exists(hsinsert_file):
        with utils.chdir(out_dir):
            with tx_tmpdir() as tmp_dir:
                cur_bam = os.path.basename(bam_fname)
                if not os.path.exists(cur_bam):
                    os.symlink(bam_fname, cur_bam)
                gen_metrics = PicardMetrics(broad_runner, tmp_dir)
                gen_metrics.report(cur_bam, ref_file,
                                bam.is_paired(bam_fname),
                                target_file, target_file, None, data["config"])
        if utils.file_exists(hsmetric_file):
            do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "")
        if utils.file_exists(hsinsert_file):
            do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "")
    return hsmetric_file
Ejemplo n.º 7
0
def to_sdf(files, data):
    """Convert a fastq or BAM input into a SDF indexed file.
    """
    # BAM
    if len(files) == 1 and files[0].endswith(".bam"):
        qual = []
        format = ["-f", "sam-pe" if bam.is_paired(files[0]) else "sam-se"]
        inputs = [files[0]]
    # fastq
    else:
        qual = ["-q", "illumina" if dd.get_quality_format(data).lower() == "illumina" else "sanger"]
        format = ["-f", "fastq"]
        if len(files) == 2:
            inputs = ["-l", files[0], "-r", files[1]]
        else:
            assert len(files) == 1
            inputs = [files[0]]
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    out_file = os.path.join(work_dir,
                            "%s.sdf" % utils.splitext_plus(os.path.basename(os.path.commonprefix(files)))[0])
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = _rtg_cmd(["rtg", "format", "-o", tx_out_file] + format + qual + inputs)
            do.run(cmd, "Format inputs to indexed SDF")
    return out_file
Ejemplo n.º 8
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    TODO: Use streaming with new development version of SNAP to feed into
    structural variation preparation de-duplication.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    assert not data.get(
        "align_split"), "Split alignments not supported with SNAP"
    snap = config_utils.get_program("snap", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)
    is_paired = bam.is_paired(fastq_file) if fastq_file.endswith(
        ".bam") else pair_file
    if not utils.file_exists(out_file):
        with postalign.tobam_cl(data, out_file,
                                is_paired) as (tobam_cl, tx_out_file):
            cmd_name = "paired" if is_paired else "single"
            cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} "
                   "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(
                cmd.format(**locals()) + tobam_cl,
                "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Ejemplo n.º 9
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    TODO: Use streaming with new development version of SNAP to feed into
    structural variation preparation de-duplication.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    assert not data.get("align_split"), "Split alignments not supported with SNAP"
    snap = config_utils.get_program("snap", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    max_mem = resources.get("memory", "1G")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with utils.curdir_tmpdir(data) as work_dir:
                if fastq_file.endswith(".bam"):
                    cmd_name = "paired" if bam.is_paired(fastq_file) else "single"
                else:
                    cmd_name = "single" if not pair_file else "paired"
                cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} "
                       "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}")
                do.run(cmd.format(**locals()), "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Ejemplo n.º 10
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {"firststrand": "strand-specific-reverse",
                    "secondstrand": "strand-specific-forward",
                    "unstranded": "non-strand-specific"}
    report_file = os.path.join(out_dir, "qualimapReport.html")
    raw_file = os.path.join(out_dir, "rnaseq_qc_results.txt")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(report_file):
        utils.safe_makedir(out_dir)
        bam.index(bam_file, config)
        cmd = _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file, single_end, library)
        do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
        cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file)
        do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, out_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update({"Average insert size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
Ejemplo n.º 11
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {"firststrand": "strand-specific-reverse",
                    "secondstrand": "strand-specific-forward",
                    "unstranded": "non-strand-specific"}
    report_file = os.path.join(out_dir, "qualimapReport.html")
    raw_file = os.path.join(out_dir, "rnaseq_qc_results.txt")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(report_file):
        utils.safe_makedir(out_dir)
        bam.index(bam_file, config)
        cmd = _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file, single_end, library)
        do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
        cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file)
        do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, out_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update({"Average insert size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
Ejemplo n.º 12
0
def run(bam_file, data, out_dir):
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    broad_runner = broad.PicardCmdRunner("picard", data["config"])
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    out_base = utils.splitext_plus(os.path.basename(bam_fname))[0]
    hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base)
    hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base)
    if not utils.file_exists(hsmetric_file) and not utils.file_exists(
            hsinsert_file):
        with utils.chdir(out_dir):
            with tx_tmpdir() as tmp_dir:
                cur_bam = os.path.basename(bam_fname)
                if not os.path.exists(cur_bam):
                    os.symlink(bam_fname, cur_bam)
                gen_metrics = PicardMetrics(broad_runner, tmp_dir)
                gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname),
                                   target_file, target_file, None,
                                   data["config"])
        if utils.file_exists(hsmetric_file):
            do.run(
                "sed -i 's/%s.bam//g' %s" %
                (out_base.replace(sample, ""), hsmetric_file), "")
        if utils.file_exists(hsinsert_file):
            do.run(
                "sed -i 's/%s.bam//g' %s" %
                (out_base.replace(sample, ""), hsinsert_file), "")
    return hsmetric_file
Ejemplo n.º 13
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, config):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        return out_file
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(config_utils.get_resources("macs2", config).get("options", ""))
    if genome_build not in HS and options.find("-g") == -1:
        raise ValueError("This %s genome doesn't have a pre-set value."
                          "You can add specific values using resources "
                          "option for macs2 in the YAML file (-g genome_size)."
                          "Check Chip-seq configuration in "
                          "bcbio-nextgen documentation.")

    genome_size = "" if options.find("-g") > -1 else "-g %s" % HS[genome_build]
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(method)
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning("macs2 terminated with an error.\n"
                                 "Please, check the message and report "
                                 "error if it is related to bcbio.\n"
                                 "You can add specific options for the sample "
                                 "setting resources as explained in docs: "
                                 "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
    return out_file
Ejemplo n.º 14
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir() as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Ejemplo n.º 15
0
def rsem_calculate_expression(bam_file, rsem_genome_dir, samplename, build,
                              out_dir, cores=1):
    """
    works only in unstranded mode for now (--forward-prob 0.5)
    """
    if not which("rsem-calculate-expression"):
        logger.info("Skipping RSEM because rsem-calculate-expression could "
                    "not be found.")
        return None

    sentinel_file = os.path.join(out_dir, samplename + "Test.genes.results")
    if file_exists(sentinel_file):
        return out_dir

    paired_flag = "--paired" if bam.is_paired(bam_file) else ""
    core_flag = "-p {cores}".format(cores=cores)
    cmd = ("rsem-calculate-expression --bam {core_flag} {paired_flag} --no-bam-output "
           "--forward-prob 0.5 --estimate-rspd {bam_file} {rsem_genome_dir}/{build} "
           "{samplename}")
    message = "Calculating transcript expression of {bam_file} using RSEM."
    with file_transaction(out_dir) as tx_out_dir:
        safe_makedir(tx_out_dir)
        with chdir(tx_out_dir):
            do.run(cmd.format(**locals()), message.format(**locals()))
    return out_dir
Ejemplo n.º 16
0
def create_peaktable(samples):
    """create a table of peak counts per sample to use with differential peak calling
    """
    data = dd.get_data_from_sample(samples[0])
    peakcounts = []
    out_dir = os.path.join(dd.get_work_dir(data), "consensus")
    out_file = os.path.join(out_dir, "consensus-counts.tsv")
    if dd.get_chip_method(data) == "chip":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts"), data))
    elif dd.get_chip_method(data) == "atac":
        for data in dd.sample_data_iterator(samples):
            if bam.is_paired(dd.get_work_bam(data)):
                peakcounts.append(tz.get_in(("peak_counts", "NF"), data))
            else:
                logger.info(f"Creating peak table from full BAM file because "
                            f"{dd.get_work_bam(data)} is single-ended.")
                peakcounts.append(tz.get_in(("peak_counts", "full"), data))
    combined_peaks = count.combine_count_files(peakcounts,
                                               out_file,
                                               ext=".counts")
    new_data = []
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks)
        new_data.append(data)
    new_samples = dd.get_samples_from_datalist(new_data)
    return new_samples
Ejemplo n.º 17
0
def rsem_calculate_expression(bam_file, rsem_genome_dir, samplename,
                              build, out_dir, cores=1):
    """
    works only in unstranded mode for now (--forward-prob 0.5)
    """
    if not utils.which("rsem-calculate-expression"):
        logger.info("Skipping RSEM because rsem-calculate-expression could "
                    "not be found.")
        return None

    sentinel_file = os.path.join(out_dir, samplename + "Test.genes.results")
    if utils.file_exists(sentinel_file):
        return out_dir

    paired_flag = "--paired" if bam.is_paired(bam_file) else ""
    core_flag = "-p {cores}".format(cores=cores)
    command = CALCULATE_EXP.format(
        core_flag=core_flag, paired_flag=paired_flag, bam_file=bam_file,
        rsem_genome_dir=rsem_genome_dir, build=build, samplename=samplename)
    message = "Calculating transcript expression of {bam_file} using RSEM."

    with transaction.file_transaction(out_dir) as tx_out_dir:
        utils.safe_makedir(tx_out_dir)
        with utils.chdir(tx_out_dir):
            do.run(command, message.format(bam_file=bam_file))
    return out_dir
Ejemplo n.º 18
0
def rsem_calculate_expression(bam_file,
                              rsem_genome_dir,
                              samplename,
                              build,
                              out_dir,
                              cores=1):
    """
    works only in unstranded mode for now (--forward-prob 0.5)
    """
    if not utils.which("rsem-calculate-expression"):
        logger.info("Skipping RSEM because rsem-calculate-expression could "
                    "not be found.")
        return None

    sentinel_file = os.path.join(out_dir, samplename + "Test.genes.results")
    if utils.file_exists(sentinel_file):
        return out_dir

    paired_flag = "--paired" if bam.is_paired(bam_file) else ""
    core_flag = "-p {cores}".format(cores=cores)
    command = CALCULATE_EXP.format(core_flag=core_flag,
                                   paired_flag=paired_flag,
                                   bam_file=bam_file,
                                   rsem_genome_dir=rsem_genome_dir,
                                   build=build,
                                   samplename=samplename)
    message = "Calculating transcript expression of {bam_file} using RSEM."

    with transaction.file_transaction(out_dir) as tx_out_dir:
        utils.safe_makedir(tx_out_dir)
        with utils.chdir(tx_out_dir):
            do.run(command, message.format(bam_file=bam_file))
    return out_dir
Ejemplo n.º 19
0
def filter_multimappers(align_file, data):
    """
    It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie,
    there are some options that are close but don't do the same thing. Bowtie2
    sets the XS flag for reads mapping in more than one place, so we can just
    filter on that. This will not work for other aligners.
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    bed_file = dd.get_variant_regions(data)
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
    base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam {bed_cmd} '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    bam.index(out_file, config)
    return out_file
Ejemplo n.º 20
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform realignment of input BAM file; uses unix pipes for avoid IO.
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G").upper()
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                rg_info = get_rg_info(names)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                       "  -F BAMPE -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Ejemplo n.º 21
0
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config):
    """Convert BAM input file into FASTQ files.
    """
    out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert"))

    qual_bin_method = config["algorithm"].get("quality_bin")
    if (qual_bin_method == "prealignment" or
         (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)):
        out_bindir = safe_makedir(os.path.join(out_dir, "qualbin"))
        in_file = cram.illumina_qual_bin(in_file, data["sam_ref"], out_bindir, config)

    out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format(
                 os.path.splitext(os.path.basename(in_file))[0], x))
                 for x in ["1", "2"]]
    if bam.is_paired(in_file):
        out1, out2 = out_files
    else:
        out1 = out_files[0]
        out2 = None
    if not file_exists(out1):
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2)
    if out2 and os.path.getsize(out2) == 0:
        out2 = None
    return [out1, out2]
Ejemplo n.º 22
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease").upper()
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-")
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa_cmd} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Ejemplo n.º 23
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources,
        data):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compres_bdg_files(out_dir)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = HS.get(
        genome_build, bam.fasta.total_sequence_length(dd.get_ref_file(data)))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(method)
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning(
                "macs2 terminated with an error.\n"
                "Please, check the message and report "
                "error if it is related to bcbio.\n"
                "You can add specific options for the sample "
                "setting resources as explained in docs: "
                "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources"
            )
    _compres_bdg_files(out_dir)
    return _get_output_files(out_dir)
Ejemplo n.º 24
0
def to_sdf(files, data):
    """Convert a fastq or BAM input into a SDF indexed file.
    """
    # BAM
    if len(files) == 1 and files[0].endswith(".bam"):
        qual = []
        format = ["-f", "sam-pe" if bam.is_paired(files[0]) else "sam-se"]
        inputs = [files[0]]
    # fastq
    else:
        qual = [
            "-q", "illumina"
            if dd.get_quality_format(data).lower() == "illumina" else "sanger"
        ]
        format = ["-f", "fastq"]
        if len(files) == 2:
            inputs = ["-l", files[0], "-r", files[1]]
        else:
            assert len(files) == 1
            inputs = [files[0]]
    work_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "align_prep"))
    out_file = os.path.join(
        work_dir, "%s.sdf" %
        utils.splitext_plus(os.path.basename(os.path.commonprefix(files)))[0])
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = _rtg_cmd(["rtg", "format", "-o", tx_out_file] + format +
                           qual + inputs)
            do.run(cmd, "Format inputs to indexed SDF")
    return out_file
Ejemplo n.º 25
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform realignment of input BAM file; uses unix pipes for avoid IO.
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G").upper()
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with tx_tmpdir(data, base_dir=align_dir) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                rg_info = get_rg_info(names)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                       "  -F BAMPE -c {num_cores} {extra_novo_args} | ")
                cmd = (cmd + tobam_cl).format(**locals())
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Ejemplo n.º 26
0
def filter_multimappers(align_file, data):
    """
    Filtering a BWA alignment file for uniquely mapped reads, from here:
    https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
    base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam {bed_cmd} '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    bam.index(out_file, config)
    return out_file
Ejemplo n.º 27
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    TODO: Use streaming with new development version of SNAP to feed into
    structural variation preparation de-duplication.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    assert not data.get(
        "align_split"), "Split alignments not supported with SNAP"
    snap = config_utils.get_program("snap", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    max_mem = resources.get("memory", "1G")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with utils.curdir_tmpdir(data) as work_dir:
                if fastq_file.endswith(".bam"):
                    cmd_name = "paired" if bam.is_paired(
                        fastq_file) else "single"
                else:
                    cmd_name = "single" if not pair_file else "paired"
                cmd = (
                    "{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} "
                    "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}"
                )
                do.run(cmd.format(**locals()),
                       "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Ejemplo n.º 28
0
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config):
    """Convert BAM input file into FASTQ files.
    """
    out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert"))

    qual_bin_method = config["algorithm"].get("quality_bin")
    if (qual_bin_method == "prealignment"
            or (isinstance(qual_bin_method, list)
                and "prealignment" in qual_bin_method)):
        out_bindir = safe_makedir(os.path.join(out_dir, "qualbin"))
        in_file = cram.illumina_qual_bin(in_file, data["sam_ref"], out_bindir,
                                         config)

    out_files = [
        os.path.join(
            out_dir, "{0}_{1}.fastq".format(
                os.path.splitext(os.path.basename(in_file))[0], x))
        for x in ["1", "2"]
    ]
    if bam.is_paired(in_file):
        out1, out2 = out_files
    else:
        out1 = out_files[0]
        out2 = None
    if not file_exists(out1):
        broad_runner = broad.runner_from_config(config)
        broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2)
    if out2 and os.path.getsize(out2) == 0:
        out2 = None
    return [out1, out2]
Ejemplo n.º 29
0
def calculate_complexity_metrics(work_bam, data):
    """
    the work_bam should have duplicates marked but not removed
    mitochondrial reads should be removed 
    """
    bedtools = config_utils.get_program("bedtools", dd.get_config(data))
    work_dir = dd.get_work_dir(data)
    metrics_dir = os.path.join(work_dir, "metrics", "atac")
    utils.safe_makedir(metrics_dir)
    metrics_file = os.path.join(
        metrics_dir, f"{dd.get_sample_name(data)}-atac-metrics.csv")
    # complexity metrics only make sense for paired-end reads
    if not bam.is_paired(work_bam):
        return data
    if utils.file_exists(metrics_file):
        data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'],
                           metrics_file)
        return data
    # BAM file must be sorted by read name
    work_bam = bam.sort(work_bam, dd.get_config(data), order="queryname")
    with file_transaction(metrics_file) as tx_metrics_file:
        with open(tx_metrics_file, "w") as out_handle:
            out_handle.write("mt,m0,m1,m2\n")
        cmd = (
            f"{bedtools} bamtobed -bedpe -i {work_bam} | "
            "awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$4,$6,$9,$10}' | "
            "sort | "
            "uniq -c | "
            "awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} "
            "($1==2){m2=m2+1}{m0=m0+1}{mt=mt+$1}END{printf \"%d,%d,%d,%d\\n\", mt,m0,m1,m2}' >> "
            f"{tx_metrics_file}")
        message = f"Calculating ATAC-seq complexity metrics on {work_bam}, saving as {metrics_file}."
        do.run(cmd, message)
    data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file)
    return data
Ejemplo n.º 30
0
def run(bam_file, data, out_dir):
    config = data["config"]
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data)
    broad_runner = broad.PicardCmdRunner("picard", config)
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample)
    if utils.file_exists(hsmetric_file):
        return hsmetric_file
    with utils.chdir(out_dir):
        with tx_tmpdir() as tmp_dir:
            cur_bam = os.path.basename(bam_fname)
            if not os.path.exists(cur_bam):
                os.symlink(bam_fname, cur_bam)
            gen_metrics = PicardMetrics(broad_runner, tmp_dir)
            gen_metrics.report(cur_bam, ref_file,
                               bam.is_paired(bam_fname),
                               target_file, target_file, None, config)
    do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "")
    return hsmetric_file
Ejemplo n.º 31
0
def run_count(bam_file, dexseq_gff, stranded, out_file, data):
    """
    run dexseq_count on a BAM file
    """
    assert file_exists(bam_file), "%s does not exist." % bam_file
    sort_order = bam._get_sort_order(bam_file, {})
    assert sort_order, "Cannot determine sort order of %s." % bam_file
    strand_flag = _strand_flag(stranded)
    assert strand_flag, "%s is not a valid strandedness value." % stranded
    if not file_exists(dexseq_gff):
        logger.info("%s was not found, so exon-level counting is being "
                    "skipped." % dexseq_gff)
        return None

    dexseq_count = _dexseq_count_path()
    if not dexseq_count:
        logger.info("DEXseq is not installed, skipping exon-level counting.")
        return None

    sort_flag = "name" if sort_order == "queryname" else "pos"
    is_paired = bam.is_paired(bam_file)
    paired_flag = "yes" if is_paired else "no"
    bcbio_python = sys.executable

    if file_exists(out_file):
        return out_file
    cmd = (
        "{bcbio_python} {dexseq_count} -f bam -r {sort_flag} -p {paired_flag} "
        "-s {strand_flag} {dexseq_gff} {bam_file} {tx_out_file}")
    message = "Counting exon-level counts with %s and %s." % (bam_file,
                                                              dexseq_gff)
    with file_transaction(data, out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    return out_file
Ejemplo n.º 32
0
def run_count(bam_file, dexseq_gff, stranded, out_file, data):
    """
    run dexseq_count on a BAM file
    """
    assert file_exists(bam_file), "%s does not exist." % bam_file
    sort_order = bam._get_sort_order(bam_file, {})
    assert sort_order, "Cannot determine sort order of %s." % bam_file
    strand_flag = _strand_flag(stranded)
    assert strand_flag, "%s is not a valid strandedness value." % stranded
    if not dexseq_gff:
        logger.info("No DEXSeq GFF file was found, skipping exon-level counting.")
        return None
    elif not file_exists(dexseq_gff):
        logger.info("%s was not found, so exon-level counting is being "
                    "skipped." % dexseq_gff)
        return None

    dexseq_count = _dexseq_count_path()
    if not dexseq_count:
        logger.info("DEXseq is not installed, skipping exon-level counting.")
        return None

    sort_flag = "name" if sort_order == "queryname" else "pos"
    is_paired = bam.is_paired(bam_file)
    paired_flag = "yes" if is_paired else "no"
    bcbio_python = sys.executable

    if file_exists(out_file):
        return out_file
    cmd = ("{bcbio_python} {dexseq_count} -f bam -r {sort_flag} -p {paired_flag} "
           "-s {strand_flag} {dexseq_gff} {bam_file} {tx_out_file}")
    message = "Counting exon-level counts with %s and %s." % (bam_file, dexseq_gff)
    with file_transaction(data, out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    return out_file
Ejemplo n.º 33
0
def filter_multimappers(align_file, data):
    """
    It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie,
    there are some options that are close but don't do the same thing. Bowtie2
    sets the XS flag for reads mapping in more than one place, so we can just
    filter on that. This will not work for other aligners.
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    if file_exists(out_file):
        return out_file
    base_filter = '-F "[XS] == null and not unmapped {paired_filter}"'
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    return out_file
Ejemplo n.º 34
0
def filter_multimappers(align_file, data):
    """
    Filtering a BWA alignment file for uniquely mapped reads, from here:
    https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
    base_filter = '-F "not unmapped {paired_filter} and [XA] == null and [SA] == null and not supplementary " '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam {bed_cmd} '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    bam.index(out_file, config)
    return out_file
Ejemplo n.º 35
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {"firststrand": "strand-specific-reverse",
                    "secondstrand": "strand-specific-forward",
                    "unstranded": "non-strand-specific"}

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(report_file):
        with file_transaction(data, results_dir) as tx_out_dir:
            utils.safe_makedir(tx_out_dir)
            raw_file = os.path.join(tx_out_dir, "rnaseq_qc_results.txt")
            bam.index(bam_file, config)
            cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_out_dir, gtf_file, single_end, library)
            do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update({"Average_insert_size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
Ejemplo n.º 36
0
def split_ATAC(data, bam_file=None):
    """
    splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based
    on the estimated insert sizes
    uses the current working BAM file if no BAM file is supplied
    """
    sambamba = config_utils.get_program("sambamba", data)
    num_cores = dd.get_num_cores(data)
    base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} '
    bam_file = bam_file if bam_file else dd.get_work_bam(data)
    out_stem = os.path.splitext(bam_file)[0]
    split_files = {}
    # we can only split these fractions from paired runs
    if not bam.is_paired(bam_file):
        split_files["full"] = bam_file
        data = tz.assoc_in(data, ['atac', 'align'], split_files)
        return data
    for arange in ATACRanges.values():
        out_file = f"{out_stem}-{arange.label}.bam"
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                cmd = base_cmd +\
                    f'-F "template_length > {arange.min} and template_length < {arange.max}" ' +\
                    f'{bam_file} > {tx_out_file}'
                message = f'Splitting {arange.label} regions from {bam_file}.'
                do.run(cmd, message)
            bam.index(out_file, dd.get_config(data))
        split_files[arange.label] = out_file
    split_files["full"] = bam_file
    data = tz.assoc_in(data, ['atac', 'align'], split_files)
    return data
Ejemplo n.º 37
0
def _rnaseq_qualimap_cmd(data,
                         bam_file,
                         out_dir,
                         gtf_file=None,
                         library="non-strand-specific"):
    """
    Create command lines for qualimap
    """
    config = data["config"]
    qualimap = config_utils.get_program("qualimap", config)
    resources = config_utils.get_resources("qualimap", config)
    num_cores = resources.get("cores", dd.get_num_cores(data))
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         num_cores)
    export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export())
    export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
        utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir)
    if library != "non-strand-specific":
        logger.info(
            "Qualimap can get the orientation wrong for stranded reads, so we run it in unstranded mode. This gives comparable results to unstranded for RNA-seq data (see https://groups.google.com/forum/#!topic/qualimap/ZGo-k8LGmHQ) for a further explanation."
        )
        library = "non-strand-specific"
    paired = " --paired" if bam.is_paired(bam_file) else ""
    cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} "
           "-a proportional -bam {bam_file} -p {library}{paired} "
           "-gtf {gtf_file}").format(**locals())
    return cmd
Ejemplo n.º 38
0
def _prep_load_script(work_bams, names, chrom, items):
    pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired"
    print len(items), items[0].get("metadata")
    if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]):
        load_script = _paired_load_script
    else:
        load_script = _population_load_script
    return load_script(work_bams, names, chrom, pairmode, items)
Ejemplo n.º 39
0
def _paired_flag(bam_file):
    """
    sets flags to handle paired-end BAM files
    """
    if is_paired(bam_file):
        return "-p -B -C"
    else:
        return ""
Ejemplo n.º 40
0
def _paired_flag(bam_file):
    """
    sets flags to handle paired-end BAM files
    """
    if is_paired(bam_file):
        return "-p -B -C"
    else:
        return ""
Ejemplo n.º 41
0
def _libtype_string(bam_file, strandedness):
    # auto by default
    libtype = "-l "
    strand = "A"
    if strandedness != "auto":
        libtype = "-l I" if bam.is_paired(bam_file) else "-l "
        strand = sailfish._sailfish_strand_string(strandedness)
    return libtype + strand
Ejemplo n.º 42
0
def _prep_load_script(work_bams, names, chrom, items):
    pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired"
    print len(items), items[0].get("metadata")
    if len(items) == 2 and items[0].get("metadata", {}).get("phenotype") in ["tumor", "normal"]:
        load_script = _paired_load_script
    else:
        load_script = _population_load_script
    return load_script(work_bams, names, chrom, pairmode, items)
Ejemplo n.º 43
0
def _prep_load_script(work_bams, names, chrom, items):
    pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired"
    print len(items), items[0].get("metadata")
    if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]):
        load_script = _paired_load_script
    else:
        load_script = _population_load_script
    return load_script(work_bams, names, chrom, pairmode, items)
Ejemplo n.º 44
0
def _prep_load_script(work_bams, names, chrom, items):
    if not chrom: chrom = ""
    pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired"
    if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]):
        load_script = _paired_load_script
    else:
        load_script = _population_load_script
    return load_script(work_bams, names, chrom, pairmode, items)
Ejemplo n.º 45
0
def _prep_load_script(work_bams, names, chrom, items):
    if not chrom: chrom = ""
    pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired"
    if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]):
        load_script = _paired_load_script
    else:
        load_script = _population_load_script
    return load_script(work_bams, names, chrom, pairmode, items)
Ejemplo n.º 46
0
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores  # 1Gb/core default
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0])
    if bam.is_paired(bam_file):
        out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    else:
        out_file_2 = None
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            try:
                do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError, msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core."
                                % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
Ejemplo n.º 47
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources,
        data):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compress_and_sort_bdg_files(out_dir, data)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    antibody = dd.get_antibody(data)
    if antibody:
        antibody = antibody.lower()
        if antibody not in antibodies.SUPPORTED_ANTIBODIES:
            logger.error(
                f"{antibody} specified, but not listed as a supported antibody. Valid antibodies are {antibodies.SUPPORTED_ANTIBODIES}. If you know your antibody "
                f"should be called with narrow or broad peaks, supply 'narrow' or 'broad' as the antibody."
                f"It will run 'narrow' if the antibody is not supported.")
            antibody = 'narrow'
        antibody = antibodies.ANTIBODIES[antibody]
        logger.info(
            f"{antibody.name} specified, using {antibody.peaktype} peak settings."
        )
        peaksettings = select_peak_parameters(antibody)
    elif method == "atac":
        logger.info(f"ATAC-seq specified, using narrow peak settings.")
        peaksettings = " "
    else:
        peaksettings = " "
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(data)
        cmd += peaksettings
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning(
                "macs2 terminated with an error. "
                "Please, check the message and report "
                "error if it is related to bcbio. "
                "You can add specific options for the sample "
                "setting resources as explained in docs: "
                "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources"
            )
    _compress_and_sort_bdg_files(out_dir, data)
    return _get_output_files(out_dir)
Ejemplo n.º 48
0
def _run_bamtools_stats(bam_file, data, out_dir):
    """Run bamtools stats with reports on mapped reads, duplicates and insert sizes.
    """
    stats_file = os.path.join(out_dir, "bamtools_stats.txt")
    if not utils.file_exists(stats_file):
        utils.safe_makedir(out_dir)
        bamtools = config_utils.get_program("bamtools", data["config"])
        with file_transaction(stats_file) as tx_out_file:
            cmd = "{bamtools} stats -in {bam_file}"
            if bam.is_paired(bam_file):
                cmd += " -insert"
            cmd += " > {tx_out_file}"
            do.run(cmd.format(**locals()), "bamtools stats", data)
    return _parse_bamtools_stats(stats_file)
Ejemplo n.º 49
0
def _run_bamtools_stats(bam_file, data, out_dir):
    """Run bamtools stats with reports on mapped reads, duplicates and insert sizes.
    """
    stats_file = os.path.join(out_dir, "bamtools_stats.txt")
    if not utils.file_exists(stats_file):
        utils.safe_makedir(out_dir)
        bamtools = config_utils.get_program("bamtools", data["config"])
        with file_transaction(stats_file) as tx_out_file:
            cmd = "{bamtools} stats -in {bam_file}"
            if bam.is_paired(bam_file):
                cmd += " -insert"
            cmd += " > {tx_out_file}"
            do.run(cmd.format(**locals()), "bamtools stats", data)
    return _parse_bamtools_stats(stats_file)
Ejemplo n.º 50
0
def _set_stranded_flag(bam_file, data):
    strand_flag = {"unstranded": "",
                   "firststrand": "--rf-stranded",
                   "secondstrand": "--fr-stranded",
                   "firststrand-s": "--r-stranded",
                   "secondstrand-s": "--f-stranded"}
    stranded = dd.get_strandedness(data)
    assert stranded in strand_flag, ("%s is not a valid strandedness value. "
            "Valid values are 'firststrand', "
            "'secondstrand' and 'unstranded" % (stranded))
    if stranded != "unstranded" and not is_paired(bam_file):
        stranded += "-s"
    flag = strand_flag[stranded]
    return flag
Ejemplo n.º 51
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {
        "firststrand": "strand-specific-reverse",
        "secondstrand": "strand-specific-forward",
        "unstranded": "non-strand-specific"
    }

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    results_file = os.path.join(results_dir, "rnaseq_qc_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(results_file):
        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)
            bam.index(bam_file, config)
            cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir,
                                       gtf_file, single_end, library)
            do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
            tx_results_file = os.path.join(tx_results_dir,
                                           "rnaseq_qc_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (
                dd.get_sample_name(data), tx_results_file)
            do.run(cmd,
                   "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update(
        {"Average_insert_size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {
        "base": base_results_file,
        "secondary": _find_qualimap_secondary_files(results_dir,
                                                    base_results_file),
        "metrics": metrics
    }
Ejemplo n.º 52
0
def run_count(bam_file, dexseq_gff, stranded, out_file, data):
    """
    run dexseq_count on a BAM file
    """
    assert file_exists(bam_file), "%s does not exist." % bam_file
    sort_order = bam._get_sort_order(bam_file, {})
    assert sort_order, "Cannot determine sort order of %s." % bam_file
    strand_flag = _strand_flag(stranded)
    assert strand_flag, "%s is not a valid strandedness value." % stranded
    if not dexseq_gff:
        logger.info(
            "No DEXSeq GFF file was found, skipping exon-level counting.")
        return None
    elif not file_exists(dexseq_gff):
        logger.info("%s was not found, so exon-level counting is being "
                    "skipped." % dexseq_gff)
        return None

    dexseq_count = _dexseq_count_path()
    if not dexseq_count:
        logger.info("DEXseq is not installed, skipping exon-level counting.")
        return None

    if dd.get_aligner(data) == "bwa":
        logger.info(
            "Can't use DEXSeq with bwa alignments, skipping exon-level counting."
        )
        return None

    sort_flag = "name" if sort_order == "queryname" else "pos"
    is_paired = bam.is_paired(bam_file)
    paired_flag = "yes" if is_paired else "no"

    anaconda = os.path.dirname(os.path.realpath(sys.executable))
    r36_python = os.path.join(anaconda, "..", "envs", "r36", "bin", "python")

    if file_exists(out_file):
        return out_file
    cmd = (
        "{r36_python} {dexseq_count} -f bam -r {sort_flag} -p {paired_flag} "
        "-s {strand_flag} {dexseq_gff} {bam_file} {tx_out_file}")
    message = "Counting exon-level counts with %s and %s." % (bam_file,
                                                              dexseq_gff)
    with file_transaction(data, out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    return out_file
Ejemplo n.º 53
0
def call_consensus(samples):
    """
    call consensus peaks on the narrowPeak files from a set of
    ChiP/ATAC samples
    """
    data = samples[0][0]
    new_samples = []
    consensusdir = os.path.join(dd.get_work_dir(data), "consensus")
    utils.safe_makedir(consensusdir)
    peakfiles = []
    for data in dd.sample_data_iterator(samples):
        if dd.get_chip_method(data) == "chip":
            for fn in tz.get_in(("peaks_files", "macs2"), data, []):
                if "narrowPeak" in fn:
                    peakfiles.append(fn)
                elif "broadPeak" in fn:
                    peakfiles.append(fn)
        elif dd.get_chip_method(data) == "atac":
            if bam.is_paired(dd.get_work_bam(data)):
                for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []):
                    if "narrowPeak" in fn:
                        peakfiles.append(fn)
            else:
                logger.info(
                    f"Using peaks from full fraction since {dd.get_work_bam(data)} is single-ended."
                )
                for fn in tz.get_in(("peaks_files", "full", "macs2"), data,
                                    []):
                    if "narrowPeak" in fn:
                        peakfiles.append(fn)
    consensusfile = os.path.join(consensusdir, "consensus.bed")
    if not peakfiles:
        logger.info(
            "No suitable peak files found, skipping consensus peak calling.")
        return samples
    consensusfile = consensus(peakfiles, consensusfile, data)
    if not utils.file_exists(consensusfile):
        logger.warning("No consensus peaks found.")
        return samples
    saffile = consensus_to_saf(consensusfile,
                               os.path.splitext(consensusfile)[0] + ".saf")
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peaks_files", "consensus"),
                           {"main": consensusfile})
        new_samples.append([data])
    return new_samples
Ejemplo n.º 54
0
def sample_summary(bam_file, data, out_dir):
    """Run RNA-SeQC on a single RNAseq sample, writing to specified output directory.
    """
    metrics_file = os.path.join(out_dir, "metrics.tsv")
    if not file_exists(metrics_file):
        config = data["config"]
        ref_file = data["sam_ref"]
        genome_dir = os.path.dirname(os.path.dirname(ref_file))
        gtf_file = config_utils.get_transcript_gtf(genome_dir)
        rna_file = config_utils.get_rRNA_sequence(genome_dir)
        sample_file = os.path.join(safe_makedir(out_dir), "sample_file.txt")
        _write_sample_id_file(data, bam_file, sample_file)
        runner = rnaseqc_runner_from_config(config)
        bam.index(bam_file, config)
        single_end = bam.is_paired(bam_file)
        runner.run(sample_file, ref_file, rna_file, gtf_file, out_dir, single_end)
    return _parse_rnaseqc_metrics(metrics_file, data["name"][-1])
Ejemplo n.º 55
0
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, library="non-strand-specific"):
    """
    Create command lines for qualimap
    """
    config = data["config"]
    qualimap = config_utils.get_program("qualimap", config)
    resources = config_utils.get_resources("qualimap", config)
    num_cores = resources.get("cores", dd.get_num_cores(data))
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         num_cores)
    export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export())
    export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
        utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir)
    paired = " --paired" if bam.is_paired(bam_file) else ""
    cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} "
           "-a proportional -bam {bam_file} -p {library}{paired} "
           "-gtf {gtf_file}").format(**locals())
    return cmd
Ejemplo n.º 56
0
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config):
    """Convert BAM input file into FASTQ files.
    """
    out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert"))
    out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format(
                 os.path.splitext(os.path.basename(in_file))[0], x))
                 for x in ["1", "2"]]
    if bam.is_paired(in_file):
        out1, out2 = out_files
    else:
        out1 = out_files[0]
        out2 = None
    if not file_exists(out1):
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2)
    if out2 and os.path.getsize(out2) == 0:
        out2 = None
    return [out1, out2]
Ejemplo n.º 57
0
def sample_summary(bam_file, data, out_dir):
    """Run RNA-SeQC on a single RNAseq sample, writing to specified output directory.
    """
    metrics_file = os.path.join(out_dir, "metrics.tsv")
    if not file_exists(metrics_file):
        with file_transaction(out_dir) as tx_out_dir:
            config = data["config"]
            ref_file = data["sam_ref"]
            genome_dir = os.path.dirname(os.path.dirname(ref_file))
            gtf_file = config_utils.get_transcript_gtf(genome_dir)
            rna_file = config_utils.get_rRNA_sequence(genome_dir)
            sample_file = os.path.join(safe_makedir(tx_out_dir), "sample_file.txt")
            _write_sample_id_file(data, bam_file, sample_file)
            runner = rnaseqc_runner_from_config(config)
            bam.index(bam_file, config)
            single_end = not bam.is_paired(bam_file)
            runner.run(sample_file, ref_file, rna_file, gtf_file, tx_out_dir, single_end)
            # we don't need this large directory for just the report
            shutil.rmtree(os.path.join(tx_out_dir, data["description"]))
    return _parse_rnaseqc_metrics(metrics_file, data["name"][-1])
Ejemplo n.º 58
0
def _rnaseq_qualimap(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    report_file = os.path.join(out_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    ref_file = dd.get_ref_file(data)
    single_end = not bam.is_paired(bam_file)
    if not utils.file_exists(report_file):
        utils.safe_makedir(out_dir)
        bam.index(bam_file, config)
        cmd = _rnaseq_qualimap_cmd(config, bam_file, out_dir, gtf_file, single_end)
        do.run(cmd, "Qualimap for {}".format(data["name"][-1]))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, out_dir, config))
    metrics.update(_detect_rRNA(config, bam_file, gtf_file, ref_file, out_dir, single_end))
    metrics.update({"Fragment Length Mean": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics