Example #1
0
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file):
    """Deduplicate and sort with samblaster, produces split read and discordant pair files.
    """
    samblaster = config_utils.get_program("samblaster", data["config"])
    samtools = config_utils.get_program("samtools", data["config"])
    tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    tobam_cmd = ("{samtools} sort {sort_opt} -@ {cores} -m {mem} -T {tmp_prefix}-{dext} {out_file} -")
    # full BAM -- associate more memory and cores
    cores, mem = _get_cores_memory(data, downscale=2)
    # Potentially downsample to maximum coverage here if not splitting and whole genome sample
    ds_cmd = None if data.get("align_split") else bam.get_maxcov_downsample_cl(data, "samtools")
    sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else ""
    if ds_cmd:
        dedup_cmd = "%s %s > %s" % (tobam_cmd.format(out_file="", dext="full", **locals()), ds_cmd, tx_out_file)
    else:
        dedup_cmd = tobam_cmd.format(out_file="-o %s" % tx_out_file, dext="full", **locals())
    # split and discordant BAMs -- give less memory/cores since smaller files
    sort_opt = ""
    cores, mem = _get_cores_memory(data, downscale=4)
    splitter_cmd = tobam_cmd.format(out_file="-o %s" % tx_sr_file, dext="spl", **locals())
    discordant_cmd = tobam_cmd.format(out_file="-o %s" % tx_disc_file, dext="disc", **locals())
    # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem
    cmd = ("{samblaster} --addMateTags -M --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) "
           "| {dedup_cmd}")
    return cmd.format(**locals())
Example #2
0
def align_to_sort_bam(fastq1, fastq2, aligner, data):
    """Align to the named genome build, returning a sorted BAM file.
    """
    names = data["rgnames"]
    align_dir_parts = [data["dirs"]["work"], "align", names["sample"]]
    if data.get("disambiguate"):
        align_dir_parts.append(data["disambiguate"]["genome_build"])
    aligner_index = _get_aligner_index(aligner, data)
    align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts))
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    if fastq1.endswith(".bam"):
        data = _align_from_bam(fastq1, aligner, aligner_index, ref_file, names,
                               align_dir, data)
    else:
        data = _align_from_fastq(fastq1, fastq2, aligner, aligner_index,
                                 ref_file, names, align_dir, data)
    if data["work_bam"] and utils.file_exists(data["work_bam"]):
        if data.get("align_split") and dd.get_mark_duplicates(data):
            # If merging later with with bamsormadup need query sorted inputs
            # but CWL requires a bai file. Create a fake one to make it happy.
            bam.fake_index(data["work_bam"], data)
        else:
            bam.index(data["work_bam"], data["config"])
        for extra in ["-sr", "-disc"]:
            extra_bam = utils.append_stem(data['work_bam'], extra)
            if utils.file_exists(extra_bam):
                bam.index(extra_bam, data["config"])
    return data
Example #3
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    data["align_bam"] = dd.get_work_bam(data)
    if dd.get_mark_duplicates(data):
        if aligner:
            if aligner == "bowtie2":
                filterer = bowtie2.filter_multimappers
            elif aligner == "bwa":
                filterer = bwa.filter_multimappers
            else:
                logger.error("ChIP-seq only supported for bowtie2 and bwa.")
                sys.exit(-1)
            unique_bam = filterer(dd.get_work_bam(data), data)
            data["work_bam"] = unique_bam
        else:
            logger.info(
                "Warning: When BAM file is given as input, bcbio skips multimappers removal."
                "If BAM is not cleaned for peak calling, can result in downstream errors."
            )
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data),
                                             dd.get_ref_file(data),
                                             data["config"])
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed,
                                        data['config'])
        bam.index(data["work_bam"], data['config'])
    data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                   dd.get_work_bam(data), data)
    return [[data]]
Example #4
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    data["align_bam"] = dd.get_work_bam(data)
    if dd.get_mark_duplicates(data):
        if aligner:
            if aligner == "bowtie2":
                filterer = bowtie2.filter_multimappers
            elif aligner == "bwa":
                filterer = bwa.filter_multimappers
            else:
                logger.error("ChIP-seq only supported for bowtie2 and bwa.")
                sys.exit(-1)
            unique_bam = filterer(dd.get_work_bam(data), data)
            data["work_bam"] = unique_bam
        else:
            logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal."
                        "If BAM is not cleaned for peak calling, can result in downstream errors.")
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data),
                                             data["config"])
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data)
    return [[data]]
Example #5
0
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file):
    """Deduplicate and sort with samblaster, produces split read and discordant pair files.
    """
    samblaster = config_utils.get_program("samblaster", data["config"])
    samtools = config_utils.get_program("samtools", data["config"])
    tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    tobam_cmd = ("{samtools} sort {sort_opt} -@ {cores} -m {mem} -T {tmp_prefix}-{dext} {out_file} -")
    # full BAM -- associate more memory and cores
    cores, mem = _get_cores_memory(data, downscale=2)
    # Potentially downsample to maximum coverage here if not splitting and whole genome sample
    ds_cmd = None if data.get("align_split") else bam.get_maxcov_downsample_cl(data, "samtools")
    sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else ""
    if ds_cmd:
        dedup_cmd = "%s %s > %s" % (tobam_cmd.format(out_file="", dext="full", **locals()), ds_cmd, tx_out_file)
    else:
        dedup_cmd = tobam_cmd.format(out_file="-o %s" % tx_out_file, dext="full", **locals())
    # split and discordant BAMs -- give less memory/cores since smaller files
    sort_opt = ""
    cores, mem = _get_cores_memory(data, downscale=4)
    splitter_cmd = tobam_cmd.format(out_file="-o %s" % tx_sr_file, dext="spl", **locals())
    discordant_cmd = tobam_cmd.format(out_file="-o %s" % tx_disc_file, dext="disc", **locals())
    # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem
    cmd = ("{samblaster} --addMateTags -M --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) "
           "| {dedup_cmd}")
    return cmd.format(**locals())
Example #6
0
def align_to_sort_bam(fastq1, fastq2, aligner, data):
    """Align to the named genome build, returning a sorted BAM file.
    """
    names = data["rgnames"]
    align_dir_parts = [data["dirs"]["work"], "align", names["sample"]]
    if data.get("disambiguate"):
        align_dir_parts.append(data["disambiguate"]["genome_build"])
    aligner_index = _get_aligner_index(aligner, data)
    align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts))
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    if fastq1.endswith(".bam"):
        data = _align_from_bam(fastq1, aligner, aligner_index, ref_file,
                               names, align_dir, data)
    else:
        data = _align_from_fastq(fastq1, fastq2, aligner, aligner_index, ref_file,
                                 names, align_dir, data)
    if data["work_bam"] and utils.file_exists(data["work_bam"]):
        if data.get("align_split") and dd.get_mark_duplicates(data):
            # If merging later with with bamsormadup need query sorted inputs
            # but CWL requires a bai file. Create a fake one to make it happy.
            bam.fake_index(data["work_bam"], data)
        else:
            bam.index(data["work_bam"], data["config"])
        for extra in ["-sr", "-disc"]:
            extra_bam = utils.append_stem(data['work_bam'], extra)
            if utils.file_exists(extra_bam):
                bam.index(extra_bam, data["config"])
    return data
Example #7
0
def merge_bam_files(bam_files, work_dir, data, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch)
    if not utils.file_exists(out_file):
        if len(bam_files) == 1 and bam.bam_already_sorted(
                bam_files[0], data["config"], "coordinate"):
            with file_transaction(data, out_file) as tx_out_file:
                _create_merge_filelist(bam_files, tx_out_file, data["config"])
            out_file = bam_files[0]
            samtools = config_utils.get_program("samtools", data["config"])
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
        else:
            with tx_tmpdir(data) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(data, out_file) as tx_out_file:
                        tx_bam_file_list = _create_merge_filelist(
                            bam_files, tx_out_file, data["config"])
                        sambamba = config_utils.get_program(
                            "sambamba", data["config"])
                        samtools = config_utils.get_program(
                            "samtools", data["config"])
                        resources = config_utils.get_resources(
                            "samtools", data["config"])
                        num_cores = dd.get_num_cores(data)
                        # Aim for 3.5Gb/core memory for BAM merging
                        num_cores = config_utils.adjust_cores_to_mb_target(
                            3500, resources.get("memory", "2G"), num_cores)
                        max_mem = config_utils.adjust_memory(
                            resources.get("memory", "1G"), 2,
                            "decrease").upper()
                        if dd.get_mark_duplicates(data):
                            cmd = _biobambam_merge_dedup_maxcov(data)
                        else:
                            cmd = _biobambam_merge_maxcov(data)
                        do.run(
                            cmd.format(**locals()), "Merge bam files to %s" %
                            os.path.basename(out_file), None)
                        do.run(
                            '{} quickcheck -v {}'.format(
                                samtools, tx_out_file),
                            "Check for valid merged BAM")
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
            _finalize_merge(out_file, bam_files, data["config"])
    bam.index(out_file, data["config"])
    return out_file
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run platypus variant calling, germline whole genome or exome.
    """
    assert out_file.endswith(".vcf.gz")
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, items[0]["config"])
            cmd = [
                "platypus", "callVariants",
                "--regions=%s" % _subset_regions(region, out_file, items),
                "--bamFiles=%s" % ",".join(align_bams),
                "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-",
                "--logFileName", "/dev/null", "--verbosity=1"
            ]
            resources = config_utils.get_resources("platypus",
                                                   items[0]["config"])
            if resources.get("options"):
                # normalize options so we can set defaults without overwriting user specified
                for opt in resources["options"]:
                    if "=" in opt:
                        key, val = opt.split("=")
                        cmd.extend([key, val])
                    else:
                        cmd.append(opt)
            if any("gvcf" in dd.get_tools_on(d) for d in items):
                cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"]
            # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers
            # Currently not used after doing more cross validation as they increase false positives
            # which seems to be a major advantage for Platypus users.
            # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9",
            #               "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001",
            #               "--minVarFreq", "0.0", "--assemble", "1"]
            # for okey, oval in utils.partition_all(2, tuned_opts):
            #     if okey not in cmd:
            #         cmd.extend([okey, oval])

            # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates
            if any(not dd.get_mark_duplicates(data) for data in items):
                cmd += ["--filterDuplicates=0"]
            post_process_cmd = (
                " | %s | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | "
                "vcfstreamsort | bgzip -c > %s" %
                (vcfutils.fix_ambiguous_cl(), vcfutils.fix_ambiguous_cl(5),
                 vcfutils.add_contig_to_header_cl(dd.get_ref_file(items[0]),
                                                  tx_out_file), tx_out_file))
            do.run(" ".join(cmd) + post_process_cmd,
                   "platypus variant calling")
        out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Example #9
0
def _biobambam_dedup_sort(data, tx_out_file):
    """Perform streaming deduplication and sorting with biobambam's bamsormadup
    """
    samtools = config_utils.get_program("samtools", data["config"])
    cores, mem = _get_cores_memory(data, downscale=2)
    tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    if data.get("align_split"):
        sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else ""
        cmd = "{samtools} sort %s -@ {cores} -m {mem} -O bam -T {tmp_file}-namesort -o {tx_out_file} -" % sort_opt
    else:
        ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup")
        cmd = ("bamsormadup inputformat=sam threads={cores} tmpfile={tmp_file}-markdup "
               "SO=coordinate %s > {tx_out_file}" % ds_cmd)
    return cmd.format(**locals())
Example #10
0
def _biobambam_dedup_sort(data, tx_out_file):
    """Perform streaming deduplication and sorting with biobambam's bamsormadup
    """
    samtools = config_utils.get_program("samtools", data["config"])
    cores, mem = _get_cores_memory(data, downscale=2)
    tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    if data.get("align_split"):
        sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else ""
        cmd = "{samtools} sort %s -@ {cores} -m {mem} -O bam -T {tmp_file}-namesort -o {tx_out_file} -" % sort_opt
    else:
        ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup")
        bamsormadup = config_utils.get_program("bamsormadup", data)
        cmd = ("{bamsormadup} inputformat=sam threads={cores} tmpfile={tmp_file}-markdup "
               "SO=coordinate %s > {tx_out_file}" % ds_cmd)
    return cmd.format(**locals())
Example #11
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run platypus variant calling, germline whole genome or exome.
    """
    assert out_file.endswith(".vcf.gz")
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, items[0]["config"])
            cmd = ["platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items),
                   "--bamFiles=%s" % ",".join(align_bams),
                   "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-",
                   "--logFileName", "/dev/null", "--verbosity=1"]
            resources = config_utils.get_resources("platypus", items[0]["config"])
            if resources.get("options"):
                # normalize options so we can set defaults without overwriting user specified
                for opt in resources["options"]:
                    if "=" in opt:
                        key, val = opt.split("=")
                        cmd.extend([key, val])
                    else:
                        cmd.append(opt)
            if any("gvcf" in dd.get_tools_on(d) for d in items):
                cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"]
            # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers
            # Currently not used after doing more cross validation as they increase false positives
            # which seems to be a major advantage for Platypus users.
            # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9",
            #               "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001",
            #               "--minVarFreq", "0.0", "--assemble", "1"]
            # for okey, oval in utils.partition_all(2, tuned_opts):
            #     if okey not in cmd:
            #         cmd.extend([okey, oval])

            # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates
            if any(not dd.get_mark_duplicates(data) for data in items):
                cmd += ["--filterDuplicates=0"]
            post_process_cmd = (" | %s | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | "
                                "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(),
                                                                   vcfutils.fix_ambiguous_cl(5),
                                                                   vcfutils.add_contig_to_header_cl(items[0]),
                                                                   tx_out_file))
            do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling")
        out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Example #12
0
def merge_bam_files(bam_files, work_dir, data, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch)
    if not utils.file_exists(out_file):
        if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], data["config"], "coordinate"):
            with file_transaction(data, out_file) as tx_out_file:
                _create_merge_filelist(bam_files, tx_out_file, data["config"])
            out_file = bam_files[0]
            samtools = config_utils.get_program("samtools", data["config"])
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
        else:
            with tx_tmpdir(data) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(data, out_file) as tx_out_file:
                        tx_bam_file_list = _create_merge_filelist(bam_files, tx_out_file, data["config"])
                        samtools = config_utils.get_program("samtools", data["config"])
                        resources = config_utils.get_resources("samtools", data["config"])
                        num_cores = dd.get_num_cores(data)
                        # Aim for 3.5Gb/core memory for BAM merging
                        num_cores = config_utils.adjust_cores_to_mb_target(
                            3500, resources.get("memory", "2G"), num_cores)
                        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                             2, "decrease").upper()
                        if dd.get_mark_duplicates(data):
                            cmd = _biobambam_merge_dedup_maxcov(data)
                        else:
                            cmd = _biobambam_merge_maxcov(data)
                        do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                None)
                        do.run('{} quickcheck -v {}'.format(samtools, tx_out_file),
                               "Check for valid merged BAM")
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
            _finalize_merge(out_file, bam_files, data["config"])
    bam.index(out_file, data["config"])
    return out_file
Example #13
0
def parallel_prep_region(samples, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions("bamprep", "-prep.bam", file_key)
    # identify samples that do not need preparation -- no recalibration or realignment
    extras = []
    torun = []
    for data in [x[0] for x in samples]:
        if data.get("work_bam"):
            data["align_bam"] = data["work_bam"]
        if (not dd.get_realign(data) and not dd.get_variantcaller(data)):
            extras.append([data])
        elif not data.get(file_key):
            extras.append([data])
        else:
            # Do not want to re-run duplicate marking after realignment
            data["config"]["algorithm"]["orig_markduplicates"] = dd.get_mark_duplicates(data)
            data = dd.set_mark_duplicates(data, False)
            torun.append([data])
    return extras + parallel_split_combine(torun, split_fn, run_parallel,
                                           "piped_bamprep", _add_combine_info, file_key, ["config"])
Example #14
0
def parallel_prep_region(samples, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions("bamprep", "-prep.bam", file_key)
    # identify samples that do not need preparation -- no recalibration or realignment
    extras = []
    torun = []
    for data in [x[0] for x in samples]:
        if data.get("work_bam"):
            data["align_bam"] = data["work_bam"]
        if (not dd.get_realign(data) and not dd.get_variantcaller(data)):
            extras.append([data])
        elif not data.get(file_key):
            extras.append([data])
        else:
            # Do not want to re-run duplicate marking after realignment
            data["config"]["algorithm"][
                "orig_markduplicates"] = dd.get_mark_duplicates(data)
            data = dd.set_mark_duplicates(data, False)
            torun.append([data])
    return extras + parallel_split_combine(torun, split_fn, run_parallel,
                                           "piped_bamprep", _add_combine_info,
                                           file_key, ["config"])
Example #15
0
 def _skip_duplicates(data):
     return (dd.get_coverage_interval(data) == "amplicon"
             or (dd.get_aligner(data) and not dd.get_mark_duplicates(data)))
Example #16
0
 def _skip_duplicates(data):
     return dd.get_coverage_interval(data) == "amplicon" or not dd.get_mark_duplicates(data)
Example #17
0
 def _skip_duplicates(data):
     return (dd.get_coverage_interval(data) == "amplicon" or
             (dd.get_aligner(data) and not dd.get_mark_duplicates(data)))
Example #18
0
 def _skip_duplicates(data):
     return dd.get_coverage_interval(
         data) == "amplicon" or not dd.get_mark_duplicates(data)