Esempio n. 1
0
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config):
    """Convert BAM input file into FASTQ files.
    """
    out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert"))

    qual_bin_method = config["algorithm"].get("quality_bin")
    if (qual_bin_method == "prealignment"
            or (isinstance(qual_bin_method, list)
                and "prealignment" in qual_bin_method)):
        out_bindir = safe_makedir(os.path.join(out_dir, "qualbin"))
        in_file = cram.illumina_qual_bin(in_file, data["sam_ref"], out_bindir,
                                         config)

    out_files = [
        os.path.join(
            out_dir, "{0}_{1}.fastq".format(
                os.path.splitext(os.path.basename(in_file))[0], x))
        for x in ["1", "2"]
    ]
    if bam.is_paired(in_file):
        out1, out2 = out_files
    else:
        out1 = out_files[0]
        out2 = None
    if not file_exists(out1):
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2)
    if out2 and os.path.getsize(out2) == 0:
        out2 = None
    return [out1, out2]
Esempio n. 2
0
def _prep_inputs(align_bams, ref_file, items):
    """Ensure inputs to calling are indexed as expected.
    """
    broad_runner = broad.runner_from_path("picard", items[0]["config"])
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, items[0]["config"])
Esempio n. 3
0
def prep_recal(data):
    """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM.
    """
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Recalibrating %s with GATK" % str(dd.get_sample_name(data)))
        ref_file = data["sam_ref"]
        config = data["config"]
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data)
        if not dbsnp_file:
            logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.")
            return [[data]]
        platform = config["algorithm"].get("platform", "illumina")
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_index_ref", ref_file)
        if config["algorithm"].get("mark_duplicates", True):
            (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"])
        else:
            dup_align_bam = data["work_bam"]
        bam.index(dup_align_bam, config)
        intervals = config["algorithm"].get("variant_regions", None)
        data["work_bam"] = dup_align_bam
        broad_runner = broad.runner_from_config(config)
        data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file,
                                                     platform, dbsnp_file, intervals, data)
    return [[data]]
Esempio n. 4
0
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    gatk_type = broad_runner.gatk_type()
    for x in align_bams:
        bam.index(x, config)
    if _use_spark(num_cores, gatk_type):
        # GATK4 spark runs use 2bit reference index
        params = ["--reference", dd.get_ref_twobit(items[0])]
    else:
        picard_runner = broad.runner_from_path("picard", config)
        picard_runner.run_fn("picard_index_ref", ref_file)
        params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        if gatk_type == "gatk4":
            params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"]
        else:
            params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += standard_cl_params(items)
    return broad_runner, params
Esempio n. 5
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence,
                   "--standard_min_confidence_threshold_for_emitting", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
Esempio n. 6
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region,
                           out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += [
            "--standard_min_confidence_threshold_for_calling", confidence
        ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += [
            "-L",
            bamprep.region_to_gatk(region), "--interval_set_rule",
            "INTERSECTION"
        ]
    params += standard_cl_params(items)
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
def _prep_inputs(align_bams, ref_file, items):
    """Ensure inputs to calling are indexed as expected.
    """
    broad_runner = broad.runner_from_path("picard", items[0]["config"])
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, items[0]["config"])
Esempio n. 8
0
def _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Preparation work for MuTect.
    """
    base_config = items[0]["config"]
    broad_runner = broad.runner_from_path("picard", base_config)
    broad_runner.run_fn("picard_index_ref", ref_file)

    broad_runner = broad.runner_from_config(base_config, "mutect")
    _check_mutect_version(broad_runner)
    for x in align_bams:
        bam.index(x, base_config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    if not paired:
        raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n"
                         "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                         "pipelines.html#cancer-variant-calling\n"
                         "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items]))
    params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"]
    params += ["--read_filter", "NotPrimaryAlignment"]
    params += ["-I:tumor", paired.tumor_bam]
    params += ["--tumor_sample_name", paired.tumor_name]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
        params += ["--normal_sample_name", paired.normal_name]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    params += _config_params(base_config, assoc_files, region, out_file)
    return broad_runner, params
Esempio n. 9
0
def _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Preparation work for MuTect.
    """
    base_config = items[0]["config"]
    broad_runner = broad.runner_from_path("picard", base_config)
    broad_runner.run_fn("picard_index_ref", ref_file)

    broad_runner = broad.runner_from_config(base_config, "mutect")
    _check_mutect_version(broad_runner)
    for x in align_bams:
        bam.index(x, base_config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    if not paired:
        raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n"
                         "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                         "pipelines.html#cancer-variant-calling\n"
                         "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items]))
    params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"]
    params += ["--read_filter", "NotPrimaryAlignment"]
    params += ["-I:tumor", paired.tumor_bam]
    params += ["--tumor_sample_name", paired.tumor_name]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
        params += ["--normal_sample_name", paired.normal_name]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    params += _config_params(base_config, assoc_files, region, out_file)
    return broad_runner, params
Esempio n. 10
0
def collect_oxog_metrics(data):
    """ extracts 8-oxoguanine (OxoG) artifact metrics from CollectSequencingArtifacts
    output so we don't have to run CollectOxoGMetrics.
    """
    input_base = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data),
                              dd.get_sample_name(data))
    if not utils.file_exists(input_base + ".pre_adapter_detail_metrics"):
        return None
    OUT_SUFFIXES = [".oxog_metrics"]
    picard = broad.runner_from_path("picard", dd.get_config(data))
    out_dir = os.path.join(dd.get_work_dir(data), "metrics", "oxog", dd.get_sample_name(data))
    utils.safe_makedir(out_dir)
    ref_file = dd.get_ref_file(data)
    out_base = os.path.join(out_dir, dd.get_sample_name(data))
    out_files = [out_base + x for x in OUT_SUFFIXES]
    if all([utils.file_exists(x) for x in out_files]):
        return out_files
    with file_transaction(data, out_dir) as tx_out_dir:
        utils.safe_makedir(tx_out_dir)
        out_base = os.path.join(tx_out_dir, dd.get_sample_name(data))
        params = [("--INPUT_BASE", input_base),
                  ("--OUTPUT_BASE", out_base),
                  ("--REFERENCE_SEQUENCE", ref_file)]
        picard.run("ConvertSequencingArtifactToOxoG", params)
    return out_files
Esempio n. 11
0
def collect_artifact_metrics(data):
    """Run CollectSequencingArtifacts to collect pre-adapter ligation artifact metrics
    https://gatk.broadinstitute.org/hc/en-us/articles/360037429491-CollectSequencingArtifactMetrics-Picard-
    use picard wrapper rather than gatk - works for gatk4 and gatk3 projects
    refactor - move to broad/picardrun
    """
    OUT_SUFFIXES = [".bait_bias_detail_metrics", ".error_summary_metrics",
                    ".pre_adapter_detail_metrics", ".pre_adapter_summary_metrics"]
    picard = broad.runner_from_path("picard", dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    bam_file = dd.get_work_bam(data)
    if not bam_file:
        return None
    if "collectsequencingartifacts" in dd.get_tools_off(data):
        return None
    out_dir = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data))
    utils.safe_makedir(out_dir)
    out_base = os.path.join(out_dir, dd.get_sample_name(data))
    out_files = [out_base + x for x in OUT_SUFFIXES]
    if all([utils.file_exists(x) for x in out_files]):
        return out_files
    with file_transaction(data, out_dir) as tx_out_dir:
        utils.safe_makedir(tx_out_dir)
        out_base = os.path.join(tx_out_dir, dd.get_sample_name(data))
        params = [("-REFERENCE_SEQUENCE", ref_file),
                  ("-INPUT", bam_file),
                  ("-OUTPUT", out_base)]
        # picard runner sets VALIDATION_STRINGENCY
        picard.run("CollectSequencingArtifactMetrics", params)
    return out_files
Esempio n. 12
0
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config):
    """Convert BAM input file into FASTQ files.
    """
    out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert"))

    qual_bin_method = config["algorithm"].get("quality_bin")
    if (qual_bin_method == "prealignment" or
         (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)):
        out_bindir = safe_makedir(os.path.join(out_dir, "qualbin"))
        in_file = cram.illumina_qual_bin(in_file, data["sam_ref"], out_bindir, config)

    out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format(
                 os.path.splitext(os.path.basename(in_file))[0], x))
                 for x in ["1", "2"]]
    if bam.is_paired(in_file):
        out1, out2 = out_files
    else:
        out1 = out_files[0]
        out2 = None
    if not file_exists(out1):
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2)
    if out2 and os.path.getsize(out2) == 0:
        out2 = None
    return [out1, out2]
Esempio n. 13
0
def prep_recal(data):
    """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM.
    """
    if data["config"]["algorithm"].get("recalibrate", True) in [True, "gatk"]:
        logger.info("Recalibrating %s with GATK" % str(data["name"]))
        ref_file = data["sam_ref"]
        config = data["config"]
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"),
                               data)
        if not dbsnp_file:
            logger.info(
                "Skipping GATK BaseRecalibrator because no VCF file of known variants was found."
            )
            return [[data]]
        platform = config["algorithm"].get("platform", "illumina")
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_index_ref", ref_file)
        if config["algorithm"].get("mark_duplicates", True):
            (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates",
                                                     data["work_bam"])
        else:
            dup_align_bam = data["work_bam"]
        bam.index(dup_align_bam, config)
        intervals = config["algorithm"].get("variant_regions", None)
        data["work_bam"] = dup_align_bam
        broad_runner = broad.runner_from_config(config)
        data["prep_recal"] = _gatk_base_recalibrator(broad_runner,
                                                     dup_align_bam, ref_file,
                                                     platform, dbsnp_file,
                                                     intervals, data)
    return [[data]]
Esempio n. 14
0
def _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores=1):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    gatk_type = broad_runner.gatk_type()
    for x in align_bams:
        bam.index(x, config)
    picard_runner = broad.runner_from_path("picard", config)
    picard_runner.run_fn("picard_index_ref", ref_file)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    variant_regions = bedutils.population_variant_regions(items)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        if gatk_type == "gatk4":
            params += ["-L", bamprep.region_to_gatk(region), "--interval-set-rule", "INTERSECTION"]
        else:
            params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    params += standard_cl_params(items)
    return broad_runner, params
Esempio n. 15
0
def combine_bam(in_files, out_file, config):
    """Parallel target to combine multiple BAM files.
    """
    runner = broad.runner_from_path("picard", config)
    runner.run_fn("picard_merge", in_files, out_file)
    for in_file in in_files:
        save_diskspace(in_file, "Merged into {0}".format(out_file), config)
    bam.index(out_file, config)
    return out_file
Esempio n. 16
0
def combine_bam(in_files, out_file, config):
    """Parallel target to combine multiple BAM files.
    """
    runner = broad.runner_from_path("picard", config)
    runner.run_fn("picard_merge", in_files, out_file)
    for in_file in in_files:
        save_diskspace(in_file, "Merged into {0}".format(out_file), config)
    bam.index(out_file, config)
    return out_file
Esempio n. 17
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = utils.to_single_data(data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                                 % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"],
                                           data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign",
                                                         data["rgnames"]["sample"]))
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data))
    else:
        raise ValueError("Could not process input file from sample configuration. \n" +
                         fastq1 +
                         "\nIs the path to the file correct or is empty?\n" +
                         "If it is a fastq file (not pre-aligned BAM or CRAM), "
                         "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
Esempio n. 18
0
def get_ref_bedtool(ref_file, config, chrom=None):
    """Retrieve a pybedtool BedTool object with reference sizes from input reference.
    """
    broad_runner = broad.runner_from_path("picard", config)
    ref_dict = broad_runner.run_fn("picard_index_ref", ref_file)
    ref_lines = []
    with contextlib.closing(pysam.Samfile(ref_dict, "r")) as ref_sam:
        for sq in ref_sam.header["SQ"]:
            if not chrom or sq["SN"] == chrom:
                ref_lines.append("%s\t%s\t%s" % (sq["SN"], 0, sq["LN"]))
    return pybedtools.BedTool("\n".join(ref_lines), from_string=True)
Esempio n. 19
0
def get_ref_bedtool(ref_file, config, chrom=None):
    """Retrieve a pybedtool BedTool object with reference sizes from input reference.
    """
    broad_runner = broad.runner_from_path("picard", config)
    ref_dict = broad_runner.run_fn("picard_index_ref", ref_file)
    ref_lines = []
    with pysam.Samfile(ref_dict, "r") as ref_sam:
        for sq in ref_sam.header["SQ"]:
            if not chrom or sq["SN"] == chrom:
                ref_lines.append("%s\t%s\t%s" % (sq["SN"], 0, sq["LN"]))
    return pybedtools.BedTool("\n".join(ref_lines), from_string=True)
Esempio n. 20
0
def picard_prep(in_bam, names, ref_file, dirs, data):
    """Prepare input BAM using Picard and GATK cleaning tools.

    - ReorderSam to reorder file to reference
    - AddOrReplaceReadGroups to add read group information and coordinate sort
    - PrintReads to filters to remove problem records:
    - filterMBQ to remove reads with mismatching bases and base qualities
    """
    runner = broad.runner_from_path("picard", data["config"])
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "bamclean", names["sample"]))
    runner.run_fn("picard_index_ref", ref_file)
    reorder_bam = os.path.join(work_dir, "%s-reorder.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    reorder_bam = runner.run_fn("picard_reorder", in_bam, ref_file, reorder_bam)
    rg_bam = runner.run_fn("picard_fix_rgs", reorder_bam, names)
    return _filter_bad_reads(rg_bam, ref_file, data)
Esempio n. 21
0
def picard_prep(in_bam, names, ref_file, dirs, data):
    """Prepare input BAM using Picard and GATK cleaning tools.

    - ReorderSam to reorder file to reference
    - AddOrReplaceReadGroups to add read group information and coordinate sort
    - PrintReads to filters to remove problem records:
    - filterMBQ to remove reads with mismatching bases and base qualities
    """
    runner = broad.runner_from_path("picard", data["config"])
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "bamclean", names["sample"]))
    runner.run_fn("picard_index_ref", ref_file)
    reorder_bam = os.path.join(work_dir, "%s-reorder.bam" %
                               os.path.splitext(os.path.basename(in_bam))[0])
    reorder_bam = runner.run_fn("picard_reorder", in_bam, ref_file, reorder_bam)
    rg_bam = runner.run_fn("picard_fix_rgs", reorder_bam, names)
    return _filter_bad_reads(rg_bam, ref_file, data)
Esempio n. 22
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, cosmic, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += [
            "--standard_min_confidence_threshold_for_calling",
            confidence,
            "--standard_min_confidence_threshold_for_emitting",
            confidence,
        ]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        bam.index(x, config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    if not paired:
        raise ValueError(
            "Specified MuTect calling but 'tumor' phenotype not present in batch\n"
            "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
            "pipelines.html#cancer-variant-calling\n"
            "for samples: %s" % ", ".join([dd.get_sample_name(x) for x in items])
        )
    params += ["-I:tumor", paired.tumor_bam]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
Esempio n. 23
0
def ref_file_from_bam(bam_file, data):
    """Subset a fasta input file to only a fraction of input contigs.
    """
    new_ref = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "inputs", "ref")),
                           "%s-subset.fa" % dd.get_genome_build(data))
    if not utils.file_exists(new_ref):
        with file_transaction(data, new_ref) as tx_out_file:
            contig_file = "%s-contigs.txt" % utils.splitext_plus(new_ref)[0]
            with open(contig_file, "w") as out_handle:
                for contig in [x.contig for x in idxstats(bam_file, data) if x.contig != "*"]:
                    out_handle.write("%s\n" % contig)
            cmd = "seqtk subseq -l 100 %s %s > %s" % (dd.get_ref_file(data), contig_file, tx_out_file)
            do.run(cmd, "Subset %s to BAM file contigs" % dd.get_genome_build(data))
    ref.fasta_idx(new_ref, data["config"])
    runner = broad.runner_from_path("picard", data["config"])
    runner.run_fn("picard_index_ref", new_ref)
    return {"base": new_ref}
Esempio n. 24
0
def ref_file_from_bam(bam_file, data):
    """Subset a fasta input file to only a fraction of input contigs.
    """
    new_ref = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "inputs", "ref")),
                           "%s-subset.fa" % dd.get_genome_build(data))
    if not utils.file_exists(new_ref):
        with file_transaction(data, new_ref) as tx_out_file:
            contig_file = "%s-contigs.txt" % utils.splitext_plus(new_ref)[0]
            with open(contig_file, "w") as out_handle:
                for contig in [x.contig for x in idxstats(bam_file, data) if x.contig != "*"]:
                    out_handle.write("%s\n" % contig)
            cmd = "seqtk subseq -l 100 %s %s > %s" % (dd.get_ref_file(data), contig_file, tx_out_file)
            do.run(cmd, "Subset %s to BAM file contigs" % dd.get_genome_build(data))
    ref.fasta_idx(new_ref, data["config"])
    runner = broad.runner_from_path("picard", data["config"])
    runner.run_fn("picard_index_ref", new_ref)
    return {"base": new_ref}
Esempio n. 25
0
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config):
    """Convert BAM input file into FASTQ files.
    """
    out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert"))
    out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format(
                 os.path.splitext(os.path.basename(in_file))[0], x))
                 for x in ["1", "2"]]
    if bam.is_paired(in_file):
        out1, out2 = out_files
    else:
        out1 = out_files[0]
        out2 = None
    if not file_exists(out1):
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2)
    if out2 and os.path.getsize(out2) == 0:
        out2 = None
    return [out1, out2]
Esempio n. 26
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, cosmic, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence,
                   "--standard_min_confidence_threshold_for_emitting", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        bam.index(x, config)

    paired = vcfutils.get_paired_bams(align_bams, items)
    if not paired:
        raise ValueError("Specified MuTect calling but 'tumor' phenotype not present in batch\n"
                         "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                         "pipelines.html#cancer-variant-calling\n"
                         "for samples: %s" % ", " .join([dd.get_sample_name(x) for x in items]))
    params += ["-I:tumor", paired.tumor_bam]
    if paired.normal_bam is not None:
        params += ["-I:normal", paired.normal_bam]
    if paired.normal_panel is not None:
        params += ["--normal_panel", paired.normal_panel]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    if cosmic:
        params += ["--cosmic", cosmic]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
Esempio n. 27
0
def _fix_unmapped(unmapped_file, config, names):
    """
    the unmapped.bam file from Tophat 2.0.9 is missing some things
    1) the RG tag is missing from the reads
    2) MAPQ is set to 255 instead of 0
    3) for reads where both are unmapped, the mate_is_unmapped flag is not set correctly
    """
    out_file = os.path.splitext(unmapped_file)[0] + "_fixed.bam"
    if file_exists(out_file):
        return out_file
    picard = broad.runner_from_path("picard", config)
    rg_fixed = picard.run_fn("picard_fix_rgs", unmapped_file, names)
    fixed = bam.sort(rg_fixed, config, "queryname")
    with closing(pysam.Samfile(fixed)) as work_sam:
        with file_transaction(config, out_file) as tx_out_file:
            tx_out = pysam.Samfile(tx_out_file, "wb", template=work_sam)
            for read1 in work_sam:
                if not read1.is_paired:
                    if read1.is_unmapped:
                        read1.mapq = 0
                    tx_out.write(read1)
                    continue
                read2 = work_sam.next()
                if read1.qname != read2.qname:
                    continue
                if read1.is_unmapped and not read2.is_unmapped:
                    read1.mapq = 0
                    read1.tid = read2.tid
                if not read1.is_unmapped and read2.is_unmapped:
                    read2.mapq = 0
                    read2.tid = read1.tid
                if read1.is_unmapped and read2.is_unmapped:
                    read1.mapq = 0
                    read2.mapq = 0
                    read1.mate_is_unmapped = True
                    read2.mate_is_unmapped = True
                tx_out.write(read1)
                tx_out.write(read2)
            tx_out.close()

    return out_file
Esempio n. 28
0
def _fix_unmapped(unmapped_file, config, names):
    """
    the unmapped.bam file from Tophat 2.0.9 is missing some things
    1) the RG tag is missing from the reads
    2) MAPQ is set to 255 instead of 0
    3) for reads where both are unmapped, the mate_is_unmapped flag is not set correctly
    """
    out_file = os.path.splitext(unmapped_file)[0] + "_fixed.bam"
    if file_exists(out_file):
        return out_file
    picard = broad.runner_from_path("picard", config)
    rg_fixed = picard.run_fn("picard_fix_rgs", unmapped_file, names)
    fixed = bam.sort(rg_fixed, config, "queryname")
    with closing(pysam.Samfile(fixed)) as work_sam:
        with file_transaction(config, out_file) as tx_out_file:
            tx_out = pysam.Samfile(tx_out_file, "wb", template=work_sam)
            for read1 in work_sam:
                if not read1.is_paired:
                    if read1.is_unmapped:
                        read1.mapq = 0
                    tx_out.write(read1)
                    continue
                read2 = work_sam.next()
                if read1.qname != read2.qname:
                    continue
                if read1.is_unmapped and not read2.is_unmapped:
                    read1.mapq = 0
                    read1.tid = read2.tid
                if not read1.is_unmapped and read2.is_unmapped:
                    read2.mapq = 0
                    read2.tid = read1.tid
                if read1.is_unmapped and read2.is_unmapped:
                    read1.mapq = 0
                    read2.mapq = 0
                    read1.mate_is_unmapped = True
                    read2.mate_is_unmapped = True
                tx_out.write(read1)
                tx_out.write(read2)
            tx_out.close()

    return out_file
Esempio n. 29
0
def _add_rg(unmapped_file, config, names):
    """Add the missing RG header."""
    picard = broad.runner_from_path("picard", config)
    rg_fixed = picard.run_fn("picard_fix_rgs", unmapped_file, names)
    return rg_fixed
Esempio n. 30
0
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data,
                 names=None):
    """
    run alignment using Tophat v2
    """
    config = data["config"]
    options = get_in(config, ("resources", "tophat", "options"), {})
    options = _set_fusion_mode(options, config)
    options = _set_quality_flag(options, data)
    options = _set_transcriptome_option(options, data, ref_file)
    options = _set_cores(options, config)
    options = _set_rg_options(options, names)
    options = _set_stranded_flag(options, config)

    ref_file, runner = _determine_aligner_and_reference(ref_file, config)

    # fusion search does not work properly with Bowtie2
    if options.get("fusion-search", False):
        ref_file = ref_file.replace("/bowtie2", "/bowtie")

    if _tophat_major_version(config) == 1:
        raise NotImplementedError("Tophat versions < 2.0 are not supported, please "
                                  "download the newest version of Tophat here: "
                                  "http://tophat.cbcb.umd.edu")

    if _ref_version(ref_file) == 1 or options.get("fusion-search", False):
        options["bowtie1"] = True

    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out

    out_file = os.path.join(out_dir, "accepted_hits.sam")
    unmapped = os.path.join(out_dir, "unmapped.bam")
    files = [ref_file, fastq_file]
    if not file_exists(out_file):
        with file_transaction(config, out_dir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            if pair_file and not options.get("mate-inner-dist", None):
                d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file,
                                                        ref_file, out_base,
                                                        tx_out_dir, data)
                options["mate-inner-dist"] = d
                options["mate-std-dev"] = d_stdev
                files.append(pair_file)
            options["output-dir"] = tx_out_dir
            options["no-convert-bam"] = True
            options["no-coverage-search"] = True
            options["no-mixed"] = True
            tophat_runner = sh.Command(config_utils.get_program("tophat",
                                                                config))
            ready_options = {}
            for k, v in options.iteritems():
                ready_options[k.replace("-", "_")] = v
            # tophat requires options before arguments,
            # otherwise it silently ignores them
            tophat_ready = tophat_runner.bake(**ready_options)
            cmd = "%s %s" % (sys.executable, str(tophat_ready.bake(*files)))
            do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file), None)
    if pair_file and _has_alignments(out_file):
        fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.sam" % out_base),
                           ref_file, config)
    else:
        fixed = out_file
    fixed = merge_unmapped(fixed, unmapped, config)
    fixed = _fix_unmapped(fixed, config, names)
    fixed = bam.sort(fixed, config)
    picard = broad.runner_from_path("picard", config)
    # set the contig order to match the reference file so GATK works
    fixed = picard.run_fn("picard_reorder", out_file, data["sam_ref"],
                          os.path.splitext(out_file)[0] + ".picard.bam")
    fixed = fix_insert_size(fixed, config)
    if not file_exists(final_out):
        symlink_plus(fixed, final_out)
    return final_out
Esempio n. 31
0
def _add_rg(unmapped_file, config, names):
    """Add the missing RG header."""
    picard = broad.runner_from_path("picard", config)
    rg_fixed = picard.run_fn("picard_fix_rgs", unmapped_file, names)
    return rg_fixed
Esempio n. 32
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" %
                    (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        if dd.get_correct_umis(data):
            data["work_bam"] = postalign.correct_umis(data)
        if dd.get_umi_consensus(data):
            data["umi_bam"] = dd.get_work_bam(data)
            if fastq2:
                f1, f2, avg_cov = postalign.umi_consensus(data)
                data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov
                del data["config"]["algorithm"]["umi_type"]
                data["config"]["algorithm"]["mark_duplicates"] = False
                data = align_to_sort_bam(f1, f2, aligner, data)
            else:
                raise ValueError(
                    "Single fastq input for UMI processing; fgbio needs paired reads: %s"
                    % dd.get_sample_name(data))
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                    % sort_method)
            ref_file = dd.get_ref_file(data)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file,
                                           data["dirs"], data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"],
                                     dd.get_ref_file(data), data["dirs"], data)
        elif bamclean == "remove_extracontigs":
            out_bam = cleanbam.remove_extracontigs(fastq1, data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            if not utils.file_exists(out_file):
                work_dir = utils.safe_makedir(
                    os.path.join(dd.get_work_dir(data), "bamclean",
                                 dd.get_sample_name(data)))
                out_file = os.path.join(
                    work_dir, "{}-sort.bam".format(dd.get_sample_name(data)))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        else:
            out_bam = _link_bam_file(
                fastq1,
                os.path.join(dd.get_work_dir(data), "prealign",
                             dd.get_sample_name(data)), data)
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data),
                         data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and not dd.get_aligner(data):
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" %
                         dd.get_sample_name(data))
    elif "kraken" in config["algorithm"]:  # kraken doesn's need bam
        pass
    else:
        raise ValueError(
            "Could not process input file from sample configuration. \n" +
            fastq1 + "\nIs the path to the file correct or is empty?\n" +
            "If it is a fastq file (not pre-aligned BAM or CRAM), "
            "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
Esempio n. 33
0
def tophat_align(fastq_file,
                 pair_file,
                 ref_file,
                 out_base,
                 align_dir,
                 data,
                 names=None):
    """
    run alignment using Tophat v2
    """
    config = data["config"]
    options = get_in(config, ("resources", "tophat", "options"), {})
    options = _set_fusion_mode(options, config)
    options = _set_quality_flag(options, data)
    options = _set_transcriptome_option(options, data, ref_file)
    options = _set_cores(options, config)
    options = _set_rg_options(options, names)
    options = _set_stranded_flag(options, config)

    ref_file, runner = _determine_aligner_and_reference(ref_file, config)

    # fusion search does not work properly with Bowtie2
    if options.get("fusion-search", False):
        ref_file = ref_file.replace("/bowtie2", "/bowtie")

    if _tophat_major_version(config) == 1:
        raise NotImplementedError(
            "Tophat versions < 2.0 are not supported, please "
            "download the newest version of Tophat here: "
            "http://tophat.cbcb.umd.edu")

    if _ref_version(ref_file) == 1 or options.get("fusion-search", False):
        options["bowtie1"] = True

    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out

    out_file = os.path.join(out_dir, "accepted_hits.bam")
    unmapped = os.path.join(out_dir, "unmapped.bam")
    files = [ref_file, fastq_file]
    if not file_exists(out_file):
        with file_transaction(config, out_dir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            if pair_file and not options.get("mate-inner-dist", None):
                d, d_stdev = _estimate_paired_innerdist(
                    fastq_file, pair_file, ref_file, out_base, tx_out_dir,
                    data)
                options["mate-inner-dist"] = d
                options["mate-std-dev"] = d_stdev
                files.append(pair_file)
            options["output-dir"] = tx_out_dir
            options["no-coverage-search"] = True
            options["no-mixed"] = True
            cmd = [sys.executable, config_utils.get_program("tophat", config)]
            for k, v in options.items():
                if v is True:
                    cmd.append("--%s" % k)
                else:
                    assert not isinstance(v, bool)
                    cmd.append("--%s=%s" % (k, v))
            # tophat requires options before arguments, otherwise it silently ignores them
            cmd += files
            do.run(cmd,
                   "Running Tophat on %s and %s." % (fastq_file, pair_file))
    if pair_file and _has_alignments(out_file):
        fixed = _fix_mates(out_file,
                           os.path.join(out_dir, "%s-align.bam" % out_base),
                           ref_file, config)
    else:
        fixed = out_file
    fixed_unmapped = _fix_unmapped(fixed, unmapped, data)
    fixed = merge_unmapped(fixed, fixed_unmapped, config)
    fixed = _add_rg(fixed, config, names)
    fixed = bam.sort(fixed, config)
    picard = broad.runner_from_path("picard", config)
    # set the contig order to match the reference file so GATK works
    fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"],
                          os.path.splitext(fixed)[0] + ".picard.bam")
    fixed = fix_insert_size(fixed, config)
    if not file_exists(final_out):
        symlink_plus(fixed, final_out)
    return final_out