Ejemplo n.º 1
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq"):
        if "qualimap" not in dd.get_tools_off(data):
            if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
                to_run.append("qualimap_rnaseq")
            else:
                logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
    if not analysis.startswith("smallrna-seq"):
        to_run.append("samtools")
        if tz.get_in(["config", "algorithm", "kraken"], data):
            to_run.append("kraken")
    if analysis.startswith(("standard", "variant", "variant2")):
        to_run += ["qsignature", "coverage", "variants", "picard"]
        if vcfutils.get_paired([data]):
            to_run += ["viral"]
        if damage.should_filter([data]):
            to_run += ["damage"]
    if dd.get_umi_consensus(data):
        to_run += ["umi"]
    return to_run
Ejemplo n.º 2
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if tz.get_in(["config", "algorithm", "kraken"], data):
        to_run.append("kraken")
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([
            tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]
    ]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq") or analysis == "smallrna-seq":
        if "qualimap" not in dd.get_tools_off(data):
            if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
                to_run.append("qualimap_rnaseq")
            else:
                logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("chip-seq"):
        to_run.append("chipqc")
        if dd.get_chip_method(data) == "atac":
            to_run.append("atac")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
        to_run.append("atropos")
    if "coverage_qc" not in dd.get_tools_off(data):
        to_run.append("samtools")
    if dd.has_variantcalls(data):
        if "coverage_qc" not in dd.get_tools_off(data):
            to_run += ["coverage", "picard"]
        to_run += ["qsignature", "variants"]
        if vcfanno.is_human(data):
            to_run += ["contamination", "peddy"]
        if vcfutils.get_paired_phenotype(data):
            to_run += ["viral"]
        if damage.should_filter([data]):
            to_run += ["damage"]
    if dd.get_umi_consensus(data):
        to_run += ["umi"]
    if tz.get_in(["config", "algorithm", "preseq"], data):
        to_run.append("preseq")
    to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)]
    to_run.sort()
    return to_run
Ejemplo n.º 3
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq"):
        if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
            to_run.append("qualimap_rnaseq")
    if not analysis.startswith("smallrna-seq"):
        to_run.append("samtools")
        to_run.append("gemini")
        if tz.get_in(["config", "algorithm", "kraken"], data):
            to_run.append("kraken")
    if analysis.startswith(("standard", "variant", "variant2")):
        to_run += ["qsignature", "coverage", "variants", "picard"]
    return to_run
Ejemplo n.º 4
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([
            tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]
    ]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq"):
        if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
            to_run.append("qualimap_rnaseq")
        else:
            logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
    if not analysis.startswith("smallrna-seq"):
        to_run.append("samtools")
        to_run.append("gemini")
        if tz.get_in(["config", "algorithm", "kraken"], data):
            to_run.append("kraken")
    if analysis.startswith(("standard", "variant", "variant2")):
        to_run += ["qsignature", "coverage", "variants", "picard"]
    return to_run
Ejemplo n.º 5
0
def run_cluster(*data):
    """
    Run seqcluster cluster to detect smallRNA clusters
    """
    sample = data[0][0]
    tools_off = dd.get_tools_off(data[0][0])
    work_dir = dd.get_work_dir(sample)
    out_dir = op.join(work_dir, "seqcluster", "cluster")
    out_dir = op.abspath(safe_makedir(out_dir))
    prepare_dir = op.join(work_dir, "seqcluster", "prepare")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    if "seqcluster" not in tools_off:
        cluster_dir = _cluster(bam_file, prepare_dir, out_dir, dd.get_ref_file(sample), dd.get_srna_gtf_file(sample))
        sample["report"] = _report(sample, dd.get_ref_file(sample))
        sample["seqcluster"] = out_dir

    out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase"))
    if out_mirna:
        sample = dd.set_mirna_counts(sample, out_mirna[0])
        sample = dd.set_isomir_counts(sample, out_mirna[1])

    out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel")
    novel_db = mirdeep.run(data)
    if out_novel:
        sample = dd.set_novel_mirna_counts(sample, out_novel[0])
        sample = dd.set_novel_isomir_counts(sample, out_novel[1])
    data[0][0] = sample
    return data
Ejemplo n.º 6
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = ["VariantFiltration",
                      "-R", ref_file,
                      "-V", vrn_file,
                      "--cluster-window-size", "35",
                      "--cluster-size", "3",
                      "--filter-expression", "'FS > 30.0'",
                      "--filter-name", "FS",
                      "--filter-expression", "'QD < 2.0'",
                      "--filter-name", "QD",
                      "--output", tx_out_file]
            # Use GATK4 for filtering, tools_off is for variant calling
            config = utils.deepish_copy(dd.get_config(data))
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.")
    return out_file
Ejemplo n.º 7
0
def _prep_grabix_indexes(in_files, dirs, data):
    if _is_bam_input(in_files):
        out = _bgzip_from_bam(in_files[0], dirs, data["config"])
    elif _is_cram_input(in_files):
        out = _bgzip_from_cram(in_files[0], dirs, data)
    elif _ready_gzip_fastq(in_files, data):
        out = in_files
    else:
        inputs = [{
            "in_file": x,
            "dirs": dirs,
            "config": data["config"],
            "rgnames": data["rgnames"]
        } for x in in_files if x]
        if "pbgzip" not in dd.get_tools_off(data):
            out = [_bgzip_from_fastq(d) for d in inputs]
        else:
            out = run_multicore(_bgzip_from_fastq_parallel,
                                [[d] for d in inputs], data["config"])
    items = [[{
        "bgzip_file": x,
        "config": copy.deepcopy(data["config"])
    }] for x in out if x]
    run_multicore(_grabix_index, items, data["config"])
    return out
Ejemplo n.º 8
0
def run_cluster(*data):
    """
    Run seqcluster cluster to detect smallRNA clusters
    """
    sample = data[0][0]
    tools_off = dd.get_tools_off(data[0][0])
    work_dir = dd.get_work_dir(sample)
    out_dir = op.join(work_dir, "seqcluster", "cluster")
    out_dir = op.abspath(safe_makedir(out_dir))
    prepare_dir = op.join(work_dir, "seqcluster", "prepare")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    if "seqcluster" not in tools_off:
        cluster_dir = _cluster(bam_file, prepare_dir, out_dir,
                               dd.get_ref_file(sample),
                               dd.get_srna_gtf_file(sample))
        sample["report"] = _report(sample, dd.get_ref_file(sample))
        sample["seqcluster"] = out_dir

    out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase"))
    if out_mirna:
        sample = dd.set_mirna_counts(sample, out_mirna[0])
        sample = dd.set_isomir_counts(sample, out_mirna[1])

    out_novel = _make_isomir_counts(data, "seqbuster_novel",
                                    op.join(work_dir, "mirdeep2"), "_novel")
    novel_db = mirdeep.run(data)
    if out_novel:
        sample = dd.set_novel_mirna_counts(sample, out_novel[0])
        sample = dd.set_novel_isomir_counts(sample, out_novel[1])
    data[0][0] = sample
    return data
Ejemplo n.º 9
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "VariantFiltration", "-R", ref_file, "-V", vrn_file,
                "--cluster-window-size", "35", "--cluster-size", "3",
                "--filter-expression", "'FS > 30.0'", "--filter-name", "FS",
                "--filter-expression", "'QD < 2.0'", "--filter-name", "QD",
                "--output", tx_out_file
            ]
            # Use GATK4 for filtering, tools_off is for variant calling
            config = utils.deepish_copy(dd.get_config(data))
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            jvm_opts = broad.get_gatk_opts(config,
                                           os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk", jvm_opts, params, config),
                   "Filter RNA-seq variants.")
    return out_file
Ejemplo n.º 10
0
def _get_gatk_opts(config,
                   names,
                   tmp_dir=None,
                   memscale=None,
                   include_gatk=True,
                   parallel_gc=False):
    """Retrieve GATK memory specifications, moving down a list of potential specifications.
    """
    if include_gatk and "gatk4" in dd.get_tools_off({"config": config}):
        opts = [
            "-U", "LENIENT_VCF_PROCESSING", "--read_filter", "BadCigar",
            "--read_filter", "NotPrimaryAlignment"
        ]
    else:
        opts = []
    jvm_opts = ["-Xms750m", "-Xmx2g"]
    for n in names:
        resources = config_utils.get_resources(n, config)
        if resources and resources.get("jvm_opts"):
            jvm_opts = resources.get("jvm_opts")
            break
    if memscale:
        jvm_opts = config_utils.adjust_opts(
            jvm_opts, {"algorithm": {
                "memory_adjust": memscale
            }})
    jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc)
    return jvm_opts + opts
Ejemplo n.º 11
0
def collect_artifact_metrics(data):
    """Run CollectSequencingArtifacts to collect pre-adapter ligation artifact metrics
    https://gatk.broadinstitute.org/hc/en-us/articles/360037429491-CollectSequencingArtifactMetrics-Picard-
    """
    OUT_SUFFIXES = [
        ".bait_bias_detail_metrics", ".error_summary_metrics",
        ".pre_adapter_detail_metrics", ".pre_adapter_summary_metrics"
    ]
    broad_runner = broad.runner_from_config(dd.get_config(data))
    gatk_type = broad_runner.gatk_type()
    ref_file = dd.get_ref_file(data)
    bam_file = dd.get_work_bam(data)
    if not bam_file:
        return None
    if "collectsequencingartifacts" in dd.get_tools_off(data):
        return None
    out_dir = os.path.join(dd.get_work_dir(data), "metrics", "artifact",
                           dd.get_sample_name(data))
    utils.safe_makedir(out_dir)
    out_base = os.path.join(out_dir, dd.get_sample_name(data))
    out_files = [out_base + x for x in OUT_SUFFIXES]
    if all([utils.file_exists(x) for x in out_files]):
        return out_files
    with file_transaction(data, out_dir) as tx_out_dir:
        utils.safe_makedir(tx_out_dir)
        out_base = os.path.join(tx_out_dir, dd.get_sample_name(data))
        params = [
            "-T", "CollectSequencingArtifactMetrics",
            "--VALIDATION_STRINGENCY", "SILENT", "-R", ref_file, "-I",
            bam_file, "-O", out_base
        ]
        broad_runner.run_gatk(params, log_error=False, parallel_gc=True)
    return out_files
Ejemplo n.º 12
0
    def get_gatk_version(self):
        """Retrieve GATK version, handling locally and config cached versions.
        Calling version can be expensive due to all the startup and shutdown
        of JVMs, so we prefer cached version information.
        """
        if self._gatk_version is None:
            self._set_default_versions(self._config)

        if "gatk4" not in dd.get_tools_off({"config": self._config}):
            # In cases whwere we don't have manifest versions. Not possible to get
            # version from commandline with GATK4 alpha version
            if self._gatk4_version is None:
                self._gatk4_version = "4.0"
            return self._gatk4_version
        elif self._gatk_version is not None:
            return self._gatk_version
        else:
            if self._has_gatk_conda_wrapper():
                gatk_jar = None
            else:
                gatk_jar = self._get_jar("GenomeAnalysisTK",
                                         ["GenomeAnalysisTKLite"],
                                         allow_missing=True)
            self._gatk_version = get_gatk_version(gatk_jar,
                                                  config=self._config)
            return self._gatk_version
Ejemplo n.º 13
0
def collect_artifact_metrics(data):
    """Run CollectSequencingArtifacts to collect pre-adapter ligation artifact metrics
    https://gatk.broadinstitute.org/hc/en-us/articles/360037429491-CollectSequencingArtifactMetrics-Picard-
    use picard wrapper rather than gatk - works for gatk4 and gatk3 projects
    refactor - move to broad/picardrun
    """
    OUT_SUFFIXES = [".bait_bias_detail_metrics", ".error_summary_metrics",
                    ".pre_adapter_detail_metrics", ".pre_adapter_summary_metrics"]
    picard = broad.runner_from_path("picard", dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    bam_file = dd.get_work_bam(data)
    if not bam_file:
        return None
    if "collectsequencingartifacts" in dd.get_tools_off(data):
        return None
    out_dir = os.path.join(dd.get_work_dir(data), "metrics", "artifact", dd.get_sample_name(data))
    utils.safe_makedir(out_dir)
    out_base = os.path.join(out_dir, dd.get_sample_name(data))
    out_files = [out_base + x for x in OUT_SUFFIXES]
    if all([utils.file_exists(x) for x in out_files]):
        return out_files
    with file_transaction(data, out_dir) as tx_out_dir:
        utils.safe_makedir(tx_out_dir)
        out_base = os.path.join(tx_out_dir, dd.get_sample_name(data))
        params = [("-REFERENCE_SEQUENCE", ref_file),
                  ("-INPUT", bam_file),
                  ("-OUTPUT", out_base)]
        # picard runner sets VALIDATION_STRINGENCY
        picard.run("CollectSequencingArtifactMetrics", params)
    return out_files
Ejemplo n.º 14
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    # back compatible -- older files were named with lane information, use sample name now
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if not utils.file_exists(out_file):
        out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    min_size = None
    if data.get("align_split") or fastq_file.endswith(".sdf"):
        if fastq_file.endswith(".sdf"):
            min_size = rtg.min_read_size(fastq_file)
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if ("bwa-mem" not in dd.get_tools_on(data) and
              ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file,
                                        names, rg_info, data)
        else:
            out_file = _align_mem(fastq_file, pair_file, ref_file, out_file,
                                  names, rg_info, data)
    data["work_bam"] = out_file
    return data
Ejemplo n.º 15
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    # back compatible -- older files were named with lane information, use sample name now
    if names["lane"] != dd.get_sample_name(data):
        out_file = os.path.join(align_dir,
                                "{0}-sort.bam".format(names["lane"]))
    else:
        out_file = None
    if not out_file or not utils.file_exists(out_file):
        umi_ext = "-cumi" if "umi_bam" in data else ""
        out_file = os.path.join(
            align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data),
                                                umi_ext))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    min_size = None
    if data.get("align_split") or fastq_file.endswith(".sdf"):
        if fastq_file.endswith(".sdf"):
            min_size = rtg.min_read_size(fastq_file)
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if ("bwa-mem" not in dd.get_tools_on(data)
                and ("bwa-mem" in dd.get_tools_off(data)
                     or not _can_use_mem(fastq_file, data, min_size))):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file,
                                        out_file, names, rg_info, data)
        else:
            if is_precollapsed_bam(
                    data) or not hla_on(data) or needs_separate_hla(data):
                out_file = _align_mem(fastq_file, pair_file, ref_file,
                                      out_file, names, rg_info, data)
            else:
                out_file = _align_mem_hla(fastq_file, pair_file, ref_file,
                                          out_file, names, rg_info, data)
    data["work_bam"] = out_file

    # bwakit will corrupt the non-HLA alignments in a UMI collapsed BAM file
    # (see https://github.com/bcbio/bcbio-nextgen/issues/3069)
    if needs_separate_hla(data):
        hla_file = os.path.join(os.path.dirname(out_file),
                                "HLA-" + os.path.basename(out_file))
        hla_file = _align_mem_hla(fastq_file, pair_file, ref_file, hla_file,
                                  names, rg_info, data)
        data["hla_bam"] = hla_file
    return data
Ejemplo n.º 16
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data)
        elif "lumpy-genotype" in dd.get_tools_off(data):
            gt_vcf = sample_vcf
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Ejemplo n.º 17
0
def _run_with_memory_scaling(params, tx_out_file, data, ld_preload=False):
    num_cores = dd.get_num_cores(data)
    memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None
    # Ignore tools_off: [gatk4], since it doesn't apply to GATK CNV calling
    config = utils.deepish_copy(data["config"])
    if "gatk4" in dd.get_tools_off({"config": config}):
        config["algorithm"]["tools_off"].remove("gatk4")
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, ld_preload=ld_preload)
Ejemplo n.º 18
0
 def cl_gatk(self, params, tmp_dir, memscale=None, parallel_gc=False):
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     if self._has_gatk_conda_wrapper():
         gatk_jar = None
     else:
         gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True)
         if not gatk_jar:
             raise ValueError("GATK processing requested but gatk or older jar install not found: "
                              "http://bcbio-nextgen.readthedocs.io/en/latest/contents/"
                              "installation.html#gatk-and-mutect-mutect2")
     is_gatk4 = "gatk4" not in dd.get_tools_off({"config": self._config})
     cores = self._config["algorithm"].get("num_cores", 1)
     config = self._config
     atype_index = params.index("-T") if params.count("-T") > 0 \
                     else params.index("--analysis_type")
     prog = params[atype_index + 1]
     # For GATK4 specify command first, so swap params to accomplish
     if is_gatk4:
         params = params[:]
         del params[atype_index + 1]
         del params[atype_index]
         params = [prog] + params
     if cores and int(cores) > 1:
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
             memscale = config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                                                "magnitude": max(1, int(cores) // 2)}
     # Filters and unsafe specifications not in GATK4
     if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9") and not is_gatk4:
         if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
             params.extend(["-U", "LENIENT_VCF_PROCESSING"])
         params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"])
     if memscale:
         jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False,
                                  parallel_gc=parallel_gc)
     else:
         # Decrease memory slightly from configuration to avoid memory allocation errors
         jvm_opts = config_utils.adjust_opts(self._jvm_opts,
                                             {"algorithm": {"memory_adjust":
                                                            {"magnitude": 1.1, "direction": "decrease"}}})
         jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc)
     if "keyfile" in self._gatk_resources:
         params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params
     if gatk_jar:
         return " ".join(["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params])
     else:
         cmd = gatk_cmd("gatk", jvm_opts, params, config=self._config)
         if cmd:
             return cmd
         else:
             raise ValueError("GATK processing requested but gatk or older jar install not found: "
                              "http://bcbio-nextgen.readthedocs.io/en/latest/contents/"
                              "installation.html#gatk-and-mutect-mutect2")
Ejemplo n.º 19
0
 def cl_gatk(self, params, tmp_dir, memscale=None, parallel_gc=False):
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     if self._has_gatk_conda_wrapper():
         gatk_jar = None
     else:
         gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True)
         if not gatk_jar:
             raise ValueError("GATK processing requested but gatk or older jar install not found: "
                              "http://bcbio-nextgen.readthedocs.io/en/latest/contents/"
                              "installation.html#gatk-and-mutect-mutect2")
     is_gatk4 = "gatk4" not in dd.get_tools_off({"config": self._config})
     cores = self._config["algorithm"].get("num_cores", 1)
     config = self._config
     atype_index = params.index("-T") if params.count("-T") > 0 \
                     else params.index("--analysis_type")
     prog = params[atype_index + 1]
     # For GATK4 specify command first, so swap params to accomplish
     if is_gatk4:
         params = params[:]
         del params[atype_index + 1]
         del params[atype_index]
         params = [prog] + params
     if cores and int(cores) > 1:
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
             memscale = config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                                                "magnitude": max(1, int(cores) // 2)}
     # Filters and unsafe specifications not in GATK4
     if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9") and not is_gatk4:
         if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
             params.extend(["-U", "LENIENT_VCF_PROCESSING"])
         params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"])
     if memscale:
         jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False,
                                  parallel_gc=parallel_gc)
     else:
         # Decrease memory slightly from configuration to avoid memory allocation errors
         jvm_opts = config_utils.adjust_opts(self._jvm_opts,
                                             {"algorithm": {"memory_adjust":
                                                            {"magnitude": 1.1, "direction": "decrease"}}})
         jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc)
     if "keyfile" in self._gatk_resources:
         params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params
     if gatk_jar:
         return " ".join(["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params])
     else:
         cmd = gatk_cmd("gatk", jvm_opts, params, config=self._config)
         if cmd:
             return cmd
         else:
             raise ValueError("GATK processing requested but gatk or older jar install not found: "
                              "http://bcbio-nextgen.readthedocs.io/en/latest/contents/"
                              "installation.html#gatk-and-mutect-mutect2")
Ejemplo n.º 20
0
def run_cufflinks(data):
    """Quantitate transcript expression with Cufflinks"""
    if "cufflinks" in dd.get_tools_off(data):
        return [[data]]
    work_bam = dd.get_work_bam(data)
    ref_file = dd.get_sam_ref(data)
    out_dir, fpkm_file, fpkm_isoform_file = cufflinks.run(work_bam, ref_file, data)
    data = dd.set_cufflinks_dir(data, out_dir)
    data = dd.set_fpkm(data, fpkm_file)
    data = dd.set_fpkm_isoform(data, fpkm_isoform_file)
    return [[data]]
Ejemplo n.º 21
0
 def _has_gatk_conda_wrapper(self):
     cmd = gatk_cmd("gatk", [], ["--version"], config=self._config)
     if cmd:
         if "gatk4" not in dd.get_tools_off({"config": self._config}):
             return True
         else:
             try:
                 stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
                 return stdout.find("GATK jar file not found") == -1
             except subprocess.CalledProcessError:
                 return False
Ejemplo n.º 22
0
 def _has_gatk_conda_wrapper(self):
     cmd = gatk_cmd("gatk", [], ["--version"], config=self._config)
     if cmd:
         if "gatk4" not in dd.get_tools_off({"config": self._config}):
             return True
         else:
             try:
                 stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True, encoding="UTF-8")
                 return stdout.find("GATK jar file not found") == -1
             except subprocess.CalledProcessError:
                 return False
Ejemplo n.º 23
0
def run_cufflinks(data):
    """Quantitate transcript expression with Cufflinks"""
    if "cufflinks" in dd.get_tools_off(data):
        return [[data]]
    work_bam = dd.get_work_bam(data)
    ref_file = dd.get_sam_ref(data)
    out_dir, fpkm_file, fpkm_isoform_file = cufflinks.run(work_bam, ref_file, data)
    data = dd.set_cufflinks_dir(data, out_dir)
    data = dd.set_fpkm(data, fpkm_file)
    data = dd.set_fpkm_isoform(data, fpkm_isoform_file)
    return [[data]]
Ejemplo n.º 24
0
def get_bgzip_cmd(config, is_retry=False):
    """Retrieve command to use for bgzip, trying to use bgzip parallel threads.

    By default, parallel bgzip is enabled in bcbio. If it causes problems
    please report them. You can turn parallel bgzip off with `tools_off: [pbgzip]`
    """
    num_cores = tz.get_in(["algorithm", "num_cores"], config, 1)
    cmd = config_utils.get_program("bgzip", config)
    if (not is_retry and num_cores > 1 and
          "pbgzip" not in dd.get_tools_off({"config": config})):
        cmd += " --threads %s" % num_cores
    return cmd
Ejemplo n.º 25
0
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir):
    """Use GATK to extract reads from full BAM file.
    """
    args = ["PrintReads",
            "-L", region_to_gatk(region),
            "-R", dd.get_ref_file(data),
            "-I", data["work_bam"]]
    # GATK3 back compatibility, need to specify analysis type
    if "gatk4" in dd.get_tools_off(data):
        args = ["--analysis_type"] + args
    runner = broad.runner_from_config(data["config"])
    return runner.cl_gatk(args, tmp_dir)
Ejemplo n.º 26
0
def get_bgzip_cmd(config, is_retry=False):
    """Retrieve command to use for bgzip, trying to use bgzip parallel threads.

    By default, parallel bgzip is enabled in bcbio. If it causes problems
    please report them. You can turn parallel bgzip off with `tools_off: [pbgzip]`
    """
    num_cores = tz.get_in(["algorithm", "num_cores"], config, 1)
    cmd = config_utils.get_program("bgzip", config)
    if (not is_retry and num_cores > 1 and
          "pbgzip" not in dd.get_tools_off({"config": config})):
        cmd += " --threads %s" % num_cores
    return cmd
Ejemplo n.º 27
0
def _run_concat_variant_files_gatk4(input_file_list, out_file, config):
    """Use GATK4 GatherVcfs for concatenation of scattered VCFs.
    """
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            params = ["-T", "GatherVcfs", "-I", input_file_list, "-O", tx_out_file]
            # Use GATK4 for merging, tools_off: [gatk4] applies to variant calling
            config = utils.deepish_copy(config)
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            broad_runner = broad.runner_from_config(config)
            broad_runner.run_gatk(params)
    return out_file
Ejemplo n.º 28
0
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir):
    """Use GATK to extract reads from full BAM file.
    """
    args = [
        "PrintReads", "-L",
        region_to_gatk(region), "-R",
        dd.get_ref_file(data), "-I", data["work_bam"]
    ]
    # GATK3 back compatibility, need to specify analysis type
    if "gatk4" in dd.get_tools_off(data):
        args = ["--analysis_type"] + args
    runner = broad.runner_from_config(data["config"])
    return runner.cl_gatk(args, tmp_dir)
Ejemplo n.º 29
0
def do_db_build(samples, need_bam=True):
    """Confirm we should build a gemini database: need gemini and not in tools_off.
    """
    genomes = set()
    for data in samples:
        if not need_bam or data.get("align_bam") or _has_precalled(data):
            genomes.add(data["genome_build"])
        if "gemini" in dd.get_tools_off(data):
            return False
    if len(genomes) == 1:
        return _has_gemini(samples[0])
    else:
        return False
Ejemplo n.º 30
0
def _run_concat_variant_files_gatk4(input_file_list, out_file, config):
    """Use GATK4 GatherVcfs for concatenation of scattered VCFs.
    """
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            params = ["-T", "GatherVcfs", "-I", input_file_list, "-O", tx_out_file]
            # Use GATK4 for merging, tools_off: [gatk4] applies to variant calling
            config = utils.deepish_copy(config)
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            broad_runner = broad.runner_from_config(config)
            broad_runner.run_gatk(params)
    return out_file
Ejemplo n.º 31
0
def concat_variant_files(orig_files, out_file, regions, ref_file, config):
    """Concatenate multiple variant files from regions into a single output file.

    Uses bcftools concat --naive which only combines samples and does no parsing
    work, allowing scaling to large file sizes.
    """
    if not utils.file_exists(out_file):
        input_file_list = _get_file_list(orig_files, out_file, regions, ref_file, config)
        if "gatk4" not in dd.get_tools_off({"config": config}):
            _run_concat_variant_files_gatk4(input_file_list, out_file, config)
        else:
            out_file = _run_concat_variant_files_bcftools(input_file_list, out_file, config, naive=True)
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    return out_file
Ejemplo n.º 32
0
def _do_prioritize(items):
    """Determine if we should perform prioritization.

    Currently done on tumor-only input samples.
    """
    if not any("tumoronly-prioritization" in dd.get_tools_off(d) for d in items):
        if vcfutils.get_paired_phenotype(items[0]):
            has_tumor = False
            has_normal = False
            for sub_data in items:
                if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                    has_tumor = True
                elif vcfutils.get_paired_phenotype(sub_data) == "normal":
                    has_normal = True
            return has_tumor and not has_normal
Ejemplo n.º 33
0
def _prep_grabix_indexes(in_files, dirs, data):
    if _is_bam_input(in_files):
        out = _bgzip_from_bam(in_files[0], dirs, data["config"])
    elif _is_cram_input(in_files):
        out = _bgzip_from_cram(in_files[0], dirs, data)
    else:
        inputs = [{"in_file": x, "dirs": dirs, "config": data["config"], "rgnames": data["rgnames"]}
                  for x in in_files if x]
        if "pbgzip" not in dd.get_tools_off(data):
            out = [_bgzip_from_fastq(d) for d in inputs]
        else:
            out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"])
    items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in out if x]
    run_multicore(_grabix_index, items, data["config"])
    return out
Ejemplo n.º 34
0
def get_bgzip_cmd(config, is_retry=False):
    """Retrieve command to use for bgzip, trying to use parallel pbgzip if available.

    By default, pbgzip is enabled in bcbio. If it causes problems please report
    them. You can turn pbgzip off with `tools_off: [pbgzip]`
    """
    num_cores = tz.get_in(["algorithm", "num_cores"], config, 1)
    if (not is_retry and num_cores > 1 and
          "pbgzip" not in dd.get_tools_off({"config": config})):
        try:
            pbgzip = config_utils.get_program("pbgzip", config)
            return "%s -n %s " % (pbgzip, num_cores)
        except config_utils.CmdNotFound:
            pass
    return config_utils.get_program("bgzip", config)
Ejemplo n.º 35
0
def _do_prioritize(items):
    """Determine if we should perform prioritization.

    Currently done on tumor-only input samples.
    """
    if not any("tumoronly-prioritization" in dd.get_tools_off(d) for d in items):
        if vcfutils.get_paired_phenotype(items[0]):
            has_tumor = False
            has_normal = False
            for sub_data in items:
                if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                    has_tumor = True
                elif vcfutils.get_paired_phenotype(sub_data) == "normal":
                    has_normal = True
            return has_tumor and not has_normal
Ejemplo n.º 36
0
def do_db_build(samples, need_bam=True, gresources=None):
    """Confirm we should build a gemini database: need gemini + human samples +
    hg19/GRCh37 + not in tools_off.
    """
    genomes = set()
    for data in samples:
        if not need_bam or data.get("align_bam") or _has_precalled(data):
            genomes.add(data["genome_build"])
        if "gemini" in dd.get_tools_off(data):
            return False
    if len(genomes) == 1:
        if not gresources:
            gresources = samples[0]["genome_resources"]
        return (tz.get_in(["aliases", "human"], gresources, False)
                and genomes.issubset(
                    ("hg19", "GRCh37")) and _has_gemini(samples[0]))
    else:
        return False
Ejemplo n.º 37
0
def _run_concat_variant_files_gatk4(input_file_list, out_file, config):
    """Use GATK4 GatherVcfs for concatenation of scattered VCFs.
    """
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            params = ["-T", "GatherVcfs", "-I", input_file_list, "-O", tx_out_file]
            # Use GATK4 for merging, tools_off: [gatk4] applies to variant calling
            config = utils.deepish_copy(config)
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            # Allow specification of verbosity in the unique style this tool uses
            resources = config_utils.get_resources("gatk", config)
            opts = [str(x) for x in resources.get("options", [])]
            if "--verbosity" in opts:
                params += ["--VERBOSITY:%s" % opts[opts.index("--verbosity") + 1]]
            broad_runner = broad.runner_from_config(config)
            broad_runner.run_gatk(params)
    return out_file
Ejemplo n.º 38
0
def do_db_build(samples, need_bam=True, gresources=None):
    """Confirm we should build a gemini database: need gemini + human samples +
    hg19/GRCh37 + not in tools_off.
    """
    genomes = set()
    for data in samples:
        if not need_bam or data.get("align_bam") or _has_precalled(data):
            genomes.add(data["genome_build"])
        if "gemini" in dd.get_tools_off(data):
            return False
    if len(genomes) == 1:
        if not gresources:
            gresources = samples[0]["genome_resources"]
        return (tz.get_in(["aliases", "human"], gresources, False)
                and genomes.issubset(("hg19", "GRCh37"))
                and _has_gemini(samples[0]))
    else:
        return False
Ejemplo n.º 39
0
def _get_gatk_opts(config, names, tmp_dir=None, memscale=None, include_gatk=True, parallel_gc=False):
    """Retrieve GATK memory specifications, moving down a list of potential specifications.
    """
    if include_gatk and "gatk4" in dd.get_tools_off({"config": config}):
        opts = ["-U", "LENIENT_VCF_PROCESSING", "--read_filter",
                "BadCigar", "--read_filter", "NotPrimaryAlignment"]
    else:
        opts = []
    jvm_opts = ["-Xms750m", "-Xmx2g"]
    for n in names:
        resources = config_utils.get_resources(n, config)
        if resources and resources.get("jvm_opts"):
            jvm_opts = resources.get("jvm_opts")
            break
    if memscale:
        jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": memscale}})
    jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc)
    return jvm_opts + opts
Ejemplo n.º 40
0
def gatk_cmd(name, jvm_opts, params, config=None):
    """Retrieve PATH to gatk using locally installed java.
    """
    if name == "gatk":
        if isinstance(config, dict) and "config" not in config:
            data = {"config": config}
        else:
            data = config
        if not data or "gatk4" not in dd.get_tools_off(data):
            return _gatk4_cmd(jvm_opts, params, data)
    gatk_cmd = utils.which(
        os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name))
    # if we can't find via the local executable, fallback to being in the path
    if not gatk_cmd:
        gatk_cmd = utils.which(name)
    if gatk_cmd:
        return "%s && export PATH=%s:$PATH && %s %s %s" % \
            (utils.clear_java_home(), utils.get_java_binpath(gatk_cmd), gatk_cmd,
             " ".join(jvm_opts), " ".join([str(x) for x in params]))
Ejemplo n.º 41
0
def gatk_cmd(name, jvm_opts, params, config=None):
    """Retrieve PATH to gatk using locally installed java.
    """
    if name == "gatk":
        if isinstance(config, dict) and "config" not in config:
            data = {"config": config}
        else:
            data = config
        if not data or "gatk4" not in dd.get_tools_off(data):
            return _gatk4_cmd(jvm_opts, params, data)
        else:
            name = "gatk3"
    gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), name))
    # if we can't find via the local executable, fallback to being in the path
    if not gatk_cmd:
        gatk_cmd = utils.which(name)
    if gatk_cmd:
        return "%s && export PATH=%s:\"$PATH\" && %s %s %s" % \
            (utils.clear_java_home(), utils.get_java_binpath(gatk_cmd), gatk_cmd,
             " ".join(jvm_opts), " ".join([str(x) for x in params]))
Ejemplo n.º 42
0
def process_intervals(data):
    """Prepare intervals file"""
    bed_file = regions.get_sv_bed(data)
    if not bed_file:
        bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    basename = os.path.splitext(bed_file)[0]
    ready_file = basename + ".txt"
    if os.path.exists(ready_file):
        return ready_file
    optimized_bed = basename + ".optimized.bed"
    rscript = utils.Rscript_cmd("base")
    interval_file_r = utils.R_package_script("PureCN",
                                             "extdata/IntervalFile.R",
                                             env="base")
    ref_file = dd.get_ref_file(data)
    mappability_resource = dd.get_variation_resources(
        data)["purecn_mappability"]
    genome = dd.get_genome_build(data)
    tools_off = dd.get_tools_off(data)
    if tools_off and "purecn_offtarget" in tools_off:
        offtarget_flag = ""
    else:
        offtarget_flag = "--off-target"
    cmd = [
        rscript, interval_file_r, "--in-file", bed_file, "--fasta", ref_file,
        "--out-file", ready_file, offtarget_flag, "--genome", genome,
        "--export", optimized_bed, "--mappability", mappability_resource
    ]
    try:
        cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(
            env="base"), utils.get_R_exports(env="base"), " ".join(
                [str(x) for x in cmd]))
        do.run(cmd_line, "PureCN intervals")
    except subprocess.CalledProcessError as msg:
        logger.info("PureCN failed to prepare intervals")
    logger.debug("Saved PureCN interval file into " + ready_file)
    return ready_file
Ejemplo n.º 43
0
    def get_gatk_version(self):
        """Retrieve GATK version, handling locally and config cached versions.
        Calling version can be expensive due to all the startup and shutdown
        of JVMs, so we prefer cached version information.
        """
        if self._gatk_version is None:
            self._set_default_versions(self._config)

        if "gatk4" not in dd.get_tools_off({"config": self._config}):
            # In cases whwere we don't have manifest versions. Not possible to get
            # version from commandline with GATK4 alpha version
            if self._gatk4_version is None:
                self._gatk4_version = "4.0"
            return self._gatk4_version
        elif self._gatk_version is not None:
            return self._gatk_version
        else:
            if self._has_gatk_conda_wrapper():
                gatk_jar = None
            else:
                gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True)
            self._gatk_version = get_gatk_version(gatk_jar, config=self._config)
            return self._gatk_version
Ejemplo n.º 44
0
def _has_alignment_file(algorithm, sample):
    return (((algorithm.get("aligner") or algorithm.get("realign")
              or algorithm.get("recalibrate") or algorithm.get("bam_clean")
              or algorithm.get("mark_duplicates", algorithm.get("aligner"))))
            and sample.get("work_bam") is not None
            and "upload_alignment" not in dd.get_tools_off(sample))
Ejemplo n.º 45
0
def _has_alignment_file(algorithm, sample):
    return (((algorithm.get("aligner") or algorithm.get("realign")
              or algorithm.get("recalibrate") or algorithm.get("bam_clean")
              or algorithm.get("mark_duplicates"))) and
              sample.get("work_bam") is not None and
              "upload_alignment" not in dd.get_tools_off(sample))
Ejemplo n.º 46
0
 def _run(in_file, work_dir, data):
     if "lumpy-genotype" in dd.get_tools_off(data):
         return in_file
     else:
         return _run_svtyper(in_file, [dd.get_align_bam(data)],
                             call.get("exclude_file"), data)
Ejemplo n.º 47
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    # Retain paired samples with tumor/normal genotyped in one file
    if paired and paired.normal_name:
        batches = [[paired.tumor_data, paired.normal_data]]
    else:
        batches = [[x] for x in items]

    for batch_items in batches:
        data = batch_items[0]
        if len(batch_items) == 1:
            sample = dd.get_sample_name(data)
            sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                                utils.append_stem(lumpy_vcf, "-%s" % sample),
                                                data["config"])
        else:
            sample_vcf = lumpy_vcf
        align_bams = [dd.get_align_bam(x) for x in batch_items]
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, align_bams, exclude_file, data)
        elif "lumpy-genotype" in dd.get_tools_off(data):
            gt_vcf = sample_vcf
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, align_bams, exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background(paired.tumor_name, [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs.get(dd.get_sample_name(data))
        if vcf_file:
            if dd.get_svprioritize(data):
                effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
            else:
                effects_vcf = None
            data["sv"].append({"variantcaller": "lumpy",
                               "vrn_file": effects_vcf or vcf_file,
                               "exclude_file": exclude_file})
        out.append(data)
    return out
Ejemplo n.º 48
0
 def _run(in_file, work_dir, data):
     if "lumpy-genotype" in dd.get_tools_off(data):
         return in_file
     else:
         return _run_svtyper(in_file, [dd.get_align_bam(data)], call.get("exclude_file"), data)
Ejemplo n.º 49
0
def run_peddy(samples, out_dir=None):
    data = samples[0]
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if isinstance(batch, (list, tuple)):
        batch = batch[0]
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"

    vcf_file = None
    for d in samples:
        vcinfo = None
        if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [
                "tumor"
        ]:
            vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if not vcinfo and dd.get_phenotype(d) in ["tumor"]:
            vcinfo = variant.extract_germline_vcinfo(d, peddy_dir)
        if vcinfo:
            for key in ["germline", "vrn_file"]:
                if vcinfo and vcinfo.get(key) and utils.file_exists(
                        vcinfo[key]):
                    if vcinfo[key] and dd.get_sample_name(
                            d) in vcfutils.get_samples(vcinfo[key]):
                        if vcinfo[
                                key] and vcfutils.vcf_has_nonfiltered_variants(
                                    vcinfo[key]):
                            vcf_file = vcinfo[key]
                            break
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples])
    if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips:
        if not peddy:
            reason = "peddy executable not found"
        elif config_skips:
            reason = "peddy in tools_off configuration"
        elif not vcfanno.is_human(data):
            reason = "sample is not human"
        else:
            assert not vcf_file
            reason = "no suitable VCF files found with the sample and non-filtered variants"
        msg = "Skipping peddy QC, %s: %s" % (
            reason, [dd.get_sample_name(d) for d in samples])
        with open(peddy_prefix + "-failed.log", "w") as out_handle:
            out_handle.write(msg)
        logger.info(msg)
        return samples
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    if not file_exists(peddy_report):
        ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
        num_cores = dd.get_num_cores(data)
        with tx_tmpdir(data) as tx_dir:
            peddy_prefix_tx = os.path.join(tx_dir,
                                           os.path.basename(peddy_prefix))
            # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
            stderr_log = os.path.join(tx_dir, "run-stderr.log")
            sites_str = "--sites hg38" if dd.get_genome_build(
                data) == "hg38" else ""
            locale = utils.locale_export()
            cmd = (
                "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
                "{vcf_file} {ped_file} 2> {stderr_log}")
            message = "Running peddy on {vcf_file} against {ped_file}."
            try:
                do.run(cmd.format(**locals()), message.format(**locals()))
            except:
                to_show = collections.deque(maxlen=100)
                with open(stderr_log) as in_handle:
                    for line in in_handle:
                        to_show.append(line)

                def allowed_errors(l):
                    return (
                        (l.find("IndexError") >= 0
                         and l.find("is out of bounds for axis") >= 0) or
                        (l.find("n_components=") >= 0
                         and l.find("must be between 1 and n_features=") >= 0)
                        or (l.find("n_components=") >= 0
                            and l.find("must be between 1 and min") >= 0)
                        or (l.find(
                            "Input contains NaN, infinity or a value too large for dtype"
                        ) >= 0))

                def all_line_errors(l):
                    return (l.find("no intervals found for") >= 0)

                if any([allowed_errors(l) for l in to_show]) or all(
                    [all_line_errors(l) for l in to_show]):
                    logger.info(
                        "Skipping peddy because no variants overlap with checks: %s"
                        % batch)
                    with open(peddy_prefix + "-failed.log", "w") as out_handle:
                        out_handle.write(
                            "peddy did not find overlaps with 1kg sites in VCF, skipping"
                        )
                    return samples
                else:
                    logger.warning("".join(to_show))
                    raise
            for ext in PEDDY_OUT_EXTENSIONS:
                if os.path.exists(peddy_prefix_tx + ext):
                    shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    peddyfiles = expected_peddy_files(peddy_report, batch)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)