Ejemplo n.º 1
0
def _run_gridss(inputs, background, work_dir):
    out_file = os.path.join(work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or
                                                            dd.get_sample_name(inputs[0])))
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            htsjdk_opts = ["-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true",
                           "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true"]
            cores = dd.get_cores(inputs[0])
            resources = config_utils.get_resources("gridss", inputs[0]["config"])
            jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"])
            jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                         {"direction": "increase",
                                                                          "magnitude": cores}}})
            jvm_opts = _finalize_memory(jvm_opts)
            tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file))
            blacklist_bed = sshared.prepare_exclude_file(inputs + background, out_file)
            cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \
                  ["THREADS=%s" % cores,
                   "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file),
                   "OUTPUT=%s" % tx_out_file,
                   "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"),
                   "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed]
            for data in inputs + background:
                cmd += ["INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data)]
            exports = utils.local_path_export()
            cmd = exports + " ".join(cmd)
            do.run(cmd, "GRIDSS SV analysis")
    return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
Ejemplo n.º 2
0
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = list(set(tz.concat([dd.get_variantcaller(data) or [] for data in items])))
            if any(x.lower().startswith("vardict") for x in vcs):
                raise ValueError("VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
            elif any(x.lower() == "mutect" for x in vcs):
                raise ValueError("Mutect requires a 'phenotype: tumor' sample for calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
Ejemplo n.º 3
0
def _merge_fastqc(samples):
    """
    merge all fastqc samples into one by module
    """
    fastqc_list = collections.defaultdict(list)
    seen = set()
    for data in samples:
        name = dd.get_sample_name(data)
        if name in seen:
            continue
        seen.add(name)
        fns = glob.glob(os.path.join(dd.get_work_dir(data), "qc", dd.get_sample_name(data), "fastqc") + "/*")
        for fn in fns:
            if fn.endswith("tsv"):
                metric = os.path.basename(fn)
                fastqc_list[metric].append([name, fn])

    for metric in fastqc_list:
        dt_by_sample = []
        for fn in fastqc_list[metric]:
            dt = pd.read_csv(fn[1], sep="\t")
            dt['sample'] = fn[0]
            dt_by_sample.append(dt)
        dt = utils.rbind(dt_by_sample)
        dt.to_csv(metric, sep="\t", index=False, mode ='w')
    return samples
Ejemplo n.º 4
0
def plot_model_segments(seg_files, work_dir, data):
    """Diagnostic plots of segmentation and inputs.
    """
    from bcbio.heterogeneity import chromhacks
    out_file = os.path.join(work_dir, "%s.modeled.png" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            dict_file = utils.splitext_plus(dd.get_ref_file(data))[0] + ".dict"
            plot_dict = os.path.join(os.path.dirname(tx_out_file), os.path.basename(dict_file))
            with open(dict_file) as in_handle:
                with open(plot_dict, "w") as out_handle:
                    for line in in_handle:
                        if line.startswith("@SQ"):
                            cur_chrom = [x.split(":", 1)[1].strip()
                                         for x in line.split("\t") if x.startswith("SN:")][0]
                            if chromhacks.is_autosomal_or_sex(cur_chrom):
                                out_handle.write(line)
                        else:
                            out_handle.write(line)
            params = ["-T", "PlotModeledSegments",
                      "--denoised-copy-ratios", tz.get_in(["depth", "bins", "normalized"], data),
                      "--segments", seg_files["final_seg"],
                      "--allelic-counts", seg_files["tumor_hets"],
                      "--sequence-dictionary", plot_dict,
                      "--minimum-contig-length", "10",
                      "--output-prefix", dd.get_sample_name(data),
                      "-O", os.path.dirname(tx_out_file)]
            _run_with_memory_scaling(params, tx_out_file, data)
    return {"seg": out_file}
Ejemplo n.º 5
0
def _evaluate_multi(calls, truth_svtypes, work_dir, data):
    base = os.path.join(work_dir, "%s-sv-validate" % (dd.get_sample_name(data)))
    out_file = base + ".csv"
    df_file = base + "-df.csv"
    if any((not utils.file_uptodate(out_file, x["vrn_file"])
            or not utils.file_uptodate(df_file, x["vrn_file"])) for x in calls):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with open(df_file, "w") as df_out_handle:
                    writer = csv.writer(out_handle)
                    dfwriter = csv.writer(df_out_handle)
                    writer.writerow(["svtype", "size", "caller", "sensitivity", "precision"])
                    dfwriter.writerow(["svtype", "size", "caller", "metric", "value", "label"])
                    for svtype, truth in truth_svtypes.items():
                        for size in EVENT_SIZES:
                            str_size = "%s-%s" % size
                            for call in calls:
                                call_bed = convert.to_bed(call, dd.get_sample_name(data), work_dir, calls, data)
                                if utils.file_exists(call_bed):
                                    evalout = _evaluate_one(call["variantcaller"], svtype, size, call_bed,
                                                            truth, data)
                                    writer.writerow([svtype, str_size, call["variantcaller"],
                                                     evalout["sensitivity"]["label"], evalout["precision"]["label"]])
                                    for metric in ["sensitivity", "precision"]:
                                        dfwriter.writerow([svtype, str_size, call["variantcaller"], metric,
                                                           evalout[metric]["val"], evalout[metric]["label"]])
    return out_file, df_file
Ejemplo n.º 6
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.
    """
    umi_ext = "-cumi" if "umi_bam" in data else ""
    out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    rg_info = novoalign.get_rg_info(names)
    preset = "sr"

    pair_file = pair_file if pair_file else ""
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None

    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
            index_file = None
            # Skip trying to use indices now as they provide only slight speed-ups
            # and give inconsitent outputs in BAM headers
            # If a single index present, index_dir points to that
            # if index_dir and os.path.isfile(index_dir):
            #     index_dir = os.path.dirname(index_dir)
            #     index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset))
            if not index_file or not os.path.exists(index_file):
                index_file = dd.get_ref_file(data)
            cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} "
                   "{fastq_file} {pair_file} | ")
            do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data))
    data["work_bam"] = out_file
    return data
Ejemplo n.º 7
0
def _run_amber(paired, work_dir, lenient=False):
    """AMBER: calculate allele frequencies at likely heterozygous sites.

    lenient flag allows amber runs on small test sets.
    """
    amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
    out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".pcf"):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            key = "germline_het_pon"
            het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data)
            cmd = ["AMBER"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-threads", dd.get_num_cores(paired.tumor_data),
                   "-tumor", dd.get_sample_name(paired.tumor_data),
                   "-tumor_bam", dd.get_align_bam(paired.tumor_data),
                   "-reference", dd.get_sample_name(paired.normal_data),
                   "-reference_bam", dd.get_align_bam(paired.normal_data),
                   "-ref_genome", dd.get_ref_file(paired.tumor_data),
                   "-bed", het_bed,
                   "-output_dir", os.path.dirname(tx_out_file)]
            if lenient:
                cmd += ["-max_het_af_percent", "1.0"]
            try:
                do.run(cmd, "PURPLE: AMBER baf generation")
            except subprocess.CalledProcessError as msg:
                if not lenient and _amber_allowed_errors(str(msg)):
                    return _run_amber(paired, work_dir, True)
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(amber_dir, f))
    return out_file
Ejemplo n.º 8
0
def _fastp_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp)
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report = tx_out[0]
            tx_out_files = tx_out[1:]
            cmd = ["fastp", "--thread", dd.get_num_cores(data)]
            if dd.get_quality_format(data).lower() == "illumina":
                cmd += ["--phred64"]
            for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)):
                if i == 0:
                    cmd += ["-i", inf, "-o", outf]
                else:
                    cmd += ["-I", inf, "-O", outf]
            cmd += ["--cut_by_quality3", "--cut_mean_quality", "5",
                    "--length_required", str(dd.get_min_read_length(data)),
                    "--disable_quality_filtering"]
            if "polyx" in dd.get_adapters(data):
                cmd += ["--trim_poly_x", "--poly_x_min_len", "8"]
            if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data):
                cmd += ["--trim_poly_g", "--poly_g_min_len", "8"]
            for a in adapters:
                cmd += ["--adapter_sequence", a]
            if not adapters:
                cmd += ["--disable_adapter_trimming"]
            cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)]
            do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data))
    return out_files, report_file
Ejemplo n.º 9
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using WHAM.
    """
    if not background: background = []
    background_bams = []
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    if paired:
        inputs = [paired.tumor_data]
        if paired.normal_bam:
            background = [paired.normal_data]
            background_bams = [paired.normal_bam]
    else:
        assert not background
        inputs, background = shared.find_case_control(items)
        background_bams = [x["align_bam"] for x in background]
    orig_vcf = _run_wham(inputs, background_bams)
    out = []
    for data in inputs:
        if "sv" not in data:
            data["sv"] = []
        sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data))
        sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"])
        if background:
            sample_vcf = filter_by_background(sample_vcf, orig_vcf, background, data)
        data["sv"].append({"variantcaller": "wham",
                           "vrn_file": sample_vcf})
        out.append(data)
    return out
Ejemplo n.º 10
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dedup_bam)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
                                            data["config"])
        gt_vcf = _run_svtyper(sample_vcf, dedup_bam, sr_bam, data)
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": gt_vcfs[dd.get_sample_name(data)],
                           "exclude_file": exclude_file})
        out.append(data)
    return out
Ejemplo n.º 11
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    from bcbio.bam import callable
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "gatk-haplotype"))
    data = _setup_variant_regions(data, out_dir)
    out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        region_files = []
        regions = []
        for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data):
            str_region = "_".join([str(x) for x in cur_region])
            region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                    "variation", "rnaseq", "gatk-haplotype",
                                                                    "regions")),
                                    "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region))
            region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                                region=cur_region, out_file=region_file)
            region_files.append(region_file)
            regions.append(cur_region)
        out_file = vcfutils.concat_variant_files(region_files, out_file, regions,
                                                 dd.get_ref_file(data), data["config"])
    return dd.set_vrn_file(data, out_file)
Ejemplo n.º 12
0
def _merge_hla_fastq_inputs(data):
    """Merge HLA inputs from a split initial alignment.
    """
    hla_key = ["hla", "fastq"]
    hla_sample_files = [x for x in tz.get_in(hla_key, data, []) if x and x != "None"]
    if hla_sample_files:
        out_files = collections.defaultdict(list)
        for hla_files in hla_sample_files:
            for hla_file in hla_files:
                rehla = re.search(".hla.(?P<hlatype>[\w-]+).fq", hla_file)
                if rehla:
                    hlatype = rehla.group("hlatype")
                    out_files[hlatype].append(hla_file)
        if len(out_files) > 0:
            hla_outdir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                         dd.get_sample_name(data), "hla"))
            merged_hlas = []
            for hlatype, files in out_files.items():
                out_file = os.path.join(hla_outdir, "%s-%s.fq" % (dd.get_sample_name(data), hlatype))
                optitype.combine_hla_fqs([(hlatype, f) for f in files], out_file, data)
                merged_hlas.append(out_file)
        data = tz.update_in(data, hla_key, lambda x: merged_hlas)
    else:
        data = tz.update_in(data, hla_key, lambda x: None)
    return data
Ejemplo n.º 13
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                         "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = ["rtg", "vcfeval", "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    return {"tp": os.path.join(out_dir, "tp.vcf.gz"),
            "fp": os.path.join(out_dir, "fp.vcf.gz"),
            "fn": os.path.join(out_dir, "fn.vcf.gz")}
Ejemplo n.º 14
0
def calculate_sv_bins(*items):
    """Determine bin sizes and regions to use for samples.

    Unified approach to prepare regional bins for coverage calculations across
    multiple CNV callers. Splits into target and antitarget regions allowing
    callers to take advantage of both. Provides consistent target/anti-target
    bin sizes across batches.

    Uses callable_regions as the access BED file and mosdepth regions in
    variant_regions to estimate depth for bin sizes.
    """
    from bcbio.structural import cnvkit
    if all(not cnvkit.use_general_sv_bins(utils.to_single_data(x)) for x in items):
        return items
    items = [utils.to_single_data(x) for x in items]
    out = []
    for cnv_group in _group_by_cnv_method(multi.group_by_batch(items, False)):
        size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes
        for data in cnv_group.items:
            target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file, size_calc_fn,
                                                         cnv_group.work_dir, data)
            if not data.get("regions"):
                data["regions"] = {}
            data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed}
            out.append([data])
    if not len(out) == len(items):
        raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" %
                             (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]),
                              sorted([dd.get_sample_name(x) for x in items])))
    return out
Ejemplo n.º 15
0
def run(samples, run_parallel, stage):
    """Run structural variation detection.

    The stage indicates which level of structural variant calling to run.
      - initial, run prior to other callers and variant calling
      - standard, regular batch calling
      - ensemble, post-calling, combine other callers-
    """
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (xs[0] for xs in samples):
        ready_data = _handle_multiple_svcallers(data, stage)
        if len(ready_data) > 0:
            background.append(data)
            for x in ready_data:
                svcaller = x["config"]["algorithm"].get("svcaller_active")
                if stage == "ensemble":  # no batching for ensemble methods
                    batch = dd.get_sample_name(x)
                else:
                    batch = dd.get_batch(x) or dd.get_sample_name(x)
                batches = batch if isinstance(batch, (list, tuple)) else [batch]
                for b in batches:
                    try:
                        to_process[(svcaller, b)].append(x)
                    except KeyError:
                        to_process[(svcaller, b)] = [x]
        else:
            extras.append([data])
    processed = run_parallel("detect_sv", ([xs, background, xs[0]["config"], stage]
                                           for xs in to_process.values()))
    finalized = (run_parallel("finalize_sv", [([xs[0] for xs in processed], processed[0][0]["config"])])
                 if len(processed) > 0 else [])
    return extras + finalized
Ejemplo n.º 16
0
def batch(samples):
    """CWL: batch together per sample, joint and germline calls for ensemble combination.

    Sets up groups of same sample/batch variant calls for ensemble calling, as
    long as we have more than one caller per group.
    """
    samples = [utils.to_single_data(x) for x in samples]
    sample_order = [dd.get_sample_name(x) for x in samples]
    batch_groups = collections.defaultdict(list)
    for data in samples:
        batch_samples = tuple(data.get("batch_samples", [dd.get_sample_name(data)]))
        batch_groups[(batch_samples, dd.get_phenotype(data))].append(data)

    out = []
    for (batch_samples, phenotype), gsamples in batch_groups.items():
        if len(gsamples) > 1:
            batches = set([])
            for d in gsamples:
                batches |= set(dd.get_batches(d))
            cur = copy.deepcopy(gsamples[0])
            cur.update({"batch_id": sorted(list(batches))[0] if batches else "_".join(batch_samples),
                        "batch_samples": batch_samples,
                        "variants": {"variantcallers": [dd.get_variantcaller(d) for d in gsamples],
                                     "calls": [d.get("vrn_file") for d in gsamples]}})
            out.append(cur)

    def by_original_order(d):
        return min([sample_order.index(s) for s in d["batch_samples"] if s in sample_order])
    return sorted(out, key=by_original_order)
Ejemplo n.º 17
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate, cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        return [[data]]
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                               dd.get_sample_name(data), "bins"))
    out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data))
    out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data))
    if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file))
          and (dd.get_align_bam(data) or dd.get_work_bam(data))):
        # mosdepth
        target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data))
        anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data))
        target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0)
        anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0)
        out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data)
        out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data)
        # TODO: Correct for GC bias
    if os.path.exists(out_target_file):
        data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file}
    return [[data]]
Ejemplo n.º 18
0
def run(items):
    assert len(items) == 1, ("Expect one input to biological prioritization: %s" %
                             ", ".join([dd.get_sample_name(d) for d in items]))
    data = items[0]
    inputs = []
    for call in data.get("sv", []):
        vcf_file = call.get("vcf_file", call.get("vrn_file"))
        if vcf_file and vcf_file.endswith((".vcf", "vcf.gz")):
            pp_fn = POST_PRIOR_FNS.get(call["variantcaller"])
            if pp_fn:
                pp_fn = pp_fn(call)
            inputs.append((call["variantcaller"], vcf_file, pp_fn))
    if len(inputs) > 0:
        prioritize_by = tz.get_in(["config", "algorithm", "svprioritize"], data)
        if prioritize_by:
            work_dir = _sv_workdir(data)
            priority_files = [_prioritize_vcf(vcaller, vfile, prioritize_by, post_prior_fn, work_dir, data)
                              for vcaller, vfile, post_prior_fn in inputs]
            priority_tsv = _combine_files([xs[0] for xs in priority_files], work_dir, data)
            raw_files = {}
            for svcaller, fname in zip([xs[0] for xs in inputs], [xs[1] for xs in priority_files]):
                clean_fname = os.path.join(os.path.dirname(fname), "%s-%s-prioritized%s" %
                                           (dd.get_sample_name(data), svcaller, utils.splitext_plus(fname)[-1]))
                utils.symlink_plus(fname, clean_fname)
                raw_files[svcaller] = clean_fname
            data["sv"].append({"variantcaller": "sv-prioritize", "vrn_file": priority_tsv,
                               "raw_files": raw_files})
    # Disabled on move to CWL, not used and tested with CNVkit changes
    # data = _cnv_prioritize(data)
    return [data]
Ejemplo n.º 19
0
def _segment_normalized_gatk(cnr_file, work_dir, paired):
    """Segmentation of normalized inputs using GATK4, converting into standard input formats.
    """
    work_dir = utils.safe_makedir(os.path.join(work_dir, "gatk-cnv"))
    seg_file = gatkcnv.model_segments(cnr_file, work_dir, paired)["seg"]
    std_seg_file = seg_file.replace(".cr.seg", ".seg")
    if not utils.file_uptodate(std_seg_file, seg_file):
        with file_transaction(std_seg_file) as tx_out_file:
            df = pd.read_csv(seg_file, sep="\t", comment="@", header=0,
                             names=["chrom", "loc.start", "loc.end", "num.mark", "seg.mean"])
            df.insert(0, "ID", [dd.get_sample_name(paired.tumor_data)] * len(df))
            df.to_csv(tx_out_file, sep="\t", header=True, index=False)
    std_cnr_file = os.path.join(work_dir, "%s.cnr" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_uptodate(std_cnr_file, cnr_file):
        with file_transaction(std_cnr_file) as tx_out_file:
            logdf = pd.read_csv(cnr_file, sep="\t", comment="@", header=0,
                                names=["chrom", "start", "end", "log2"])
            covdf = pd.read_csv(tz.get_in(["depth", "bins", "antitarget"], paired.tumor_data),
                                sep="\t", header=None,
                                names=["chrom", "start", "end", "orig.name", "depth", "gene"])
            df = pd.merge(logdf, covdf, on=["chrom", "start", "end"])
            del df["orig.name"]
            df = df[["chrom", "start", "end", "gene", "log2", "depth"]]
            df.insert(6, "weight", [1.0] * len(df))
            df.to_csv(tx_out_file, sep="\t", header=True, index=False)
    return std_cnr_file, std_seg_file
Ejemplo n.º 20
0
def resolve(items, run_parallel):
    """Combine aligned and split samples into final set of disambiguated reads.
    """
    out = []
    to_process = collections.defaultdict(list)
    for data in [x[0] for x in items]:
        if "disambiguate" in data:
            split_part = tuple(data["align_split"]) if data.get("combine") else None
            to_process[(dd.get_sample_name(data), split_part)].append(data)
        else:
            out.append([data])
    if len(to_process) > 0:
        dis1 = run_parallel("run_disambiguate",
                            [(xs, xs[0]["config"]) for xs in to_process.itervalues()])
        disambigs_by_name = collections.defaultdict(list)
        print len(dis1)
        for xs in dis1:
            assert len(xs) == 1
            data = xs[0]
            disambigs_by_name[dd.get_sample_name(data)].append(data)
        dis2 = run_parallel("disambiguate_merge_extras",
                            [(xs, xs[0]["config"]) for xs in disambigs_by_name.itervalues()])
    else:
        dis2 = []
    return out + dis2
Ejemplo n.º 21
0
def _get_vcf_samples(calls, items):
    have_full_file = False
    all_samples = set([])
    sample_matches = False
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for data in items:
                for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)):
                    # For tumor/normal batches, want to attach germline VCFs to normals
                    # Standard somatics go to tumors
                    if dd.get_phenotype(data) == "normal":
                        test_name += "-germline"
                    if os.path.basename(f).startswith(("%s-" % test_name,
                                                       "%s." % test_name)):
                        # Prefer matches to single samples (gVCF) over joint batches
                        if i == 0:
                            sample_matches = True
                        if sample_matches and i > 0:
                            continue
                        else:
                            all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
Ejemplo n.º 22
0
def _batch_split_by_sv(samples, stage):
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (utils.to_single_data(x) for x in samples):
        ready_data = _handle_multiple_svcallers(data, stage)
        if len(ready_data) > 0:
            background.append(data)
            for x in ready_data:
                svcaller = tz.get_in(["config", "algorithm", "svcaller"], x)
                batch = dd.get_batch(x) or dd.get_sample_name(x)
                if stage in ["ensemble"]:  # no batching for ensemble methods
                    if isinstance(batch, six.string_types) and batch != dd.get_sample_name(x):
                        batch += "_%s" % dd.get_sample_name(x)
                    else:
                        batch = dd.get_sample_name(x)
                    if dd.get_phenotype(x) == "germline":
                        batch += "_germline"
                elif svcaller in _GLOBAL_BATCHING:  # All samples batched together for analyses
                    batch = "all"
                batches = batch if isinstance(batch, (list, tuple)) else [batch]
                for b in batches:
                    try:
                        to_process[(svcaller, b)].append(x)
                    except KeyError:
                        to_process[(svcaller, b)] = [x]
        else:
            extras.append([data])
    return to_process, extras, background
Ejemplo n.º 23
0
def _bcftools_stats(data, out_dir):
    """Run bcftools stats.
    """
    vcinfo = _get_active_vcinfo(data)
    if vcinfo:
        out_dir = utils.safe_makedir(os.path.join(out_dir, "bcftools_stats"))
        vcf_file = vcinfo["vrn_file"]
        if tz.get_in(("config", "algorithm", "jointcaller"), data):
            opts = ""
        else:
            opts = "-f PASS"
        name = dd.get_sample_name(data)
        out_file = os.path.join(out_dir, "%s.txt" % name)
        bcftools = config_utils.get_program("bcftools", data["config"])
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                orig_out_file = os.path.join(os.path.dirname(tx_out_file), "orig_%s" % os.path.basename(tx_out_file))
                cmd = ("{bcftools} stats -s {name} {opts} {vcf_file} > {orig_out_file}")
                do.run(cmd.format(**locals()), "bcftools stats %s" % dd.get_sample_name(data))
                with open(orig_out_file) as in_handle:
                    with open(tx_out_file, "w") as out_handle:
                        for line in in_handle:
                            if line.startswith("ID\t"):
                                parts = line.split("\t")
                                parts[-1] = "%s\n" % name
                                line = "\t".join(parts)
                            out_handle.write(line)
        return out_file
Ejemplo n.º 24
0
def calculate_sv_bins(*items):
    """Determine bin sizes and regions to use for samples.

    Unified approach to prepare regional bins for coverage calculations across
    multiple CNV callers. Splits into target and antitarget regions allowing
    callers to take advantage of both. Provides consistent target/anti-target
    bin sizes across batches.

    Uses callable_regions as the access BED file and mosdepth regions in
    variant_regions to estimate depth for bin sizes.
    """
    calcfns = {"cnvkit": _calculate_sv_bins_cnvkit, "gatk-cnv": _calculate_sv_bins_gatk}
    from bcbio.structural import cnvkit
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out = []
    for i, cnv_group in enumerate(_group_by_cnv_method(multi.group_by_batch(items, False))):
        size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes
        for data in cnv_group.items:
            if cnvkit.use_general_sv_bins(data):
                target_bed, anti_bed, gcannotated_tsv = calcfns[cnvkit.bin_approach(data)](data, cnv_group,
                                                                                           size_calc_fn)
                if not data.get("regions"):
                    data["regions"] = {}
                data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i),
                                           "gcannotated": gcannotated_tsv}
            out.append([data])
    if not len(out) == len(items):
        raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" %
                             (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]),
                              sorted([dd.get_sample_name(x) for x in items])))
    return out
Ejemplo n.º 25
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {"firststrand": "strand-specific-reverse",
                    "secondstrand": "strand-specific-forward",
                    "unstranded": "non-strand-specific"}

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(report_file):
        with file_transaction(data, results_dir) as tx_out_dir:
            utils.safe_makedir(tx_out_dir)
            raw_file = os.path.join(tx_out_dir, "rnaseq_qc_results.txt")
            bam.index(bam_file, config)
            cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_out_dir, gtf_file, single_end, library)
            do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update({"Average_insert_size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
Ejemplo n.º 26
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in validate.summarize_grading(items)]
    out = {"validate": items[0]["validate"],
           "variants": {"calls": [], "gvcf": []}}
    added = set([])
    for data in items:
        if data.get("vrn_file"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                if cur_name not in added:
                    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                            "variants", out_key)),
                                            "%s.vcf.gz" % cur_name)
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    return [out]
Ejemplo n.º 27
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage, providing flexible point for multiple methods.
    """
    calcfns = {"cnvkit": _normalize_sv_coverage_cnvkit, "gatk-cnv": _normalize_sv_coverage_gatk}
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out_files = {}
    back_files = {}
    for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        # No CNVkit calling for this particular set of samples
        if group_id is None:
            continue
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items))
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural",
                                                    dd.get_sample_name(inputs[0]), "bins"))
        back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])](group_id, inputs, backgrounds, work_dir,
                                                                        back_files, out_files)
    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)]
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)]
        out.append([data])
    return out
Ejemplo n.º 28
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    depth_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions")
    if not utils.file_uptodate(callable_file, bam_file):
        cmd = ["goleft", "depth", "--q", "1", "--mincov", str(params["min"]),
               "--processes", str(dd.get_num_cores(data)), "--ordered"]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, depth_file) as tx_depth_file:
            with utils.chdir(os.path.dirname(tx_depth_file)):
                tx_callable_file = tx_depth_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_depth_file.replace(".depth.bed", "")
                bam_ref_file = "%s-bamref.fa" % utils.splitext_plus(bam_file)[0]
                bam.fai_from_bam(dd.get_ref_file(data), bam_file, bam_ref_file + ".fai", data)
                cmd += ["--reference", bam_ref_file]
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return depth_file, final_callable, _extract_highdepth(final_callable, data), variant_regions_avg_cov
Ejemplo n.º 29
0
def merge_extras(items, config):
    """Merge extra disambiguated reads into a final BAM file.
    """
    final = {}
    for extra_name in items[0]["disambiguate"].keys():
        items_by_name = collections.defaultdict(list)
        for data in items:
            items_by_name[dd.get_sample_name(data)].append(data)
        for sname, name_items in items_by_name.items():
            if sname not in final:
                final[sname] = {}
            in_files = []
            for data in name_items:
                in_files.append(data["disambiguate"][extra_name])
            out_file = "%s-allmerged%s" % os.path.splitext(in_files[0])
            if in_files[0].endswith(".bam"):
                merged_file = merge.merge_bam_files(in_files, os.path.dirname(out_file), config,
                                                    out_file=out_file)
            else:
                assert extra_name == "summary", extra_name
                merged_file = _merge_summary(in_files, out_file, name_items[0])
            final[sname][extra_name] = merged_file
    out = []
    for data in items:
        data["disambiguate"] = final[dd.get_sample_name(data)]
        out.append([data])
    return out
Ejemplo n.º 30
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data)
    sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname")
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts", dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = "{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}"

    message = "Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts"
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)

    return count_file
Ejemplo n.º 31
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.

    var2vcf_valid uses -A flag which reports all alleles and improves sensitivity:
    https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   items=items,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in zip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts, var2vcf_opts = _vardict_options_from_config(
                    items, config, out_file, target)
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if tx_out_file.endswith(
                    "gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(
                    ref_file, tx_out_file)
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} "
                    "| {contig_cl} | bcftools filter -i 'QUAL >= 0' "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
    return out_file
Ejemplo n.º 32
0
def compare_to_rm(data):
    """Compare final variant calls against reference materials of known calls.
    """
    if isinstance(data, (list, tuple)):
        data = _normalize_cwl_inputs(data)
    toval_data = _get_validate(data)
    toval_data = cwlutils.unpack_tarballs(toval_data, toval_data)
    if toval_data:
        caller = _get_caller(toval_data)
        sample = dd.get_sample_name(toval_data)
        base_dir = utils.safe_makedir(
            os.path.join(toval_data["dirs"]["work"], "validate", sample,
                         caller))

        if isinstance(toval_data["vrn_file"], (list, tuple)):
            raise NotImplementedError(
                "Multiple input files for validation: %s" %
                toval_data["vrn_file"])
        else:
            vrn_file = os.path.abspath(toval_data["vrn_file"])
        rm_file = normalize_input_path(
            toval_data["config"]["algorithm"]["validate"], toval_data)
        rm_interval_file = _gunzip(
            normalize_input_path(
                toval_data["config"]["algorithm"].get("validate_regions"),
                toval_data), toval_data)
        rm_interval_file = bedutils.clean_file(
            rm_interval_file,
            toval_data,
            prefix="validateregions-",
            bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep")))
        rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(toval_data),
                                         data.get("genome_build"), base_dir,
                                         data)
        rm_interval_file = (naming.handle_synonyms(
            rm_interval_file, dd.get_ref_file(toval_data),
            data.get("genome_build"), base_dir, data)
                            if rm_interval_file else None)
        vmethod = tz.get_in(["config", "algorithm", "validate_method"], data,
                            "rtg")
        if not vcfutils.vcf_has_variants(vrn_file):
            # RTG can fail on totally empty files. Skip these since we have nothing.
            pass
        # empty validation file, every call is a false positive
        elif not vcfutils.vcf_has_variants(rm_file):
            eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir,
                                         toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir,
                                                     toval_data)
        elif vmethod == "rtg":
            eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file,
                                       base_dir, toval_data)
            eval_files = _annotate_validations(eval_files, toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir,
                                                     toval_data)
        elif vmethod == "hap.py":
            data["validate"] = _run_happy_eval(vrn_file, rm_file,
                                               rm_interval_file, base_dir,
                                               toval_data)
        elif vmethod == "bcbio.variation":
            data["validate"] = _run_bcbio_variation(vrn_file, rm_file,
                                                    rm_interval_file, base_dir,
                                                    sample, caller, toval_data)
    return [[data]]
Ejemplo n.º 33
0
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    """
    if not all(
            utils.get_in(data, ("config", "algorithm",
                                "aligner")) in ["bwa", False, None]
            for data in items):
        raise ValueError(
            "Require bwa-mem alignment input for lumpy structural variation detection"
        )
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(
        paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        full_bams.append(dd.get_align_bam(data))
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams,
                                         previous_evidence, work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sr_bam, _ = sshared.get_split_discordants(data, work_dir)
        sample_vcf = vcfutils.select_sample(
            lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample),
            data["config"])
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), sr_bam,
                                  exclude_file, data)
        else:
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), sr_bam,
                                      exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" %
                utils.splitext_plus(std_gt_vcf)[0],
                config=data["config"])
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name],
                                        [paired.normal_name], gt_vcfs,
                                        paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
        else:
            effects_vcf = None
        data["sv"].append({
            "variantcaller": "lumpy",
            "vrn_file": effects_vcf or vcf_file,
            "exclude_file": exclude_file
        })
        out.append(data)
    return out
Ejemplo n.º 34
0
def _sv_workdir(data):
    return utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "structural",
                     dd.get_sample_name(data), "lumpy"))
Ejemplo n.º 35
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            dbsnp_file, intervals, data):
    """Step 1 of GATK recalibration process, producing table of covariates.

    For GATK 4 we use local multicore spark runs:
    https://github.com/broadinstitute/gatk/issues/2345

    For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and
    the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis'
    plots in the GATK documentation:

    http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest

    This identifies large files and calculates the fraction to downsample to.
    """
    target_counts = 1e8  # 100 million reads per read group, 20x the plotted max
    out_file = os.path.join(
        dd.get_work_dir(data), "align", dd.get_sample_name(data),
        "%s-recal.grp" %
        utils.splitext_plus(os.path.basename(dup_align_bam))[0])
    if not utils.file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with file_transaction(data, out_file) as tx_out_file:
                gatk_type = broad_runner.gatk_type()
                assert gatk_type in ["restricted", "gatk4"], \
                    "Require full version of GATK 2.4+ or GATK4 for BQSR"
                params = ["-I", dup_align_bam]
                cores = dd.get_num_cores(data)
                if gatk_type == "gatk4":
                    params += [
                        "-T", "BaseRecalibratorSpark", "--sparkMaster",
                        "local[%s]" % cores, "--output", tx_out_file,
                        "--reference",
                        dd.get_ref_twobit(data)
                    ]
                else:
                    params += [
                        "-T", "BaseRecalibrator", "-o", tx_out_file, "-R",
                        ref_file
                    ]
                    downsample_pct = bam.get_downsample_pct(
                        dup_align_bam, target_counts, data)
                    if downsample_pct:
                        params += [
                            "--downsample_to_fraction",
                            str(downsample_pct), "--downsampling_type",
                            "ALL_READS"
                        ]
                    if platform.lower() == "solid":
                        params += [
                            "--solid_nocall_strategy", "PURGE_READ",
                            "--solid_recal_mode", "SET_Q_ZERO_BASE_N"
                        ]
                if dbsnp_file:
                    params += ["--knownSites", dbsnp_file]
                if intervals:
                    params += [
                        "-L", intervals, "--interval_set_rule", "INTERSECTION"
                    ]
                memscale = {
                    "magnitude": 0.9 * cores,
                    "direction": "increase"
                } if cores > 1 else None
                broad_runner.run_gatk(params,
                                      os.path.dirname(tx_out_file),
                                      memscale=memscale)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Ejemplo n.º 36
0
 def by_original_order(xs):
     return min([sample_order.index(dd.get_sample_name(x)) for x in xs])
Ejemplo n.º 37
0
def run_peddy(samples, out_dir=None):
    vcf_file = None
    for d in samples:
        vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(
                vcinfo["vrn_file"]):
            if vcinfo["vrn_file"] and dd.get_sample_name(
                    d) in vcfutils.get_samples(vcinfo["vrn_file"]):
                vcf_file = vcinfo["vrn_file"]
                break
    data = samples[0]
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    if not peddy or not vcf_file or not is_human(data):
        logger.info(
            "peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking "
            "for %s." % vcf_file)
        return samples
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"
    peddyfiles = expected_peddy_files(peddy_report, batch)
    if file_exists(peddy_report):
        return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    num_cores = dd.get_num_cores(data)

    with tx_tmpdir(data) as tx_dir:
        peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix))
        # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
        stderr_log = os.path.join(tx_dir, "run-stderr.log")
        sites_str = "--sites hg38" if dd.get_genome_build(
            data) == "hg38" else ""
        cmd = (
            "{peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
            "{vcf_file} {ped_file} 2> {stderr_log}")
        message = "Running peddy on {vcf_file} against {ped_file}."
        try:
            do.run(cmd.format(**locals()), message.format(**locals()))
        except:
            to_show = collections.deque(maxlen=100)
            with open(stderr_log) as in_handle:
                for line in in_handle:
                    to_show.append(line)

            def allowed_errors(l):
                return ((l.find("IndexError") >= 0
                         and l.find("is out of bounds for axis") >= 0) or
                        (l.find("n_components=") >= 0
                         and l.find("must be between 1 and n_features=") >= 0))

            def all_line_errors(l):
                return (l.find("no intervals found for") >= 0)

            if any([allowed_errors(l) for l in to_show]) or all(
                [all_line_errors(l) for l in to_show]):
                logger.info(
                    "Skipping peddy because no variants overlap with checks: %s"
                    % batch)
                with open(peddy_prefix + "-failed.log", "w") as out_handle:
                    out_handle.write(
                        "peddy did not find overlaps with 1kg sites in VCF, skipping"
                    )
                return samples
            else:
                logger.warning("".join(to_show))
                raise
        for ext in PEDDY_OUT_EXTENSIONS:
            if os.path.exists(peddy_prefix_tx + ext):
                shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Ejemplo n.º 38
0
def run(_, data, out_dir):
    stats_file = os.path.join(utils.safe_makedir(out_dir),
                              "%s_umi_stats.yaml" % dd.get_sample_name(data))
    if not utils.file_uptodate(stats_file, dd.get_align_bam(data)):
        out = {}
        total = 0
        mapped = 0
        duplicates = 0
        umi_reductions = []
        umi_counts = collections.defaultdict(int)
        with pysam.AlignmentFile(data["umi_bam"], "rb",
                                 check_sq=False) as bam_iter:
            cur_counts = collections.defaultdict(int)
            cur_key = None
            for rec in bam_iter:
                total += 1
                umi = rec.get_tag("RX")
                if umi and not rec.is_unmapped:
                    mapped += 1
                    if rec.is_duplicate:
                        duplicates += 1
                    chrom = bam_iter.getrname(rec.reference_id)
                    pos = rec.reference_start
                    key = (chrom, pos)
                    if key != cur_key:
                        # update counts
                        if cur_counts:
                            for c in cur_counts.values():
                                umi_counts[c] += 1
                            total_seqs = sum(cur_counts.values())
                            umi_count = len(cur_counts)
                            umi_reductions.append(
                                float(total_seqs) / umi_count)
                        # update current keys
                        cur_key = key
                        cur_counts = collections.defaultdict(int)
                    cur_counts[umi] += 1
            if cur_counts:
                for c in cur_counts.values():
                    umi_counts[c] += 1
                total_seqs = sum(cur_counts.values())
                umi_count = len(cur_counts)
                umi_reductions.append(float(total_seqs) / umi_count)
        consensus_count = sum(
            [x.aligned for x in bam.idxstats(dd.get_align_bam(data), data)])
        out["umi_baseline_all"] = total
        out["umi_baseline_mapped"] = mapped
        out["umi_baseline_duplicate_pct"] = float(duplicates) / float(
            mapped) * 100.0
        out["umi_consensus_mapped"] = consensus_count
        out["umi_consensus_pct"] = (
            100.0 - float(consensus_count) / float(mapped) * 100.0)
        out["umi_reduction_median"] = int(math.ceil(np.median(umi_reductions)))
        out["umi_reduction_max"] = int(max(umi_reductions))
        out["umi_counts"] = dict(umi_counts)
        with open(stats_file, "w") as out_handle:
            yaml.safe_dump({dd.get_sample_name(data): out},
                           out_handle,
                           default_flow_style=False,
                           allow_unicode=False)
    return stats_file
Ejemplo n.º 39
0
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items):
    """Run lumpy-sv using smoove.
    """
    data = items[0]
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    name = "%s%s" % (dd.get_sample_name(items[0]), ext)
    out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name)
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    old_out_file = os.path.join(
        work_dir, "%s%s-prep.vcf.gz" %
        (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    if utils.file_exists(old_out_file):
        return old_out_file, sv_exclude_bed
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cores = dd.get_num_cores(items[0])
            out_dir = os.path.dirname(tx_out_file)
            ref_file = dd.get_ref_file(items[0])
            full_bams = " ".join(
                _prepare_smoove_bams(full_bams, sr_bams, disc_bams, items,
                                     os.path.dirname(tx_out_file)))
            std_excludes = [
                "~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"
            ]

            def _is_std_exclude(n):
                clean_excludes = [
                    x.replace("~", "").replace("^", "") for x in std_excludes
                ]
                return any(
                    [n.startswith(x) or n.endswith(x) for x in clean_excludes])

            exclude_chrs = [
                c.name for c in ref.file_contigs(ref_file)
                if not chromhacks.is_nonalt(c.name)
                and not _is_std_exclude(c.name)
            ]
            exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes +
                                                             exclude_chrs)
            exclude_bed = (
                "--exclude %s" %
                sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else ""
            tempdir = os.path.dirname(tx_out_file)
            smoove = config_utils.get_program("smoove", data)
            smoovepath = os.path.dirname(smoove)
            cmd = (
                "export TMPDIR={tempdir} && export PATH={smoovepath}:$PATH && "
                "{smoove} call --processes {cores} --genotype --removepr --fasta {ref_file} "
                "--name {name} --outdir {out_dir} "
                "{exclude_bed} {exclude_chrs} {full_bams}")
            with utils.chdir(tempdir):
                try:
                    do.run(cmd.format(**locals()), "smoove lumpy calling",
                           items[0])
                except subprocess.CalledProcessError as msg:
                    if _allowed_errors(msg):
                        vcfutils.write_empty_vcf(
                            tx_out_file,
                            config=items[0]["config"],
                            samples=[dd.get_sample_name(d) for d in items])
                    else:
                        logger.exception()
                        raise
    vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file, sv_exclude_bed
Ejemplo n.º 40
0
 def _items_to_world(self, items):
     world = {}
     for item in items:
         assert len(item) == 1
         world[dd.get_sample_name(item[0])] = copy.deepcopy(item[0])
     return world
Ejemplo n.º 41
0
def combine_calls(*args):
    """Combine multiple callsets into a final set of merged calls.
    """
    if len(args) == 3:
        is_cwl = False
        batch_id, samples, data = args
        caller_names, vrn_files = _organize_variants(samples, batch_id)
    else:
        is_cwl = True
        samples = [utils.to_single_data(x) for x in args]
        samples = [cwlutils.unpack_tarballs(x, x) for x in samples]
        data = samples[0]
        batch_id = data["batch_id"]
        caller_names = data["variants"]["variantcallers"]
        vrn_files = data["variants"]["calls"]
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(caller_names)))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(
        os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    if any([vcfutils.vcf_has_variants(f) for f in vrn_files]):
        # Decompose multiallelic variants and normalize
        passonly = not tz.get_in(
            ["config", "algorithm", "ensemble", "use_filtered"], edata, False)
        vrn_files = [
            normalize.normalize(f,
                                data,
                                passonly=passonly,
                                rerun_effects=False,
                                remove_oldeffects=True,
                                work_dir=utils.safe_makedir(
                                    os.path.join(base_dir, c)))
            for c, f in zip(caller_names, vrn_files)
        ]
        if "classifiers" not in (dd.get_ensemble(edata) or {}):
            callinfo = _run_ensemble_intersection(batch_id, vrn_files,
                                                  caller_names, base_dir,
                                                  edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir,
                                             edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file,
                                     base_dir, dd.get_ref_file(edata), edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(
                callinfo["vrn_file"], data["config"])
        # After decomposing multiallelic variants and normalizing, re-evaluate effects
        ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data)
        if ann_ma_file:
            callinfo["vrn_file"] = ann_ma_file

        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get(
            "validate")
    else:
        out_vcf_file = os.path.join(base_dir,
                                    "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(
            out_vcf_file, samples=[dd.get_sample_name(d) for d in samples])
        callinfo = {
            "variantcaller": "ensemble",
            "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
            "bed_file": None
        }
    if is_cwl:
        callinfo["batch_samples"] = data["batch_samples"]
        callinfo["batch_id"] = batch_id
        return [{"ensemble": callinfo}]
    else:
        return [[batch_id, callinfo]]
Ejemplo n.º 42
0
def _run_freebayes_paired(align_bams,
                          items,
                          ref_file,
                          assoc_files,
                          region=None,
                          out_file=None):
    """Detect SNPs and indels with FreeBayes for paired tumor/normal samples.

    Sources of options for FreeBayes:
    mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ
    mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ
    speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916
    sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/
                   sga_generate_varcall_makefile.pl#L299
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering"

            freebayes = config_utils.get_program("freebayes", config)
            opts, no_target_regions = _freebayes_options_from_config(
                items, config, out_file, region)
            if no_target_regions:
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                opts = " ".join(opts)
                opts += " --min-repeat-entropy 1"
                opts += " --no-partial-observations"
                opts = _add_somatic_opts(opts, paired)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                # For multi-sample outputs, ensure consistent order
                samples = (
                    "-s " +
                    ",".join([dd.get_sample_name(d)
                              for d in items])) if len(items) > 1 else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                bcbio_py = sys.executable
                py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                cl = (
                    "{freebayes} -f {ref_file} {opts} "
                    "{paired.tumor_bam} {paired.normal_bam} "
                    """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """
                    """| {bcbio_py} -c 'from bcbio.variation import freebayes; """
                    """freebayes.call_somatic("{paired.tumor_name}", "{paired.normal_name}")' """
                    "| {fix_ambig} | bcftools view {samples} -a - | "
                    "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                    "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                    "vt normalize -n -r {ref_file} -q - | vcfuniqalleles | vt uniq - 2> /dev/null "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cl.format(**locals()),
                       "Genotyping paired variants with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Ejemplo n.º 43
0
def merge_split_alignments(samples, run_parallel):
    """Manage merging split alignments back into a final working BAM file.

    Perform de-duplication on the final merged file.
    """
    ready = []
    file_key = "work_bam"
    to_merge = collections.defaultdict(list)
    for data in (xs[0] for xs in samples):
        if data.get("combine"):
            out_key = tz.get_in(["combine", file_key, "out"], data)
            if not out_key:
                out_key = data["rgnames"]["lane"]
            to_merge[out_key].append(data)
        else:
            ready.append([data])
    ready_merge = []
    hla_merges = []
    for mgroup in to_merge.values():
        cur_data = mgroup[0]
        del cur_data["align_split"]
        for x in mgroup[1:]:
            cur_data["combine"][file_key]["extras"].append(x[file_key])
        ready_merge.append([cur_data])
        cur_hla = None
        for d in mgroup:
            hla_files = tz.get_in(["hla", "fastq"], d)
            if hla_files:
                if not cur_hla:
                    cur_hla = {
                        "rgnames": {
                            "sample": dd.get_sample_name(cur_data)
                        },
                        "config": cur_data["config"],
                        "dirs": cur_data["dirs"],
                        "hla": {
                            "fastq": []
                        }
                    }
                cur_hla["hla"]["fastq"].append(hla_files)
        if cur_hla:
            hla_merges.append([cur_hla])
    if not tz.get_in(["config", "algorithm", "kraken"], data):
        # kraken requires fasta filenames from data['files'] as input.
        # We don't want to remove those files if kraken qc is required.
        _save_fastq_space(samples)
    merged = run_parallel("delayed_bam_merge", ready_merge)
    hla_merge_raw = run_parallel("merge_split_alignments", hla_merges)
    hla_merges = {}
    for hla_merge in [x[0] for x in hla_merge_raw]:
        hla_merges[dd.get_sample_name(hla_merge)] = tz.get_in(["hla", "fastq"],
                                                              hla_merge)

    # Add stable 'align_bam' target to use for retrieving raw alignment
    out = []
    for data in [x[0] for x in merged + ready]:
        if data.get("work_bam"):
            data["align_bam"] = data["work_bam"]
        if dd.get_sample_name(data) in hla_merges:
            data["hla"]["fastq"] = hla_merges[dd.get_sample_name(data)]
        else:
            hla_files = glob.glob(
                os.path.join(dd.get_work_dir(data), "align",
                             dd.get_sample_name(data), "hla", "*.fq"))
            if hla_files:
                data["hla"]["fastq"] = hla_files
        out.append([data])
    return out
Ejemplo n.º 44
0
def _sv_workdir(data):
    return utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "heterogeneity",
                     dd.get_sample_name(data), "theta"))
Ejemplo n.º 45
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage depths by GC, repeats and background.

    Provides normalized output based on CNVkit approaches, provides a
    point for providing additional methods in the future:

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    items = [
        utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)
    ]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out_files = {}
    back_files = {}
    for group_id, gitems in itertools.groupby(
            items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        # No CNVkit calling for this particular set of samples
        if group_id is None:
            continue
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        cnns = reduce(operator.add, [[
            tz.get_in(["depth", "bins", "target"], x),
            tz.get_in(["depth", "bins", "antitarget"], x)
        ] for x in backgrounds], [])
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(
            dd.get_sample_name(x) for x in items))
        for d in inputs:
            if tz.get_in(["depth", "bins", "target"], d):
                target_bed = tz.get_in(["depth", "bins", "target"], d)
                antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(inputs[0]), "structural",
                         dd.get_sample_name(inputs[0]), "bins"))
        input_backs = set(
            filter(lambda x: x is not None,
                   [dd.get_background_cnv_reference(d) for d in inputs]))
        if input_backs:
            assert len(
                input_backs
            ) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
            back_file = list(input_backs)[0]
        else:
            back_file = cnvkit.cnvkit_background(
                cnns,
                os.path.join(work_dir,
                             "background-%s-cnvkit.cnn" % (group_id)),
                backgrounds or inputs, target_bed, antitarget_bed)
        fix_cmd_inputs = []
        for data in inputs:
            work_dir = utils.safe_makedir(
                os.path.join(dd.get_work_dir(data), "structural",
                             dd.get_sample_name(data), "bins"))
            if tz.get_in(["depth", "bins", "target"], data):
                fix_file = os.path.join(
                    work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
                fix_cmd_inputs.append(
                    (tz.get_in(["depth", "bins", "target"], data),
                     tz.get_in(["depth", "bins", "antitarget"],
                               data), back_file, fix_file, data))
                out_files[dd.get_sample_name(data)] = fix_file
                back_files[dd.get_sample_name(data)] = back_file
        parallel = {
            "type": "local",
            "cores": dd.get_cores(inputs[0]),
            "progs": ["cnvkit"]
        }
        run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs,
                      inputs[0]["config"], parallel)

    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["background"] = back_files[
                dd.get_sample_name(data)]
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(
                data)]
        out.append([data])
    return out
Ejemplo n.º 46
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = (" ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 target))
                        if _is_bed_file(target) else "")
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if dd.get_avg_coverage(
                    items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
                    utils.Rscript_cmd())
                cmd = (
                    "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
    out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config)
                if assoc_files.get("dbsnp") else out_file)
    return out_file
Ejemplo n.º 47
0
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(
        backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir,
                                  "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(
            os.path.join(_sv_workdir(cur_input), "raw"))
        out_base = _bam_to_outbase(dd.get_align_bam(cur_input),
                                   cur_raw_work_dir)
        ckouts.append({
            "cnr": "%s.cnr" % out_base,
            "cns": "%s.cns" % out_base,
            "back_cnn": background_cnn
        })
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        raw_target_bed, access_bed = _get_target_access_files(
            cov_interval, inputs[0], work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0])
        parallel = {
            "type": "local",
            "cores": dd.get_cores(inputs[0]),
            "progs": ["cnvkit"]
        }
        pct_coverage = (
            pybedtools.BedTool(raw_target_bed).total_coverage() /
            float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed,
                                                     access_bed, cov_interval,
                                                     pct_coverage,
                                                     raw_work_dir, inputs[0])
        split_beds = _split_bed(target_bed, inputs[0]) + _split_bed(
            antitarget_bed, inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                         zip(["evaluate"] * len(inputs), inputs)
        split_cnns = run_multicore(_cnvkit_coverage,
                                   [(cdata, bed, itype)
                                    for itype, cdata in samples_to_run
                                    for bed in split_beds],
                                   inputs[0]["config"], parallel)
        raw_coverage_cnns = _merge_coverage(split_cnns, inputs[0])
        coverage_cnns = run_multicore(
            _cnvkit_metrics,
            [(cnns, target_bed, antitarget_bed, cov_interval,
              inputs + backgrounds)
             for cnns in tz.groupby("bam", raw_coverage_cnns).values()],
            inputs[0]["config"], parallel)
        background_cnn = _cnvkit_background(
            _select_background_cnns(coverage_cnns), background_cnn, target_bed,
            antitarget_bed, inputs[0])
        fixed_cnrs = run_multicore(
            _cnvkit_fix,
            [(cnns, background_cnn, inputs + backgrounds)
             for cnns in tz.groupby(
                 "bam", [x for x in coverage_cnns
                         if x["itype"] == "evaluate"]).values()],
            inputs[0]["config"], parallel)
        run_multicore(_cnvkit_segment,
                      [(cnr, cov_interval, data) for cnr, data in fixed_cnrs],
                      inputs[0]["config"], parallel)
    return ckouts
Ejemplo n.º 48
0
def _get_cache_file(data, target_name):
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    cache_file = prefix + "-" + target_name + "-stats.yaml"
    return cache_file
Ejemplo n.º 49
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.")
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = cwlutils.unpack_tarballs([utils.deepish_copy(x) for x in samples], samples[0])
    work_samples = _summarize_inputs(work_samples, out_dir)
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0])
                    else:
                        export_tmp = ""
                    path_export = utils.local_path_export()
                    other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", [])
                    other_opts = " ".join([str(x) for x in other_opts])
                    cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}"
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"), out_data)
    samples = _group_by_sample_and_batch(samples)
    if utils.file_exists(out_file) and samples:
        data_files = set()
        for i, data in enumerate(samples):
            data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt"))
        data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml"))
        data_files.add(os.path.join(out_dir, "multiqc_config.yaml"))
        [data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))]
        data_files = [f for f in data_files if f and utils.file_exists(f)]
        if "summary" not in samples[0]:
            samples[0]["summary"] = {}
        samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files}

        data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json")
        data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data"))
        if data_json_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final)

        # Prepare final file list and inputs for downstream usage
        file_list_final = _save_uploaded_file_list(samples, file_list, out_dir)
        if file_list_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final)
            if any([cwlutils.is_cwl_run(d) for d in samples]):
                for indir in ["inputs", "report"]:
                    tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir))
                    if not utils.file_exists(tarball):
                        cmd = ["tar", "-czvpf", tarball, os.path.join(out_dir, indir)]
                        do.run(cmd, "Compress multiqc inputs: %s" % indir)
                    samples[0]["summary"]["multiqc"]["secondary"].append(tarball)

    if any([cwlutils.is_cwl_run(d) for d in samples]):
        samples = _add_versions(samples)

    return [[data] for data in samples]
Ejemplo n.º 50
0
def _sv_workdir(data):
    return utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "structural",
                     dd.get_sample_name(data), "titancna"))
Ejemplo n.º 51
0
def _cur_workdir(data):
    return utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "heterogeneity",
                     dd.get_sample_name(data), "bubbletree"))
Ejemplo n.º 52
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" %
                    (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        if dd.get_umi_consensus(data):
            data["umi_bam"] = dd.get_work_bam(data)
            if fastq2:
                f1, f2 = postalign.umi_consensus(data)
                del data["config"]["algorithm"]["umi_type"]
                data["config"]["algorithm"]["mark_duplicates"] = False
                data = align_to_sort_bam(f1, f2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                    % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"],
                                           dd.get_ref_file(data), data["dirs"],
                                           data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"],
                                     dd.get_ref_file(data), data["dirs"], data)
        elif bamclean == "remove_extracontigs":
            out_bam = cleanbam.remove_extracontigs(fastq1, data)
            data["reference"]["fasta"] = bam.ref_file_from_bam(out_bam, data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        else:
            out_bam = link_bam_file(
                fastq1,
                os.path.join(data["dirs"]["work"], "prealign",
                             data["rgnames"]["sample"]))
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data),
                         data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" %
                         dd.get_sample_name(data))
    else:
        raise ValueError(
            "Could not process input file from sample configuration. \n" +
            fastq1 + "\nIs the path to the file correct or is empty?\n" +
            "If it is a fastq file (not pre-aligned BAM or CRAM), "
            "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
Ejemplo n.º 53
0
def _sv_workdir(data):
    return os.path.join(data["dirs"]["work"], "structural",
                        dd.get_sample_name(data), "manta")
Ejemplo n.º 54
0
def _get_batches(data):
    batches = dd.get_batch(data) or dd.get_sample_name(data)
    if not isinstance(batches, (list, tuple)):
        batches = [batches]
    return batches
Ejemplo n.º 55
0
def summarize_grading(samples, vkey="validate"):
    """Provide summaries of grading results across all samples.

    Handles both traditional pipelines (validation part of variants) and CWL
    pipelines (validation at top level)
    """
    samples = list(utils.flatten(samples))
    if not _has_grading_info(samples, vkey):
        return [[d] for d in samples]
    validate_dir = utils.safe_makedir(
        os.path.join(samples[0]["dirs"]["work"], vkey))
    header = ["sample", "caller", "variant.type", "category", "value"]
    _summarize_combined(samples, vkey)
    validated, out = _group_validate_samples(
        samples, vkey, (["metadata", "validate_batch"], ["metadata", "batch"
                                                         ], ["description"]))
    for vname, vitems in validated.items():
        out_csv = os.path.join(validate_dir, "grading-summary-%s.csv" % vname)
        with open(out_csv, "w") as out_handle:
            writer = csv.writer(out_handle)
            writer.writerow(header)
            plot_data = []
            plot_files = []
            for data in sorted(
                    vitems,
                    key=lambda x: x.get("lane", dd.get_sample_name(x))):
                validations = [
                    variant.get(vkey) for variant in data.get("variants", [])
                    if isinstance(variant, dict)
                ]
                validations = [v for v in validations if v]
                if len(validations) == 0 and vkey in data:
                    validations = [data.get(vkey)]
                for validate in validations:
                    if validate:
                        validate["grading_summary"] = out_csv
                        if validate.get("grading"):
                            for row in _get_validate_plotdata_yaml(
                                    validate["grading"], data):
                                writer.writerow(row)
                                plot_data.append(row)
                        elif validate.get("summary") and not validate.get(
                                "summary") == "None":
                            if isinstance(validate["summary"], (list, tuple)):
                                plot_files.extend(
                                    list(set(validate["summary"])))
                            else:
                                plot_files.append(validate["summary"])
        if plot_files:
            plots = validateplot.classifyplot_from_plotfiles(
                plot_files, out_csv)
        elif plot_data:
            plots = validateplot.create(plot_data, header, 0, data["config"],
                                        os.path.splitext(out_csv)[0])
        else:
            plots = []
        for data in vitems:
            if data.get(vkey):
                data[vkey]["grading_plots"] = plots
            for variant in data.get("variants", []):
                if isinstance(variant, dict) and variant.get(vkey):
                    variant[vkey]["grading_plots"] = plots
            out.append([data])
    return out
Ejemplo n.º 56
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4-len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if not transform:
        logger.info("No UMI transform specified, assuming pre-transformed data.")
        if is_transformed(fq1):
            logger.info("%s detected as pre-transformed, passing it on unchanged." % fq1)
            data["files"] = [fq1]
            return [[data]]
        else:
            logger.error("No UMI transform was specified, but %s does not look "
                         "pre-transformed." % fq1)
            sys.exit(1)

    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s."
                %(dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    cellular_barcodes = get_cellular_barcodes(data)
    if len(cellular_barcodes) > 1:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]

    cmd = ("{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = ("Inserting UMI and barcode information into the read name of %s"
               % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Ejemplo n.º 57
0
def _get_sample_and_caller(data):
    return [
        tz.get_in(["metadata", "validate_sample"], data)
        or dd.get_sample_name(data),
        _get_caller_supplement(_get_caller(data), data)
    ]
Ejemplo n.º 58
0
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(
        backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir,
                                  "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(
            os.path.join(_sv_workdir(cur_input), "raw"))
        out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input),
                                                 cur_raw_work_dir, cur_input)
        if utils.file_exists(out_base_old + ".cns"):
            out_base = out_base_old
        ckouts.append({
            "cnr": "%s.cnr" % out_base,
            "cns": "%s.cns" % out_base,
            "back_cnn": background_cnn
        })
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                        zip(["evaluate"] * len(inputs), inputs)
        # New style shared SV bins
        if tz.get_in(["depth", "bins", "target"], inputs[0]):
            target_bed = tz.get_in(["depth", "bins", "target"], inputs[0])
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"],
                                       inputs[0])
            raw_coverage_cnns = reduce(operator.add, [
                _get_general_coverage(cdata, itype)
                for itype, cdata in samples_to_run
            ])
        # Back compatible with pre-existing runs
        else:
            target_bed, antitarget_bed = _get_original_targets(inputs[0])
            raw_coverage_cnns = reduce(operator.add, [
                _get_original_coverage(cdata, itype)
                for itype, cdata in samples_to_run
            ])
        # Currently metrics not calculated due to speed and needing re-evaluation
        # We could re-enable with larger truth sets to evaluate background noise
        # But want to reimplement in a more general fashion as part of normalization
        if False:
            coverage_cnns = reduce(operator.add, [
                _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval,
                                inputs + backgrounds)
                for cnns in tz.groupby("bam", raw_coverage_cnns).values()
            ])
            background_cnn = _cnvkit_background(
                _select_background_cnns(coverage_cnns), background_cnn,
                target_bed, antitarget_bed, inputs[0])
        else:
            coverage_cnns = raw_coverage_cnns
            background_cnn = _cnvkit_background([
                x["file"] for x in coverage_cnns if x["itype"] == "background"
            ], background_cnn, target_bed, antitarget_bed, inputs[0])
        parallel = {
            "type": "local",
            "cores": dd.get_cores(inputs[0]),
            "progs": ["cnvkit"]
        }
        fixed_cnrs = run_multicore(
            _cnvkit_fix,
            [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby(
                "bam", [x for x in coverage_cnns
                        if x["itype"] == "evaluate"]).values()],
            inputs[0]["config"], parallel)
        [
            _cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds)
            for cnr, data in fixed_cnrs
        ]
    return ckouts
Ejemplo n.º 59
0
def _maybe_add_salmon_files(algorithm, sample, out):
    salmon_dir = os.path.join(dd.get_work_dir(sample), "salmon",
                              dd.get_sample_name(sample), "quant")
    if os.path.exists(salmon_dir):
        out.append({"path": salmon_dir, "type": "directory", "ext": "salmon"})
    return out
Ejemplo n.º 60
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data,
                  validate_method):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        vrn_file, rm_file, interval_bed = _prepare_inputs(
            vrn_file, rm_file, rm_interval_file, base_dir, data)

        rtg_ref = tz.get_in(["reference", "rtg"], data)
        if isinstance(rtg_ref, dict) and "base" in rtg_ref:
            rtg_ref = os.path.dirname(rtg_ref["base"])
        assert rtg_ref and os.path.exists(rtg_ref), (
            "Did not find rtg indexed reference file for validation:\n%s\n"
            "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)

        # get core and memory usage from standard configuration
        threads = min(dd.get_num_cores(data), 6)
        resources = config_utils.get_resources("rtg", data["config"])
        memory = config_utils.adjust_opts(
            resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), {
                "algorithm": {
                    "memory_adjust": {
                        "magnitude": threads,
                        "direction": "increase"
                    }
                }
            })
        jvm_stack = [x for x in memory if x.startswith("-Xms")]
        jvm_mem = [x for x in memory if x.startswith("-Xmx")]
        jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m"
        jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g"
        cmd = [
            "rtg", "vcfeval", "--threads",
            str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c",
            vrn_file, "-t", rtg_ref, "-o", out_dir
        ]
        if validate_method == "rtg-squash-ploidy":
            cmd += ["--squash-ploidy"]
        rm_samples = vcfutils.get_samples(rm_file)
        if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples:
            cmd += ["--sample=%s" % dd.get_sample_name(data)]
        cmd += [
            "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))
        ]
        mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (
            utils.local_path_export(), jvm_stack, jvm_mem)
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {
        "fp": os.path.join(out_dir, "fp.vcf.gz"),
        "fn": os.path.join(out_dir, "fn.vcf.gz")
    }
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out