예제 #1
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)

    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)

    if dd.get_transcriptome_align(data):
        # to create a disambiguated transcriptome file realign with bowtie2
        if dd.get_disambiguate(data):
            logger.info("Aligning to the transcriptome with bowtie2 using the "
                        "disambiguated reads.")
            bam_path = data["work_bam"]
            fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome')
            if len(fastq_paths) == 2:
                file1, file2 = fastq_paths
            else:
                file1, file2 = fastq_paths[0], None
            ref_file = dd.get_ref_file(data)
            data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
        else:
            file1, file2 = dd.get_input_sequence_files(data)
        if not dd.get_transcriptome_bam(data):
            ref_file = dd.get_ref_file(data)
            logger.info("Transcriptome alignment was flagged to run, but the "
                        "transcriptome BAM file was not found. Aligning to the "
                        "transcriptome with bowtie2.")
            data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
예제 #2
0
def postprocess_variants(items):
    """Provide post-processing of variant calls: filtering and effects annotation.
    """
    data, items = _get_batch_representative(items, "vrn_file")
    cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data))
    logger.info("Finalizing variant calls: %s" % cur_name)
    orig_vrn_file = data.get("vrn_file")
    data = _symlink_to_workdir(data, ["vrn_file"])
    data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"])
    if data.get("align_bam") and data.get("vrn_file"):
        logger.info("Calculating variation effects for %s" % cur_name)
        ann_vrn_file, vrn_stats = effects.add_to_vcf(data["vrn_file"], data)
        if ann_vrn_file:
            data["vrn_file"] = ann_vrn_file
        if vrn_stats:
            data["vrn_stats"] = vrn_stats
        orig_items = _get_orig_items(items)
        logger.info("Annotate VCF file: %s" % cur_name)
        data["vrn_file"] = annotation.finalize_vcf(data["vrn_file"], get_variantcaller(data), orig_items)
        logger.info("Filtering for %s" % cur_name)
        data["vrn_file"] = variant_filtration(data["vrn_file"], dd.get_ref_file(data),
                                              tz.get_in(("genome_resources", "variation"), data, {}),
                                              data, orig_items)
        logger.info("Prioritization for %s" % cur_name)
        data["vrn_file"] = prioritize.handle_vcf_calls(data["vrn_file"], data, orig_items)
        logger.info("Germline extraction for %s" % cur_name)
        data = germline.extract(data, orig_items)

        data = damage.run_filter(data["vrn_file"], dd.get_align_bam(data), dd.get_ref_file(data),
                                 data, orig_items)
    if orig_vrn_file and os.path.samefile(data["vrn_file"], orig_vrn_file):
        data["vrn_file"] = orig_vrn_file
    return [[data]]
예제 #3
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    from bcbio.bam import callable
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "gatk-haplotype"))
    data = _setup_variant_regions(data, out_dir)
    out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        region_files = []
        regions = []
        for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data):
            str_region = "_".join([str(x) for x in cur_region])
            region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                    "variation", "rnaseq", "gatk-haplotype",
                                                                    "regions")),
                                    "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region))
            region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                                region=cur_region, out_file=region_file)
            region_files.append(region_file)
            regions.append(cur_region)
        out_file = vcfutils.concat_variant_files(region_files, out_file, regions,
                                                 dd.get_ref_file(data), data["config"])
    return dd.set_vrn_file(data, out_file)
예제 #4
0
def get_noalt_contigs(data):
    """Retrieve contigs without alternatives as defined in bwa *.alts files.

    If no alt files present (when we're not aligning with bwa), work around
    with standard set of alts based on hg38 -- anything with HLA, _alt or
    _decoy in the name.
    """
    alts = set([])
    alt_files = [
        f for f in tz.get_in(["reference", "bwa", "indexes"], data, [])
        if f.endswith("alt")
    ]
    if alt_files:
        for alt_file in alt_files:
            with open(alt_file) as in_handle:
                for line in in_handle:
                    if not line.startswith("@"):
                        alts.add(line.split()[0].strip())
    else:
        for contig in ref.file_contigs(dd.get_ref_file(data)):
            if ("_alt" in contig.name or "_decoy" in contig.name
                    or contig.name.startswith("HLA-") or ":" in contig.name):
                alts.add(contig.name)
    return [
        c for c in ref.file_contigs(dd.get_ref_file(data))
        if c.name not in alts
    ]
예제 #5
0
def run_vcfanno(vcf, anno_type, data):
    """
    annotated a VCF file using vcfanno, looks up the proper config/lua scripts
    under the `vcfanno` key under the algorithm section of the datadict,
    skipping if the files cannot be found
    """
    UNSUPPORTED_TYPE_MESSAGE = (
        "{anno_type} is not a supported vcf annotation type with vcfanno. "
        "Supported types are {SUPPORTED_ANNOTATION_TYPES}")
    if anno_type not in SUPPORTED_ANNOTATION_TYPES:
        logger.warn(UNSUPPORTED_TYPE_MESSAGE.format(**locals()))
        return vcf
    build = dd.get_genome_build(data)
    annodir = os.path.dirname(dd.get_ref_file(data))
    annodir = os.path.abspath(os.path.join(annodir, os.pardir, "vcfanno"))
    annostem = os.path.join(annodir, build + "-")
    conffn = annostem + anno_type + ".conf"
    luafn = annostem + anno_type + ".lua"
    CONF_NOT_FOUND = (
        "The vcfanno configuration {conffn} was not found for {build}, skipping.")
    if not utils.file_exists(conffn):
        logger.warn(CONF_NOT_FOUND.format(**locals()))
        return vcf

    base = os.path.splitext(vcf)[0]
    out_file = base + anno_type + "-annotated.vcf.gz"
    if utils.file_exists(out_file):
        return out_file
    basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)),
                                            os.path.pardir))
    basepath = annodir

    out_file = vcfanno(vcf, out_file, conffn, data, basepath, luafn)
    return out_file
예제 #6
0
def compare_to_rm(data):
    """Compare final variant calls against reference materials of known calls.
    """
    toval_data = _get_validate(data)
    if toval_data:
        if isinstance(toval_data["vrn_file"], (list, tuple)):
            raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"])
        else:
            vrn_file = os.path.abspath(toval_data["vrn_file"])
        rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data)
        rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"),
                                                        toval_data),
                                   toval_data)
        caller = _get_caller(toval_data)
        sample = dd.get_sample_name(toval_data)
        base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller))
        rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(data), data["genome_build"], base_dir, data)
        rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(data),
                                                   data["genome_build"], base_dir, data)
                            if rm_interval_file else None)
        vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg")
        if vmethod == "rtg":
            eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data)
        elif vmethod == "bcbio.variation":
            data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir,
                                                    sample, caller, toval_data)
    return [[data]]
예제 #7
0
def get_analysis_intervals(data, vrn_file, base_dir):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    from bcbio.bam import callable
    if vrn_file and vcfutils.is_gvcf_file(vrn_file):
        callable_bed = _callable_from_gvcf(data, vrn_file, base_dir)
        if callable_bed:
            return callable_bed

    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif dd.get_sample_callable(data):
        return dd.get_sample_callable(data)
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam_callable"):
        data = utils.deepish_copy(data)
        data["work_bam"] = data.pop("work_bam_callable")
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0]
    elif tz.get_in(["config", "algorithm", "callable_regions"], data):
        return tz.get_in(["config", "algorithm", "callable_regions"], data)
    elif tz.get_in(["config", "algorithm", "variant_regions"], data):
        return tz.get_in(["config", "algorithm", "variant_regions"], data)
예제 #8
0
def run_cluster(*data):
    """
    Run seqcluster cluster to detect smallRNA clusters
    """
    sample = data[0][0]
    work_dir = dd.get_work_dir(sample)
    out_dir = op.join(work_dir, "seqcluster", "cluster")
    out_dir = op.abspath(safe_makedir(out_dir))
    prepare_dir = op.join(work_dir, "seqcluster", "prepare")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    cluster_dir = _cluster(bam_file, prepare_dir, out_dir, dd.get_ref_file(sample), dd.get_srna_gtf_file(sample))
    sample["report"] = _report(sample, dd.get_ref_file(sample))
    sample["seqcluster"] = out_dir

    out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase"))
    if out_mirna:
        sample = dd.set_mirna_counts(sample, out_mirna[0])
        sample = dd.set_isomir_counts(sample, out_mirna[1])

    out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel")
    novel_db = mirdeep.run(data)
    if out_novel:
        sample = dd.set_novel_mirna_counts(sample, out_novel[0])
        sample = dd.set_novel_isomir_counts(sample, out_novel[1])
    data[0][0] = sample
    return data
예제 #9
0
def run_region(data, region, vrn_files, out_file):
    """Perform variant calling on gVCF inputs in a specific genomic region.
    """
    broad_runner = broad.runner_from_config(data["config"])
    if broad_runner.gatk_type() == "gatk4":
        genomics_db = _run_genomicsdb_import(vrn_files, region, out_file, data)
        return _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data)
    else:
        vrn_files = _batch_gvcfs(data, region, vrn_files, dd.get_ref_file(data), out_file)
        return _run_genotype_gvcfs_gatk3(data, region, vrn_files, dd.get_ref_file(data), out_file)
예제 #10
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = utils.to_single_data(data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                                 % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"],
                                           data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign",
                                                         data["rgnames"]["sample"]))
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data))
    else:
        raise ValueError("Could not process input file from sample configuration. \n" +
                         fastq1 +
                         "\nIs the path to the file correct or is empty?\n" +
                         "If it is a fastq file (not pre-aligned BAM or CRAM), "
                         "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
예제 #11
0
def run_cluster(*data):
    """
    Run seqcluster cluster to detect smallRNA clusters
    """
    work_dir = dd.get_work_dir(data[0][0])
    out_dir = os.path.join(work_dir, "seqcluster", "cluster")
    out_dir = os.path.abspath(safe_makedir(out_dir))
    prepare_dir = op.join(work_dir, "seqcluster", "prepare")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    cluster_dir = _cluster(bam_file, prepare_dir, out_dir, dd.get_ref_file(data[0][0]), dd.get_srna_gtf_file(data[0][0]))
    report_file = _report(data[0][0], dd.get_ref_file(data[0][0]))
    for sample in data:
        sample[0]["seqcluster"] = out_dir
    return data
예제 #12
0
def postprocess_variants(items):
    """Provide post-processing of variant calls: filtering and effects annotation.
    """
    vrn_key = "vrn_file"
    if not isinstance(items, dict):
        items = [utils.to_single_data(x) for x in items]
        if "vrn_file_joint" in items[0]:
            vrn_key = "vrn_file_joint"
    data, items = _get_batch_representative(items, vrn_key)
    items = cwlutils.unpack_tarballs(items, data)
    data = cwlutils.unpack_tarballs(data, data)
    cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data, require_bam=False))
    logger.info("Finalizing variant calls: %s" % cur_name)
    orig_vrn_file = data.get(vrn_key)
    data = _symlink_to_workdir(data, [vrn_key])
    data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"])
    if data.get(vrn_key):
        logger.info("Calculating variation effects for %s" % cur_name)
        ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data)
        if ann_vrn_file:
            data[vrn_key] = ann_vrn_file
        if vrn_stats:
            data["vrn_stats"] = vrn_stats
        orig_items = _get_orig_items(items)
        logger.info("Annotate VCF file: %s" % cur_name)
        data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data, require_bam=False),
                                                orig_items)
        if cwlutils.is_cwl_run(data):
            logger.info("Annotate with population level variation data")
            ann_file = population.run_vcfanno(data[vrn_key], data)
            if ann_file:
                data[vrn_key] = ann_file
        logger.info("Filtering for %s" % cur_name)
        data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data),
                                           tz.get_in(("genome_resources", "variation"), data, {}),
                                           data, orig_items)
        logger.info("Prioritization for %s" % cur_name)
        prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items)
        if prio_vrn_file != data[vrn_key]:
            data[vrn_key] = prio_vrn_file
            logger.info("Germline extraction for %s" % cur_name)
            data = germline.extract(data, orig_items)

        if dd.get_align_bam(data):
            data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data),
                                     data, orig_items)
    if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file):
        data[vrn_key] = orig_vrn_file
    return [[data]]
예제 #13
0
def check_bed_contigs(in_file, data):
    """Ensure BED file contigs match the reference genome.
    """
    if not dd.get_ref_file(data):
        return
    contigs = set([])
    with utils.open_gzipsafe(in_file) as in_handle:
        for line in in_handle:
            if not line.startswith(("#", "track", "browser")) and line.strip():
                contigs.add(line.split()[0])
    ref_contigs = set([x.name for x in ref.file_contigs(dd.get_ref_file(data))])
    if len(contigs - ref_contigs) / float(len(contigs)) > 0.25:
        raise ValueError("Contigs in BED file %s not in reference genome:\n %s\n"
                         % (in_file, list(contigs - ref_contigs)) +
                         "This is typically due to chr1 versus 1 differences in BED file and reference.")
예제 #14
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                         "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = ["rtg", "vcfeval", "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    return {"tp": os.path.join(out_dir, "tp.vcf.gz"),
            "fp": os.path.join(out_dir, "fp.vcf.gz"),
            "fn": os.path.join(out_dir, "fn.vcf.gz")}
예제 #15
0
def _fill_prioritization_targets(data):
    """Fill in globally installed files for prioritization.
    """
    ref_file = dd.get_ref_file(data)
    for target in [["svprioritize"]]:
        val = tz.get_in(["config", "algorithm"] + target, data)
        if val and not os.path.exists(val):
            installed_vals = glob.glob(os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir,
                                                                     "coverage", "prioritize", val + "*.bed.gz")))
            if len(installed_vals) == 0:
                raise ValueError("Configuration problem. Prioritization file not found for %s: %s" %
                                 (target, val))
            elif len(installed_vals) == 1:
                installed_val = installed_vals[0]
            else:
                # check for partial matches
                installed_val = None
                for v in installed_vals:
                    if v.endswith(val + ".bed.gz"):
                        installed_val = v
                        break
                # handle date-stamped inputs
                if not installed_val:
                    installed_val = sorted(installed_vals, reverse=True)[0]
            data = tz.update_in(data, ["config", "algorithm"] + target, lambda x: installed_val)
    return data
예제 #16
0
def run(calls, data):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + [
        "--sample",
        dd.get_sample_name(data),
        "--reference",
        dd.get_ref_file(data),
        "--bam",
        dd.get_align_bam(data),
        "--outdir",
        work_dir,
    ]
    available_callers = 0
    for call in calls:
        if call["variantcaller"] in SUPPORTED:
            available_callers += 1
            cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])]
    if available_callers >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
            ins_stats = shared.calc_paired_insert_stats_save(
                dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")
            )
            cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))]
            cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")]
            cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]]
            do.run(cmd, "Combine variant calls with MetaSV")
        calls.append({"variantcaller": "metasv", "vrn_file": out_file})
    return calls
예제 #17
0
def run(items):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    assert len(items) == 1, "Expect one input to MetaSV ensemble calling"
    data = items[0]
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data),
                        "--bam", dd.get_align_bam(data), "--outdir", work_dir]
    methods = []
    for call in data.get("sv", []):
        if call["variantcaller"] in SUPPORTED and call["variantcaller"] not in methods:
            methods.append(call["variantcaller"])
            cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])]
    if len(methods) >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
            ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data),
                                                             os.path.join(tx_work_dir, "insert-stats.yaml"))
            cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))]
            cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")]
            cmd += ["--assembly_max_tools=1", "--assembly_pad=500"]
            cmd += ["--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]]
            do.run(cmd, "Combine variant calls with MetaSV")
        filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || "
                   "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)")
        filter_file = vfilter.hard_w_expression(out_file, filters,
                                                data, name="ReassemblyStats", limit_regions=None)
        effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff")
        data["sv"].append({"variantcaller": "metasv",
                           "vrn_file": effects_vcf or filter_file})
    return [data]
예제 #18
0
def gatk_rnaseq_calling(data):
    """
    use GATK to perform variant calling on RNA-seq data
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    split_bam = dd.get_split_bam(data)
    out_file = os.path.splitext(split_bam)[0] + ".gvcf"
    num_cores = dd.get_num_cores(data)
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    with file_transaction(out_file) as tx_out_file:
        params = ["-T", "HaplotypeCaller",
                  "-R", ref_file,
                  "-I", split_bam,
                  "-o", tx_out_file,
                  "-nct", str(num_cores),
                  "--emitRefConfidence", "GVCF",
                  "--variant_index_type", "LINEAR",
                  "--variant_index_parameter", "128000",
                  "-dontUseSoftClippedBases",
                  "-stand_call_conf", "20.0",
                  "-stand_emit_conf", "20.0"]
        broad_runner.run_gatk(params)
    data = dd.set_vrn_file(data, out_file)
    return data
예제 #19
0
def _create_combined_fasta(data, out_dir):
    """
    if there are genomes to be disambiguated, create a FASTA file of
    all of the transcripts for all genomes
    """
    items = disambiguate.split([data])
    fasta_files = []
    for i in items:
        odata = i[0]
        gtf_file = dd.get_gtf_file(odata)
        ref_file = dd.get_ref_file(odata)
        out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa")
        if file_exists(out_file):
            fasta_files.append(out_file)
        else:
            out_file = _gtf_to_fasta(gtf_file, ref_file, out_file)
            out_file = _clean_gtf_fa(out_file, out_file)
            fasta_files.append(out_file)
    out_stem = os.path.join(out_dir, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_stem = "-".join([out_stem] + dd.get_disambiguate(data))
    combined_file = out_stem + ".fa"
    if file_exists(combined_file):
        return combined_file

    fasta_file_string = " ".join(fasta_files)
    cmd = "cat {fasta_file_string} > {tx_out_file}"
    with file_transaction(combined_file) as tx_out_file:
        do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.")
    return combined_file
예제 #20
0
def gatk_splitreads(data):
    """
    use GATK to split reads with Ns in the CIGAR string, hard clipping regions
    that end up in introns
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    deduped_bam = dd.get_deduped_bam(data)
    base, ext = os.path.splitext(deduped_bam)
    split_bam = base + ".splitN" + ext
    if dd.get_quality_format(data) == "illumina":
        quality_flag = ["--fix_misencoded_quality_scores", "-fixMisencodedQuals"]
    else:
        quality_flag = []
    if file_exists(split_bam):
        data = dd.set_split_bam(data, split_bam)
        return data
    with file_transaction(split_bam) as tx_split_bam:
        params = ["-T", "SplitNCigarReads",
                  "-R", ref_file,
                  "-I", deduped_bam,
                  "-o", tx_split_bam,
                  "-rf", "ReassignOneMappingQuality",
                  "-RMQF", "255",
                  "-RMQT", "60",
                  "-rf", "UnmappedRead",
                  "-U", "ALLOW_N_CIGAR_READS"] + quality_flag
        broad_runner.run_gatk(params)
    bam.index(split_bam, dd.get_config(data))
    data = dd.set_split_bam(data, split_bam)
    return data
예제 #21
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.
    """
    umi_ext = "-cumi" if "umi_bam" in data else ""
    out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    rg_info = novoalign.get_rg_info(names)
    preset = "sr"

    pair_file = pair_file if pair_file else ""
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None

    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
            index_file = None
            # Skip trying to use indices now as they provide only slight speed-ups
            # and give inconsitent outputs in BAM headers
            # If a single index present, index_dir points to that
            # if index_dir and os.path.isfile(index_dir):
            #     index_dir = os.path.dirname(index_dir)
            #     index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset))
            if not index_file or not os.path.exists(index_file):
                index_file = dd.get_ref_file(data)
            cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} "
                   "{fastq_file} {pair_file} | ")
            do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data))
    data["work_bam"] = out_file
    return data
예제 #22
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = ["VariantFiltration",
                      "-R", ref_file,
                      "-V", vrn_file,
                      "--cluster-window-size", "35",
                      "--cluster-size", "3",
                      "--filter-expression", "'FS > 30.0'",
                      "--filter-name", "FS",
                      "--filter-expression", "'QD < 2.0'",
                      "--filter-name", "QD",
                      "--output", tx_out_file]
            # Use GATK4 for filtering, tools_off is for variant calling
            config = utils.deepish_copy(dd.get_config(data))
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.")
    return out_file
예제 #23
0
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0])))
    species = dd.get_species(data[0][0])
    hairpin = op.join(mirbase, "hairpin.fa")
    mature = op.join(mirbase, "mature.fa")
    rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    safe_makedir(out_dir)
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals())
        if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file):
            do.run(cmd.format(**locals()), "Running mirdeep2.")
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
예제 #24
0
def _run_amber(paired, work_dir, lenient=False):
    """AMBER: calculate allele frequencies at likely heterozygous sites.

    lenient flag allows amber runs on small test sets.
    """
    amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
    out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".pcf"):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            key = "germline_het_pon"
            het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data)
            cmd = ["AMBER"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-threads", dd.get_num_cores(paired.tumor_data),
                   "-tumor", dd.get_sample_name(paired.tumor_data),
                   "-tumor_bam", dd.get_align_bam(paired.tumor_data),
                   "-reference", dd.get_sample_name(paired.normal_data),
                   "-reference_bam", dd.get_align_bam(paired.normal_data),
                   "-ref_genome", dd.get_ref_file(paired.tumor_data),
                   "-bed", het_bed,
                   "-output_dir", os.path.dirname(tx_out_file)]
            if lenient:
                cmd += ["-max_het_af_percent", "1.0"]
            try:
                do.run(cmd, "PURPLE: AMBER baf generation")
            except subprocess.CalledProcessError as msg:
                if not lenient and _amber_allowed_errors(str(msg)):
                    return _run_amber(paired, work_dir, True)
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(amber_dir, f))
    return out_file
예제 #25
0
def _run_wham(inputs, background_bams):
    """Run WHAM on a defined set of inputs and targets.
    """
    out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0]))
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            coords = chromhacks.autosomal_or_x_coords(dd.get_ref_file(inputs[0]))
            parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": []}
            rs = run_multicore(_run_wham_coords,
                                [(inputs, background_bams, coord, out_file)
                                 for coord in coords],
                                inputs[0]["config"], parallel)
            rs = {coord: fname for (coord, fname) in rs}
            vcfutils.concat_variant_files([rs[c] for c in coords], tx_out_file, coords,
                                          dd.get_ref_file(inputs[0]), inputs[0]["config"])
    return out_file
예제 #26
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    variation_dir = os.path.join(dd.get_work_dir(data), "variation")
    safe_makedir(variation_dir)
    out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = ("unset R_HOME && export PATH=%s:$PATH && "
                % os.path.dirname(Rscript_cmd()))
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data))
    opts = " -c 1 -S 2 -E 3 -g 4 "
    with file_transaction(out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
                "-N {sample} -b {bamfile} {opts} {bed_file} "
                "| {strandbias}"
                "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
                "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    data = dd.set_vrn_file(data, out_file)
    return data
예제 #27
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run platypus variant calling, germline whole genome or exome.
    """
    assert out_file.endswith(".vcf.gz")
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, items[0]["config"])
            cmd = ["platypus", "callVariants", "--regions=%s" % _bed_to_platypusin(region, out_file, items),
                   "--bamFiles=%s" % ",".join(align_bams),
                   "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-",
                   "--logFileName", "/dev/null", "--verbosity=1"]
            cmd += ["--assemble=1"]
            # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers
            cmd += ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9",
                    "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001",
                    "--minVarFreq", "0.0"]
            # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates
            if any(not tz.get_in(["config", "algorithm", "mark_duplicates"], data, True)
                   for data in items):
                cmd += ["--filterDuplicates=0"]
            post_process_cmd = " | %s | vcfallelicprimitives | vcfstreamsort | bgzip -c > %s" % (
                vcfutils.fix_ambiguous_cl(), tx_out_file)
            do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling")
        out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
예제 #28
0
def run_cluster(*data):
    """
    Run seqcluster cluster to detect smallRNA clusters
    """
    sample = data[0][0]
    tools = dd.get_expression_caller(data[0][0])
    work_dir = dd.get_work_dir(sample)
    out_dir = op.join(work_dir, "seqcluster", "cluster")
    out_dir = op.abspath(safe_makedir(out_dir))
    prepare_dir = op.join(work_dir, "seqcluster", "prepare")
    bam_file = data[0][0]["work_bam"]
    if "seqcluster" in tools:
        sample["seqcluster"] = _cluster(bam_file, data[0][0]["seqcluster_prepare_ma"], out_dir, dd.get_ref_file(sample), dd.get_srna_gtf_file(sample))
        sample["report"] = _report(sample, dd.get_ref_file(sample))

    out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase"))
    if out_mirna:
        sample = dd.set_mirna_counts(sample, out_mirna[0])
        sample = dd.set_isomir_counts(sample, out_mirna[1])

    out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel")
    if out_novel:
        sample = dd.set_novel_mirna_counts(sample, out_novel[0])
        sample = dd.set_novel_isomir_counts(sample, out_novel[1])
    data[0][0] = sample
    return data
예제 #29
0
def get_multisample_vcf(fnames, name, caller, data):
    """Retrieve a multiple sample VCF file in a standard location.

    Handles inputs with multiple repeated input files from batches.
    """
    unique_fnames = []
    for f in fnames:
        if f not in unique_fnames:
            unique_fnames.append(f)
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    if len(unique_fnames) > 1:
        gemini_vcf = os.path.join(out_dir, "%s-%s.vcf.gz" % (name, caller))
        vrn_file_batch = None
        for variant in data.get("variants", []):
            if variant["variantcaller"] == caller and variant.get("vrn_file_batch"):
                vrn_file_batch = variant["vrn_file_batch"]
        if vrn_file_batch:
            utils.symlink_plus(vrn_file_batch, gemini_vcf)
            return gemini_vcf
        else:
            return vcfutils.merge_variant_files(unique_fnames, gemini_vcf, dd.get_ref_file(data),
                                                data["config"])
    else:
        gemini_vcf = os.path.join(out_dir, "%s-%s%s" % (name, caller, utils.splitext_plus(unique_fnames[0])[1]))
        utils.symlink_plus(unique_fnames[0], gemini_vcf)
        return gemini_vcf
예제 #30
0
def get_analysis_intervals(data):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)
    elif data.get("work_bam_callable"):
        return callable.sample_callable_bed(data["work_bam_callable"], dd.get_ref_file(data), data)
    else:
        for key in ["callable_regions", "variant_regions"]:
            intervals = data["config"]["algorithm"].get(key)
            if intervals:
                return intervals
예제 #31
0
def sort_by_ref(vcf_file, data):
    """Sort a VCF file by genome reference and position, adding contig information.
    """
    out_file = "%s-prep.vcf.gz" % utils.splitext_plus(vcf_file)[0]
    if not utils.file_uptodate(out_file, vcf_file):
        with file_transaction(data, out_file) as tx_out_file:
            header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0]
            with open(header_file, "w") as out_handle:
                for region in ref.file_contigs(dd.get_ref_file(data),
                                               data["config"]):
                    out_handle.write("##contig=<ID=%s,length=%s>\n" %
                                     (region.name, region.size))
            cat_cmd = "zcat" if vcf_file.endswith("vcf.gz") else "cat"
            cmd = (
                "{cat_cmd} {vcf_file} | grep -v ^##contig | bcftools annotate -h {header_file} | "
                "vt sort -m full -o {tx_out_file} -")
            with utils.chdir(os.path.dirname(tx_out_file)):
                do.run(cmd.format(**locals()), "Sort VCF by reference")
    return bgzip_and_index(out_file, data["config"])
예제 #32
0
def run_vcfanno(vcf, conf_files, data, data_basepath=None):
    """
    annotated a VCF file using vcfanno, looks up the proper config/lua scripts
    under the `vcfanno` key under the algorithm section of the datadict,
    skipping if the files cannot be found
    """
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    build = dd.get_genome_build(data)
    basepath = os.path.abspath(
        os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir))
    annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno"))
    conf_fns = []
    lua_fns = []
    anno_type = None
    for conf_file in conf_files:
        if utils.file_exists(conf_file) and os.path.isfile(conf_file):
            conffn = conf_file
            luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
        else:
            anno_type = os.path.basename(conf_file)
            conffn = os.path.join(annodir, anno_type + ".conf")
            luafn = os.path.join(annodir, anno_type + ".lua")
        if not utils.file_exists(conffn):
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping."
            )
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            conf_fns.append(conffn)
            lua_fns.append(luafn)
    if not conf_fns:
        return vcf
    if not anno_type:
        anno_type = "gemini"
    out_file = utils.splitext_plus(
        vcf)[0] + "-annotated-" + anno_type + ".vcf.gz"
    if utils.file_exists(out_file):
        return out_file

    out_file = vcfanno(vcf, out_file, conf_fns, data, data_basepath
                       or basepath, lua_fns)
    return out_file
예제 #33
0
def run_sailfish(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    sailfish_dir = os.path.join(work_dir, "sailfish", samplename)
    gtf_file = dd.get_gtf_file(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    stranded = dd.get_strandedness(data).lower()
    out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded,
                        data)
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, sailfish_dir)
    return [[data]]
예제 #34
0
def prep_recal(data):
    """Do pre-BQSR recalibration, calculation of recalibration tables.
    """
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data)))
        dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data)
        if not dbsnp_file:
            logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.")
            return data
        broad_runner = broad.runner_from_config(data["config"])
        data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dd.get_align_bam(data),
                                                     dd.get_ref_file(data), dd.get_platform(data),
                                                     dbsnp_file, dd.get_variant_regions(data), data)
    elif dd.get_recalibrate(data) == "sentieon":
        logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data)))
        data["prep_recal"] = sentieon.bqsr_table(data)
    elif dd.get_recalibrate(data):
        raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data)))
    return data
예제 #35
0
def _gatk_apply_bqsr(data):
    """Parallel BQSR support for GATK4.
    """
    in_file = dd.get_align_bam(data) or dd.get_work_bam(data)
    out_file = os.path.join(
        dd.get_work_dir(data), "align", dd.get_sample_name(data),
        "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0])
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            gatk_type = broad_runner.gatk_type()
            cores = dd.get_num_cores(data)
            if gatk_type == "gatk4":
                params = [
                    "-T", "ApplyBQSRSpark", "--spark-master",
                    "local[%s]" % cores, "--input", in_file, "--output",
                    tx_out_file, "--bqsr-recal-file", data["prep_recal"],
                    "--conf",
                    "spark.local.dir=%s" % os.path.dirname(tx_out_file)
                ]
            else:
                params = [
                    "-T", "PrintReads", "-R",
                    dd.get_ref_file(data), "-I", in_file, "-BQSR",
                    data["prep_recal"], "-o", tx_out_file
                ]
            # Avoid problems with intel deflater for GATK 3.8 and GATK4
            # https://github.com/chapmanb/bcbio-nextgen/issues/2145#issuecomment-343095357
            if gatk_type == "gatk4":
                params += ["--jdk-deflater", "--jdk-inflater"]
            elif LooseVersion(
                    broad_runner.gatk_major_version()) > LooseVersion("3.7"):
                params += ["-jdk_deflater", "-jdk_inflater"]
            memscale = {
                "magnitude": 0.9 * cores,
                "direction": "increase"
            } if cores > 1 else None
            broad_runner.run_gatk(params,
                                  os.path.dirname(tx_out_file),
                                  memscale=memscale,
                                  parallel_gc=True)
    bam.index(out_file, data["config"])
    return out_file
예제 #36
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    out_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions,
                                                   "variant_regions", file_prefix=prefix)
    if not utils.file_uptodate(out_file, bam_file):
        ref_file = dd.get_ref_file(data)
        cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1",
               "--mincov", str(params["min"]), "--reference", ref_file,
               "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"]
        window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0]
        if not utils.file_uptodate(window_file, bam_file):
            with file_transaction(data, window_file) as tx_out_file:
                if not variant_regions:
                    variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0]
                    with open(variant_regions, "w") as out_handle:
                        for c in shared.get_noalt_contigs(data):
                            out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size))
                pybedtools.BedTool().window_maker(w=params["parallel_window_size"],
                                                  b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file)
        cmd += ["--bed", window_file]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, out_file) as tx_out_file:
            with utils.chdir(os.path.dirname(tx_out_file)):
                tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_out_file.replace(".depth.bed", "")
                cmd += ["--prefix", prefix, bam_file]
                do.run(cmd, "Calculate coverage: %s" % dd.get_sample_name(data))
                shutil.move(tx_callable_file, callable_file)
    return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
예제 #37
0
def _normalize(in_file,
               data,
               passonly=False,
               normalize_indels=True,
               split_biallelic=True,
               remove_oldeffects=False):
    """Convert multi-allelic variants into single allelic.

    `vt normalize` has the -n flag passed (skipping reference checks) because
    of errors where the reference genome has non GATCN ambiguous bases. These
    are not supported in VCF, so you'll have a mismatch of N in VCF versus R
    (or other ambiguous bases) in the genome.
    """
    if remove_oldeffects:
        out_file = "%s-noeff-decompose%s" % utils.splitext_plus(in_file)
        old_effects = [a for a in ["CSQ", "ANN"] if a in cyvcf2.VCF(in_file)]
        if old_effects:
            clean_effects_cmd = " | bcftools annotate -x %s " % (",".join(
                ["INFO/%s" % x for x in old_effects]))
        else:
            clean_effects_cmd = ""
    else:
        clean_effects_cmd = ""
        out_file = "%s-decompose%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        assert out_file.endswith(".vcf.gz")
        with file_transaction(data, out_file) as tx_out_file:
            cmd = ("gunzip -c " + in_file +
                   (" | bcftools view -f 'PASS,.'" if passonly else "") +
                   clean_effects_cmd +
                   (" | vcfallelicprimitives -t DECOMPOSED --keep-geno"
                    if split_biallelic else "") +
                   " | sed 's/ID=AD,Number=./ID=AD,Number=R/'" +
                   " | vt decompose -s - " +
                   ((" | vt normalize -n -r " + ref_file +
                     " - ") if normalize_indels else "") +
                   " | awk '{ gsub(\"./-65\", \"./.\"); print $0 }'" +
                   " | sed -e 's/Number=A/Number=1/g'" + " | bgzip -c > " +
                   tx_out_file)
            do.run(cmd, "Multi-allelic to single allele")
    return vcfutils.bgzip_and_index(out_file, data["config"])
예제 #38
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run platypus variant calling, germline whole genome or exome.
    """
    assert out_file.endswith(".vcf.gz")
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, items[0]["config"])
            cmd = ["platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items),
                   "--bamFiles=%s" % ",".join(align_bams),
                   "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-",
                   "--logFileName", "/dev/null", "--verbosity=1"]
            resources = config_utils.get_resources("platypus", items[0]["config"])
            if resources.get("options"):
                # normalize options so we can set defaults without overwriting user specified
                for opt in resources["options"]:
                    if "=" in opt:
                        key, val = opt.split("=")
                        cmd.extend([key, val])
                    else:
                        cmd.append(opt)
            if any("gvcf" in dd.get_tools_on(d) for d in items):
                cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"]
            # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers
            # Currently not used after doing more cross validation as they increase false positives
            # which seems to be a major advantage for Platypus users.
            # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9",
            #               "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001",
            #               "--minVarFreq", "0.0", "--assemble", "1"]
            # for okey, oval in utils.partition_all(2, tuned_opts):
            #     if okey not in cmd:
            #         cmd.extend([okey, oval])

            # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates
            if any(not dd.get_mark_duplicates(data) for data in items):
                cmd += ["--filterDuplicates=0"]
            post_process_cmd = (" | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | "
                                "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(),
                                                                   vcfutils.fix_ambiguous_cl(5), tx_out_file))
            do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling")
        out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
예제 #39
0
def _fill_prioritization_targets(data):
    """Fill in globally installed files for prioritization.
    """
    ref_file = dd.get_ref_file(data)
    for target in [["svprioritize"], ["coverage"]]:
        val = tz.get_in(["config", "algorithm"] + target, data)
        if val and not os.path.exists(val):
            installed_vals = []
            # Check prioritize directory
            for ext in [".bed", ".bed.gz"]:
                installed_vals += glob.glob(
                    os.path.normpath(
                        os.path.join(os.path.dirname(ref_file), os.pardir,
                                     "coverage", "prioritize",
                                     val + "*%s" % ext)))
            # Check sv-annotation directory for prioritize gene name lists
            if target[-1] == "svprioritize":
                installed_vals += glob.glob(
                    os.path.join(
                        os.path.dirname(
                            os.path.realpath(
                                utils.which("simple_sv_annotation.py"))),
                        "%s*" % os.path.basename(val)))
            if len(installed_vals) == 0:
                raise ValueError(
                    "Configuration problem. BED file not found for %s: %s" %
                    (target, val))
            elif len(installed_vals) == 1:
                installed_val = installed_vals[0]
            else:
                # check for partial matches
                installed_val = None
                for v in installed_vals:
                    if v.endswith(val + ".bed.gz") or v.endswith(val + ".bed"):
                        installed_val = v
                        break
                # handle date-stamped inputs
                if not installed_val:
                    installed_val = sorted(installed_vals, reverse=True)[0]
            data = tz.update_in(data, ["config", "algorithm"] + target,
                                lambda x: installed_val)
    return data
예제 #40
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "vardict"))
    out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = get_R_exports()
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    data = _setup_variant_regions(data, out_dir)
    bed_file = dd.get_variant_regions(data)
    opts = " -c 1 -S 2 -E 3 -g 4 "
    resources = config_utils.get_resources("vardict", data)
    if resources.get("options"):
        opts += " ".join([str(x) for x in resources["options"]])
    cores = dd.get_num_cores(data)
    if cores and cores > 1:
        opts += " -th %s" % str(cores)
    with file_transaction(data, out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
               "-N {sample} -b {bamfile} {opts} {bed_file} "
               "| {strandbias}"
               "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
               "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
               "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    data = dd.set_vrn_file(data, out_file)
    return data
예제 #41
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    genome_cov_thresh = 0.40  # percent of genome covered for whole genome analysis
    offtarget_thresh = 0.05  # percent of offtarget reads required to be capture (not amplification) based
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions_merged(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            callable_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([
            c.size
            for c in ref.file_contigs(dd.get_ref_file(data), data["config"])
        ])
        genome_cov_pct = callable_size / float(total_size)
        if genome_cov_pct > genome_cov_thresh:
            cov_interval = "genome"
            offtarget_pct = 0.0
        elif not vrs:
            cov_interval = "regional"
            offtarget_pct = 0.0
        else:
            offtarget_pct = _count_offtarget(data, data["work_bam"], vrs
                                             or callable_file,
                                             "variant_regions")
            if offtarget_pct > offtarget_thresh:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info(
            "%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
            % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0,
               offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
예제 #42
0
def remove_extracontigs(in_bam, data):
    """Remove extra contigs (non chr1-22,X,Y) from an input BAM.

    These extra contigs can often be arranged in different ways, causing
    incompatibility issues with GATK and other tools. This also fixes the
    read group header as in fixrg.

    This does not yet handle mapping over 1 -> chr1 issues since this requires
    a ton of search/replace which slows down conversion.
    """
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "bamclean",
                     dd.get_sample_name(data)))
    out_file = os.path.join(
        work_dir,
        "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_exists(out_file):
        out_file = os.path.join(work_dir,
                                "%s-noextras.bam" % dd.get_sample_name(data))
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            target_chroms = _target_chroms_and_header(in_bam, data)
            str_chroms = " ".join(target_chroms)
            rg_info = novoalign.get_rg_info(data["rgnames"])
            bcbio_py = sys.executable
            ref_file = dd.get_ref_file(data)
            local_bam = os.path.join(os.path.dirname(tx_out_file),
                                     os.path.basename(in_bam))
            cores = dd.get_cores(data)
            utils.symlink_plus(in_bam, local_bam)
            bam.index(local_bam, data["config"])
            cmd = (
                "samtools view -@ {cores} -h {local_bam} {str_chroms} | "
                """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """
                """cleanbam.fix_header("{ref_file}")' | """
                "samtools view -@ {cores} -u - | "
                "samtools addreplacerg -@ {cores} -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - "
            )
            do.run(
                cmd.format(**locals()),
                "bamprep, remove extra contigs: %s" % dd.get_sample_name(data))
    return out_file
예제 #43
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    genome_cov_thresh = 0.40  # percent of genome covered for whole genome analysis
    offtarget_thresh = 0.10  # percent of offtarget reads required to be capture (not amplification) based
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            seq_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            seq_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([
            c.size
            for c in ref.file_contigs(dd.get_ref_file(data), data["config"])
        ])
        genome_cov_pct = seq_size / float(total_size)
        if genome_cov_pct > genome_cov_thresh:
            cov_interval = "genome"
            offtarget_pct = 0.0
        else:
            offtarget_stat_file = dd.get_offtarget_stats(data)
            if not offtarget_stat_file:
                offtarget_pct = 0.0
            else:
                with open(offtarget_stat_file) as in_handle:
                    stats = yaml.safe_load(in_handle)
                offtarget_pct = stats["offtarget"] / float(stats["mapped"])
            if offtarget_pct > offtarget_thresh:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info(
            "Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
            % (cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
예제 #44
0
def _bedpe_to_vcf(bedpe_file, sconfig_file, items):
    """Convert BEDPE output into a VCF file.
    """
    tovcf_script = do.find_cmd("bedpeToVcf")
    if tovcf_script:
        out_file = "%s.vcf.gz" % utils.splitext_plus(bedpe_file)[0]
        out_nogzip = out_file.replace(".vcf.gz", ".vcf")
        raw_file = "%s-raw.vcf" % utils.splitext_plus(bedpe_file)[0]
        if not utils.file_exists(out_file):
            if not utils.file_exists(raw_file):
                with file_transaction(items[0], raw_file) as tx_raw_file:
                    cmd = [sys.executable, tovcf_script, "-c", sconfig_file, "-f", dd.get_ref_file(items[0]),
                           "-t", "LUMPY", "-b", bedpe_file, "-o", tx_raw_file]
                    do.run(cmd, "Convert lumpy bedpe output to VCF")
            clean_file = _clean_lumpy_vcf(raw_file, items[0])
            prep_file = vcfutils.sort_by_ref(clean_file, items[0])
            if not utils.file_exists(out_nogzip):
                utils.symlink_plus(prep_file, out_nogzip)
        out_file = vcfutils.bgzip_and_index(out_nogzip, items[0]["config"])
        return out_file
예제 #45
0
def _rnaseq_qualimap(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    report_file = os.path.join(out_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    ref_file = dd.get_ref_file(data)
    single_end = not bam.is_paired(bam_file)
    if not utils.file_exists(report_file):
        utils.safe_makedir(out_dir)
        bam.index(bam_file, config)
        cmd = _rnaseq_qualimap_cmd(config, bam_file, out_dir, gtf_file, single_end)
        do.run(cmd, "Qualimap for {}".format(data["name"][-1]))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, out_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update({"Fragment Length Mean": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
예제 #46
0
def tobam_cl(data, out_file, is_paired=False):
    """Prepare command line for producing de-duplicated sorted output.

    - If no deduplication, sort and prepare a BAM file.
    - If paired, then use samblaster and prepare discordant outputs.
    - If unpaired, use biobambam's bammarkduplicates
    """
    do_dedup = _check_dedup(data)
    with file_transaction(data, out_file) as tx_out_file:
        if not do_dedup:
            yield (sam_to_sortbam_cl(data, tx_out_file), tx_out_file)
        elif is_paired and not _too_many_contigs(dd.get_ref_file(data)):
            sr_file = "%s-sr.bam" % os.path.splitext(out_file)[0]
            disc_file = "%s-disc.bam" % os.path.splitext(out_file)[0]
            with file_transaction(data, sr_file) as tx_sr_file:
                with file_transaction(data, disc_file) as tx_disc_file:
                    yield (samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file),
                           tx_out_file)
        else:
            yield (_biobambam_dedup_sort(data, tx_out_file), tx_out_file)
예제 #47
0
def _maybe_limit_chromosomes(data):
    """Potentially limit chromosomes to avoid problematically named HLA contigs.

    HLAs have ':' characters in them which confuse downstream processing. If
    we have no problematic chromosomes we don't limit anything.
    """
    std_chroms = []
    prob_chroms = []
    noalt_calling = "noalt_calling" in dd.get_tools_on(
        data) or "altcontigs" in dd.get_exclude_regions(data)
    for contig in ref.file_contigs(dd.get_ref_file(data)):
        if contig.name.find(":") > 0 or (
                noalt_calling and not chromhacks.is_nonalt(contig.name)):
            prob_chroms.append(contig.name)
        else:
            std_chroms.append(contig.name)
    if len(prob_chroms) > 0:
        return std_chroms
    else:
        return []
예제 #48
0
def run_kallisto_rnaseq(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    assert fq2, ("bcbio doesn't support kallisto for single-end reads, we can "
                 "add support for this if you open up an issue about it here: "
                 "https://github.com/bcbio/bcbio-nextgen/issues")
    out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file,
                               data)
    data = dd.set_kallisto_quant(data, out_file)
    return [[data]]
예제 #49
0
def remove_nonassembled_chrom(bam_file, data):
    """Remove non-assembled contigs from the BAM file"""
    ref_file = dd.get_ref_file(data)
    config = dd.get_config(data)
    fai = "%s.fai" % ref_file
    chrom = []
    with open(fai) as inh:
        for line in inh:
            c = line.split("\t")[0]
            if c.find("_") < 0:
                chrom.append(c)
    chroms = " ".join(chrom)
    out_file = utils.append_stem(bam_file, '_chrom')
    samtools = config_utils.get_program("samtools", config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out:
            cmd = "{samtools} view -b {bam_file} {chroms} > {tx_out}"
            do.run(cmd.format(**locals()), "Remove contigs from %s" % bam_file)
        bam.index(out_file, config)
    return out_file
예제 #50
0
def prepare_intervals(data, region_file, work_dir):
    """Prepare interval regions for targeted and gene based regions.
    """
    target_file = os.path.join(
        work_dir, "%s-target.interval_list" % dd.get_sample_name(data))
    if not utils.file_uptodate(target_file, region_file):
        with file_transaction(data, target_file) as tx_out_file:
            params = [
                "-T", "PreprocessIntervals", "-R",
                dd.get_ref_file(data), "--interval-merging-rule",
                "OVERLAPPING_ONLY", "-O", tx_out_file
            ]
            if dd.get_coverage_interval(data) == "genome":
                params += ["--bin-length", "1000", "--padding", "0"]
            else:
                params += [
                    "-L", region_file, "--bin-length", "0", "--padding", "250"
                ]
            _run_with_memory_scaling(params, tx_out_file, data)
    return target_file
예제 #51
0
def add_dbsnp(orig_file, dbsnp_file, data, out_file=None):
    """Annotate a VCF file with dbSNP.
    """
    orig_file = vcfutils.bgzip_and_index(orig_file, data["config"])
    if out_file is None:
        out_file = "%s-wdbsnp.vcf.gz" % utils.splitext_plus(orig_file)[0]
    if not utils.file_uptodate(out_file, orig_file):
        with file_transaction(data, out_file) as tx_out_file:
            conf_file = os.path.join(os.path.dirname(tx_out_file), "dbsnp.conf")
            with open(conf_file, "w") as out_handle:
                out_handle.write('[[annotation]]\n')
                out_handle.write('file="%s"\n' % os.path.normpath(os.path.join(dd.get_work_dir(data), dbsnp_file)))
                out_handle.write('fields=["ID"]\n')
                out_handle.write('names=["rs_ids"]\n')
                out_handle.write('ops=["concat"]\n')
            ref_file = dd.get_ref_file(data)
            cmd = ("vcfanno {conf_file} {orig_file} | "
                   "bcftools annotate --set-id +'%INFO/rs_ids' -o {tx_out_file} -O z")
            do.run(cmd.format(**locals()), "Annotate with dbSNP")
    return vcfutils.bgzip_and_index(out_file, data["config"])
예제 #52
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    if not variantcaller:
        return samples
    if "gatk" not in variantcaller:
        return samples
    ref_file = dd.get_ref_file(data)
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [
            dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)
        ]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file)
        vrn_file = vcfanno.run_vcfanno(out_file, ["rnaedit"], data)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, vrn_file)
            updated_samples.append([data])
        return updated_samples
    return samples
예제 #53
0
def variantcall_batch_region(items):
    """CWL entry point: variant call a batch of samples in a region.
    """
    items = [utils.to_single_data(x) for x in items]
    align_bams = [dd.get_align_bam(x) for x in items]
    variantcaller = _get_batch_variantcaller(items)
    region = list(set([x.get("region") for x in items if "region" in x]))
    assert len(region) == 1, region
    region = region[0]
    caller_fn = get_variantcallers()[variantcaller]
    assoc_files = tz.get_in(("genome_resources", "variation"), items[0], {})
    region = _region_to_coords(region)
    chrom, start, end = region
    region_str = "_".join(str(x) for x in region)
    batch_name = _get_batch_name(items)
    out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom,
                            "%s-%s.vcf.gz" % (batch_name, region_str))
    utils.safe_makedir(os.path.dirname(out_file))
    call_file = caller_fn(align_bams, items, dd.get_ref_file(items[0]), assoc_files, region, out_file)
    return {"vrn_file_region": call_file, "region": "%s:%s-%s" % (chrom, start, end)}
예제 #54
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    sr_file = os.path.join(
        work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(
        work_dir,
        "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file):
        with file_transaction(data, sr_file) as tx_sr_file:
            with file_transaction(data, disc_file) as tx_disc_file:
                cores = dd.get_num_cores(data)
                ref_file = dd.get_ref_file(data)
                cmd = ("extract-sv-reads -e --threads {cores} -T {ref_file} "
                       "-i {in_bam} -s {tx_sr_file} -d {tx_disc_file}")
                do.run(cmd.format(**locals()),
                       "extract split and discordant reads", data)
    for fname in [sr_file, disc_file]:
        bam.index(fname, data["config"])
    return sr_file, disc_file
예제 #55
0
def cnvkit_background(background_cnns, out_file, items, target_bed=None, antitarget_bed=None):
    """Calculate background reference, handling flat case with no normal sample.
    """
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cmd = [_get_cmd(), "reference", "-f", dd.get_ref_file(items[0]), "-o", tx_out_file]
            genders = set([population.get_gender(x) for x in items])
            genders.discard("unknown")
            if len(genders) == 1:
                gender = genders.pop()
                cmd += ["--gender", gender]
                if gender.lower() == "male":
                    cmd += ["--male-reference"]
            if len(background_cnns) == 0:
                assert target_bed and antitarget_bed, "Missing CNNs and target BEDs for flat background"
                cmd += ["-t", target_bed, "-a", antitarget_bed]
            else:
                cmd += background_cnns
            do.run(_prep_cmd(cmd, tx_out_file), "CNVkit background")
    return out_file
예제 #56
0
def _bgzip_from_cram_sambamba(cram_file, dirs, data):
    """Use sambamba to extract from CRAM via regions.
    """
    raise NotImplementedError(
        "sambamba doesn't yet support retrieval from CRAM by BED file")
    region_file = (tz.get_in(["config", "algorithm", "variant_regions"], data)
                   if tz.get_in(["config", "algorithm", "coverage_interval"],
                                data) in ["regional", "exome"] else None)
    base_name = utils.splitext_plus(os.path.basename(cram_file))[0]
    work_dir = utils.safe_makedir(
        os.path.join(dirs["work"], "align_prep", "%s-parts" % base_name))
    f1, f2, o1, o2, si = [
        os.path.join(work_dir, "%s.fq" % x)
        for x in ["match1", "match2", "unmatch1", "unmatch2", "single"]
    ]
    ref_file = dd.get_ref_file(data)
    region = "-L %s" % region_file if region_file else ""
    cmd = ("sambamba view -f bam -l 0 -C {cram_file} -T {ref_file} {region} | "
           "bamtofastq F={f1} F2={f2} S={si} O={o1} O2={o2}")
    do.run(cmd.format(**locals()), "Convert CRAM to fastq in regions")
예제 #57
0
def run_salmon_reads(data):
    data = utils.to_single_data(data)
    files = dd.get_input_sequence_files(data)
    if bam.is_bam(files[0]):
        files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"],
                                           data, data["dirs"], data["config"])
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    fasta_file = dd.get_ref_file(data)
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file,
                                  data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    return [[data]]
예제 #58
0
def add_genes(in_file, data, max_distance=10000, work_dir=None):
    """Add gene annotations to a BED file from pre-prepared RNA-seq data.

    max_distance -- only keep annotations within this distance of event
    """
    gene_file = regions.get_sv_bed(data,
                                   "exons",
                                   out_dir=os.path.dirname(in_file))
    if gene_file and utils.file_exists(in_file):
        out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0]
        if work_dir:
            out_file = os.path.join(work_dir, os.path.basename(out_file))
        if not utils.file_uptodate(out_file, in_file):
            fai_file = ref.fasta_idx(dd.get_ref_file(data))
            with file_transaction(data, out_file) as tx_out_file:
                add_genes_to_bed(in_file, gene_file, fai_file, tx_out_file,
                                 max_distance)
        return out_file
    else:
        return in_file
예제 #59
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(
        data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_file = os.path.join(
        utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "variation", "rnaseq",
                         "gatk-haplotype")),
        "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    out_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data],
                                     dd.get_ref_file(data), {},
                                     out_file=out_file)
    return dd.set_vrn_file(data, out_file)
def variants(data):
    if not "vrn_file" in data:
        return data
    if not dd.get_coverage(data):
        return data

    in_vcf = data['vrn_file']
    work_dir = os.path.join(dd.get_work_dir(data), "report", "variants")
    with chdir(work_dir):
        in_bam = data['work_bam']
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        sample = dd.get_sample_name(data)
        in_bam = data.get("work_bam")
        cg_file = os.path.join(sample + "_with-gc.vcf.gz")
        parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(cg_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T", "VariantAnnotator", "-R", ref_file, "-L",
                        bed_file, "-I", in_bam, "-A", "GCContent", "-A",
                        "Coverage", "--variant", in_vcf, "--out", tx_out
                    ]
                    broad_runner.run_gatk(params)
            cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                           "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
        return data