Esempio n. 1
0
def postprocess_variants(items):
    """Provide post-processing of variant calls: filtering and effects annotation.
    """
    vrn_key = "vrn_file"
    if not isinstance(items, dict):
        items = [utils.to_single_data(x) for x in items]
        if "vrn_file_joint" in items[0]:
            vrn_key = "vrn_file_joint"
    data, items = _get_batch_representative(items, vrn_key)
    items = cwlutils.unpack_tarballs(items, data)
    data = cwlutils.unpack_tarballs(data, data)
    cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data))
    logger.info("Finalizing variant calls: %s" % cur_name)
    orig_vrn_file = data.get(vrn_key)
    data = _symlink_to_workdir(data, [vrn_key])
    data = _symlink_to_workdir(data,
                               ["config", "algorithm", "variant_regions"])
    if data.get(vrn_key):
        logger.info("Calculating variation effects for %s" % cur_name)
        ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data)
        if ann_vrn_file:
            data[vrn_key] = ann_vrn_file
        if vrn_stats:
            data["vrn_stats"] = vrn_stats
        orig_items = _get_orig_items(items)
        logger.info("Annotate VCF file: %s" % cur_name)
        data[vrn_key] = annotation.finalize_vcf(data[vrn_key],
                                                get_variantcaller(data),
                                                orig_items)
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            logger.info("Annotate RNA editing sites")
            ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"],
                                           data)
            if ann_file:
                data[vrn_key] = ann_file
        if cwlutils.is_cwl_run(data):
            logger.info("Annotate with population level variation data")
            ann_file = population.run_vcfanno(dd.get_vrn_file(data), data,
                                              population.do_db_build([data]))
            if ann_file:
                data[vrn_key] = ann_file
        logger.info("Filtering for %s" % cur_name)
        data[vrn_key] = variant_filtration(
            data[vrn_key], dd.get_ref_file(data),
            tz.get_in(("genome_resources", "variation"), data, {}), data,
            orig_items)
        logger.info("Prioritization for %s" % cur_name)
        prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data,
                                                    orig_items)
        if prio_vrn_file != data[vrn_key]:
            data[vrn_key] = prio_vrn_file
            logger.info("Germline extraction for %s" % cur_name)
            data = germline.extract(data, orig_items)

        if dd.get_align_bam(data):
            data = damage.run_filter(data[vrn_key], dd.get_align_bam(data),
                                     dd.get_ref_file(data), data, orig_items)
    if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file):
        data[vrn_key] = orig_vrn_file
    return [[data]]
Esempio n. 2
0
def run_rnaseq_variant_calling(data):
    """
    run RNA-seq variant calling, variation file is stored in `vrn_file`
    in the datadict
    """
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/bcbio/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller:
        if "gatk-haplotype" in variantcaller:
            data = variation.rnaseq_gatk_variant_calling(data)
        if vardict.get_vardict_command(data):
            data = variation.rnaseq_vardict_variant_calling(data)
    if dd.get_vrn_file(data):
        ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"],
                                       data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data,
                                          population.do_db_build([data]))
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    return [[data]]
Esempio n. 3
0
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    """
    ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data)
    if ann_file:
        dd.set_vrn_file(data, ann_file)
    filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
    dd.set_vrn_file(data, filter_file)
    return [[data]]
Esempio n. 4
0
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    """
    data = to_single_data(data)
    if dd.get_vrn_file(data):
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    variantcaller = dd.get_variantcaller(data)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
        data = dd.set_vrn_file(data, filter_file)
    return [[data]]
Esempio n. 5
0
def _run_gemini_stats(bam_file, data, out_dir):
    """Retrieve high level variant statistics from Gemini.
    """
    out = {}
    gemini_dbs = [d for d in
                  [tz.get_in(["population", "db"], x) for x in data.get("variants", [])] if d]
    if len(gemini_dbs) > 0:
        gemini_db = gemini_dbs[0]
        gemini_stat_file = "%s-stats.yaml" % os.path.splitext(gemini_db)[0]
        if not utils.file_uptodate(gemini_stat_file, gemini_db):
            gemini = config_utils.get_program("gemini", data["config"])
            tstv = subprocess.check_output([gemini, "stats", "--tstv", gemini_db])
            gt_counts = subprocess.check_output([gemini, "stats", "--gts-by-sample", gemini_db])
            dbsnp_count = subprocess.check_output([gemini, "query", gemini_db, "-q",
                                                   "SELECT count(*) FROM variants WHERE in_dbsnp==1"])
            out["Transition/Transversion"] = tstv.split("\n")[1].split()[-1]
            for line in gt_counts.split("\n"):
                parts = line.rstrip().split()
                if len(parts) > 0 and parts[0] != "sample":
                    name, hom_ref, het, hom_var, _, total = parts
                    out[name] = {}
                    out[name]["Variations (heterozygous)"] = int(het)
                    out[name]["Variations (homozygous)"] = int(hom_var)
                    # same total variations for all samples, keep that top level as well.
                    out["Variations (total)"] = int(total)
            out["Variations (in dbSNP)"] = int(dbsnp_count.strip())
            if out.get("Variations (total)") > 0:
                out["Variations (in dbSNP) pct"] = "%.1f%%" % (out["Variations (in dbSNP)"] /
                                                               float(out["Variations (total)"]) * 100.0)
            with open(gemini_stat_file, "w") as out_handle:
                yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
        else:
            with open(gemini_stat_file) as in_handle:
                out = yaml.safe_load(in_handle)
    else:
        vcf_file = dd.get_vrn_file(data)
        if isinstance(vcf_file, list):
            vcf_file = vcf_file[0]
        if vcf_file:
            out_file = "%s-bcfstats.tsv" % utils.splitext_plus(vcf_file)[0]
            bcftools = config_utils.get_program("bcftools", data["config"])
            if not utils.file_exists(out_file):
                cmd = ("{bcftools} stats -f PASS {vcf_file} > {out_file}")
                do.run(cmd.format(**locals()), "basic vcf stats %s" % data["name"][-1])
            with open(out_file) as in_handle:
                for line in in_handle:
                    if line.startswith("SN") and line.find("records") > -1:
                        cols = line.split()
                        print line
                        out["Variations (total)"] = cols[-1]

    res = {}
    for k, v in out.iteritems():
        if not isinstance(v, dict):
            res.update({k: v})
        if k == data["name"][-1]:
            res.update(v)
    return res
Esempio n. 6
0
def _default_conf_files(data, retriever):
    conf_files = []
    if dd.get_variantcaller(data) or dd.get_vrn_file(data):
        if annotate_gemini(data, retriever):
            conf_files.append("gemini")
        if _annotate_somatic(data, retriever):
            conf_files.append("somatic")
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            conf_files.append("rnaedit")
    return conf_files
Esempio n. 7
0
def _default_conf_files(data, retriever):
    conf_files = []
    if dd.get_variantcaller(data) or dd.get_vrn_file(data):
        if annotate_gemini(data, retriever):
            conf_files.append("gemini")
        if _annotate_somatic(data, retriever):
            conf_files.append("somatic")
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            conf_files.append("rnaedit")
    return conf_files
Esempio n. 8
0
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    """
    data = to_single_data(data)
    if dd.get_vrn_file(data):
        eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0]
        if eff_file:
            data = dd.set_vrn_file(data, eff_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    variantcaller = dd.get_variantcaller(data)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
        data = dd.set_vrn_file(data, filter_file)
    # remove variants close to splice junctions
    vrn_file = dd.get_vrn_file(data)
    vrn_file = variation.filter_junction_variants(vrn_file, data)
    data = dd.set_vrn_file(data, vrn_file)
    return [[data]]
Esempio n. 9
0
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    """
    data = to_single_data(data)
    if dd.get_vrn_file(data):
        eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0]
        if eff_file:
            data = dd.set_vrn_file(data, eff_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    variantcaller = dd.get_variantcaller(data)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
        data = dd.set_vrn_file(data, filter_file)
    # remove variants close to splice junctions
    vrn_file = dd.get_vrn_file(data)
    vrn_file = variation.filter_junction_variants(vrn_file, data)
    data = dd.set_vrn_file(data, vrn_file)
    return [[data]]
Esempio n. 10
0
def run_rnaseq_variant_calling(data):
    """
    run RNA-seq variant calling, variation file is stored in `vrn_file`
    in the datadict
    """
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/chapmanb/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller and "gatk" in variantcaller:
        data = variation.rnaseq_gatk_variant_calling(data)
    if vardict.get_vardict_command(data):
        data = variation.rnaseq_vardict_variant_calling(data)
        if dd.get_vrn_file(data):
            vrn_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data)
            data = dd.set_vrn_file(data, vrn_file)
    return [[data]]
Esempio n. 11
0
def run_rnaseq_variant_calling(data):
    """
    run RNA-seq variant calling, variation file is stored in `vrn_file`
    in the datadict
    """
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/chapmanb/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller and "gatk" in variantcaller:
        data = variation.rnaseq_gatk_variant_calling(data)
    if vardict.get_vardict_command(data):
        data = variation.rnaseq_vardict_variant_calling(data)
    # annotate RNA-editing events with vcfanno
    if dd.get_vrn_file(data):
        vrn_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), "rnaedit", data)
        data = dd.set_vrn_file(data, vrn_file)
    return [[data]]
Esempio n. 12
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    ref_file = dd.get_ref_file(data)
    out_file = os.path.join(dd.get_work_dir(data, "."), "variation", "combined.vcf")
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file, out_file)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, out_file)
            updated_samples.append([data])
        return updated_samples
    return samples
Esempio n. 13
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    ref_file = dd.get_ref_file(data)
    out_file = os.path.join(dd.get_work_dir(data, "."), "variation", "combined.vcf")
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file, out_file)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, out_file)
            updated_samples.append([data])
        return updated_samples
    return samples
Esempio n. 14
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    if not variantcaller:
       return samples
    if "gatk" not in variantcaller:
        return samples
    ref_file = dd.get_ref_file(data)
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file)
        vrn_file = vcfanno.run_vcfanno(out_file, ["rnaedit"], data)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, vrn_file)
            updated_samples.append([data])
        return updated_samples
    return samples
Esempio n. 15
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    if not variantcaller:
       return samples
    if "gatk" not in variantcaller:
        return samples
    ref_file = dd.get_ref_file(data)
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file)
        vrn_file = vcfanno.run_vcfanno(out_file, "rnaedit", data)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, vrn_file)
            updated_samples.append([data])
        return updated_samples
    return samples
Esempio n. 16
0
def run(bam_file, data, out_dir):
    """Retrieve high level variant statistics from Gemini.
    """
    out = {}
    gemini_dbs = [
        d for d in
        [tz.get_in(["population", "db"], x) for x in data.get("variants", [])]
        if d
    ]
    if len(gemini_dbs) > 0:
        gemini_db = gemini_dbs[0]
        gemini_stat_file = "%s-stats.yaml" % os.path.splitext(gemini_db)[0]
        if not utils.file_uptodate(gemini_stat_file, gemini_db):
            gemini = config_utils.get_program("gemini", data["config"])
            tstv = subprocess.check_output(
                [gemini, "stats", "--tstv", gemini_db])
            gt_counts = subprocess.check_output(
                [gemini, "stats", "--gts-by-sample", gemini_db])
            dbsnp_count = subprocess.check_output([
                gemini, "query", gemini_db, "-q",
                "SELECT count(*) FROM variants WHERE in_dbsnp==1"
            ])
            out["Transition/Transversion"] = tstv.split("\n")[1].split()[-1]
            for line in gt_counts.split("\n"):
                parts = line.rstrip().split()
                if len(parts) > 0 and parts[0] != "sample":
                    name, hom_ref, het, hom_var, _, total = parts
                    out[name] = {}
                    out[name]["Variations (heterozygous)"] = int(het)
                    out[name]["Variations (homozygous)"] = int(hom_var)
                    # same total variations for all samples, keep that top level as well.
                    out["Variations (total)"] = int(total)
            out["Variations (in dbSNP)"] = int(dbsnp_count.strip())
            if out.get("Variations (total)") > 0:
                out["Variations (in dbSNP) pct"] = "%.1f%%" % (
                    out["Variations (in dbSNP)"] /
                    float(out["Variations (total)"]) * 100.0)
            with open(gemini_stat_file, "w") as out_handle:
                yaml.safe_dump(out,
                               out_handle,
                               default_flow_style=False,
                               allow_unicode=False)
        else:
            with open(gemini_stat_file) as in_handle:
                out = yaml.safe_load(in_handle)
    else:
        vcf_file = dd.get_vrn_file(data)
        if isinstance(vcf_file, list):
            vcf_file = vcf_file[0]
        if vcf_file:
            out_file = "%s-bcfstats.tsv" % utils.splitext_plus(vcf_file)[0]
            bcftools = config_utils.get_program("bcftools", data["config"])
            if not utils.file_exists(out_file):
                cmd = ("{bcftools} stats -f PASS {vcf_file} > {out_file}")
                do.run(cmd.format(**locals()),
                       "basic vcf stats %s" % dd.get_sample_name(data))
            with open(out_file) as in_handle:
                for line in in_handle:
                    if line.startswith("SN") and line.find("records") > -1:
                        cols = line.split()
                        out["Variations (total)"] = cols[-1]

    res = {}
    for k, v in out.iteritems():
        if not isinstance(v, dict):
            res.update({k: v})
        if k == dd.get_sample_name(data):
            res.update(v)
    return res