예제 #1
0
def postprocess_variants(items):
    """Provide post-processing of variant calls: filtering and effects annotation.
    """
    vrn_key = "vrn_file"
    if not isinstance(items, dict):
        items = [utils.to_single_data(x) for x in items]
        if "vrn_file_joint" in items[0]:
            vrn_key = "vrn_file_joint"
    data, items = _get_batch_representative(items, vrn_key)
    items = cwlutils.unpack_tarballs(items, data)
    data = cwlutils.unpack_tarballs(data, data)
    cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data))
    logger.info("Finalizing variant calls: %s" % cur_name)
    orig_vrn_file = data.get(vrn_key)
    data = _symlink_to_workdir(data, [vrn_key])
    data = _symlink_to_workdir(data,
                               ["config", "algorithm", "variant_regions"])
    if data.get(vrn_key):
        logger.info("Calculating variation effects for %s" % cur_name)
        ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data)
        if ann_vrn_file:
            data[vrn_key] = ann_vrn_file
        if vrn_stats:
            data["vrn_stats"] = vrn_stats
        orig_items = _get_orig_items(items)
        logger.info("Annotate VCF file: %s" % cur_name)
        data[vrn_key] = annotation.finalize_vcf(data[vrn_key],
                                                get_variantcaller(data),
                                                orig_items)
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            logger.info("Annotate RNA editing sites")
            ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"],
                                           data)
            if ann_file:
                data[vrn_key] = ann_file
        if cwlutils.is_cwl_run(data):
            logger.info("Annotate with population level variation data")
            ann_file = population.run_vcfanno(dd.get_vrn_file(data), data,
                                              population.do_db_build([data]))
            if ann_file:
                data[vrn_key] = ann_file
        logger.info("Filtering for %s" % cur_name)
        data[vrn_key] = variant_filtration(
            data[vrn_key], dd.get_ref_file(data),
            tz.get_in(("genome_resources", "variation"), data, {}), data,
            orig_items)
        logger.info("Prioritization for %s" % cur_name)
        prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data,
                                                    orig_items)
        if prio_vrn_file != data[vrn_key]:
            data[vrn_key] = prio_vrn_file
            logger.info("Germline extraction for %s" % cur_name)
            data = germline.extract(data, orig_items)

        if dd.get_align_bam(data):
            data = damage.run_filter(data[vrn_key], dd.get_align_bam(data),
                                     dd.get_ref_file(data), data, orig_items)
    if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file):
        data[vrn_key] = orig_vrn_file
    return [[data]]
예제 #2
0
def run_rnaseq_variant_calling(data):
    """
    run RNA-seq variant calling, variation file is stored in `vrn_file`
    in the datadict
    """
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/bcbio/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller:
        if "gatk-haplotype" in variantcaller:
            data = variation.rnaseq_gatk_variant_calling(data)
        if vardict.get_vardict_command(data):
            data = variation.rnaseq_vardict_variant_calling(data)
    if dd.get_vrn_file(data):
        ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"],
                                       data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data,
                                          population.do_db_build([data]))
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    return [[data]]
예제 #3
0
def handle_vcf_calls(vcf_file, data, orig_items):
    """Prioritize VCF calls based on external annotations supplied through GEMINI.
    """
    if not _do_prioritize(orig_items):
        return vcf_file
    else:
        ann_vcf = population.run_vcfanno(vcf_file, data)
        if ann_vcf:
            priority_file = _prep_priority_filter_vcfanno(ann_vcf, data)
            return _apply_priority_filter(ann_vcf, priority_file, data)
        # No data available for filtering, return original file
        else:
            return vcf_file
예제 #4
0
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    """
    data = to_single_data(data)
    if dd.get_vrn_file(data):
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    variantcaller = dd.get_variantcaller(data)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
        data = dd.set_vrn_file(data, filter_file)
    return [[data]]
예제 #5
0
def handle_vcf_calls(vcf_file, data, orig_items):
    """Prioritize VCF calls based on external annotations supplied through GEMINI.
    """
    if not _do_prioritize(orig_items):
        return vcf_file
    else:
        ann_vcf = population.run_vcfanno(vcf_file, data)
        if ann_vcf:
            priority_file = _prep_priority_filter_vcfanno(ann_vcf, data)
            return _apply_priority_filter(ann_vcf, priority_file, data)
        # No data available for filtering, return original file
        else:
            return vcf_file
예제 #6
0
def postprocess_variants(items):
    """Provide post-processing of variant calls: filtering and effects annotation.
    """
    vrn_key = "vrn_file"
    if not isinstance(items, dict):
        items = [utils.to_single_data(x) for x in items]
        if "vrn_file_joint" in items[0]:
            vrn_key = "vrn_file_joint"
    data, items = _get_batch_representative(items, vrn_key)
    items = cwlutils.unpack_tarballs(items, data)
    data = cwlutils.unpack_tarballs(data, data)
    cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data, require_bam=False))
    logger.info("Finalizing variant calls: %s" % cur_name)
    orig_vrn_file = data.get(vrn_key)
    data = _symlink_to_workdir(data, [vrn_key])
    data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"])
    if data.get(vrn_key):
        logger.info("Calculating variation effects for %s" % cur_name)
        ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data)
        if ann_vrn_file:
            data[vrn_key] = ann_vrn_file
        if vrn_stats:
            data["vrn_stats"] = vrn_stats
        orig_items = _get_orig_items(items)
        logger.info("Annotate VCF file: %s" % cur_name)
        data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data, require_bam=False),
                                                orig_items)
        if cwlutils.is_cwl_run(data):
            logger.info("Annotate with population level variation data")
            ann_file = population.run_vcfanno(data[vrn_key], data)
            if ann_file:
                data[vrn_key] = ann_file
        logger.info("Filtering for %s" % cur_name)
        data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data),
                                           tz.get_in(("genome_resources", "variation"), data, {}),
                                           data, orig_items)
        logger.info("Prioritization for %s" % cur_name)
        prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items)
        if prio_vrn_file != data[vrn_key]:
            data[vrn_key] = prio_vrn_file
            logger.info("Germline extraction for %s" % cur_name)
            data = germline.extract(data, orig_items)

        if dd.get_align_bam(data):
            data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data),
                                     data, orig_items)
    if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file):
        data[vrn_key] = orig_vrn_file
    return [[data]]
예제 #7
0
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    """
    data = to_single_data(data)
    if dd.get_vrn_file(data):
        eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0]
        if eff_file:
            data = dd.set_vrn_file(data, eff_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    variantcaller = dd.get_variantcaller(data)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
        data = dd.set_vrn_file(data, filter_file)
    # remove variants close to splice junctions
    vrn_file = dd.get_vrn_file(data)
    vrn_file = variation.filter_junction_variants(vrn_file, data)
    data = dd.set_vrn_file(data, vrn_file)
    return [[data]]
예제 #8
0
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    """
    data = to_single_data(data)
    if dd.get_vrn_file(data):
        eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0]
        if eff_file:
            data = dd.set_vrn_file(data, eff_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    variantcaller = dd.get_variantcaller(data)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
        data = dd.set_vrn_file(data, filter_file)
    # remove variants close to splice junctions
    vrn_file = dd.get_vrn_file(data)
    vrn_file = variation.filter_junction_variants(vrn_file, data)
    data = dd.set_vrn_file(data, vrn_file)
    return [[data]]