コード例 #1
0
ファイル: population.py プロジェクト: wshands/bcbio-nextgen
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not vcfutils.vcf_has_variants(gemini_vcf):
        return None
    if not utils.file_exists(gemini_db):
        data_basepath = install.get_gemini_dir(data) if support_gemini_orig(
            data) else None
        conf_files = dd.get_vcfanno(data)
        if not conf_files:
            conf_files = ["gemini"]
        ann_file = vcfanno.run_vcfanno(gemini_vcf, conf_files, data,
                                       data_basepath)
        with file_transaction(data, gemini_db) as tx_gemini_db:
            vcf2db = config_utils.get_program("vcf2db.py", data)
            if "vcf2db_expand" in dd.get_tools_on(data):
                vcf2db_args = [
                    "--expand", "gt_types", "--expand", "gt_ref_depths",
                    "--expand", "gt_alt_depths"
                ]
            else:
                vcf2db_args = []
            cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] + vcf2db_args
            do.run(cmd, "GEMINI: create database with vcf2db")
    return gemini_db
コード例 #2
0
def run_vcfanno(vcf_file, data, decomposed=False):
    """Run vcfanno, providing annotations from external databases.
    """
    conf_files = dd.get_vcfanno(data)
    if conf_files:
        with_basepaths = collections.defaultdict(list)
        if not isinstance(conf_files, (list, tuple)):
            conf_files = [conf_files]
        for f in conf_files:
            data_basepath = (install.get_gemini_dir(data)
                             if f.find("gemini") >= 0
                             and is_human(data, builds=["37"]) else None)
            with_basepaths[data_basepath].append(f)
        conf_files = with_basepaths.items()
    else:
        conf_files = _default_conf_files(data)
    out_file = None
    if conf_files:
        for data_basepath, conf_files in conf_files:
            ann_file = vcfanno.run_vcfanno(vcf_file,
                                           conf_files,
                                           data,
                                           data_basepath=data_basepath,
                                           decomposed=decomposed)
            if ann_file:
                out_file = ann_file
                vcf_file = ann_file
    return out_file
コード例 #3
0
def postprocess_variants(items):
    """Provide post-processing of variant calls: filtering and effects annotation.
    """
    vrn_key = "vrn_file"
    if not isinstance(items, dict):
        items = [utils.to_single_data(x) for x in items]
        if "vrn_file_joint" in items[0]:
            vrn_key = "vrn_file_joint"
    data, items = _get_batch_representative(items, vrn_key)
    items = cwlutils.unpack_tarballs(items, data)
    data = cwlutils.unpack_tarballs(data, data)
    cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data))
    logger.info("Finalizing variant calls: %s" % cur_name)
    orig_vrn_file = data.get(vrn_key)
    data = _symlink_to_workdir(data, [vrn_key])
    data = _symlink_to_workdir(data,
                               ["config", "algorithm", "variant_regions"])
    if data.get(vrn_key):
        logger.info("Calculating variation effects for %s" % cur_name)
        ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data)
        if ann_vrn_file:
            data[vrn_key] = ann_vrn_file
        if vrn_stats:
            data["vrn_stats"] = vrn_stats
        orig_items = _get_orig_items(items)
        logger.info("Annotate VCF file: %s" % cur_name)
        data[vrn_key] = annotation.finalize_vcf(data[vrn_key],
                                                get_variantcaller(data),
                                                orig_items)
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            logger.info("Annotate RNA editing sites")
            ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"],
                                           data)
            if ann_file:
                data[vrn_key] = ann_file
        if cwlutils.is_cwl_run(data):
            logger.info("Annotate with population level variation data")
            ann_file = population.run_vcfanno(dd.get_vrn_file(data), data,
                                              population.do_db_build([data]))
            if ann_file:
                data[vrn_key] = ann_file
        logger.info("Filtering for %s" % cur_name)
        data[vrn_key] = variant_filtration(
            data[vrn_key], dd.get_ref_file(data),
            tz.get_in(("genome_resources", "variation"), data, {}), data,
            orig_items)
        logger.info("Prioritization for %s" % cur_name)
        prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data,
                                                    orig_items)
        if prio_vrn_file != data[vrn_key]:
            data[vrn_key] = prio_vrn_file
            logger.info("Germline extraction for %s" % cur_name)
            data = germline.extract(data, orig_items)

        if dd.get_align_bam(data):
            data = damage.run_filter(data[vrn_key], dd.get_align_bam(data),
                                     dd.get_ref_file(data), data, orig_items)
    if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file):
        data[vrn_key] = orig_vrn_file
    return [[data]]
コード例 #4
0
ファイル: rnaseq.py プロジェクト: tischfis/bcbio-nextgen
def run_rnaseq_variant_calling(data):
    """
    run RNA-seq variant calling, variation file is stored in `vrn_file`
    in the datadict
    """
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/bcbio/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller:
        if "gatk-haplotype" in variantcaller:
            data = variation.rnaseq_gatk_variant_calling(data)
        if vardict.get_vardict_command(data):
            data = variation.rnaseq_vardict_variant_calling(data)
    if dd.get_vrn_file(data):
        ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"],
                                       data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data,
                                          population.do_db_build([data]))
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    return [[data]]
コード例 #5
0
def _run_vcfanno(gemini_vcf, data, use_gemini=False):
    data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None
    conf_files = dd.get_vcfanno(data)
    if not conf_files and use_gemini:
        conf_files = ["gemini"]
    if conf_files:
        return vcfanno.run_vcfanno(gemini_vcf, conf_files, data, data_basepath)
    else:
        return gemini_vcf
コード例 #6
0
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    """
    ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data)
    if ann_file:
        dd.set_vrn_file(data, ann_file)
    filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
    dd.set_vrn_file(data, filter_file)
    return [[data]]
コード例 #7
0
def handle_vcf_calls(vcf_file, data, orig_items):
    """Prioritize VCF calls based on external annotations supplied through GEMINI.
    """
    if not _do_prioritize(orig_items):
        return vcf_file
    else:
        if population.has_gemini_data(data):
            data_basepath = install.get_gemini_dir(data) if population.support_gemini_orig(data) else None
            ann_vcf = vcfanno.run_vcfanno(vcf_file, ["gemini"], data, data_basepath)
            if ann_vcf:
                priority_file = _prep_priority_filter_vcfanno(ann_vcf, data)
                return _apply_priority_filter(vcf_file, priority_file, data)
        # No GEMINI database for filtering, return original file
        return vcf_file
コード例 #8
0
ファイル: population.py プロジェクト: chapmanb/bcbio-nextgen
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not vcfutils.vcf_has_variants(gemini_vcf):
        return None
    if not utils.file_exists(gemini_db):
        data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None
        ann_file = vcfanno.run_vcfanno(gemini_vcf, "gemini", data, data_basepath)
        with file_transaction(data, gemini_db) as tx_gemini_db:
            vcf2db = config_utils.get_program("vcf2db.py", data)
            cmd = [vcf2db, ann_file, ped_file, tx_gemini_db]
            do.run(cmd, "GEMINI: create database with vcf2db")
    return gemini_db
コード例 #9
0
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    """
    data = to_single_data(data)
    ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data)
    if ann_file:
        data = dd.set_vrn_file(data, ann_file)
    ann_file = population.run_vcfanno(dd.get_vrn_file(data), data)
    if ann_file:
        data = dd.set_vrn_file(data, ann_file)
    variantcaller = dd.get_variantcaller(data)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
        data = dd.set_vrn_file(data, filter_file)
    return [[data]]
コード例 #10
0
ファイル: prioritize.py プロジェクト: yodeng/bcbio-nextgen
def handle_vcf_calls(vcf_file, data, orig_items):
    """Prioritize VCF calls based on external annotations supplied through GEMINI.
    """
    if not _do_prioritize(orig_items):
        return vcf_file
    else:
        if population.has_gemini_data(data):
            data_basepath = install.get_gemini_dir(
                data) if population.support_gemini_orig(data) else None
            ann_vcf = vcfanno.run_vcfanno(vcf_file, ["gemini"], data,
                                          data_basepath)
            if ann_vcf:
                priority_file = _prep_priority_filter_vcfanno(ann_vcf, data)
                return _apply_priority_filter(vcf_file, priority_file, data)
        # No GEMINI database for filtering, return original file
        return vcf_file
コード例 #11
0
ファイル: rnaseq.py プロジェクト: DoaneAS/bcbio-nextgen
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    if not variantcaller:
       return samples
    if "gatk" not in variantcaller:
        return samples
    ref_file = dd.get_ref_file(data)
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file)
        vrn_file = vcfanno.run_vcfanno(out_file, ["rnaedit"], data)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, vrn_file)
            updated_samples.append([data])
        return updated_samples
    return samples
コード例 #12
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    if not variantcaller:
       return samples
    if "gatk" not in variantcaller:
        return samples
    ref_file = dd.get_ref_file(data)
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file)
        vrn_file = vcfanno.run_vcfanno(out_file, "rnaedit", data)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, vrn_file)
            updated_samples.append([data])
        return updated_samples
    return samples
コード例 #13
0
ファイル: population.py プロジェクト: skanwal/bcbio-nextgen
def run_vcfanno(vcf_file, data, decomposed=False):
    """Run vcfanno, providing annotations from external databases.
    """
    conf_files = dd.get_vcfanno(data)
    if conf_files:
        conf_files = [(None, conf_files)]
    else:
        conf_files = _default_conf_files(data)
    out_file = None
    if conf_files:
        for data_basepath, conf_files in conf_files:
            ann_file = vcfanno.run_vcfanno(vcf_file, conf_files, data,
                                           data_basepath=data_basepath,
                                           decomposed=decomposed)
            if ann_file:
                out_file = ann_file
                vcf_file = ann_file
    return out_file
コード例 #14
0
ファイル: rnaseq.py プロジェクト: DoaneAS/bcbio-nextgen
def run_rnaseq_variant_calling(data):
    """
    run RNA-seq variant calling, variation file is stored in `vrn_file`
    in the datadict
    """
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/chapmanb/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller and "gatk" in variantcaller:
        data = variation.rnaseq_gatk_variant_calling(data)
    if vardict.get_vardict_command(data):
        data = variation.rnaseq_vardict_variant_calling(data)
        if dd.get_vrn_file(data):
            vrn_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data)
            data = dd.set_vrn_file(data, vrn_file)
    return [[data]]
コード例 #15
0
ファイル: rnaseq.py プロジェクト: Yixf-Self/bcbio-nextgen
def run_rnaseq_variant_calling(data):
    """
    run RNA-seq variant calling, variation file is stored in `vrn_file`
    in the datadict
    """
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/chapmanb/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller and "gatk" in variantcaller:
        data = variation.rnaseq_gatk_variant_calling(data)
    if vardict.get_vardict_command(data):
        data = variation.rnaseq_vardict_variant_calling(data)
    # annotate RNA-editing events with vcfanno
    if dd.get_vrn_file(data):
        vrn_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), "rnaedit", data)
        data = dd.set_vrn_file(data, vrn_file)
    return [[data]]