コード例 #1
0
def prep_db_parallel(samples, parallel_fn):
    """Prepares gemini databases in parallel, handling jointly called populations.
    """
    batch_groups, singles, out_retrieve, extras = _group_by_batches(samples, _has_variant_calls)
    to_process = []
    has_batches = False
    for (name, caller), info in batch_groups.items():
        fnames = [x[0] for x in info]
        to_process.append([fnames, (str(name), caller, True), [x[1] for x in info], extras])
        has_batches = True
    for name, caller, data, fname in singles:
        to_process.append([[fname], (str(name), caller, False), [data], extras])
    if (len(samples) > 0 and not do_db_build([x[0] for x in samples])
          and not has_batches and not any(dd.get_vcfanno(x[0] for x in samples))):
        return samples
    output = parallel_fn("prep_gemini_db", to_process)
    out_fetch = {}
    for batch_id, out_file in output:
        out_fetch[tuple(batch_id)] = out_file
    out = []
    for batch_name, data in out_retrieve:
        out_variants = []
        for vrn in data["variants"]:
            use_population = vrn.pop("population", True)
            if use_population:
                vrn["population"] = out_fetch[(batch_name, vrn["variantcaller"])]
            out_variants.append(vrn)
        data["variants"] = out_variants
        out.append([data])
    for x in extras:
        out.append([x])
    return out
コード例 #2
0
ファイル: population.py プロジェクト: wshands/bcbio-nextgen
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database.
    """
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not vcfutils.vcf_has_variants(gemini_vcf):
        return None
    if not utils.file_exists(gemini_db):
        data_basepath = install.get_gemini_dir(data) if support_gemini_orig(
            data) else None
        conf_files = dd.get_vcfanno(data)
        if not conf_files:
            conf_files = ["gemini"]
        ann_file = vcfanno.run_vcfanno(gemini_vcf, conf_files, data,
                                       data_basepath)
        with file_transaction(data, gemini_db) as tx_gemini_db:
            vcf2db = config_utils.get_program("vcf2db.py", data)
            if "vcf2db_expand" in dd.get_tools_on(data):
                vcf2db_args = [
                    "--expand", "gt_types", "--expand", "gt_ref_depths",
                    "--expand", "gt_alt_depths"
                ]
            else:
                vcf2db_args = []
            cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] + vcf2db_args
            do.run(cmd, "GEMINI: create database with vcf2db")
    return gemini_db
コード例 #3
0
def run_vcfanno(vcf_file, data, decomposed=False):
    """Run vcfanno, providing annotations from external databases.
    """
    conf_files = dd.get_vcfanno(data)
    if conf_files:
        with_basepaths = collections.defaultdict(list)
        if not isinstance(conf_files, (list, tuple)):
            conf_files = [conf_files]
        for f in conf_files:
            data_basepath = (install.get_gemini_dir(data)
                             if f.find("gemini") >= 0
                             and is_human(data, builds=["37"]) else None)
            with_basepaths[data_basepath].append(f)
        conf_files = with_basepaths.items()
    else:
        conf_files = _default_conf_files(data)
    out_file = None
    if conf_files:
        for data_basepath, conf_files in conf_files:
            ann_file = vcfanno.run_vcfanno(vcf_file,
                                           conf_files,
                                           data,
                                           data_basepath=data_basepath,
                                           decomposed=decomposed)
            if ann_file:
                out_file = ann_file
                vcf_file = ann_file
    return out_file
コード例 #4
0
ファイル: population.py プロジェクト: wshands/bcbio-nextgen
def prep_gemini_db(fnames, call_info, samples, extras):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    use_gemini = do_db_build(samples) and any(
        vcfutils.vcf_has_variants(f) for f in fnames)
    name, caller, is_batch = call_info
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    multisample_vcf = get_multisample_vcf(fnames, name, caller, data)
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    if not utils.file_exists(gemini_db) and use_gemini:
        passonly = all("gemini_allvariants" not in dd.get_tools_on(d)
                       for d in samples)
        gemini_vcf = multiallelic.to_single(multisample_vcf,
                                            data,
                                            passonly=passonly)
        ped_file = create_ped_file(samples + extras, gemini_vcf)
        # Use original approach for hg19/GRCh37 pending additional testing
        if support_gemini_orig(data) and not any(
                dd.get_vcfanno(d) for d in samples):
            gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db,
                                              ped_file)
        else:
            gemini_db = create_gemini_db(gemini_vcf, data, gemini_db, ped_file)
    return [[(name, caller), {
        "db": gemini_db if utils.file_exists(gemini_db) else None,
        "vcf": multisample_vcf if is_batch else None
    }]]
コード例 #5
0
def find_annotations(data):
    """Find annotation configuration files for vcfanno, using pre-installed inputs.

    Creates absolute paths for user specified inputs and finds locally
    installed defaults.

    Default annotations if not specified:
      - gemini for variant pipelines
      - somatic for variant tumor pipelines
      - rnaedit for RNA-seq variant calling
    """
    conf_files = dd.get_vcfanno(data)
    if not conf_files:
        conf_files = _default_conf_files(data)
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    out = []
    annodir = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)),
                                                            os.pardir, "config", "vcfanno")))
    for conf_file in conf_files:
        if utils.file_exists(conf_file) and os.path.isfile(conf_file):
            conffn = conf_file
        else:
            conffn = os.path.join(annodir, conf_file + ".conf")
        if not utils.file_exists(conffn):
            build = dd.get_genome_build(data)
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping.")
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            out.append(conffn)
            luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
            if os.path.exists(luafn):
                out.append(luafn)
    return out
コード例 #6
0
def _run_vcfanno(gemini_vcf, data, use_gemini=False):
    data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None
    conf_files = dd.get_vcfanno(data)
    if not conf_files and use_gemini:
        conf_files = ["gemini"]
    if conf_files:
        return vcfanno.run_vcfanno(gemini_vcf, conf_files, data, data_basepath)
    else:
        return gemini_vcf
コード例 #7
0
def find_annotations(data, retriever=None):
    """Find annotation configuration files for vcfanno, using pre-installed inputs.

    Creates absolute paths for user specified inputs and finds locally
    installed defaults.

    Default annotations:
      - gemini for variant pipelines
      - somatic for variant tumor pipelines
      - rnaedit for RNA-seq variant calling
    """
    conf_files = dd.get_vcfanno(data)
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    for c in _default_conf_files(data, retriever):
        if c not in conf_files:
            conf_files.append(c)
    conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic}
    out = []
    annodir = os.path.normpath(
        os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir,
                     "config", "vcfanno"))
    if not retriever:
        annodir = os.path.abspath(annodir)
    for conf_file in conf_files:
        if objectstore.is_remote(conf_file) or (os.path.exists(conf_file)
                                                and os.path.isfile(conf_file)):
            conffn = conf_file
        elif not retriever:
            conffn = os.path.join(annodir, conf_file + ".conf")
        else:
            conffn = conf_file + ".conf"
        luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
        if retriever:
            conffn, luafn = [
                (x if objectstore.is_remote(x) else None)
                for x in retriever.add_remotes([conffn, luafn], data["config"])
            ]
        if not conffn:
            pass
        elif conf_file in conf_checkers and not conf_checkers[conf_file](
                data, retriever):
            logger.warn(
                "Skipping vcfanno configuration: %s. Not all input files found."
                % conf_file)
        elif not objectstore.file_exists_or_remote(conffn):
            build = dd.get_genome_build(data)
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping."
            )
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            out.append(conffn)
            if luafn and objectstore.file_exists_or_remote(luafn):
                out.append(luafn)
    return out
コード例 #8
0
ファイル: population.py プロジェクト: xlec/bcbio-nextgen
def run_vcfanno(vcf_file, data, decomposed=False):
    """Run vcfanno, providing annotations from external databases if needed.

    Puts together lua and conf files from multiple inputs by file names.
    """
    conf_files = dd.get_vcfanno(data)
    if conf_files:
        with_basepaths = collections.defaultdict(list)
        gemini_basepath = _back_compatible_gemini(conf_files, data)
        for f in conf_files:
            name = os.path.splitext(os.path.basename(f))[0]
            if f.endswith(".lua"):
                conf_file = None
                lua_file = f
            else:
                conf_file = f
                lua_file = "%s.lua" % utils.splitext_plus(conf_file)[0]
            if lua_file and not os.path.exists(lua_file):
                lua_file = None
            data_basepath = gemini_basepath if name == "gemini" else None
            if conf_file and os.path.exists(conf_file):
                with_basepaths[(data_basepath, name)].append(conf_file)
            if lua_file and os.path.exists(lua_file):
                with_basepaths[(data_basepath, name)].append(lua_file)
        conf_files = with_basepaths.items()
    out_file = None
    if conf_files:
        VcfannoIn = collections.namedtuple("VcfannoIn", ["conf", "lua"])
        bp_files = collections.defaultdict(list)
        for (data_basepath, name), anno_files in conf_files:
            anno_files = list(set(anno_files))
            if len(anno_files) == 1:
                cur = VcfannoIn(anno_files[0], None)
            elif len(anno_files) == 2:
                lua_files = [x for x in anno_files if x.endswith(".lua")]
                assert len(lua_files) == 1, anno_files
                lua_file = lua_files[0]
                anno_files.remove(lua_file)
                cur = VcfannoIn(anno_files[0], lua_file)
            else:
                raise ValueError("Unexpected annotation group %s" % anno_files)
            bp_files[data_basepath].append(cur)
        for data_basepath, anno_files in bp_files.items():
            ann_file = vcfanno.run(vcf_file, [x.conf for x in anno_files],
                                   [x.lua for x in anno_files],
                                   data,
                                   basepath=data_basepath,
                                   decomposed=decomposed)
            if ann_file:
                out_file = ann_file
                vcf_file = ann_file
    return out_file
コード例 #9
0
ファイル: vcfanno.py プロジェクト: vladsaveliev/bcbio-nextgen
def find_annotations(data, retriever=None):
    """Find annotation configuration files for vcfanno, using pre-installed inputs.

    Creates absolute paths for user specified inputs and finds locally
    installed defaults.

    Default annotations:
      - gemini for variant pipelines
      - somatic for variant tumor pipelines
      - rnaedit for RNA-seq variant calling
    """
    conf_files = dd.get_vcfanno(data)
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    for c in _default_conf_files(data, retriever):
        if c not in conf_files:
            conf_files.append(c)
    conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic}
    out = []
    annodir = os.path.normpath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno"))
    if not retriever:
        annodir = os.path.abspath(annodir)
    for conf_file in conf_files:
        if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)):
            conffn = conf_file
        elif not retriever:
            conffn = os.path.join(annodir, conf_file + ".conf")
        else:
            conffn = conf_file + ".conf"
        luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
        if retriever:
            conffn, luafn = [(x if objectstore.is_remote(x) else None)
                             for x in retriever.add_remotes([conffn, luafn], data["config"])]
        if not conffn:
            pass
        elif conf_file in conf_checkers and not conf_checkers[conf_file](data, retriever):
            logger.warn("Skipping vcfanno configuration: %s. Not all input files found." % conf_file)
        elif not objectstore.file_exists_or_remote(conffn):
            build = dd.get_genome_build(data)
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping.")
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            out.append(conffn)
            if luafn and objectstore.file_exists_or_remote(luafn):
                out.append(luafn)
    return out
コード例 #10
0
ファイル: population.py プロジェクト: skanwal/bcbio-nextgen
def run_vcfanno(vcf_file, data, decomposed=False):
    """Run vcfanno, providing annotations from external databases.
    """
    conf_files = dd.get_vcfanno(data)
    if conf_files:
        conf_files = [(None, conf_files)]
    else:
        conf_files = _default_conf_files(data)
    out_file = None
    if conf_files:
        for data_basepath, conf_files in conf_files:
            ann_file = vcfanno.run_vcfanno(vcf_file, conf_files, data,
                                           data_basepath=data_basepath,
                                           decomposed=decomposed)
            if ann_file:
                out_file = ann_file
                vcf_file = ann_file
    return out_file
コード例 #11
0
ファイル: population.py プロジェクト: DoaneAS/bcbio-nextgen
def prep_gemini_db(fnames, call_info, samples, extras):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
    name, caller, is_batch = call_info
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    if use_gemini:
        passonly = all("gemini_allvariants" not in dd.get_tools_on(d) for d in samples)
        gemini_vcf = multiallelic.to_single(gemini_vcf, data, passonly=passonly)
    gemini_vcf = _run_vcfanno(gemini_vcf, data, use_gemini)
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    if vcfutils.vcf_has_variants(gemini_vcf):
        if not utils.file_exists(gemini_db) and use_gemini:
            ped_file = create_ped_file(samples + extras, gemini_vcf)
            # Use original approach for hg19/GRCh37 pending additional testing
            if support_gemini_orig(data) and not any(dd.get_vcfanno(d) for d in samples):
                gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db, ped_file)
            else:
                gemini_db = create_gemini_db(gemini_vcf, data, gemini_db, ped_file)
    return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None,
                              "vcf": gemini_vcf,
                              "decomposed": use_gemini}]]