Ejemplo n.º 1
0
def _do_run(paired):
    """Perform Battenberg caling with the paired dataset.

    This purposely does not use a temporary directory for the output
    since Battenberg does smart restarts.
    """
    work_dir = _sv_workdir(paired.tumor_data)
    ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt")
    out = _get_battenberg_out(paired, work_dir)
    if len(_missing_files(out)) > 0:
        ref_file = dd.get_ref_file(paired.tumor_data)
        bat_datadir = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg"))
        ignore_file = _make_ignore_file(work_dir, ref_file, os.path.join(bat_datadir, "impute", "impute_info.txt"),
                                        ignore_file)
        local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                     "lib", "R", "site-library")
        perl_exports = utils.get_perl_exports()
        tumor_bam = paired.tumor_bam
        normal_bam = paired.normal_bam
        platform = dd.get_platform(paired.tumor_data)
        genome_build = paired.tumor_data["genome_build"]
        # scale cores to avoid over-using memory during imputation
        cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5))
        cmd = ("export R_LIBS_USER={local_sitelib} && "
               "{perl_exports} && "
               "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai "
               "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt "
               "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt "
               "-ig {ignore_file} "
               "-assembly {genome_build} -species Human -platform {platform}")
        do.run(cmd.format(**locals()), "Battenberg CNV calling")
    assert len(_missing_files(out)) == 0, "Missing Battenberg output: %s" % _missing_files(out)
    out["ignore"] = ignore_file
    return out
Ejemplo n.º 2
0
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    hairpin, mature, species = "none", "none", "na"
    rfam_file = dd.get_mirdeep2_file(data[0][0])
    if file_exists(dd.get_mirbase_hairpin(data[0][0])):
        species = dd.get_species(data[0][0])
        hairpin = dd.get_mirbase_hairpin(data[0][0])
        mature = dd.get_mirbase_mature(data[0][0])

    logger.debug("Preparing for mirdeep2 analysis.")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    safe_makedir(out_dir)
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res").format(**locals())
        if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(rfam_file):
            try:
                do.run(cmd.format(**locals()), "Running mirdeep2.")
            except:
                logger.warning("mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues.")
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
Ejemplo n.º 3
0
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            db_file = os.path.join(tmp_path, "main", "somatic.db")
            if not os.path.exists(db_file + ".dir"):
                if os.path.exists(tmp_path):
                    utils.remove_safe(tmp_path)
                opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
                opts += " --ref {}".format(ref_file)
                opts += " --dir %s" % tmp_path
                # caling
                cl = ("{perl_exports} && "
                      "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}")
                do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {})
            # filtering to adjust input parameters
            bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path))
            use_defaults = True
            if use_defaults:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf")
            # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher
            # to swap precision for sensitivity
            else:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz")
                with file_transaction(config, scalpel_tmp_file) as tx_indel_file:
                    cmd = ("{perl_exports} && "
                           "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} "
                           "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 "
                           "| bgzip -c > {tx_indel_file}")
                    do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {})
            scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config)
            scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                   "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                   " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
Ejemplo n.º 4
0
def _mint_trna_annotation(data):
    """
    use MINTmap to quantify tRNAs
    """
    trna_lookup = op.join(dd.get_srna_mint_lookup(data))
    trna_space = op.join(dd.get_srna_mint_space(data))
    trna_other = op.join(dd.get_srna_mint_other(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name))
    in_file = op.basename(data["clean_fastq"])
    mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl"))
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_lookup) or not file_exists(mintmap):
        logger.info("There is no tRNA annotation to run MINTmap.")
        return work_dir
    jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates")
    out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file))
                cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} "
                       "-l {trna_lookup} -s {trna_space} -j {jar_folder} "
                       "-o {trna_other}").format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*MINTmap*"):
                    shutil.move(filename, work_dir)
    return work_dir
Ejemplo n.º 5
0
def _trna_annotation(data):
    """
    use tDRmapper to quantify tRNAs
    """
    trna_ref = op.join(dd.get_srna_trna_file(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "trna", name))
    in_file = op.basename(data["clean_fastq"])
    tdrmapper = os.path.join(os.path.dirname(sys.executable),
                             "TdrMappingScripts.pl")
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_ref) or not file_exists(tdrmapper):
        logger.info("There is no tRNA annotation to run TdrMapper.")
        return work_dir
    out_file = op.join(work_dir, in_file + ".hq_cs.mapped")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"],
                                   op.join(txdir, in_file))
                cmd = ("{perl_export} && perl {tdrmapper} {trna_ref} {in_file}"
                       ).format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*mapped*"):
                    shutil.move(filename, work_dir)
    return work_dir
Ejemplo n.º 6
0
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0])))
    species = dd.get_species(data[0][0])
    hairpin = op.join(mirbase, "hairpin.fa")
    mature = op.join(mirbase, "mature.fa")
    rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    safe_makedir(out_dir)
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals())
        if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file):
            do.run(cmd.format(**locals()), "Running mirdeep2.")
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
Ejemplo n.º 7
0
def _mint_trna_annotation(data):
    """
    use MINTmap to quantify tRNAs
    """
    trna_lookup = op.join(dd.get_srna_mint_lookup(data))
    trna_space = op.join(dd.get_srna_mint_space(data))
    trna_other = op.join(dd.get_srna_mint_other(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name))
    in_file = op.basename(data["clean_fastq"])
    mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl"))
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_lookup) or not file_exists(mintmap):
        logger.info("There is no tRNA annotation to run MINTmap.")
        return work_dir
    jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates")
    out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file))
                cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} "
                       "-l {trna_lookup} -s {trna_space} -j {jar_folder} "
                       "-o {trna_other}").format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*MINTmap*"):
                    shutil.move(filename, work_dir)
    return work_dir
Ejemplo n.º 8
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,
                                  "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion}
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Ejemplo n.º 9
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,
                                  "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion}
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        return vcfutils.bgzip_and_index(out_file, data["config"])
Ejemplo n.º 10
0
def prep_vep_cache(dbkey, ref_file, tooldir=None, config=None):
    """Ensure correct installation of VEP cache file.
    """
    if config is None: config = {}
    resource_file = os.path.join(os.path.dirname(ref_file),
                                 "%s-resources.yaml" % dbkey)
    if os.path.exists(resource_file):
        with open(resource_file) as in_handle:
            resources = yaml.load(in_handle)
        ensembl_name = tz.get_in(["aliases", "ensembl"], resources)
        symlink_dir = _special_dbkey_maps(dbkey, ref_file)
        if ensembl_name and ensembl_name.find("_vep_") == -1:
            raise ValueError("%s has ensembl an incorrect value."
                             "It should have _vep_ in the name."
                             "Remove line or fix the name to avoid error.")
        if symlink_dir and ensembl_name:
            species, vepv = ensembl_name.split("_vep_")
            return symlink_dir, species
        elif ensembl_name:
            species, vepv = ensembl_name.split("_vep_")
            vep_dir = utils.safe_makedir(
                os.path.normpath(
                    os.path.join(os.path.dirname(os.path.dirname(ref_file)),
                                 "vep")))
            out_dir = os.path.join(vep_dir, species, vepv)
            if not os.path.exists(out_dir):
                tmp_dir = utils.safe_makedir(
                    os.path.join(vep_dir, species, "txtmp"))
                eversion = vepv.split("_")[0]
                url = "ftp://ftp.ensembl.org/pub/release-%s/variation/VEP/%s.tar.gz" % (
                    eversion, ensembl_name)
                with utils.chdir(tmp_dir):
                    subprocess.check_call(
                        ["wget", "--no-check-certificate", "-c", url])
                vep_path = "%s/bin/" % tooldir if tooldir else ""
                perl_exports = utils.get_perl_exports()
                cmd = [
                    "%svep_install" % vep_path, "-a", "c", "-s", ensembl_name,
                    "-c", vep_dir, "-u", tmp_dir, "--NO_UPDATE", "--VERSION",
                    eversion
                ]
                do.run("%s && %s" % (perl_exports, " ".join(cmd)),
                       "Prepare VEP directory for %s" % ensembl_name)
                cmd = [
                    "%svep_convert_cache" % vep_path, "--species", species,
                    "--version", vepv, "--dir", vep_dir, "--force_overwrite",
                    "--remove"
                ]
                do.run("%s && %s" % (perl_exports, " ".join(cmd)),
                       "Convert VEP cache to tabix %s" % ensembl_name)
                for tmp_fname in os.listdir(tmp_dir):
                    os.remove(os.path.join(tmp_dir, tmp_fname))
                os.rmdir(tmp_dir)
            tmp_dir = os.path.join(vep_dir, "tmp")
            if os.path.exists(tmp_dir):
                shutil.rmtree(tmp_dir)
            return vep_dir, species
    return None, None
Ejemplo n.º 11
0
def _run_scalpel_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(
                tx_out_file)[0]
            if os.path.exists(tmp_path):
                utils.remove_safe(tmp_path)
            opts = " ".join(
                _scalpel_options_from_config(items, config, out_file, region,
                                             tmp_path))
            opts += " --dir %s" % tx_tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            cmd = (
                "{perl_exports} && "
                "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} "
            )
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            shutil.move(tx_tmp_path, tmp_path)
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(
                os.path.join(tmp_path, "variants.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression(
                "chi2", config)
            sample_name_str = items[0]["name"][1]
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = (
                "{bcftools_cmd_chi2} {scalpel_tmp_file} | "
                r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' "
                "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort "
                "{compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Ejemplo n.º 12
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                if is_human:
                    dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                    loftee_args, loftee_fields = _get_loftee(data)
                    prediction_args = ["--sift", "b", "--polyphen", "b"]
                    prediction_fields = ["PolyPhen", "SIFT"]
                else:
                    dbnsfp_args, dbnsfp_fields = [], []
                    loftee_args, loftee_fields = [], []
                    prediction_args, prediction_fields = [], []
                if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False):
                    # In case of clinical reporting, we need one and only one variant per gene
                    # http://useast.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick
                    # Also use hgvs reporting but requires indexing the reference file
                    clinical_args = ["--pick", "--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)]
                    clinical_fields = ["HGVSc", "HGVSp"]
                else:
                    clinical_args, clinical_fields = [], []
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields + clinical_fields)] + \
                       prediction_args + dbnsfp_args + loftee_args + clinical_args

                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Ejemplo n.º 13
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                config_args, config_fields, prediction_fields = [], [], []
                if is_human:
                    plugin_fns = {"dbnsfp": _get_dbnsfp, "loftee": _get_loftee, "dbscsnv": _get_dbscsnv,
                                  "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer}
                    plugins = tz.get_in(("config", "resources", "vep", "plugins"), data, ["dbnsfp", "loftee"])
                    for plugin in plugins:
                        plugin_args, plugin_fields = plugin_fns[plugin](data)
                        config_args += plugin_args
                        config_fields += plugin_fields
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    prediction_fields += ["PolyPhen", "SIFT"]
                    # Use HGVS by default, requires indexing the reference genome
                    config_args += ["--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)]
                    config_fields += ["HGVSc", "HGVSp"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick"]
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds",
                       "--fields", ",".join(std_fields + config_fields)] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Ejemplo n.º 14
0
def prep_vep_cache(dbkey, ref_file, tooldir=None, config=None):
    """Ensure correct installation of VEP cache file.
    """
    if config is None: config = {}
    resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey)
    if os.path.exists(resource_file):
        with open(resource_file) as in_handle:
            resources = yaml.safe_load(in_handle)
        ensembl_name = tz.get_in(["aliases", "ensembl"], resources)
        symlink_dir = _special_dbkey_maps(dbkey, ref_file)
        if ensembl_name and ensembl_name.find("_vep_") == -1:
            raise ValueError("%s has ensembl an incorrect value."
                             "It should have _vep_ in the name."
                             "Remove line or fix the name to avoid error.")
        if symlink_dir and ensembl_name:
            species, vepv = ensembl_name.split("_vep_")
            return symlink_dir, species
        elif ensembl_name:
            species, vepv = ensembl_name.split("_vep_")
            vep_dir = utils.safe_makedir(os.path.normpath(os.path.join(
                os.path.dirname(os.path.dirname(ref_file)), "vep")))
            out_dir = os.path.join(vep_dir, species, vepv)
            if not os.path.exists(out_dir):
                tmp_dir = utils.safe_makedir(os.path.join(vep_dir, species, "txtmp"))
                eversion = vepv.split("_")[0]
                url = "http://ftp.ensembl.org/pub/release-%s/variation/VEP/%s.tar.gz" % (eversion, ensembl_name)
                with utils.chdir(tmp_dir):
                    subprocess.check_call(["wget", "--no-check-certificate", "-c", url])
                vep_path = "%s/bin/" % tooldir if tooldir else ""
                perl_exports = utils.get_perl_exports()
                cmd = ["%svep_install" % vep_path, "-a", "c", "-s", ensembl_name,
                       "-c", vep_dir, "-u", tmp_dir, "--NO_UPDATE", "--VERSION", eversion]
                do.run("%s && %s" % (perl_exports, " ".join(cmd)), "Prepare VEP directory for %s" % ensembl_name)
                cmd = ["%svep_convert_cache" % vep_path, "--species", species, "--version", vepv,
                       "--dir", vep_dir, "--force_overwrite", "--remove"]
                do.run("%s && %s" % (perl_exports, " ".join(cmd)), "Convert VEP cache to tabix %s" % ensembl_name)
                for tmp_fname in os.listdir(tmp_dir):
                    os.remove(os.path.join(tmp_dir, tmp_fname))
                os.rmdir(tmp_dir)
            tmp_dir = os.path.join(vep_dir, "tmp")
            if os.path.exists(tmp_dir):
                shutil.rmtree(tmp_dir)
            return vep_dir, species
    return None, None
Ejemplo n.º 15
0
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(tx_out_file)[0]
            if os.path.exists(tmp_path):
                utils.remove_safe(tmp_path)
            opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
            opts += " --dir %s" % tx_tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            cmd = ("{perl_exports} && "
                   "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} ")
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            shutil.move(tx_tmp_path, tmp_path)
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            sample_name_str = items[0]["name"][1]
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | "
                   r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' "
                   "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort "
                   "{compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Ejemplo n.º 16
0
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    hairpin, mature, species = "none", "none", "na"
    rfam_file = dd.get_mirdeep2_file(data[0][0])
    if file_exists(dd.get_mirbase_hairpin(data[0][0])):
        species = dd.get_species(data[0][0])
        hairpin = dd.get_mirbase_hairpin(data[0][0])
        mature = dd.get_mirbase_mature(data[0][0])

    logger.debug("Preparing for mirdeep2 analysis.")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    safe_makedir(out_dir)
    if not file_exists(rfam_file):
        logger.warning("mirdeep2 Rfam file not instaled. Skipping...")
        return None
    if not file_exists(mirdeep2):
        logger.warning("mirdeep2 executable file not found. Skipping...")
        return None
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = (
            "{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res"
        ).format(**locals())
        if not file_exists(out_file):
            try:
                do.run(cmd.format(**locals()), "Running mirdeep2.")
            except:
                logger.warning(
                    "mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues."
                )
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
Ejemplo n.º 17
0
def _do_run(paired):
    """Perform Battenberg caling with the paired dataset.

    This purposely does not use a temporary directory for the output
    since Battenberg does smart restarts.
    """
    work_dir = _sv_workdir(paired.tumor_data)
    ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt")
    out = _get_battenberg_out(paired, work_dir)
    if len(_missing_files(out)) > 0:
        ref_file = dd.get_ref_file(paired.tumor_data)
        bat_datadir = os.path.normpath(
            os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg"))
        ignore_file = _make_ignore_file(
            work_dir, ref_file,
            os.path.join(bat_datadir, "impute", "impute_info.txt"),
            ignore_file)
        local_sitelib = os.path.join(
            install.get_defaults().get("tooldir", "/usr/local"), "lib", "R",
            "site-library")
        perl_exports = utils.get_perl_exports()
        tumor_bam = paired.tumor_bam
        normal_bam = paired.normal_bam
        platform = dd.get_platform(paired.tumor_data)
        genome_build = paired.tumor_data["genome_build"]
        # scale cores to avoid over-using memory during imputation
        cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5))
        cmd = (
            "export R_LIBS_USER={local_sitelib} && "
            "{perl_exports} && "
            "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai "
            "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt "
            "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt "
            "-ig {ignore_file} "
            "-assembly {genome_build} -species Human -platform {platform}")
        do.run(cmd.format(**locals()), "Battenberg CNV calling")
    assert len(_missing_files(
        out)) == 0, "Missing Battenberg output: %s" % _missing_files(out)
    out["ignore"] = ignore_file
    return out
Ejemplo n.º 18
0
def _trna_annotation(data):
    """
    use tDRmapper to quantify tRNAs
    """
    trna_ref = op.join(dd.get_srna_trna_file(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna", name))
    in_file = op.basename(data["clean_fastq"])
    tdrmapper = os.path.join(os.path.dirname(sys.executable), "TdrMappingScripts.pl")
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_ref) or not file_exists(tdrmapper):
        logger.info("There is no tRNA annotation to run TdrMapper.")
        return work_dir
    out_file = op.join(work_dir, in_file + ".hq_cs.mapped")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file))
                cmd = ("{perl_export} && perl {tdrmapper} {trna_ref} {in_file}").format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*mapped*"):
                    shutil.move(filename, work_dir)
    return work_dir
Ejemplo n.º 19
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(
                data["genome_build"],
                tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data,
                                  1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl",
                                               data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"],
                                     data, False)
                config_args, config_fields, prediction_fields = [], [], []
                if is_human:
                    plugin_fns = {
                        "dbnsfp": _get_dbnsfp,
                        "loftee": _get_loftee,
                        "dbscsnv": _get_dbscsnv,
                        "maxentscan": _get_maxentscan,
                        "genesplicer": _get_genesplicer
                    }
                    plugins = tz.get_in(
                        ("config", "resources", "vep", "plugins"), data,
                        ["dbnsfp", "loftee"])
                    for plugin in plugins:
                        plugin_args, plugin_fields = plugin_fns[plugin](data)
                        config_args += plugin_args
                        plugin_fields += plugin_fields
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    prediction_fields += ["PolyPhen", "SIFT"]
                    # Use HGVS by default, requires indexing the reference genome
                    config_args += [
                        "--hgvs", "--shift_hgvs", "1", "--fasta",
                        dd.get_ref_file(data)
                    ]
                    config_fields += ["HGVSc", "HGVSp"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                        or tz.get_in(
                            ("config", "algorithm", "clinical_reporting"),
                            data)):
                    config_args += ["--pick"]
                std_fields = [
                    "Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL",
                    "Feature", "EXON"
                ] + prediction_fields + [
                    "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"
                ]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds",
                       "--fields", ",".join(std_fields + config_fields)] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (
                    perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Ejemplo n.º 20
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                if is_human:
                    dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                    loftee_args, loftee_fields = _get_loftee(data)
                    prediction_args = ["--sift", "b", "--polyphen", "b"]
                    prediction_fields = ["PolyPhen", "SIFT"]
                else:
                    dbnsfp_args, dbnsfp_fields = [], []
                    loftee_args, loftee_fields = [], []
                    prediction_args, prediction_fields = [], []
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + \
                       prediction_args + dbnsfp_args + loftee_args

                if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False):

                    # In case of clinical reporting, we need one and only one
                    # variant per gene
                    # From the VEP docs:
                    # "Pick once line of consequence data per variant,
                    # including transcript-specific columns. Consequences are
                    # chosen by the canonical, biotype status and length of the
                    # transcript, along with the ranking of the consequence
                    # type according to this table. This is the best method to
                    # use if you are interested only in one consequence per
                    #  variant.

                    cmd += ["--pick"]

                    # TODO investigate hgvs reporting but requires indexing the reference file
                    # cmd += ["--hgvs", "--shift-hgvs", "--fasta", dd.get_ref_file(data)]
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Ejemplo n.º 21
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                if is_human:
                    dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                    loftee_args, loftee_fields = _get_loftee(data)
                    prediction_args = ["--sift", "b", "--polyphen", "b"]
                    prediction_fields = ["PolyPhen", "SIFT"]
                else:
                    dbnsfp_args, dbnsfp_fields = [], []
                    loftee_args, loftee_fields = [], []
                    prediction_args, prediction_fields = [], []
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + \
                       prediction_args + dbnsfp_args + loftee_args

                if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False):

                    # In case of clinical reporting, we need one and only one
                    # variant per gene
                    # From the VEP docs:
                    # "Pick once line of consequence data per variant,
                    # including transcript-specific columns. Consequences are
                    # chosen by the canonical, biotype status and length of the
                    # transcript, along with the ranking of the consequence
                    # type according to this table. This is the best method to
                    # use if you are interested only in one consequence per
                    #  variant.

                    cmd += ["--pick"]

                    # TODO investigate hgvs reporting but requires indexing the reference file
                    # cmd += ["--hgvs", "--shift-hgvs", "--fasta", dd.get_ref_file(data)]
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Ejemplo n.º 22
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(
                data["genome_build"],
                tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data,
                                  1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl",
                                               data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"],
                                     data, False)
                if is_human:
                    dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                    loftee_args, loftee_fields = _get_loftee(data)
                    prediction_args = ["--sift", "b", "--polyphen", "b"]
                    prediction_fields = ["PolyPhen", "SIFT"]
                else:
                    dbnsfp_args, dbnsfp_fields = [], []
                    loftee_args, loftee_fields = [], []
                    prediction_args, prediction_fields = [], []
                if tz.get_in(("config", "algorithm", "clinical_reporting"),
                             data, False):
                    # In case of clinical reporting, we need one and only one variant per gene
                    # http://useast.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick
                    # Also use hgvs reporting but requires indexing the reference file
                    clinical_args = [
                        "--pick", "--hgvs", "--shift_hgvs", "1", "--fasta",
                        dd.get_ref_file(data)
                    ]
                    clinical_fields = ["HGVSc", "HGVSp"]
                else:
                    clinical_args, clinical_fields = [], []
                std_fields = [
                    "Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL",
                    "Feature", "EXON"
                ] + prediction_fields + [
                    "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"
                ]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields + clinical_fields)] + \
                       prediction_args + dbnsfp_args + loftee_args + clinical_args

                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (
                    perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file