def run_vep(mt: hl.MatrixTable,
            genome_version: str,
            name: str = 'vep',
            block_size: int = 1000,
            vep_config_json_path=None) -> hl.MatrixTable:
    """Runs VEP.

    :param MatrixTable mt: MT to annotate with VEP
    :param str genome_version: "37" or "38"
    :param str name: Name for resulting row field
    :param int block_size: Number of rows to process per VEP invocation.
    :return: annotated MT
    :rtype: MatrixTable
    """
    if vep_config_json_path is not None:
        config = vep_config_json_path
        mt = mt.annotate_globals(gencodeVersion="unknown")
    else:
        if genome_version not in ["37", "38"]:
            raise ValueError(f"Invalid genome version: {genome_version}")
        config = "file:///vep_data/vep-gcloud.json"

    mt = hl.vep(mt, config=config, name=name, block_size=block_size)

    logger.info("==> Done with VEP")
    return mt
def format_clinvar_variants(ds):
    # There are some variants with only one entry in alleles, ignore them for now.
    # TODO: These could be displayed in the ClinVar track even though they will never match a gnomAD variant.
    ds = ds.filter(hl.len(ds.alleles) == 2)

    # When a cluster is started with hailctl dataproc start cluster_name --vep, the init script for the
    # selected version of VEP links the appropriate configuration file to /vep_data/vep-gcloud.json
    ds = hl.vep(ds, "file:///vep_data/vep-gcloud.json", name="vep", block_size=1000)
    ds = ds.annotate(sorted_transcript_consequences=sorted_transcript_consequences_v3(ds.vep))
    ds = ds.drop("vep")

    ds = ds.select(
        clinical_significance=hl.sorted(ds.info.CLNSIG, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        clinvar_variation_id=ds.rsid,
        gold_stars=get_gold_stars(ds.info.CLNREVSTAT),
        review_status=hl.sorted(ds.info.CLNREVSTAT, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        sorted_transcript_consequences=ds.sorted_transcript_consequences,
    )

    ds = ds.annotate(
        chrom=normalized_contig(ds.locus), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus)
    )

    return ds
def make_clinvar_hail2(clinvar_vcf_path, clinvar_variants_table,
                       clinvar_mt_out_path):
    """
    Import ClinVar vcf file, and turn it into a usable Hail2 mt

    :param str clinvar_vcf_path: Example : "gs://gnomad-berylc/tx-annotation/hail2/clinvar_alleles_single.b37.vcf.bgz"
    :param str clinvar_variants_table: Example : "gs://gnomad-berylc/tx-annotation/hail2/clinvar_alleles_single.b37.variants_table.tsv"
    :param bool repartition:
    :param int n_partitions: Number of partitions if repartition = True
    :param str clinvar_mt_out_path: "gs://gnomad-resources/clinvar/hail-0.2/clinvar_alleles.single.b37.hail2.vepped.mt"
    :return: split and VEP'd MT
    :rtype: MatrixTable
    """
    clinvar_mt = hl.import_vcf(clinvar_vcf_path)
    variants_table = hl.import_table(clinvar_variants_table, impute=True)
    variants_table = variants_table.annotate(
        v=hl.parse_variant(variants_table.v))
    variants_table = (variants_table.annotate(
        locus=variants_table.v.locus,
        alleles=variants_table.v.alleles).key_by('locus', 'alleles'))

    clinvar_mt = clinvar_mt.annotate_rows(
        va=variants_table[clinvar_mt.locus, clinvar_mt.alleles])

    clinvar_mt = split_multi_dynamic(clinvar_mt, left_aligned=False)
    clinvar_mt = clinvar_mt.repartition(100)
    clinvar_vep = hl.vep(clinvar_mt, vep_config)
    clinvar_vep.write(clinvar_mt_out_path, overwrite=True)

    t = hl.read_matrix_table(clinvar_mt_out_path)
    t.rows().show()
Example #4
0
def prepare_clinvar_variants(vcf_path, reference_genome):
    ds = import_clinvar_vcf(vcf_path, reference_genome)

    # There are some variants with only one entry in alleles, ignore them for now.
    # These could be displayed in the ClinVar track even though they will never match a gnomAD variant.
    ds = ds.filter(hl.len(ds.alleles) == 2)

    ds = hl.vep(ds)

    ds = ds.select(
        clinical_significance=hl.sorted(ds.info.CLNSIG, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        clinvar_variation_id=ds.rsid,
        gold_stars=get_gold_stars(ds.info.CLNREVSTAT),
        review_status=hl.sorted(ds.info.CLNREVSTAT, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        vep=ds.vep,
    )

    ds = ds.annotate(
        chrom=normalized_contig(ds.locus.contig), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus)
    )

    return ds
def run_vep(
        mt: hl.MatrixTable,
        genome_version: str,
        name: str = 'vep',
        block_size: int = 1000) -> hl.MatrixTable:
    """Runs VEP.

    :param MatrixTable mt: MT to annotate with VEP
    :param str genome_version: "37" or "38"
    :param str name: Name for resulting row field
    :param int block_size: Number of rows to process per VEP invocation.
    :return: annotated MT
    :rtype: MatrixTable
    """
    if genome_version == "37":
        mt = mt.annotate_globals(gencodeVersion="19")  # see gs://hail-common/vep/vep/homo_sapiens/85_GRCh38/info.txt
        config = "hdfs:///user/hdfs/vep85-loftee-gcloud.json"
    elif genome_version == "38":
        mt = mt.annotate_globals(gencodeVersion="25")
        config = "gs://hail-common/vep/vep/vep95-GRCh38-loftee-gcloud.json"
    else:
        raise ValueError(f"Invalid genome version: {genome_version}")

    mt = hl.vep(mt, config=config, name=name, block_size=block_size)

    logger.info("==> Done with VEP")
    return mt
Example #6
0
def main(mt: str):
    """
    Run vep using main.py wrapper
    """

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(mt)
    # filter to biallelic loci only
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mt = mt.filter_rows(mt.alleles[1] != '*')
    vep = hl.vep(mt, config='file:///vep_data/vep-gcloud.json')
    vep_path = output_path('vep105_GRCh38.mt')
    vep.write(vep_path)
def make_exac_release_hail2(vcf_path, mt_out):
    """
    From Konrad, who had already did this. I didn't actually run the code.

    :param str vcf_path: Example   "gs://gnomad/raw/source/ExAC.r1.sites.vep.vcf.gz"
    :param str mt_out: Example: "gs://gnomad/raw/hail-0.2/vds/exac/exac.r1.sites.vep.vds" should be mt but whatevs
    :param bool repartition:
    :param int n_partitions: Number of partitions if repartition = True
    :return: Writes out VEP'd Hail0.2 MatrixTable
    :rtype: None
    """

    mt = hl.import_vcf(vcf_path, force_bgz=True, min_partitions=1000)
    mt = split_multi_dynamic(mt)
    mt = hl.vep(mt, vep_config)
    mt.write(mt_out)
def make_gnomad_release_hail2(vcf_path, mt_out):
    """
    Used to import, filter and VEP existing "bootleg" gnomAD VCF (01.26.2018) and write out as a Hail 0.2 MatrixTable

    :param str vcf_path:
    Example: "gs://gnomad-public/release/2.0.2/vcf/exomes/gnomad.exomes.r2.0.2.sites.vcf.bgz"
    :param str mt_out:
    Example: "gs://gnomad-berylc/tx-annotation/hail2/gnomad.exomes.r2.0.2.sites.split.vep.030818.mt"
    :return: Writes out VEP'd Hail0.2 MatrixTable
    :rtype: None
    """

    release_mt = hl.import_vcf(vcf_path, min_partitions=8000)
    release_mt = split_multi_dynamic(release_mt)

    release_mt = release_mt.annotate_rows(
        as_pass=(release_mt.info.AS_FilterStatus[release_mt.a_index -
                                                 1] == "PASS")
        & (release_mt.filters.length() == 0))

    release_mt = release_mt.filter_rows(release_mt.as_pass)
    release_mt = hl.vep(release_mt, vep_config)
    release_mt.write(mt_out)
Example #9
0
def vep_or_lookup_vep(ht,
                      reference_vep_ht=None,
                      reference=None,
                      vep_config=None):
    """
    VEP a table, or lookup variants in a reference database

    :param ht: Input Table
    :param reference_vep_ht: A reference database with VEP annotations (must be in top-level `vep`)
    :param reference: If reference_vep_ht is not specified, find a suitable one in reference (if None, grabs from hl.default_reference)
    :param vep_config: vep_config to pass to hl.vep (if None, a suitable one for `reference` is chosen)
    :return: VEPped Table
    """
    if reference is None:
        reference = hl.default_reference().name
    if reference_vep_ht is None:

        possible_refs = ("GRCh37", "GRCh38")
        if reference not in possible_refs:
            raise ValueError(
                f'vep_or_lookup_vep got {reference}. Expected one of {", ".join(possible_refs)}'
            )

        reference_vep_ht = hl.read_table(vep_context_ht_path(reference))

    ht = ht.annotate(vep=reference_vep_ht[ht.key].vep)

    vep_ht = ht.filter(hl.is_defined(ht.vep))
    revep_ht = ht.filter(hl.is_missing(ht.vep))

    if vep_config is None:
        vep_config = vep_config_path(reference)

    revep_ht = hl.vep(revep_ht, vep_config)

    return vep_ht.union(revep_ht)
Example #10
0
import hail as hl

GOLD_STD = 'gs://hail-common/vep/vep/vep_examplars/vep_no_csq_35d9e30.mt/'
GOLD_STD_CSQ = 'gs://hail-common/vep/vep/vep_examplars/vep_csq_23673e70.mt/'

for path, csq in [(GOLD_STD, False), (GOLD_STD_CSQ, True)]:
    print(f"Checking 'hl.vep' replicates on '{path}'")
    expected = hl.read_matrix_table(path)
    actual = hl.vep(expected.select_rows(),
                    'gs://hail-common/vep/vep/vep85-loftee-gcloud.json',
                    csq=csq)
    vep_result_agrees = actual._same(expected)
    if vep_result_agrees:
        print('TEST PASSED')
    else:
        print('TEST FAILED')
    assert vep_result_agrees
Example #11
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# perform sample and variant qc on remaining variants
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("sample QC...")
vds = hl.sample_qc(vds)

print("variant QC...")
vds = hl.variant_qc(vds)


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# add VEP
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

vds = hl.vep(vds, "gs://ccdg-qc-multi/data/vep85-GRCh38-gcloud.json")


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# write output VDS
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("writing VDS...")
vds.write(qced_vds_excl_file, overwrite=True)


print("writing sample QC...")
vds.cols().flatten().export(sample_qc_info_postqc_file)


    import_clinvar_xml,
    "/clinvar/clinvar.ht",
    {"clinvar_xml_path": pipeline.get_task("download_clinvar_xml")},
)

pipeline.add_task(
    "prepare_clinvar_grch38_variants",
    prepare_clinvar_variants,
    "/clinvar/clinvar_grch38_base.ht",
    {"clinvar_path": pipeline.get_task("import_clinvar_xml")},
    {"reference_genome": "GRCh38"},
)

pipeline.add_task(
    "vep_clinvar_grch38_variants",
    lambda path: hl.vep(hl.read_table(path)).drop("vep_proc_id"),
    "/clinvar/clinvar_grch38_vepped.ht",
    {"path": pipeline.get_task("prepare_clinvar_grch38_variants")},
)

pipeline.add_task(
    "annotate_clinvar_grch38_transcript_consequences",
    annotate_transcript_consequences,
    "/clinvar/clinvar_grch38_annotated_1.ht",
    {
        "variants_path": pipeline.get_task("vep_clinvar_grch38_variants"),
        "transcripts_path": genes_pipeline.get_task("extract_grch38_transcripts"),
        "mane_transcripts_path": genes_pipeline.get_task("import_mane_select_transcripts"),
    },
)
Example #13
0
def populate_clinvar():

    clinvar_release_date = _parse_clinvar_release_date('clinvar.vcf.gz')
    mt = import_vcf('clinvar.vcf.gz',
                    "38",
                    drop_samples=True,
                    min_partitions=2000,
                    skip_invalid_loci=True)
    mt = mt.annotate_globals(version=clinvar_release_date)

    print("\n=== Running VEP ===")
    mt = hl.vep(mt, 'vep85-loftee-ruddle-b38.json', name="vep")

    print("\n=== Processing ===")
    mt = mt.annotate_rows(
        sortedTranscriptConsequences=
        get_expr_for_vep_sorted_transcript_consequences_array(vep_root=mt.vep))

    mt = mt.annotate_rows(
        main_transcript=
        get_expr_for_worst_transcript_consequence_annotations_struct(
            vep_sorted_transcript_consequences_root=mt.
            sortedTranscriptConsequences))

    mt = mt.annotate_rows(gene_ids=get_expr_for_vep_gene_ids_set(
        vep_transcript_consequences_root=mt.sortedTranscriptConsequences), )

    review_status_str = hl.delimit(
        hl.sorted(hl.array(hl.set(mt.info.CLNREVSTAT)),
                  key=lambda s: s.replace("^_", "z")))

    mt = mt.select_rows(
        allele_id=mt.info.ALLELEID,
        alt=get_expr_for_alt_allele(mt),
        chrom=get_expr_for_contig(mt.locus),
        clinical_significance=hl.delimit(
            hl.sorted(hl.array(hl.set(mt.info.CLNSIG)),
                      key=lambda s: s.replace("^_", "z"))),
        domains=get_expr_for_vep_protein_domains_set(
            vep_transcript_consequences_root=mt.vep.transcript_consequences),
        gene_ids=mt.gene_ids,
        gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map(
            vep_sorted_transcript_consequences_root=mt.
            sortedTranscriptConsequences,
            gene_ids=mt.gene_ids),
        gold_stars=CLINVAR_GOLD_STARS_LOOKUP[review_status_str],
        **{
            f"main_transcript_{field}": mt.main_transcript[field]
            for field in mt.main_transcript.dtype.fields
        },
        pos=get_expr_for_start_pos(mt),
        ref=get_expr_for_ref_allele(mt),
        review_status=review_status_str,
        transcript_consequence_terms=get_expr_for_vep_consequence_terms_set(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
        transcript_ids=get_expr_for_vep_transcript_ids_set(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
        transcript_id_to_consequence_json=
        get_expr_for_vep_transcript_id_to_consequence_map(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
        variant_id=get_expr_for_variant_id(mt),
        xpos=get_expr_for_xpos(mt.locus),
    )

    print("\n=== Summary ===")
    hl.summarize_variants(mt)

    # Drop key columns for export
    rows = mt.rows()
    rows = rows.order_by(rows.variant_id).drop("locus", "alleles")
    rows.write('clinvar.ht', overwrite=True)
    '''
Example #14
0
               help="Elasticsearch block size to use when exporting",
               default=200,
               type=int)
args = p.parse_args()

if args.index_name:
    index_name = args.index_name.lower()
else:
    index_name = "clinvar_grch{}".format(args.genome_version)

print("\n=== Downloading VCF ===")
mt = download_and_import_latest_clinvar_vcf(args.genome_version)
print(dict(mt.globals.value))

print("\n=== Running VEP ===")
mt = hl.vep(mt, "file:///vep/vep85-gcloud.json", name="vep", block_size=1000)

print("\n=== Processing ===")
mt = mt.annotate_rows(sortedTranscriptConsequences=
                      get_expr_for_vep_sorted_transcript_consequences_array(
                          vep_root=mt.vep))

mt = mt.annotate_rows(
    main_transcript=
    get_expr_for_worst_transcript_consequence_annotations_struct(
        vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences
    ))

mt = mt.annotate_rows(gene_ids=get_expr_for_vep_gene_ids_set(
    vep_transcript_consequences_root=mt.sortedTranscriptConsequences), )
    "/clinvar/clinvar.ht",
    {"clinvar_xml_path": pipeline.get_task("download_clinvar_xml")},
)

pipeline.add_task(
    "prepare_clinvar_grch37_variants",
    prepare_clinvar_variants,
    "/clinvar/clinvar_grch37_base.ht",
    {"clinvar_path": pipeline.get_task("import_clinvar_xml")},
    {"reference_genome": "GRCh37"},
)

pipeline.add_task(
    "vep_clinvar_grch37_variants",
    # tolerate_parse_error to ignore not a number error from "NaN" gene symbol
    lambda path: hl.vep(hl.read_table(path), tolerate_parse_error=True).drop("vep_proc_id"),
    "/clinvar/clinvar_grch37_vepped.ht",
    {"path": pipeline.get_task("prepare_clinvar_grch37_variants")},
)

pipeline.add_task(
    "annotate_clinvar_grch37_transcript_consequences",
    annotate_transcript_consequences,
    "/clinvar/clinvar_grch37_annotated_1.ht",
    {
        "variants_path": pipeline.get_task("vep_clinvar_grch37_variants"),
        "transcripts_path": genes_pipeline.get_task("extract_grch37_transcripts"),
    },
)

pipeline.add_task(
Example #16
0
            (snp2_con == "stop_retained_variant")):
            return "gained_stop_loss"
        else:
            return ("Unchanged")
    elif mnv_con == "stop_retained_variant":  #this case, by definition one of the variant is stop_lost, and the other is stop_retained
        return ("Rescued stop loss")
    else:
        return ("Noncoding_or_else")


#read MNV
mnv = hl.read_table(sys.argv[1])

#annotate snv effects
mnv = mnv.key_by("locus", "alleles")
mnv = hl.vep(mnv, vep_config, name="snp2_vep")
mnv = mnv.key_by()  #unkey first
mnv = mnv.rename({
    'locus': 'snp2_locus',
    'alleles': 'snp2_alleles',
    "prev_locus": "locus",
    "prev_alleles": "alleles"
})  #rename the snp1 locus and alleles as locus, allele
mnv = mnv.key_by('locus', 'alleles')  #and re-key
mnv = hl.vep(mnv, vep_config, name="snp1_vep")  #and vep

#annotate MNV effects, specified by the distance
if sys.argv[2] == 1:
    t = mnv.filter(mnv.dist == 1)
    t = t.annotate(refs=t.alleles[0] + t.snp2_alleles[0],
                   alts=t.alleles[1] +
import hail as hl

GNOMAD_CHR22_FIRST_1000 = "gs://hail-us-vep/vep_examplars/gnomad3_chr22_first_1000.mt"

for path, csq in [(GNOMAD_CHR22_FIRST_1000, False)]:
    print(f"Checking 'hl.vep' replicates on '{path}'")
    expected = hl.read_matrix_table(path)
    actual = hl.vep(expected.rows().select(),
                    'gs://hail-us-vep/vep95-GRCh38-loftee-gcloud.json',
                    csq=csq)
    actual._force_count()
    # vep_result_agrees = actual._same(expected)
    # if vep_result_agrees:
    #     print('TEST PASSED')
    # else:
    #     print('TEST FAILED')
    # assert vep_result_agrees
Example #18
0
import hail as hl

# Read in the MatrixTable
mt = hl.read_matrix_table('table.mt')

# Apply VEP
vep = hl.vep(mt, "vep85-loftee-ruddle-b38.json")

# Write the MatrixTable (you don't want to re-apply VEP every time, it takes ~1 hour)
vep.write('vep_matrixtable.mt')

# Get SNVs
snvs = mt.filter_rows(mt.vep.variant_class == "SNV")

# Filter for specific consequence_terms
snvs.filter_rows(
    snvs.vep.transcript_consequences.consequence_terms.contains(
        ["missense_variant"])
    | snvs.vep.transcript_consequences.consequence_terms.contains(
        ["splice_acceptor_variant"])
    | snvs.vep.transcript_consequences.consequence_terms.contains(
        ["splice_donor_variant"])
    | snvs.vep.transcript_consequences.consequence_terms.contains(
        ["splice_region_variant"]) |
    snvs.vep.transcript_consequences.consequence_terms.contains(["start_lost"])
    | snvs.vep.transcript_consequences.consequence_terms.contains(
        ["stop_gained"])
    | snvs.vep.transcript_consequences.consequence_terms.contains(
        ["stop_lost"])).show()

# Filter for rows that do NOT have those consequence_terms
                mt1 = matrix_tables[i]
                next_mt = matrix_tables[i+1]

                if i == 0:  # if first, combing first two MTs
                    mt = mt1.union_cols(next_mt)
                else:  # Else combine the combined MT with the next MT
                    mt = mt.union_cols(next_mt)

        logging.info('Joined count: ' + str(mt.count()))

    logging.info('Splitting multiallelic variants')
    mt_split = hl.split_multi_hts(mt)
    logging.info('Split count: ' + str(mt_split.count()))

    logging.info('VEP annotating dataset.')
    mt_vep = hl.vep(mt_split, args.vep_config)

    if args.out_file is None:
        out_name = vcf_files[0]
    else:
        out_name = args.out_file

    if args.test:
        out_name = out_name + "_test"

    logging.info('Writing matrix table to bucket.')
    mt_vep.write(os.path.join(args.data_dir, out_name))

    logging.info('Successfully completed import and VEP annotation. Copying logs to bucket and shutting down in 10 min.')
    h.copy_logs_output(log_dir, timestr=timestr, log_file=log_file, plot_dir=args.data_dir)
    AN=mt_split.info.AN,
    EAS_AF=mt_split.info.EAS_AF[mt_split.a_index - 1],
    EUR_AF=mt_split.info.EUR_AF[mt_split.a_index - 1],
    AFR_AF=mt_split.info.AFR_AF[mt_split.a_index - 1],
    AMR_AF=mt_split.info.AMR_AF[mt_split.a_index - 1],
    SAS_AF=mt_split.info.SAS_AF[mt_split.a_index - 1],
    VT=(hl.case().when((mt_split.alleles[0].length() == 1)
                       & (mt_split.alleles[1].length() == 1), 'SNP').when(
                           mt_split.alleles[0].matches('<CN*>')
                           | mt_split.alleles[1].matches('<CN*>'),
                           'SV').default('INDEL')),
    EX_TARGET=mt_split.info.EX_TARGET,
    MULTI_ALLELIC=mt_split.info.MULTI_ALLELIC,
    DP=mt_split.info.DP))
mt_split.describe()
mt_split = mt_split.drop('old_locus', 'old_alleles', 'a_index')

mt_split = mt_split.annotate_cols(
    sex=ht_samples[mt_split.s].gender,
    super_population=ht_samples[mt_split.s].super_pop,
    population=ht_samples[mt_split.s].pop)

mt_split = hl.sample_qc(mt_split)
mt_split = hl.variant_qc(mt_split)
mt_split = hl.vep(mt_split, 'gs://hail-common/vep/vep/vep85-gcloud.json')

mt_split.describe()
mt_split.write(
    'gs://hail-datasets/hail-data/1000_genomes_phase3_chrY.GRCh37.mt',
    overwrite=True)
import hail as hl
hl.init()

from hail.plot import show
from pprint import pprint
hl.plot.output_notebook()

MT = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.mt'

ht = hl.read_matrix_table(MT).rows()

ht_vep = hl.vep(ht, "gs://hail-common/vep/vep/vep95-GRCh38-loftee-gcloud.json")
ht_vep.write(
    "gs://dalio_bipolar_w1_w2_hail_02/data/annotations/vep_annotate.ht",
    overwrite=True)
Example #22
0
def vep_or_lookup_vep(ht,
                      reference_vep_ht=None,
                      reference=None,
                      vep_config_path=None,
                      vep_version=None):
    """
    VEP a table, or lookup variants in a reference database

    .. warning::
        If `reference_vep_ht` is supplied, no check is performed to confirm `reference_vep_ht` was
        generated with the same version of VEP / VEP configuration as the VEP referenced in `vep_config_path`.

    :param ht: Input Table
    :param reference_vep_ht: A reference database with VEP annotations (must be in top-level `vep`)
    :param reference: If reference_vep_ht is not specified, find a suitable one in reference (if None, grabs from hl.default_reference)
    :param vep_config_path: vep_config to pass to hl.vep (if None, a suitable one for `reference` is chosen)
    :param vep_version: Version of VEPed context Table to use (if None, the default `vep_context` resource will be used)
    :return: VEPed Table
    """

    if reference is None:
        reference = hl.default_reference().name

    if vep_config_path is None:
        vep_config_path = VEP_CONFIG_PATH

    vep_help = get_vep_help(vep_config_path)

    with hl.hadoop_open(vep_config_path) as vep_config_file:
        vep_config = vep_config_file.read()

    if reference_vep_ht is None:

        if reference not in POSSIBLE_REFS:
            raise ValueError(
                f'vep_or_lookup_vep got {reference}. Expected one of {", ".join(POSSIBLE_REFS)}'
            )

        vep_context = get_vep_context(reference)
        if vep_version is None:
            vep_version = vep_context.default_version

        if vep_version not in vep_context.versions:
            logger.warning(
                f"No VEPed context Table available for genome build {reference} and VEP version {vep_version}, "
                f"all variants will be VEPed using the following VEP:\n{vep_help}"
            )
            return hl.vep(ht, vep_config_path)

        logger.info(
            f"Using VEPed context Table from genome build {reference} and VEP version {vep_version}"
        )

        reference_vep_ht = vep_context.versions[vep_version].ht()
        vep_context_help = hl.eval(reference_vep_ht.vep_help)
        vep_context_config = hl.eval(reference_vep_ht.vep_config)

        assert vep_help == vep_context_help, (
            f"The VEP context HT version does not match the version referenced in the VEP config file."
            f"\nVEP context:\n{vep_context_help}\n\n VEP config:\n{vep_help}")

        assert vep_config == vep_context_config, (
            f"The VEP context HT configuration does not match the configuration in {vep_config_path}."
            f"\nVEP context:\n{vep_context_config}\n\n Current config:\n{vep_config}"
        )

    ht = ht.annotate(vep=reference_vep_ht[ht.key].vep)

    vep_ht = ht.filter(hl.is_defined(ht.vep))
    revep_ht = ht.filter(hl.is_missing(ht.vep))
    revep_ht = hl.vep(revep_ht, vep_config_path)

    return vep_ht.union(revep_ht)
Example #23
0
import hail as hl

GOLD_STD = 'gs://hail-common/vep/vep/vep_examplars/vep_no_csq_35d9e30.mt/'
GOLD_STD_CSQ = 'gs://hail-common/vep/vep/vep_examplars/vep_csq_23673e70.mt/'

for path, csq in [(GOLD_STD, False), (GOLD_STD_CSQ, True)]:
    print(f"Checking 'hl.vep' replicates on '{path}'")
    expected = hl.read_matrix_table(path)
    actual = hl.vep(expected.select_rows(), 'gs://hail-common/vep/vep/vep85-loftee-gcloud.json', csq=csq)
    vep_result_agrees = actual._same(expected)
    if vep_result_agrees:
        print('TEST PASSED')
    else:
        print('TEST FAILED')
    assert vep_result_agrees
Example #24
0
import hail as hl

GOLD_STD = 'gs://hail-us-vep/vep_examplars/vep_no_csq_4dc19bc1b.mt/'
GOLD_STD_CSQ = 'gs://hail-us-vep/vep_examplars/vep_csq_4dc19bc1b.mt/'

for path, csq in [(GOLD_STD, False), (GOLD_STD_CSQ, True)]:
    print(f"Checking 'hl.vep' replicates on '{path}'")
    expected = hl.read_matrix_table(path)
    actual = hl.vep(expected.rows().select(),
                    'gs://hail-us-vep/vep85-loftee-gcloud-testing.json',
                    csq=csq)
    actual._force_count()
    # vep_result_agrees = actual._same(expected)
    # if vep_result_agrees:
    #     print('TEST PASSED')
    # else:
    #     print('TEST FAILED')
    # assert vep_result_agrees
Example #25
0
import hail.expr.aggregators as agg
from bokeh.plotting import figure, output_file
import numpy as np
​
​
hl.init(default_reference='GRCh38',min_block_size=6)
​
​#Annotations: gsutil -m cp /medpop/esp2/mzekavat/CHIP/CHUD/data/variant_annot/somVariants.txt.bgz gs://maryam_lipids/UKBB_CHIP/somVariants.txt.bgz

kt = hl.import_table('gs://maryam_lipids/UKBB_CHIP/somVariants.txt.bgz', impute = True,min_partitions=2000,no_header = True) 
kt2 = kt.key_by(**hl.parse_variant(kt.f0)
kt2.describe()
kt2.write('gs://maryam_lipids/UKBB_CHIP/all_somatic_var_list.ht')

kt2=hl.read_table('gs://maryam_lipids/UKBB_CHIP/all_somatic_var_list.ht').repartition(1000)
kt2 = hl.vep(kt2, 'gs://hail-us-vep/vep95-GRCh38-loftee-gcloud.json')

consequence_in_severity_order = [
  "transcript_ablation"
, "splice_acceptor_variant"
, "splice_donor_variant"
, "stop_gained"
, "frameshift_variant"
, "stop_lost"
, "start_lost"
, "transcript_amplification"
, "inframe_insertion"
, "inframe_deletion"
, "missense_variant"
, "protein_altering_variant"
, "splice_region_variant"
        & (r_de_novo_mt.father.DP >= args.min_parent_dp)
        & (r_de_novo_mt.AF[0] > args.min_aaf)
        & (r_de_novo_mt.AF[0] < 1 - args.min_aaf))
    dnm_ht = dnm.key_cols_by().entries()
    dnm_ht = dnm_ht.filter((dnm_ht.alleles[0].length() == 1)
                           & (dnm_ht.alleles[1].length() == 1))
    dnm_ht = dnm_ht.filter((dnm_ht.father.AD[1] <= args.max_parent_alleles)
                           & (dnm_ht.mother.AD[1] <= args.max_parent_alleles))
    if args.dnm_checkpoint is not None:
        dnm_ht = dnm_ht.checkpoint(args.dnm_checkpoint)
else:
    dnm_ht = hl.read_table(args.dnm_checkpoint)

logger.info("Annotating data using vep...")
if args.vep_checkpoint is None or not os.path.exists(args.vep_checkpoint):
    vep_dnm_ht = hl.vep(dnm_ht, config=args.vep_config, name="vep", csq=False)
    if args.vep_checkpoint is not None:
        vep_dnm_ht = vep_dnm_ht.checkpoint(args.vep_checkpoint, overwrite=True)
else:
    vep_dnm_ht = hl.read_table(args.vep_checkpoint)


def max_if_defined(expr):
    return hl.nanmax(hl.if_else(
        hl.is_defined(expr),
        expr,
        [.0],
    ))

vep_dnm_ht = vep_dnm_ht.annotate(
    gene_symbol=hl.set(vep_dnm_ht.vep.transcript_consequences.gene_symbol),
            denovos = hl.read_table(file_path)

    else:
        if not file_exists(file_path):
            print(f"{data_label}: Generating {file_path}")
            denovos = compute_samocha_denovos(mt, pedigree)
            denovos = denovos.checkpoint(file_path, overwrite=True, _read_if_exists=not force)
        else:
            print(f"Reading table {file_path}")
            denovos = hl.read_table(file_path)

    file_path = os.path.join(BASE_DIR, f"{data_label}.{algo_label}_denovos.vep.ht")
    if not file_exists(file_path):
        print(f"{data_label}: Generating {file_path}")
        denovos = denovos.key_by('locus', 'alleles')
        denovos = hl.vep(denovos, "file:///vep_data/vep-gcloud.json", name="vep", block_size=100)

        denovos = denovos.checkpoint(file_path, overwrite=True, _read_if_exists=not force)
    else:
        print(f"Reading table {file_path}")
        denovos = hl.read_table(file_path)

    file_path = os.path.join(BASE_DIR, f"{data_label}.{algo_label}_de_novos_table.tsv")
    print(f"{data_label}: annotate and export {file_path}")

    denovos = denovos.annotate(sorted_transcript_consequences = get_expr_for_vep_sorted_transcript_consequences_array(denovos.vep))
    denovos = denovos.annotate(transcript_consequence_terms = get_expr_for_vep_consequence_terms(denovos.sorted_transcript_consequences))
    denovos = denovos.annotate(transcript_consequence_categories = denovos.sorted_transcript_consequences.map(lambda c: c.category))
    denovos = denovos.annotate(transcript_consequence = hl.cond(hl.len(denovos.transcript_consequence_terms) > 0, denovos.transcript_consequence_terms[0], "other"))
    denovos = denovos.annotate(transcript_consequence_category = hl.cond(hl.len(denovos.transcript_consequence_categories) > 0, denovos.transcript_consequence_categories[0], "other"))
Example #28
0
import hail as hl

an_gen = hl.read_matrix_table('gs://phenotype_31063/hail/ukb31063.genotype.mt')
an_gen = hl.vep(an_gen, 'gs://hail-common/vep/vep/vep85-loftee-gcloud.json')
an_gen.describe()
#write to a bucket
an_gen.write('gs://rec_project/genotype_annotations.mt', overwrite=True)
# Now we need --requester-pays-allow-all --vep GRCh37 when starting the cluster
# hailctl dataproc start dp --vep GRCh37 --requester-pays-allow-all --region us-central1

# Ensure that the variant list is moved to the cloud
# gsutil cp ~/Repositories/BipEx/BSC_variant_data/BSC_BipEx_gene_variants.v2.txt gs://dalio_bipolar_w1_w2_hail_02/data/BSC_variants/
# gsutil cp ~/Repositories/BipEx/BSC_data/variant_data/BSC_BipEx_gene_variants.v3.20201018.txt gs://dalio_bipolar_w1_w2_hail_02/data/BSC_variants/ <- New file with updated definition based on SCHEMA thresholds, MAC <= 5 not in gnomAD top 10.

# Read in the file, and annotate using VEP...making sure that the reference file is build 37.

ht = hl.import_table(  #'gs://dalio_bipolar_w1_w2_hail_02/data/BSC_variants/BSC_BipEx_gene_variants.v2.txt',
    'gs://dalio_bipolar_w1_w2_hail_02/data/BSC_variants/BSC_BipEx_gene_variants.v3.20201018.txt',
    impute=True,
    no_header=True)
ht = ht.rename({'f0': 'gene', 'f2': 'position', 'f3': 'ref', 'f4': 'alt'})
ht = ht.annotate(chr=ht.f1.replace('chr', ''))
ht = ht.annotate(locus=hl.locus(ht.chr, ht.position,
                                reference_genome='GRCh37'),
                 alleles=[ht.ref, ht.alt])
ht = ht.key_by(ht.locus, ht.alleles)

# Now create locus and alleles
# Create a compound row key to allow us to annotate.

# Note that the location of the loftee file changed
# (check https://hail.is/docs/0.2/methods/genetics.html?highlight=vep#hail.methods.vep for most up to date location).
ht_vep = hl.vep(ht, "gs://hail-us-vep/vep85-loftee-gcloud.json")
ht_vep.write(
    "gs://dalio_bipolar_w1_w2_hail_02/data/annotations/bsc_variants_vep_annotate.ht",
    overwrite=True)