Ejemplo n.º 1
0
def prepare_clinvar_variants(vcf_path, reference_genome):
    ds = import_clinvar_vcf(vcf_path, reference_genome)

    # There are some variants with only one entry in alleles, ignore them for now.
    # These could be displayed in the ClinVar track even though they will never match a gnomAD variant.
    ds = ds.filter(hl.len(ds.alleles) == 2)

    ds = hl.vep(ds)

    ds = ds.select(
        clinical_significance=hl.sorted(ds.info.CLNSIG, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        clinvar_variation_id=ds.rsid,
        gold_stars=get_gold_stars(ds.info.CLNREVSTAT),
        review_status=hl.sorted(ds.info.CLNREVSTAT, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        vep=ds.vep,
    )

    ds = ds.annotate(
        chrom=normalized_contig(ds.locus.contig), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus)
    )

    return ds
Ejemplo n.º 2
0
def prepare_clinvar_variants(clinvar_path, reference_genome):
    ds = hl.read_table(clinvar_path)

    ds = ds.filter(hl.is_defined(ds[f"locus_{reference_genome}"]) & hl.is_defined(ds[f"alleles_{reference_genome}"]))

    ds = ds.select(locus=ds[f"locus_{reference_genome}"], alleles=ds[f"alleles_{reference_genome}"], **ds.variant)

    # Remove any variants with alleles other than ACGT
    ds = ds.filter(
        hl.len(hl.set(hl.delimit(ds.alleles, "").split("")).difference(hl.set(["A", "C", "G", "T", ""]))) == 0
    )

    ds = ds.annotate(
        variant_id=variant_id(ds.locus, ds.alleles),
        chrom=normalized_contig(ds.locus.contig),
        pos=ds.locus.position,
        ref=ds.alleles[0],
        alt=ds.alleles[1],
    )

    ds = ds.key_by("locus", "alleles")

    return ds
Ejemplo n.º 3
0
def prepare_mitochondrial_variants(path, mnvs_path=None):
    ds = hl.read_table(path)

    haplogroups = hl.eval(ds.globals.hap_order)

    ds = ds.annotate(hl_hist=ds.hl_hist.annotate(
        bin_edges=ds.hl_hist.bin_edges.map(
            lambda n: hl.float(hl.format("%.2f", n)))))

    filter_names = hl.dict({
        "artifact_prone_site": "Artifact-prone site",
        "indel_stack": "Indel stack",
        "npg": "No passing genotype"
    })

    ds = ds.select(
        # ID
        variant_id=variant_id(ds.locus, ds.alleles),
        reference_genome=ds.locus.dtype.reference_genome.name,
        chrom=normalized_contig(ds.locus.contig),
        pos=ds.locus.position,
        ref=ds.alleles[0],
        alt=ds.alleles[1],
        rsid=ds.rsid,
        # Quality
        filters=ds.filters.map(lambda f: filter_names.get(f, f)),
        qual=ds.qual,
        genotype_quality_metrics=[
            hl.struct(name="Depth", alt=ds.dp_hist_alt, all=ds.dp_hist_all)
        ],
        genotype_quality_filters=[
            hl.struct(
                name="Base Quality",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.base_qual_hist),
            ),
            hl.struct(
                name="Contamination",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.contamination_hist),
            ),
            hl.struct(
                name="Heteroplasmy below 10%",
                filtered=hl.struct(
                    bin_edges=ds.hl_hist.bin_edges,
                    bin_freq=ds.heteroplasmy_below_10_percent_hist),
            ),
            hl.struct(name="Position",
                      filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                         bin_freq=ds.position_hist)),
            hl.struct(
                name="Strand Bias",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.strand_bias_hist),
            ),
            hl.struct(
                name="Weak Evidence",
                filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges,
                                   bin_freq=ds.weak_evidence_hist),
            ),
        ],
        site_quality_metrics=[
            hl.struct(name="Mean Depth", value=nullify_nan(ds.dp_mean)),
            hl.struct(name="Mean MQ", value=nullify_nan(ds.mq_mean)),
            hl.struct(name="Mean TLOD", value=nullify_nan(ds.tlod_mean)),
        ],
        # Frequency
        an=ds.AN,
        ac_hom=ds.AC_hom,
        ac_het=ds.AC_het,
        excluded_ac=ds.excluded_AC,
        # Heteroplasmy
        common_low_heteroplasmy=ds.common_low_heteroplasmy,
        heteroplasmy_distribution=ds.hl_hist,
        max_heteroplasmy=ds.max_hl,
        # Populations
        populations=hl.sorted(
            hl.range(hl.len(
                ds.globals.pop_order)).map(lambda pop_index: hl.struct(
                    id=ds.globals.pop_order[pop_index],
                    an=ds.pop_AN[pop_index],
                    ac_het=ds.pop_AC_het[pop_index],
                    ac_hom=ds.pop_AC_hom[pop_index],
                    heteroplasmy_distribution=hl.struct(
                        bin_edges=ds.hl_hist.bin_edges,
                        bin_freq=ds.pop_hl_hist[pop_index],
                        n_smaller=0,
                        n_larger=0,
                    ),
                )),
            key=lambda pop: pop.id,
        ),
        # Haplogroups
        hapmax_af_hom=ds.hapmax_AF_hom,
        hapmax_af_het=ds.hapmax_AF_het,
        faf_hapmax_hom=ds.faf_hapmax_hom,
        haplogroup_defining=ds.hap_defining_variant,
        haplogroups=[
            hl.struct(
                id=haplogroup,
                an=ds.hap_AN[i],
                ac_het=ds.hap_AC_het[i],
                ac_hom=ds.hap_AC_hom[i],
                faf_hom=ds.hap_faf_hom[i],
                heteroplasmy_distribution=ds.hap_hl_hist[i],
            ) for i, haplogroup in enumerate(haplogroups)
        ],
        # Other
        age_distribution=hl.struct(het=ds.age_hist_het, hom=ds.age_hist_hom),
        flags=hl.set([
            hl.or_missing(ds.common_low_heteroplasmy,
                          "common_low_heteroplasmy")
        ]).filter(hl.is_defined),
        mitotip_score=ds.mitotip_score,
        mitotip_trna_prediction=ds.mitotip_trna_prediction,
        pon_ml_probability_of_pathogenicity=ds.
        pon_ml_probability_of_pathogenicity,
        pon_mt_trna_prediction=ds.pon_mt_trna_prediction,
        variant_collapsed=ds.variant_collapsed,
        vep=ds.vep,
    )

    if mnvs_path:
        mnvs = hl.import_table(mnvs_path,
                               types={
                                   "pos": hl.tint,
                                   "ref": hl.tstr,
                                   "alt": hl.tstr,
                                   "AC_hom_MNV": hl.tint
                               })
        mnvs = mnvs.key_by(
            locus=hl.locus("chrM",
                           mnvs.pos,
                           reference_genome=ds.locus.dtype.reference_genome),
            alleles=[mnvs.ref, mnvs.alt],
        )
        ds = ds.annotate(ac_hom_mnv=hl.or_else(mnvs[ds.key].AC_hom_MNV, 0))
        ds = ds.annotate(
            flags=hl.if_else(ds.ac_hom_mnv > 0, ds.flags.add("mnv"), ds.flags))

    return ds
def import_gnomad_v2_lof_curation_results(curation_result_paths, genes_path):
    all_flags = set()

    with hl.hadoop_open("/tmp/import_temp.tsv", "w") as temp_output_file:
        writer = csv.writer(temp_output_file, delimiter="\t", quotechar='"')
        writer.writerow(["chrom", "position", "ref", "alt", "genes", "verdict", "flags", "project_index"])

        for project_index, path in enumerate(curation_result_paths):
            with hl.hadoop_open(path, "r") as input_file:
                reader = csv.DictReader(input_file)

                raw_dataset_flags = [f.lstrip("Flag ") for f in reader.fieldnames if f.startswith("Flag ")]

                dataset_flags = [FLAG_MAPPING.get(f, f) for f in raw_dataset_flags]

                all_flags = all_flags.union(set(dataset_flags))

                for row in reader:
                    [chrom, pos, ref, alt] = row["Variant ID"].split("-")

                    variant_flags = [FLAG_MAPPING.get(f, f) for f in raw_dataset_flags if row[f"Flag {f}"] == "TRUE"]

                    genes = [gene_id for (gene_id, gene_symbol) in (gene.split(":") for gene in row["Gene"].split(";"))]

                    verdict = row["Verdict"]

                    if verdict == "inufficient_evidence":
                        verdict = "insufficient_evidence"

                    verdict = VERDICT_MAPPING[verdict]

                    output_row = [
                        chrom,
                        pos,
                        ref,
                        alt,
                        ",".join(genes),
                        verdict,
                        ",".join(variant_flags),
                        project_index,
                    ]

                    writer.writerow(output_row)

    ds = hl.import_table("/tmp/import_temp.tsv")

    ds = ds.transmute(locus=hl.locus(ds.chrom, hl.int(ds.position)), alleles=[ds.ref, ds.alt],)

    ds = ds.annotate(
        genes=ds.genes.split(","),
        flags=hl.set(hl.if_else(ds.flags == "", hl.empty_array(hl.tstr), ds.flags.split(","))),
    )

    ds = ds.explode(ds.genes, name="gene_id")

    genes = hl.read_table(genes_path)
    ds = ds.annotate(gene_symbol=genes[ds.gene_id].symbol, gene_version=genes[ds.gene_id].gene_version)

    ds = ds.group_by(ds.locus, ds.alleles, ds.gene_id).aggregate(
        result=hl.agg.take(ds.row.drop("locus", "alleles", "gene_id"), 1, ds.project_index)
    )

    ds = ds.annotate(**ds.result[0]).drop("result", "project_index")

    ds = ds.group_by("locus", "alleles").aggregate(lof_curations=hl.agg.collect(ds.row.drop("locus", "alleles")))

    ds = ds.annotate(variant_id=variant_id(ds.locus, ds.alleles))

    for flag in sorted(list(all_flags)):
        print(flag)

    return ds
def prepare_gnomad_v2_variants(exome_variants_path, genome_variants_path):
    exome_variants = prepare_gnomad_v2_variants_helper(exome_variants_path, "exome")
    genome_variants = prepare_gnomad_v2_variants_helper(genome_variants_path, "genome")

    shared_fields = [
        "lcr",
        "nonpar",
        "rsid",
        "segdup",
        "vep",
    ]

    variants = exome_variants.join(genome_variants, "outer")

    variants = variants.annotate(
        **{field: hl.or_else(variants.exome[field], variants.genome[field]) for field in shared_fields}
    )

    variants = variants.annotate(exome=variants.exome.drop(*shared_fields), genome=variants.genome.drop(*shared_fields))

    variants = variants.annotate(
        variant_id=variant_id(variants.locus, variants.alleles),
        reference_genome="GRCh37",
        chrom=normalized_contig(variants.locus.contig),
        pos=variants.locus.position,
        xpos=x_position(variants.locus),
        ref=variants.alleles[0],
        alt=variants.alleles[1],
    )

    variants = variants.transmute(rsids=hl.or_missing(hl.is_defined(variants.rsid), hl.set([variants.rsid])))

    # Variant is in a subset if it is in the subset in either exome or genome samples
    variants = variants.annotate(subsets=variants.exome.subsets.union(variants.genome.subsets))

    # Flags
    variants = variants.annotate(
        flags=hl.set(
            [
                hl.or_missing(variants.lcr, "lcr"),
                hl.or_missing(((variants.chrom == "X") | (variants.chrom == "Y")) & ~variants.nonpar, "par"),
            ]
        ).filter(hl.is_defined)
    )

    # Colocated variants
    variants = variants.cache()
    variants_by_locus = variants.select(
        variants.variant_id,
        exome_ac_raw=hl.struct(**{f: variants.exome.freq[f].ac_raw for f in variants.exome.freq.dtype.fields}),
        genome_ac_raw=hl.struct(
            non_cancer=variants.genome.freq.gnomad.ac_raw,
            **{f: variants.genome.freq[f].ac_raw for f in variants.genome.freq.dtype.fields},
        ),
    )
    variants_by_locus = variants_by_locus.group_by("locus").aggregate(
        variants=hl.agg.collect(variants_by_locus.row_value)
    )

    def subset_filter(subset):
        return lambda variant: (variant.exome_ac_raw[subset] > 0) | (variant.genome_ac_raw[subset] > 0)

    variants_by_locus = variants_by_locus.annotate(
        variant_ids=hl.struct(
            **{
                subset: variants_by_locus.variants.filter(subset_filter(subset)).map(lambda variant: variant.variant_id)
                for subset in ["gnomad", "controls", "non_cancer", "non_neuro", "non_topmed"]
            }
        )
    )

    variants = variants.annotate(colocated_variants=variants_by_locus[variants.locus].variant_ids)
    variants = variants.annotate(
        colocated_variants=hl.struct(
            **{
                subset: variants.colocated_variants[subset].filter(lambda variant_id: variant_id != variants.variant_id)
                for subset in ["gnomad", "controls", "non_cancer", "non_neuro", "non_topmed"]
            }
        )
    )

    return variants
Ejemplo n.º 6
0
def prepare_gnomad_v3_variants(path):
    ds = hl.read_table(path)

    g = hl.eval(ds.globals)

    subsets = set(m.get("subset", None) for m in g.freq_meta)

    def freq(ds, *args, **kwargs):
        return ds.freq[g.freq_index_dict[freq_index_key(*args, **kwargs)]]

    ############################
    # Derived top level fields #
    ############################

    ds = ds.annotate(variant_id=variant_id(ds.locus, ds.alleles))

    ######################
    # Colocated variants #
    ######################

    variants_by_locus = ds.select(
        ds.variant_id,
        ac_raw=hl.struct(
            **{
                subset or "all": freq(ds, subset=subset, raw=True).AC
                for subset in subsets
            }),
    )
    variants_by_locus = variants_by_locus.group_by("locus").aggregate(
        variants=hl.agg.collect(variants_by_locus.row_value))

    def subset_filter(subset):
        return lambda variant: variant.ac_raw[subset] > 0

    variants_by_locus = variants_by_locus.annotate(variant_ids=hl.struct(
        **{
            subset or "all": variants_by_locus.variants.filter(
                subset_filter(subset or "all")).map(
                    lambda variant: variant.variant_id)
            for subset in subsets
        }))

    ds = ds.annotate(
        colocated_variants=variants_by_locus[ds.locus].variant_ids)
    ds = ds.annotate(colocated_variants=hl.struct(
        **{
            subset: ds.colocated_variants[subset].filter(
                lambda variant_id: variant_id != ds.variant_id)
            for subset in ds.colocated_variants._fields
        }))

    ###############
    # Frequencies #
    ###############

    subset_populations = {}
    for subset in subsets:
        subset_populations[subset] = set(
            m.get("pop", None) for m in g.freq_meta
            if m.get("subset", None) == subset)

        subset_populations[subset].discard(None)

        # "global" population is used for downsamplings
        subset_populations[subset].discard("global")

    ds = ds.annotate(in_autosome_or_par=ds.locus.in_autosome_or_par())

    ds = ds.annotate(genome=hl.struct(freq=hl.struct(
        **{
            subset or "all": hl.struct(
                ac=freq(ds, subset=subset).AC,
                ac_raw=freq(ds, subset=subset, raw=True).AC,
                an=freq(ds, subset=subset).AN,
                hemizygote_count=hl.if_else(
                    ds.in_autosome_or_par,
                    0,
                    freq(ds, subset=subset, sex="XY").AC,
                ),
                homozygote_count=freq(ds, subset=subset).homozygote_count,
                populations=[
                    hl.struct(
                        id="_".join(filter(bool, [pop, sex])),
                        ac=freq(ds, subset=subset, pop=pop, sex=sex).AC,
                        an=freq(ds, subset=subset, pop=pop, sex=sex).AN,
                        hemizygote_count=0 if sex == "XX" else hl.if_else(
                            ds.in_autosome_or_par,
                            0,
                            freq(ds, subset=subset, pop=pop, sex="XY").AC,
                        ),
                        homozygote_count=freq(
                            ds, subset=subset, pop=pop,
                            sex=sex).homozygote_count,
                    ) for pop, sex in itertools.product(
                        subset_populations[subset], [None, "XX", "XY"])
                ],
            )
            for subset in subsets
        })))

    ds = ds.drop("freq", "in_autosome_or_par")

    ##############################
    # Filtering allele frequency #
    ##############################

    faf_populations = [
        pop for pop in subset_populations[None]
        if f"{pop}-adj" in g.faf_index_dict
    ]

    # Get popmax FAFs
    ds = ds.annotate(genome=ds.genome.annotate(
        faf95=hl.rbind(
            hl.sorted(
                hl.array([
                    hl.struct(faf=ds.faf[g.faf_index_dict[f"{pop}-adj"]].faf95,
                              population=pop) for pop in faf_populations
                ]),
                key=lambda f: (-f.faf, f.population),
            ),
            lambda fafs: hl.if_else(
                hl.len(fafs) > 0,
                hl.struct(popmax=fafs[0].faf,
                          popmax_population=fafs[0].population),
                hl.struct(popmax=hl.null(hl.tfloat),
                          popmax_population=hl.null(hl.tstr)),
            ),
        ),
        faf99=hl.rbind(
            hl.sorted(
                hl.array([
                    hl.struct(faf=ds.faf[g.faf_index_dict[f"{pop}-adj"]].faf99,
                              population=pop) for pop in faf_populations
                ]),
                key=lambda f: (-f.faf, f.population),
            ),
            lambda fafs: hl.if_else(
                hl.len(fafs) > 0,
                hl.struct(popmax=fafs[0].faf,
                          popmax_population=fafs[0].population),
                hl.struct(popmax=hl.null(hl.tfloat),
                          popmax_population=hl.null(hl.tstr)),
            ),
        ),
    ))

    ds = ds.drop("faf")

    ####################
    # Age distribution #
    ####################

    ds = ds.annotate(genome=ds.genome.annotate(
        age_distribution=hl.struct(het=ds.age_hist_het, hom=ds.age_hist_hom)))

    ds = ds.drop("age_hist_het", "age_hist_hom")

    ###################
    # Quality metrics #
    ###################

    ds = ds.annotate(genome=ds.genome.annotate(
        filters=ds.filters,
        quality_metrics=hl.struct(
            allele_balance=hl.struct(alt=ds.qual_hists.ab_hist_alt.annotate(
                bin_edges=ds.qual_hists.ab_hist_alt.bin_edges.map(
                    lambda n: hl.float(hl.format("%.3f", n))))),
            genotype_depth=hl.struct(all=ds.qual_hists.dp_hist_all,
                                     alt=ds.qual_hists.dp_hist_alt),
            genotype_quality=hl.struct(all=ds.qual_hists.gq_hist_all,
                                       alt=ds.qual_hists.gq_hist_alt),
            site_quality_metrics=[
                hl.struct(metric="SiteQuality",
                          value=hl.float(nullify_nan(ds.info.QUALapprox)))
            ] + [
                hl.struct(metric=metric,
                          value=hl.float(nullify_nan(ds.info[metric])))
                for metric in [
                    "InbreedingCoeff",
                    "AS_FS",
                    "AS_MQ",
                    "AS_MQRankSum",
                    "AS_pab_max",
                    "AS_QUALapprox",
                    "AS_QD",
                    "AS_ReadPosRankSum",
                    "AS_SOR",
                    "AS_VarDP",
                    "AS_VQSLOD",
                ]
            ],
        ),
    ))

    ds = ds.drop("filters", "qual_hists", "raw_qual_hists", "vqsr")

    #########
    # Flags #
    #########

    ds = ds.annotate(flags=hl.set([
        hl.or_missing(ds.region_flag.lcr, "lcr"),
        hl.or_missing(ds.region_flag.segdup, "segdup"),
        hl.or_missing(
            ((ds.locus.contig == "chrX") & ds.locus.in_x_par())
            | ((ds.locus.contig == "chrY") & ds.locus.in_y_par()),
            "par",
        ),
        hl.or_missing(ds.info.monoallelic, "monoallelic"),
    ]).filter(hl.is_defined))

    ds = ds.drop("region_flag")

    ###############
    # Annotations #
    ###############

    ds = ds.transmute(annotations=hl.struct(cadd=ds.cadd,
                                            primate_ai=ds.primate_ai,
                                            revel=ds.revel,
                                            splice_ai=ds.splice_ai))

    ################
    # Other fields #
    ################

    # Drop unused fields
    ds = ds.drop("allele_info", "a_index", "info", "popmax", "was_split")

    return ds
Ejemplo n.º 7
0
def import_exac_vcf(path):
    ds = hl.import_vcf(path, force_bgz=True, skip_invalid_loci=True).rows()

    ds = hl.split_multi(ds)

    ds = ds.repartition(5000, shuffle=True)

    # Get value corresponding to the split variant
    ds = ds.annotate(
        info=ds.info.annotate(
            **{
                field: hl.or_missing(hl.is_defined(ds.info[field]), ds.info[field][ds.a_index - 1])
                for field in PER_ALLELE_FIELDS
            }
        )
    )

    # For DP_HIST and GQ_HIST, the first value in the array contains the histogram for all individuals,
    # which is the same in each alt allele's variant.
    ds = ds.annotate(
        info=ds.info.annotate(
            DP_HIST=hl.struct(all=ds.info.DP_HIST[0], alt=ds.info.DP_HIST[ds.a_index]),
            GQ_HIST=hl.struct(all=ds.info.GQ_HIST[0], alt=ds.info.GQ_HIST[ds.a_index]),
        )
    )

    ds = ds.cache()

    # Convert "NA" and empty strings into null values
    # Convert fields in chunks to avoid "Method code too large" errors
    for i in range(0, len(SELECT_INFO_FIELDS), 10):
        ds = ds.annotate(
            info=ds.info.annotate(
                **{
                    field: hl.or_missing(
                        hl.is_defined(ds.info[field]),
                        hl.if_else(
                            (hl.str(ds.info[field]) == "") | (hl.str(ds.info[field]) == "NA"),
                            hl.null(ds.info[field].dtype),
                            ds.info[field],
                        ),
                    )
                    for field in SELECT_INFO_FIELDS[i : i + 10]
                }
            )
        )

    # Convert field types
    ds = ds.annotate(
        info=ds.info.annotate(
            **{
                field: hl.if_else(ds.info[field] == "", hl.null(hl.tint), hl.int(ds.info[field]))
                for field in CONVERT_TO_INT_FIELDS
            }
        )
    )
    ds = ds.annotate(
        info=ds.info.annotate(
            **{
                field: hl.if_else(ds.info[field] == "", hl.null(hl.tfloat), hl.float(ds.info[field]))
                for field in CONVERT_TO_FLOAT_FIELDS
            }
        )
    )

    # Format VEP annotations to mimic the output of hail.vep
    ds = ds.annotate(
        info=ds.info.annotate(
            CSQ=ds.info.CSQ.map(
                lambda s: s.replace("%3A", ":")
                .replace("%3B", ";")
                .replace("%3D", "=")
                .replace("%25", "%")
                .replace("%2C", ",")
            )
        )
    )
    ds = ds.annotate(
        vep=hl.struct(
            transcript_consequences=ds.info.CSQ.map(
                lambda csq_str: hl.bind(
                    lambda csq_values: hl.struct(
                        **{
                            field: hl.if_else(csq_values[index] == "", hl.null(hl.tstr), csq_values[index])
                            for index, field in enumerate(VEP_FIELDS)
                        }
                    ),
                    csq_str.split(r"\|"),
                )
            )
            .filter(lambda annotation: annotation.Feature.startswith("ENST"))
            .filter(lambda annotation: hl.int(annotation.ALLELE_NUM) == ds.a_index)
            .map(
                lambda annotation: annotation.select(
                    amino_acids=annotation.Amino_acids,
                    biotype=annotation.BIOTYPE,
                    canonical=annotation.CANONICAL == "YES",
                    # cDNA_position may contain either "start-end" or, when start == end, "start"
                    cdna_start=split_position_start(annotation.cDNA_position),
                    cdna_end=split_position_end(annotation.cDNA_position),
                    codons=annotation.Codons,
                    consequence_terms=annotation.Consequence.split("&"),
                    distance=hl.int(annotation.DISTANCE),
                    domains=hl.or_missing(
                        hl.is_defined(annotation.DOMAINS),
                        annotation.DOMAINS.split("&").map(
                            lambda d: hl.struct(db=d.split(":")[0], name=d.split(":")[1])
                        ),
                    ),
                    exon=annotation.EXON,
                    gene_id=annotation.Gene,
                    gene_symbol=annotation.SYMBOL,
                    gene_symbol_source=annotation.SYMBOL_SOURCE,
                    hgnc_id=annotation.HGNC_ID,
                    hgvsc=annotation.HGVSc,
                    hgvsp=annotation.HGVSp,
                    lof=annotation.LoF,
                    lof_filter=annotation.LoF_filter,
                    lof_flags=annotation.LoF_flags,
                    lof_info=annotation.LoF_info,
                    # PolyPhen field contains "polyphen_prediction(polyphen_score)"
                    polyphen_prediction=hl.or_missing(
                        hl.is_defined(annotation.PolyPhen), annotation.PolyPhen.split(r"\(")[0]
                    ),
                    protein_id=annotation.ENSP,
                    # Protein_position may contain either "start-end" or, when start == end, "start"
                    protein_start=split_position_start(annotation.Protein_position),
                    protein_end=split_position_end(annotation.Protein_position),
                    # SIFT field contains "sift_prediction(sift_score)"
                    sift_prediction=hl.or_missing(hl.is_defined(annotation.SIFT), annotation.SIFT.split(r"\(")[0]),
                    transcript_id=annotation.Feature,
                )
            )
        )
    )

    ds = ds.annotate(
        vep=ds.vep.annotate(
            most_severe_consequence=hl.bind(
                lambda all_consequence_terms: hl.or_missing(
                    all_consequence_terms.size() != 0, hl.sorted(all_consequence_terms, key=consequence_term_rank)[0]
                ),
                ds.vep.transcript_consequences.flatmap(lambda c: c.consequence_terms),
            )
        )
    )

    ds = ds.cache()

    QUALITY_METRIC_HISTOGRAM_BIN_EDGES = [i * 5 for i in range(21)]

    ds = ds.select(
        variant_id=variant_id(ds.locus, ds.alleles),
        reference_genome="GRCh37",
        chrom=normalized_contig(ds.locus.contig),
        pos=ds.locus.position,
        xpos=x_position(ds.locus),
        ref=ds.alleles[0],
        alt=ds.alleles[1],
        rsid=ds.rsid,
        exome=hl.struct(
            ac=ds.info.AC_Adj,
            an=ds.info.AN_Adj,
            homozygote_count=ds.info.AC_Hom,
            hemizygote_count=hl.or_else(ds.info.AC_Hemi, 0),
            filters=hl.set(hl.if_else(ds.info.AC_Adj == 0, ds.filters.add("AC0"), ds.filters)),
            populations=[
                hl.struct(
                    id=pop_id,
                    ac=ds.info[f"AC_{pop_id}"],
                    an=ds.info[f"AN_{pop_id}"],
                    hemizygote_count=hl.or_else(ds.info[f"Hemi_{pop_id}"], 0),
                    homozygote_count=ds.info[f"Hom_{pop_id}"],
                )
                for pop_id in ["AFR", "AMR", "EAS", "FIN", "NFE", "OTH", "SAS"]
            ],
            age_distribution=hl.struct(
                het=hl.rbind(
                    hl.or_else(ds.info.AGE_HISTOGRAM_HET, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float),
                    lambda bins: hl.struct(
                        bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80],
                        bin_freq=bins[1:11],
                        n_smaller=bins[0],
                        n_larger=bins[11],
                    ),
                ),
                hom=hl.rbind(
                    hl.or_else(ds.info.AGE_HISTOGRAM_HOM, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float),
                    lambda bins: hl.struct(
                        bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80],
                        bin_freq=bins[1:11],
                        n_smaller=bins[0],
                        n_larger=bins[11],
                    ),
                ),
            ),
            quality_metrics=hl.struct(
                genotype_depth=hl.struct(
                    all=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.DP_HIST.all.split(r"\|").map(hl.float),
                    ),
                    alt=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.DP_HIST.alt.split(r"\|").map(hl.float),
                    ),
                ),
                genotype_quality=hl.struct(
                    all=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.GQ_HIST.all.split(r"\|").map(hl.float),
                    ),
                    alt=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.GQ_HIST.alt.split(r"\|").map(hl.float),
                    ),
                ),
                site_quality_metrics=[
                    hl.struct(metric="BaseQRankSum", value=hl.float(ds.info.BaseQRankSum)),
                    hl.struct(metric="ClippingRankSum", value=hl.float(ds.info.ClippingRankSum)),
                    hl.struct(metric="DP", value=hl.float(ds.info.DP)),
                    hl.struct(metric="FS", value=hl.float(ds.info.FS)),
                    hl.struct(metric="InbreedingCoeff", value=hl.float(ds.info.InbreedingCoeff)),
                    hl.struct(metric="MQ", value=hl.float(ds.info.MQ)),
                    hl.struct(metric="MQRankSum", value=hl.float(ds.info.MQRankSum)),
                    hl.struct(metric="QD", value=hl.float(ds.info.QD)),
                    hl.struct(metric="ReadPosRankSum", value=hl.float(ds.info.ReadPosRankSum)),
                    hl.struct(metric="SiteQuality", value=hl.float(ds.qual)),
                    hl.struct(metric="VQSLOD", value=hl.float(ds.info.VQSLOD)),
                ],
            ),
        ),
        colocated_variants=hl.rbind(
            variant_id(ds.locus, ds.alleles),
            lambda this_variant_id: variant_ids(ds.old_locus, ds.old_alleles).filter(
                lambda v_id: v_id != this_variant_id
            ),
        ),
        vep=ds.vep,
    )

    ds = ds.annotate(genome=hl.null(ds.exome.dtype))

    return ds