Exemple #1
0
def main(args):

    hl.init(log='/liftover.log')

    if args.gnomad:
        gnomad = True
        path = None

        if args.exomes:
            data_type = 'exomes'
        if args.genomes:
            data_type = 'genomes'

        logger.info('Working on gnomAD {} release ht'.format(data_type))
        logger.info('Reading in release ht')
        t = public_release(data_type).ht()
        logger.info('Variants in release ht: {}'.format(t.count()))

    else:
        data_type = None
        gnomad = False

        if args.ht:
            path = args.ht
            t = hl.read_table(args.ht)
        if args.mt:
            path = args.mt
            t = hl.read_matrix_table(args.mt)

    logger.info('Checking if input data has been split')
    if 'was_split' not in t.row:
        t = hl.split_multi(t) if isinstance(
            t, hl.Table) else hl.split_multi_hts(t)

    logger.info('Preparing reference genomes for liftover')
    source, target = get_liftover_genome(t)

    if args.test:
        logger.info('Filtering to chr21 for testing')
        if source.name == 'GRCh38':
            contig = 'chr21'
        else:
            contig = '21'
        t = hl.filter_intervals(
            t, [hl.parse_locus_interval(contig, reference_genome=source.name)])

    logger.info(f'Lifting data to {target.name}')
    t = lift_data(t, gnomad, data_type, path, target, args.overwrite)

    logger.info('Checking SNPs for reference mismatches')
    t = annotate_snp_mismatch(t, data_type, target)

    mismatch = check_mismatch(t) if isinstance(
        t, hl.Table) else check_mismatch(t.rows())
    logger.info('{} total SNPs'.format(mismatch['total_variants']))
    logger.info('{} SNPs on minus strand'.format(mismatch['negative_strand']))
    logger.info('{} reference mismatches in SNPs'.format(
        mismatch['total_mismatch']))
    logger.info('{} mismatches on minus strand'.format(
        mismatch['negative_strand_mismatch']))
Exemple #2
0
def _import_dbsnp(**kwargs) -> hl.Table:
    dbsnp = import_sites_vcf(**kwargs)
    # Note: permit_shuffle is set because the dbsnp vcf has duplicate loci (turned into a set) so might be out of order
    dbsnp = hl.split_multi(dbsnp, permit_shuffle=True)
    dbsnp = dbsnp.group_by(
        dbsnp.locus,
        dbsnp.alleles).aggregate(rsid=hl.agg.collect_as_set(dbsnp.rsid))

    return dbsnp
Exemple #3
0
def split_info() -> hl.Table:
    """
    Generates an info table that splits multi-allelic sites from the multi-allelic info table.

    :return: Info table with split multi-allelics
    :rtype: Table
    """
    info_ht = get_info(split=False).ht()

    # Create split version
    info_ht = hl.split_multi(info_ht)

    info_ht = info_ht.annotate(
        info=info_ht.info.annotate(
            **split_info_annotation(info_ht.info, info_ht.a_index), ),
        AS_lowqual=split_lowqual_annotation(info_ht.AS_lowqual,
                                            info_ht.a_index),
    )
    return info_ht
Exemple #4
0
import hail as hl

ht_samples = hl.read_table(
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_samples.ht')
ht_relationships = hl.read_table(
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_sample_relationships.ht')

mt = hl.import_vcf(
    'gs://hail-datasets-raw-data/1000_Genomes/1000_Genomes_phase3_chrMT_GRCh37.vcf.bgz',
    reference_genome='GRCh37')

mt = mt.annotate_cols(**ht_samples[mt.s])
mt = mt.annotate_cols(**ht_relationships[mt.s])

mt_split = hl.split_multi(mt)
mt_split = mt_split.select_entries(
    GT=hl.downcode(mt_split.GT, mt_split.a_index))
mt_split = mt_split.annotate_rows(info=hl.struct(
    AC=mt_split.info.AC[mt_split.a_index - 1],
    VT=(hl.case().when((mt_split.alleles[0].length() == 1) & (
        mt_split.alleles[1].length() == 1), 'SNP').when(
            mt_split.alleles[0].matches('<CN*>')
            | mt_split.alleles[1].matches('<CN*>'), 'SV').default('INDEL'))))

n_rows, n_cols = mt_split.count()
n_partitions = mt_split.n_partitions()

mt_split = hl.sample_qc(mt_split)
mt_split = hl.variant_qc(mt_split)

mt_split = mt_split.annotate_globals(
def split_multi():
    mt = hl.read_matrix_table(resource('profile.mt'))
    hl.split_multi(mt)._force_count_rows()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input-url",
        help="URL of ExAC sites VCF",
        default=
        "gs://gnomad-public/legacy/exac_browser/ExAC.r1.sites.vep.vcf.gz")
    parser.add_argument("--output-url",
                        help="URL to write Hail table to",
                        required=True)
    parser.add_argument("--subset",
                        help="Filter variants to this chrom:start-end range")
    args = parser.parse_args()

    hl.init(log="/tmp/hail.log")

    print("\n=== Importing VCF ===")

    ds = hl.import_vcf(args.input_url,
                       force_bgz=True,
                       min_partitions=2000,
                       skip_invalid_loci=True).rows()

    if args.subset:
        print(f"\n=== Filtering to interval {args.subset} ===")
        subset_interval = hl.parse_locus_interval(args.subset)
        ds = ds.filter(subset_interval.contains(ds.locus))

    print("\n=== Splitting multiallelic variants ===")

    ds = hl.split_multi(ds)

    ds = ds.repartition(2000, shuffle=True)

    # Get value corresponding to the split variant
    ds = ds.annotate(info=ds.info.annotate(
        **{
            field: hl.or_missing(hl.is_defined(ds.info[field]), ds.info[field][
                ds.a_index - 1])
            for field in PER_ALLELE_FIELDS
        }))

    # For DP_HIST and GQ_HIST, the first value in the array contains the histogram for all individuals,
    # which is the same in each alt allele's variant.
    ds = ds.annotate(info=ds.info.annotate(
        DP_HIST=hl.struct(all=ds.info.DP_HIST[0],
                          alt=ds.info.DP_HIST[ds.a_index]),
        GQ_HIST=hl.struct(all=ds.info.GQ_HIST[0],
                          alt=ds.info.GQ_HIST[ds.a_index]),
    ))

    ds = ds.cache()

    print("\n=== Munging data ===")

    # Convert "NA" and empty strings into null values
    # Convert fields in chunks to avoid "Method code too large" errors
    for i in range(0, len(SELECT_INFO_FIELDS), 10):
        ds = ds.annotate(info=ds.info.annotate(
            **{
                field: hl.or_missing(
                    hl.is_defined(ds.info[field]),
                    hl.bind(
                        lambda value: hl.cond(
                            (value == "") | (value == "NA"),
                            hl.null(ds.info[field].dtype), ds.info[field]),
                        hl.str(ds.info[field]),
                    ),
                )
                for field in SELECT_INFO_FIELDS[i:i + 10]
            }))

    # Convert field types
    ds = ds.annotate(info=ds.info.annotate(
        **{
            field: hl.cond(ds.info[field] == "", hl.null(hl.tint),
                           hl.int(ds.info[field]))
            for field in CONVERT_TO_INT_FIELDS
        }))
    ds = ds.annotate(info=ds.info.annotate(
        **{
            field: hl.cond(ds.info[field] == "", hl.null(hl.tfloat),
                           hl.float(ds.info[field]))
            for field in CONVERT_TO_FLOAT_FIELDS
        }))

    # Format VEP annotations to mimic the output of hail.vep
    ds = ds.annotate(info=ds.info.annotate(CSQ=ds.info.CSQ.map(
        lambda s: s.replace("%3A", ":").replace("%3B", ";").replace(
            "%3D", "=").replace("%25", "%").replace("%2C", ","))))
    ds = ds.annotate(vep=hl.struct(
        transcript_consequences=ds.info.CSQ.map(lambda csq_str: hl.bind(
            lambda csq_values: hl.struct(
                **{
                    field: hl.cond(csq_values[index] == "", hl.null(hl.tstr),
                                   csq_values[index])
                    for index, field in enumerate(VEP_FIELDS)
                }),
            csq_str.split("\\|"),
        )).filter(lambda annotation: annotation.Feature.startswith("ENST")).
        filter(lambda annotation: hl.int(annotation.ALLELE_NUM) == ds.a_index).
        map(lambda annotation: annotation.select(
            amino_acids=annotation.Amino_acids,
            biotype=annotation.BIOTYPE,
            canonical=annotation.CANONICAL == "YES",
            # cDNA_position may contain either "start-end" or, when start == end, "start"
            cdna_start=split_position_start(annotation.cDNA_position),
            cdna_end=split_position_end(annotation.cDNA_position),
            codons=annotation.Codons,
            consequence_terms=annotation.Consequence.split("&"),
            distance=hl.int(annotation.DISTANCE),
            domains=hl.or_missing(
                hl.is_defined(annotation.DOMAINS),
                annotation.DOMAINS.split("&").map(lambda d: hl.struct(
                    db=d.split(":")[0], name=d.split(":")[1])),
            ),
            exon=annotation.EXON,
            gene_id=annotation.Gene,
            gene_symbol=annotation.SYMBOL,
            gene_symbol_source=annotation.SYMBOL_SOURCE,
            hgnc_id=annotation.HGNC_ID,
            hgvsc=annotation.HGVSc,
            hgvsp=annotation.HGVSp,
            lof=annotation.LoF,
            lof_filter=annotation.LoF_filter,
            lof_flags=annotation.LoF_flags,
            lof_info=annotation.LoF_info,
            # PolyPhen field contains "polyphen_prediction(polyphen_score)"
            polyphen_prediction=hl.or_missing(
                hl.is_defined(annotation.PolyPhen),
                annotation.PolyPhen.split("\\(")[0]),
            protein_id=annotation.ENSP,
            # Protein_position may contain either "start-end" or, when start == end, "start"
            protein_start=split_position_start(annotation.Protein_position),
            protein_end=split_position_end(annotation.Protein_position),
            # SIFT field contains "sift_prediction(sift_score)"
            sift_prediction=hl.or_missing(hl.is_defined(annotation.SIFT),
                                          annotation.SIFT.split("\\(")[0]),
            transcript_id=annotation.Feature,
        ))))

    ds = ds.annotate(vep=ds.vep.annotate(most_severe_consequence=hl.bind(
        lambda all_consequence_terms: hl.or_missing(
            all_consequence_terms.size() != 0,
            hl.sorted(all_consequence_terms, key=consequence_term_rank)[0]),
        ds.vep.transcript_consequences.flatmap(lambda c: c.consequence_terms),
    )))

    ds = ds.cache()

    print("\n=== Adding derived fields ===")

    ds = ds.annotate(
        sorted_transcript_consequences=sorted_transcript_consequences_v3(
            ds.vep))

    ds = ds.select(
        "filters",
        "qual",
        "rsid",
        "sorted_transcript_consequences",
        AC=ds.info.AC,
        AC_Adj=ds.info.AC_Adj,
        AC_Hemi=ds.info.AC_Hemi,
        AC_Hom=ds.info.AC_Hom,
        AF=ds.info.AF,
        AN=ds.info.AN,
        AN_Adj=ds.info.AN_Adj,
        BaseQRankSum=ds.info.BaseQRankSum,
        CCC=ds.info.CCC,
        ClippingRankSum=ds.info.ClippingRankSum,
        DB=ds.info.DB,
        DP=ds.info.DP,
        DS=ds.info.DS,
        END=ds.info.END,
        FS=ds.info.FS,
        GQ_MEAN=ds.info.GQ_MEAN,
        GQ_STDDEV=ds.info.GQ_STDDEV,
        HWP=ds.info.HWP,
        HaplotypeScore=ds.info.HaplotypeScore,
        InbreedingCoeff=ds.info.InbreedingCoeff,
        MLEAC=ds.info.MLEAC,
        MLEAF=ds.info.MLEAF,
        MQ=ds.info.MQ,
        MQ0=ds.info.MQ0,
        MQRankSum=ds.info.MQRankSum,
        NCC=ds.info.NCC,
        NEGATIVE_TRAIN_SITE=ds.info.NEGATIVE_TRAIN_SITE,
        POSITIVE_TRAIN_SITE=ds.info.POSITIVE_TRAIN_SITE,
        QD=ds.info.QD,
        ReadPosRankSum=ds.info.ReadPosRankSum,
        VQSLOD=ds.info.VQSLOD,
        culprit=ds.info.culprit,
        DP_HIST=ds.info.DP_HIST,
        GQ_HIST=ds.info.GQ_HIST,
        DOUBLETON_DIST=ds.info.DOUBLETON_DIST,
        AC_CONSANGUINEOUS=ds.info.AC_CONSANGUINEOUS,
        AN_CONSANGUINEOUS=ds.info.AN_CONSANGUINEOUS,
        Hom_CONSANGUINEOUS=ds.info.Hom_CONSANGUINEOUS,
        AGE_HISTOGRAM_HET=ds.info.AGE_HISTOGRAM_HET,
        AGE_HISTOGRAM_HOM=ds.info.AGE_HISTOGRAM_HOM,
        AC_POPMAX=ds.info.AC_POPMAX,
        AN_POPMAX=ds.info.AN_POPMAX,
        POPMAX=ds.info.POPMAX,
        K1_RUN=ds.info.K1_RUN,
        K2_RUN=ds.info.K2_RUN,
        K3_RUN=ds.info.K3_RUN,
        ESP_AF_POPMAX=ds.info.ESP_AF_POPMAX,
        ESP_AF_GLOBAL=ds.info.ESP_AF_GLOBAL,
        ESP_AC=ds.info.ESP_AC,
        KG_AF_POPMAX=ds.info.KG_AF_POPMAX,
        KG_AF_GLOBAL=ds.info.KG_AF_GLOBAL,
        KG_AC=ds.info.KG_AC,
        AC_FEMALE=ds.info.AC_FEMALE,
        AN_FEMALE=ds.info.AN_FEMALE,
        AC_MALE=ds.info.AC_MALE,
        AN_MALE=ds.info.AN_MALE,
        populations=hl.struct(
            **{
                pop_id: hl.struct(
                    AC=ds.info[f"AC_{pop_id}"],
                    AN=ds.info[f"AN_{pop_id}"],
                    hemi=ds.info[f"Hemi_{pop_id}"],
                    hom=ds.info[f"Hom_{pop_id}"],
                )
                for pop_id in
                ["AFR", "AMR", "EAS", "FIN", "NFE", "OTH", "SAS"]
            }),
        colocated_variants=hl.bind(
            lambda this_variant_id: variant_ids(ds.old_locus, ds.old_alleles).
            filter(lambda v_id: v_id != this_variant_id),
            variant_id(ds.locus, ds.alleles),
        ),
        variant_id=variant_id(ds.locus, ds.alleles),
        xpos=x_position(ds.locus),
    )

    print("\n=== Writing table ===")

    ds.write(args.output_url)
Exemple #7
0
def split_multi(mt_path):
    mt = hl.read_matrix_table(mt_path)
    hl.split_multi(mt)._force_count_rows()
 def _get_filtered_mt(self, rsid='rs35471880'):
     mt = hl.import_vcf('tests/data/1kg_30variants.vcf.bgz')
     mt = hl.split_multi(mt.filter_rows(mt.rsid == rsid))
     return mt
def import_exac_vcf(path):
    ds = hl.import_vcf(path, force_bgz=True, skip_invalid_loci=True).rows()

    ds = hl.split_multi(ds)

    ds = ds.repartition(5000, shuffle=True)

    # Get value corresponding to the split variant
    ds = ds.annotate(
        info=ds.info.annotate(
            **{
                field: hl.or_missing(hl.is_defined(ds.info[field]), ds.info[field][ds.a_index - 1])
                for field in PER_ALLELE_FIELDS
            }
        )
    )

    # For DP_HIST and GQ_HIST, the first value in the array contains the histogram for all individuals,
    # which is the same in each alt allele's variant.
    ds = ds.annotate(
        info=ds.info.annotate(
            DP_HIST=hl.struct(all=ds.info.DP_HIST[0], alt=ds.info.DP_HIST[ds.a_index]),
            GQ_HIST=hl.struct(all=ds.info.GQ_HIST[0], alt=ds.info.GQ_HIST[ds.a_index]),
        )
    )

    ds = ds.cache()

    # Convert "NA" and empty strings into null values
    # Convert fields in chunks to avoid "Method code too large" errors
    for i in range(0, len(SELECT_INFO_FIELDS), 10):
        ds = ds.annotate(
            info=ds.info.annotate(
                **{
                    field: hl.or_missing(
                        hl.is_defined(ds.info[field]),
                        hl.if_else(
                            (hl.str(ds.info[field]) == "") | (hl.str(ds.info[field]) == "NA"),
                            hl.null(ds.info[field].dtype),
                            ds.info[field],
                        ),
                    )
                    for field in SELECT_INFO_FIELDS[i : i + 10]
                }
            )
        )

    # Convert field types
    ds = ds.annotate(
        info=ds.info.annotate(
            **{
                field: hl.if_else(ds.info[field] == "", hl.null(hl.tint), hl.int(ds.info[field]))
                for field in CONVERT_TO_INT_FIELDS
            }
        )
    )
    ds = ds.annotate(
        info=ds.info.annotate(
            **{
                field: hl.if_else(ds.info[field] == "", hl.null(hl.tfloat), hl.float(ds.info[field]))
                for field in CONVERT_TO_FLOAT_FIELDS
            }
        )
    )

    # Format VEP annotations to mimic the output of hail.vep
    ds = ds.annotate(
        info=ds.info.annotate(
            CSQ=ds.info.CSQ.map(
                lambda s: s.replace("%3A", ":")
                .replace("%3B", ";")
                .replace("%3D", "=")
                .replace("%25", "%")
                .replace("%2C", ",")
            )
        )
    )
    ds = ds.annotate(
        vep=hl.struct(
            transcript_consequences=ds.info.CSQ.map(
                lambda csq_str: hl.bind(
                    lambda csq_values: hl.struct(
                        **{
                            field: hl.if_else(csq_values[index] == "", hl.null(hl.tstr), csq_values[index])
                            for index, field in enumerate(VEP_FIELDS)
                        }
                    ),
                    csq_str.split(r"\|"),
                )
            )
            .filter(lambda annotation: annotation.Feature.startswith("ENST"))
            .filter(lambda annotation: hl.int(annotation.ALLELE_NUM) == ds.a_index)
            .map(
                lambda annotation: annotation.select(
                    amino_acids=annotation.Amino_acids,
                    biotype=annotation.BIOTYPE,
                    canonical=annotation.CANONICAL == "YES",
                    # cDNA_position may contain either "start-end" or, when start == end, "start"
                    cdna_start=split_position_start(annotation.cDNA_position),
                    cdna_end=split_position_end(annotation.cDNA_position),
                    codons=annotation.Codons,
                    consequence_terms=annotation.Consequence.split("&"),
                    distance=hl.int(annotation.DISTANCE),
                    domains=hl.or_missing(
                        hl.is_defined(annotation.DOMAINS),
                        annotation.DOMAINS.split("&").map(
                            lambda d: hl.struct(db=d.split(":")[0], name=d.split(":")[1])
                        ),
                    ),
                    exon=annotation.EXON,
                    gene_id=annotation.Gene,
                    gene_symbol=annotation.SYMBOL,
                    gene_symbol_source=annotation.SYMBOL_SOURCE,
                    hgnc_id=annotation.HGNC_ID,
                    hgvsc=annotation.HGVSc,
                    hgvsp=annotation.HGVSp,
                    lof=annotation.LoF,
                    lof_filter=annotation.LoF_filter,
                    lof_flags=annotation.LoF_flags,
                    lof_info=annotation.LoF_info,
                    # PolyPhen field contains "polyphen_prediction(polyphen_score)"
                    polyphen_prediction=hl.or_missing(
                        hl.is_defined(annotation.PolyPhen), annotation.PolyPhen.split(r"\(")[0]
                    ),
                    protein_id=annotation.ENSP,
                    # Protein_position may contain either "start-end" or, when start == end, "start"
                    protein_start=split_position_start(annotation.Protein_position),
                    protein_end=split_position_end(annotation.Protein_position),
                    # SIFT field contains "sift_prediction(sift_score)"
                    sift_prediction=hl.or_missing(hl.is_defined(annotation.SIFT), annotation.SIFT.split(r"\(")[0]),
                    transcript_id=annotation.Feature,
                )
            )
        )
    )

    ds = ds.annotate(
        vep=ds.vep.annotate(
            most_severe_consequence=hl.bind(
                lambda all_consequence_terms: hl.or_missing(
                    all_consequence_terms.size() != 0, hl.sorted(all_consequence_terms, key=consequence_term_rank)[0]
                ),
                ds.vep.transcript_consequences.flatmap(lambda c: c.consequence_terms),
            )
        )
    )

    ds = ds.cache()

    QUALITY_METRIC_HISTOGRAM_BIN_EDGES = [i * 5 for i in range(21)]

    ds = ds.select(
        variant_id=variant_id(ds.locus, ds.alleles),
        reference_genome="GRCh37",
        chrom=normalized_contig(ds.locus.contig),
        pos=ds.locus.position,
        xpos=x_position(ds.locus),
        ref=ds.alleles[0],
        alt=ds.alleles[1],
        rsid=ds.rsid,
        exome=hl.struct(
            ac=ds.info.AC_Adj,
            an=ds.info.AN_Adj,
            homozygote_count=ds.info.AC_Hom,
            hemizygote_count=hl.or_else(ds.info.AC_Hemi, 0),
            filters=hl.set(hl.if_else(ds.info.AC_Adj == 0, ds.filters.add("AC0"), ds.filters)),
            populations=[
                hl.struct(
                    id=pop_id,
                    ac=ds.info[f"AC_{pop_id}"],
                    an=ds.info[f"AN_{pop_id}"],
                    hemizygote_count=hl.or_else(ds.info[f"Hemi_{pop_id}"], 0),
                    homozygote_count=ds.info[f"Hom_{pop_id}"],
                )
                for pop_id in ["AFR", "AMR", "EAS", "FIN", "NFE", "OTH", "SAS"]
            ],
            age_distribution=hl.struct(
                het=hl.rbind(
                    hl.or_else(ds.info.AGE_HISTOGRAM_HET, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float),
                    lambda bins: hl.struct(
                        bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80],
                        bin_freq=bins[1:11],
                        n_smaller=bins[0],
                        n_larger=bins[11],
                    ),
                ),
                hom=hl.rbind(
                    hl.or_else(ds.info.AGE_HISTOGRAM_HOM, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float),
                    lambda bins: hl.struct(
                        bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80],
                        bin_freq=bins[1:11],
                        n_smaller=bins[0],
                        n_larger=bins[11],
                    ),
                ),
            ),
            quality_metrics=hl.struct(
                genotype_depth=hl.struct(
                    all=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.DP_HIST.all.split(r"\|").map(hl.float),
                    ),
                    alt=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.DP_HIST.alt.split(r"\|").map(hl.float),
                    ),
                ),
                genotype_quality=hl.struct(
                    all=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.GQ_HIST.all.split(r"\|").map(hl.float),
                    ),
                    alt=hl.struct(
                        bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES,
                        bin_freq=ds.info.GQ_HIST.alt.split(r"\|").map(hl.float),
                    ),
                ),
                site_quality_metrics=[
                    hl.struct(metric="BaseQRankSum", value=hl.float(ds.info.BaseQRankSum)),
                    hl.struct(metric="ClippingRankSum", value=hl.float(ds.info.ClippingRankSum)),
                    hl.struct(metric="DP", value=hl.float(ds.info.DP)),
                    hl.struct(metric="FS", value=hl.float(ds.info.FS)),
                    hl.struct(metric="InbreedingCoeff", value=hl.float(ds.info.InbreedingCoeff)),
                    hl.struct(metric="MQ", value=hl.float(ds.info.MQ)),
                    hl.struct(metric="MQRankSum", value=hl.float(ds.info.MQRankSum)),
                    hl.struct(metric="QD", value=hl.float(ds.info.QD)),
                    hl.struct(metric="ReadPosRankSum", value=hl.float(ds.info.ReadPosRankSum)),
                    hl.struct(metric="SiteQuality", value=hl.float(ds.qual)),
                    hl.struct(metric="VQSLOD", value=hl.float(ds.info.VQSLOD)),
                ],
            ),
        ),
        colocated_variants=hl.rbind(
            variant_id(ds.locus, ds.alleles),
            lambda this_variant_id: variant_ids(ds.old_locus, ds.old_alleles).filter(
                lambda v_id: v_id != this_variant_id
            ),
        ),
        vep=ds.vep,
    )

    ds = ds.annotate(genome=hl.null(ds.exome.dtype))

    return ds
print("\n=== Importing VCF ===")

mt = hl.import_vcf(args.input_url, force_bgz=True, min_partitions=2000, skip_invalid_loci=True)

# Drop entry values
mt = mt.drop("AD", "DP", "GQ", "GT", "MIN_DP", "PL", "SB")

if args.subset:
    print(f"\n=== Filtering to interval {args.subset} ===")
    subset_interval = hl.parse_locus_interval(args.subset)
    mt = mt.filter_rows(subset_interval.contains(mt.locus))

print("\n=== Splitting multiallelic variants ===")

mt = hl.split_multi(mt)

# For multiallelic variants, these fields contain a value for each alt allele
PER_ALLELE_FIELDS = [
    "AC",
    "AC_Adj",
    "AC_Hemi",
    "AC_Hom",
    "AC_MALE",
    "AC_FEMALE",
    "AF",
    "AC_AFR",
    "AC_AMR",
    "AC_EAS",
    "AC_FIN",
    "AC_NFE",