def prepare_coverage(coverage_path): coverage = hl.read_table(coverage_path) coverage = coverage.annotate(xpos=x_position(coverage.locus)) # Median field name is different in v3.0.1 vs v2 if "median" not in coverage.row.dtype.fields: coverage = coverage.annotate(median=coverage.median_approx) # Drop extra fields in v3 coverage = coverage.select( "xpos", "mean", "median", "over_1", "over_5", "over_10", "over_15", "over_20", "over_25", "over_30", "over_50", "over_100", ) return coverage
def prepare_mitochondrial_coverage(coverage_path): coverage = hl.read_table(coverage_path) coverage = coverage.annotate(xpos=x_position(coverage.locus)) coverage = coverage.select("xpos", "mean", "median", "over_100", "over_1000") return coverage
def prepare_gnomad_v2_variants(exome_variants_path, genome_variants_path): exome_variants = prepare_gnomad_v2_variants_helper(exome_variants_path, "exome") genome_variants = prepare_gnomad_v2_variants_helper(genome_variants_path, "genome") shared_fields = [ "lcr", "nonpar", "rsid", "segdup", "vep", ] variants = exome_variants.join(genome_variants, "outer") variants = variants.annotate( **{field: hl.or_else(variants.exome[field], variants.genome[field]) for field in shared_fields} ) variants = variants.annotate(exome=variants.exome.drop(*shared_fields), genome=variants.genome.drop(*shared_fields)) variants = variants.annotate( variant_id=variant_id(variants.locus, variants.alleles), reference_genome="GRCh37", chrom=normalized_contig(variants.locus.contig), pos=variants.locus.position, xpos=x_position(variants.locus), ref=variants.alleles[0], alt=variants.alleles[1], ) variants = variants.transmute(rsids=hl.or_missing(hl.is_defined(variants.rsid), hl.set([variants.rsid]))) # Variant is in a subset if it is in the subset in either exome or genome samples variants = variants.annotate(subsets=variants.exome.subsets.union(variants.genome.subsets)) # Flags variants = variants.annotate( flags=hl.set( [ hl.or_missing(variants.lcr, "lcr"), hl.or_missing(((variants.chrom == "X") | (variants.chrom == "Y")) & ~variants.nonpar, "par"), ] ).filter(hl.is_defined) ) # Colocated variants variants = variants.cache() variants_by_locus = variants.select( variants.variant_id, exome_ac_raw=hl.struct(**{f: variants.exome.freq[f].ac_raw for f in variants.exome.freq.dtype.fields}), genome_ac_raw=hl.struct( non_cancer=variants.genome.freq.gnomad.ac_raw, **{f: variants.genome.freq[f].ac_raw for f in variants.genome.freq.dtype.fields}, ), ) variants_by_locus = variants_by_locus.group_by("locus").aggregate( variants=hl.agg.collect(variants_by_locus.row_value) ) def subset_filter(subset): return lambda variant: (variant.exome_ac_raw[subset] > 0) | (variant.genome_ac_raw[subset] > 0) variants_by_locus = variants_by_locus.annotate( variant_ids=hl.struct( **{ subset: variants_by_locus.variants.filter(subset_filter(subset)).map(lambda variant: variant.variant_id) for subset in ["gnomad", "controls", "non_cancer", "non_neuro", "non_topmed"] } ) ) variants = variants.annotate(colocated_variants=variants_by_locus[variants.locus].variant_ids) variants = variants.annotate( colocated_variants=hl.struct( **{ subset: variants.colocated_variants[subset].filter(lambda variant_id: variant_id != variants.variant_id) for subset in ["gnomad", "controls", "non_cancer", "non_neuro", "non_topmed"] } ) ) return variants
def import_exac_coverage(): paths = [ "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr1.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr10.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr11.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr12.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr13.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr14.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr15.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr16.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr17.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr18.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr19.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr2.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr20.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr21.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr22.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr3.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr4.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr5.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr6.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr7.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr8.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chr9.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chrX.coverage.txt.gz", "gs://gnomad-public/legacy/exac_browser/coverage/Panel.chrY.coverage.txt.gz", ] column_types = { "#chrom": hl.tstr, "pos": hl.tint, "mean": hl.tfloat, "median": hl.tfloat, "1": hl.tfloat, "5": hl.tfloat, "10": hl.tfloat, "15": hl.tfloat, "20": hl.tfloat, "25": hl.tfloat, "30": hl.tfloat, "50": hl.tfloat, "100": hl.tfloat, } ds = hl.import_table(paths, types=column_types, force_bgz=True) ds = ds.rename({ "#chrom": "chrom", "1": "over_1", "5": "over_5", "10": "over_10", "15": "over_15", "20": "over_20", "25": "over_25", "30": "over_30", "50": "over_50", "100": "over_100", }) ds = ds.transmute( locus=hl.locus(ds.chrom, ds.pos, reference_genome="GRCh37")) ds = ds.key_by(ds.locus) ds = ds.annotate(xpos=x_position(ds.locus)) ds = ds.repartition(1000, shuffle=True) return ds
def prepare_clinvar_variants(vcf_path, reference_genome): ds = import_clinvar_vcf(vcf_path, reference_genome) # There are some variants with only one entry in alleles, ignore them for now. # These could be displayed in the ClinVar track even though they will never match a gnomAD variant. ds = ds.filter(hl.len(ds.alleles) == 2) ds = hl.vep(ds) ds = ds.select( clinical_significance=hl.sorted(ds.info.CLNSIG, key=lambda s: s.replace("^_", "z")).map( lambda s: s.replace("^_", "") ), clinvar_variation_id=ds.rsid, gold_stars=get_gold_stars(ds.info.CLNREVSTAT), review_status=hl.sorted(ds.info.CLNREVSTAT, key=lambda s: s.replace("^_", "z")).map( lambda s: s.replace("^_", "") ), vep=ds.vep, ) ds = ds.annotate( chrom=normalized_contig(ds.locus.contig), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus) ) return ds
def import_exac_vcf(path): ds = hl.import_vcf(path, force_bgz=True, skip_invalid_loci=True).rows() ds = hl.split_multi(ds) ds = ds.repartition(5000, shuffle=True) # Get value corresponding to the split variant ds = ds.annotate( info=ds.info.annotate( **{ field: hl.or_missing(hl.is_defined(ds.info[field]), ds.info[field][ds.a_index - 1]) for field in PER_ALLELE_FIELDS } ) ) # For DP_HIST and GQ_HIST, the first value in the array contains the histogram for all individuals, # which is the same in each alt allele's variant. ds = ds.annotate( info=ds.info.annotate( DP_HIST=hl.struct(all=ds.info.DP_HIST[0], alt=ds.info.DP_HIST[ds.a_index]), GQ_HIST=hl.struct(all=ds.info.GQ_HIST[0], alt=ds.info.GQ_HIST[ds.a_index]), ) ) ds = ds.cache() # Convert "NA" and empty strings into null values # Convert fields in chunks to avoid "Method code too large" errors for i in range(0, len(SELECT_INFO_FIELDS), 10): ds = ds.annotate( info=ds.info.annotate( **{ field: hl.or_missing( hl.is_defined(ds.info[field]), hl.if_else( (hl.str(ds.info[field]) == "") | (hl.str(ds.info[field]) == "NA"), hl.null(ds.info[field].dtype), ds.info[field], ), ) for field in SELECT_INFO_FIELDS[i : i + 10] } ) ) # Convert field types ds = ds.annotate( info=ds.info.annotate( **{ field: hl.if_else(ds.info[field] == "", hl.null(hl.tint), hl.int(ds.info[field])) for field in CONVERT_TO_INT_FIELDS } ) ) ds = ds.annotate( info=ds.info.annotate( **{ field: hl.if_else(ds.info[field] == "", hl.null(hl.tfloat), hl.float(ds.info[field])) for field in CONVERT_TO_FLOAT_FIELDS } ) ) # Format VEP annotations to mimic the output of hail.vep ds = ds.annotate( info=ds.info.annotate( CSQ=ds.info.CSQ.map( lambda s: s.replace("%3A", ":") .replace("%3B", ";") .replace("%3D", "=") .replace("%25", "%") .replace("%2C", ",") ) ) ) ds = ds.annotate( vep=hl.struct( transcript_consequences=ds.info.CSQ.map( lambda csq_str: hl.bind( lambda csq_values: hl.struct( **{ field: hl.if_else(csq_values[index] == "", hl.null(hl.tstr), csq_values[index]) for index, field in enumerate(VEP_FIELDS) } ), csq_str.split(r"\|"), ) ) .filter(lambda annotation: annotation.Feature.startswith("ENST")) .filter(lambda annotation: hl.int(annotation.ALLELE_NUM) == ds.a_index) .map( lambda annotation: annotation.select( amino_acids=annotation.Amino_acids, biotype=annotation.BIOTYPE, canonical=annotation.CANONICAL == "YES", # cDNA_position may contain either "start-end" or, when start == end, "start" cdna_start=split_position_start(annotation.cDNA_position), cdna_end=split_position_end(annotation.cDNA_position), codons=annotation.Codons, consequence_terms=annotation.Consequence.split("&"), distance=hl.int(annotation.DISTANCE), domains=hl.or_missing( hl.is_defined(annotation.DOMAINS), annotation.DOMAINS.split("&").map( lambda d: hl.struct(db=d.split(":")[0], name=d.split(":")[1]) ), ), exon=annotation.EXON, gene_id=annotation.Gene, gene_symbol=annotation.SYMBOL, gene_symbol_source=annotation.SYMBOL_SOURCE, hgnc_id=annotation.HGNC_ID, hgvsc=annotation.HGVSc, hgvsp=annotation.HGVSp, lof=annotation.LoF, lof_filter=annotation.LoF_filter, lof_flags=annotation.LoF_flags, lof_info=annotation.LoF_info, # PolyPhen field contains "polyphen_prediction(polyphen_score)" polyphen_prediction=hl.or_missing( hl.is_defined(annotation.PolyPhen), annotation.PolyPhen.split(r"\(")[0] ), protein_id=annotation.ENSP, # Protein_position may contain either "start-end" or, when start == end, "start" protein_start=split_position_start(annotation.Protein_position), protein_end=split_position_end(annotation.Protein_position), # SIFT field contains "sift_prediction(sift_score)" sift_prediction=hl.or_missing(hl.is_defined(annotation.SIFT), annotation.SIFT.split(r"\(")[0]), transcript_id=annotation.Feature, ) ) ) ) ds = ds.annotate( vep=ds.vep.annotate( most_severe_consequence=hl.bind( lambda all_consequence_terms: hl.or_missing( all_consequence_terms.size() != 0, hl.sorted(all_consequence_terms, key=consequence_term_rank)[0] ), ds.vep.transcript_consequences.flatmap(lambda c: c.consequence_terms), ) ) ) ds = ds.cache() QUALITY_METRIC_HISTOGRAM_BIN_EDGES = [i * 5 for i in range(21)] ds = ds.select( variant_id=variant_id(ds.locus, ds.alleles), reference_genome="GRCh37", chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, xpos=x_position(ds.locus), ref=ds.alleles[0], alt=ds.alleles[1], rsid=ds.rsid, exome=hl.struct( ac=ds.info.AC_Adj, an=ds.info.AN_Adj, homozygote_count=ds.info.AC_Hom, hemizygote_count=hl.or_else(ds.info.AC_Hemi, 0), filters=hl.set(hl.if_else(ds.info.AC_Adj == 0, ds.filters.add("AC0"), ds.filters)), populations=[ hl.struct( id=pop_id, ac=ds.info[f"AC_{pop_id}"], an=ds.info[f"AN_{pop_id}"], hemizygote_count=hl.or_else(ds.info[f"Hemi_{pop_id}"], 0), homozygote_count=ds.info[f"Hom_{pop_id}"], ) for pop_id in ["AFR", "AMR", "EAS", "FIN", "NFE", "OTH", "SAS"] ], age_distribution=hl.struct( het=hl.rbind( hl.or_else(ds.info.AGE_HISTOGRAM_HET, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float), lambda bins: hl.struct( bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80], bin_freq=bins[1:11], n_smaller=bins[0], n_larger=bins[11], ), ), hom=hl.rbind( hl.or_else(ds.info.AGE_HISTOGRAM_HOM, "0|0|0|0|0|0|0|0|0|0|0|0").split(r"\|").map(hl.float), lambda bins: hl.struct( bin_edges=[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80], bin_freq=bins[1:11], n_smaller=bins[0], n_larger=bins[11], ), ), ), quality_metrics=hl.struct( genotype_depth=hl.struct( all=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.DP_HIST.all.split(r"\|").map(hl.float), ), alt=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.DP_HIST.alt.split(r"\|").map(hl.float), ), ), genotype_quality=hl.struct( all=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.GQ_HIST.all.split(r"\|").map(hl.float), ), alt=hl.struct( bin_edges=QUALITY_METRIC_HISTOGRAM_BIN_EDGES, bin_freq=ds.info.GQ_HIST.alt.split(r"\|").map(hl.float), ), ), site_quality_metrics=[ hl.struct(metric="BaseQRankSum", value=hl.float(ds.info.BaseQRankSum)), hl.struct(metric="ClippingRankSum", value=hl.float(ds.info.ClippingRankSum)), hl.struct(metric="DP", value=hl.float(ds.info.DP)), hl.struct(metric="FS", value=hl.float(ds.info.FS)), hl.struct(metric="InbreedingCoeff", value=hl.float(ds.info.InbreedingCoeff)), hl.struct(metric="MQ", value=hl.float(ds.info.MQ)), hl.struct(metric="MQRankSum", value=hl.float(ds.info.MQRankSum)), hl.struct(metric="QD", value=hl.float(ds.info.QD)), hl.struct(metric="ReadPosRankSum", value=hl.float(ds.info.ReadPosRankSum)), hl.struct(metric="SiteQuality", value=hl.float(ds.qual)), hl.struct(metric="VQSLOD", value=hl.float(ds.info.VQSLOD)), ], ), ), colocated_variants=hl.rbind( variant_id(ds.locus, ds.alleles), lambda this_variant_id: variant_ids(ds.old_locus, ds.old_alleles).filter( lambda v_id: v_id != this_variant_id ), ), vep=ds.vep, ) ds = ds.annotate(genome=hl.null(ds.exome.dtype)) return ds
def import_mnv_file(path, **kwargs): column_types = { "AC_mnv_ex": hl.tint, "AC_mnv_gen": hl.tint, "AC_mnv": hl.tint, "AC_snp1_ex": hl.tint, "AC_snp1_gen": hl.tint, "AC_snp1": hl.tint, "AC_snp2_ex": hl.tint, "AC_snp2_gen": hl.tint, "AC_snp2": hl.tint, "AN_snp1_ex": hl.tfloat, "AN_snp1_gen": hl.tfloat, "AN_snp2_ex": hl.tfloat, "AN_snp2_gen": hl.tfloat, "categ": hl.tstr, "filter_snp1_ex": hl.tarray(hl.tstr), "filter_snp1_gen": hl.tarray(hl.tstr), "filter_snp2_ex": hl.tarray(hl.tstr), "filter_snp2_gen": hl.tarray(hl.tstr), "gene_id": hl.tstr, "gene_name": hl.tstr, "locus.contig": hl.tstr, "locus.position": hl.tint, "mnv_amino_acids": hl.tstr, "mnv_codons": hl.tstr, "mnv_consequence": hl.tstr, "mnv_lof": hl.tstr, "mnv": hl.tstr, "n_homhom_ex": hl.tint, "n_homhom_gen": hl.tint, "n_homhom": hl.tint, "n_indv_ex": hl.tint, "n_indv_gen": hl.tint, "n_indv": hl.tint, "snp1_amino_acids": hl.tstr, "snp1_codons": hl.tstr, "snp1_consequence": hl.tstr, "snp1_lof": hl.tstr, "snp1": hl.tstr, "snp2_amino_acids": hl.tstr, "snp2_codons": hl.tstr, "snp2_consequence": hl.tstr, "snp2_lof": hl.tstr, "snp2": hl.tstr, "transcript_id": hl.tstr, } ds = hl.import_table(path, key="mnv", missing="", types=column_types, **kwargs) ds = ds.rename({"mnv": "variant_id"}) ds = ds.transmute(locus=hl.locus(ds["locus.contig"], ds["locus.position"])) ds = ds.transmute(chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, xpos=x_position(ds.locus),) ds = ds.annotate(ref=ds.variant_id.split("-")[2], alt=ds.variant_id.split("-")[3]) ds = ds.annotate(snp1_copy=ds.snp1, snp2_copy=ds.snp2) ds = ds.transmute( constituent_snvs=[ hl.bind( lambda variant_id_parts: hl.struct( variant_id=ds[f"{snp}_copy"], chrom=variant_id_parts[0], pos=hl.int(variant_id_parts[1]), ref=variant_id_parts[2], alt=variant_id_parts[3], exome=hl.or_missing( hl.is_defined(ds[f"AN_{snp}_ex"]), hl.struct( filters=ds[f"filter_{snp}_ex"], ac=ds[f"AC_{snp}_ex"], an=hl.int(ds[f"AN_{snp}_ex"]), ), ), genome=hl.or_missing( hl.is_defined(ds[f"AN_{snp}_gen"]), hl.struct( filters=ds[f"filter_{snp}_gen"], ac=ds[f"AC_{snp}_gen"], an=hl.int(ds[f"AN_{snp}_gen"]), ), ), ), ds[f"{snp}_copy"].split("-"), ) for snp in ["snp1", "snp2"] ] ) ds = ds.annotate(constituent_snv_ids=[ds.snp1, ds.snp2]) ds = ds.annotate( mnv_in_exome=ds.constituent_snvs.all(lambda s: hl.is_defined(s.exome)), mnv_in_genome=ds.constituent_snvs.all(lambda s: hl.is_defined(s.genome)), ) ds = ds.transmute( n_individuals=ds.n_indv, ac=ds.AC_mnv, ac_hom=ds.n_homhom, exome=hl.or_missing( ds.mnv_in_exome, hl.struct(n_individuals=ds.n_indv_ex, ac=ds.AC_mnv_ex, ac_hom=ds.n_homhom_ex), ), genome=hl.or_missing( ds.mnv_in_genome, hl.struct(n_individuals=ds.n_indv_gen, ac=ds.AC_mnv_gen, ac_hom=ds.n_homhom_gen), ), ) ds = ds.drop("AC_snp1", "AC_snp2") ds = ds.transmute( consequence=hl.struct( category=ds.categ, gene_id=ds.gene_id, gene_name=ds.gene_name, transcript_id=ds.transcript_id, consequence=ds.mnv_consequence, codons=ds.mnv_codons, amino_acids=ds.mnv_amino_acids, lof=ds.mnv_lof, snv_consequences=[ hl.struct( variant_id=ds[f"{snp}"], amino_acids=ds[f"{snp}_amino_acids"], codons=ds[f"{snp}_codons"], consequence=ds[f"{snp}_consequence"], lof=ds[f"{snp}_lof"], ) for snp in ["snp1", "snp2"] ], ) ) # Collapse table to one row per MNV, with all consequences for the MNV collected into an array consequences = ds.group_by(ds.variant_id).aggregate(consequences=hl.agg.collect(ds.consequence)) ds = ds.drop("consequence") ds = ds.distinct() ds = ds.join(consequences) # Sort consequences by severity ds = ds.annotate(consequences=hl.sorted(ds.consequences, key=lambda c: consequence_term_rank(c.consequence),)) ds = ds.annotate( changes_amino_acids_for_snvs=hl.literal([0, 1]) .filter( lambda idx: ds.consequences.any( lambda csq: csq.snv_consequences[idx].amino_acids.lower() != csq.amino_acids.lower() ) ) .map(lambda idx: ds.constituent_snv_ids[idx]) ) return ds