def main(args): mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) mt = mt.annotate_entries( gvcf_info=mt.gvcf_info.drop('ClippingRankSum', 'ReadPosRankSum')) mt = mt.annotate_rows( n_unsplit_alleles=hl.len(mt.alleles), mixed_site=(hl.len(mt.alleles) > 2) & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:]) & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:])) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) mt.write(args.split_mt_location, overwrite=args.overwrite)
def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None, verbose=True, reference_genome=None, gtf_file=None): """Get intervals of genes or transcripts. Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable. On Google Cloud platform: Gencode v19 (GRCh37) GTF available at: gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz Gencode v29 (GRCh38) GTF available at: gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz Example ------- >>> hl.filter_intervals(ht, get_gene_intervals(gene_symbols=['PCSK9'], reference_genome='GRCh37')) # doctest: +SKIP Parameters ---------- gene_symbols : :obj:`list` of :obj:`str`, optional Gene symbols (e.g. PCSK9). gene_ids : :obj:`list` of :obj:`str`, optional Gene IDs (e.g. ENSG00000223972). transcript_ids : :obj:`list` of :obj:`str`, optional Transcript IDs (e.g. ENSG00000223972). verbose : :obj:`bool` If ``True``, print which genes and transcripts were matched in the GTF file. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use (passed along to import_gtf). gtf_file : :obj:`str` GTF file to load. If none is provided, but `reference_genome` is one of `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform). Returns ------- :obj:`list` of :class:`.Interval` """ if gene_symbols is None and gene_ids is None and transcript_ids is None: raise ValueError('get_gene_intervals requires at least one of gene_symbols, gene_ids, or transcript_ids') ht = _load_gencode_gtf(gtf_file, reference_genome) criteria = [] if gene_symbols: criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_name == y), gene_symbols)) if gene_ids: criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_id == y.split('\\.')[0]), gene_ids)) if transcript_ids: criteria.append(hl.any(lambda y: (ht.feature == 'transcript') & (ht.transcript_id == y.split('\\.')[0]), transcript_ids)) ht = ht.filter(functools.reduce(operator.ior, criteria)) gene_info = ht.aggregate(hl.agg.collect((ht.feature, ht.gene_name, ht.gene_id, ht.transcript_id, ht.interval))) if verbose: info(f'get_gene_intervals found {len(gene_info)} entries:\n' + "\n".join(map(lambda x: f'{x[0]}: {x[1]} ({x[2] if x[0] == "gene" else x[3]})', gene_info))) intervals = list(map(lambda x: x[-1], gene_info)) return intervals
def get_gnomad_v3_mt( split=False, key_by_locus_and_alleles: bool = False, remove_hard_filtered_samples: bool = True, release_only: bool = False, samples_meta: bool = False, ) -> hl.MatrixTable: """ Wrapper function to get gnomAD data with desired filtering and metadata annotations :param split: Perform split on MT - Note: this will perform a split on the MT rather than grab an already split MT :param key_by_locus_and_alleles: Whether to key the MatrixTable by locus and alleles (only needed for v3) :param remove_hard_filtered_samples: Whether to remove samples that failed hard filters (only relevant after sample QC) :param release_only: Whether to filter the MT to only samples available for release (can only be used if metadata is present) :param samples_meta: Whether to add metadata to MT in 'meta' column :return: gnomAD v3 dataset with chosen annotations and filters """ mt = gnomad_v3_genotypes.mt() if key_by_locus_and_alleles: mt = hl.MatrixTable( hl.ir.MatrixKeyRowsBy( mt._mir, ["locus", "alleles"], is_sorted=True ) # Prevents hail from running sort on genotype MT which is already sorted by a unique locus ) if remove_hard_filtered_samples: mt = mt.filter_cols( hl.is_missing(hard_filtered_samples.ht()[mt.col_key])) if samples_meta: mt = mt.annotate_cols(meta=meta.ht()[mt.col_key]) if release_only: mt = mt.filter_cols(mt.meta.release) elif release_only: mt = mt.filter_cols(meta.ht()[mt.col_key].release) if split: mt = mt.annotate_rows( n_unsplit_alleles=hl.len(mt.alleles), mixed_site=(hl.len(mt.alleles) > 2) & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:]) & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:]), ) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) return mt
def generate_allele_data(mt: hl.MatrixTable) -> hl.Table: """ Writes bi-allelic sites MT with the following annotations: - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles) :param MatrixTable mt: Full unsplit MT :return: Table with allele data annotations :rtype: Table """ ht = mt.rows().select() allele_data = hl.struct(nonsplit_alleles=ht.alleles, has_star=hl.any(lambda a: a == '*', ht.alleles)) ht = ht.annotate(allele_data=allele_data.annotate( **add_variant_type(ht.alleles))) ht = hl.split_multi_hts(ht) allele_type = (hl.case().when( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv').when(hl.is_insertion(ht.alleles[0], ht.alleles[1]), 'ins').when(hl.is_deletion(ht.alleles[0], ht.alleles[1]), 'del').default('complex')) ht = ht.annotate(allele_data=ht.allele_data.annotate( allele_type=allele_type, was_mixed=ht.allele_data.variant_type == 'mixed')) return ht
def get_filtered_mt(chrom: str = 'all', pop: str = 'all', imputed: bool = True, min_mac: int = 20, entry_fields=('GP', )): if imputed: ht = hl.read_table(get_ukb_af_ht_path()) if pop == 'all': ht = ht.filter( hl.any(lambda x: ht.af[x] * ht.an[x] >= min_mac, hl.literal(POPS))) else: ht = ht.filter(ht.af[pop] * ht.an[pop] >= min_mac) mt = get_ukb_imputed_data(chrom, variant_list=ht, entry_fields=entry_fields) else: mt = hl.read_matrix_table('gs://ukb31063/ukb31063.genotype.mt') covariates_ht = get_covariates() hq_samples_ht = get_hq_samples() # TODO: confirm that this is correct set mt = mt.annotate_cols(**covariates_ht[mt.s]) mt = mt.filter_cols( hl.is_defined(mt.pop) & hl.is_defined(hq_samples_ht[mt.s])) if pop != 'all': mt = mt.filter_cols(mt.pop == pop) return mt
def generate_allele_data(ht: hl.Table) -> hl.Table: """ Returns bi-allelic sites HT with the following annotations: - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles) :param Table ht: Full unsplit HT :return: Table with allele data annotations :rtype: Table """ ht = ht.select() allele_data = hl.struct(nonsplit_alleles=ht.alleles, has_star=hl.any(lambda a: a == "*", ht.alleles)) ht = ht.annotate(allele_data=allele_data.annotate( **add_variant_type(ht.alleles))) ht = hl.split_multi_hts(ht) ht = ht.filter(hl.len(ht.alleles) > 1) allele_type = (hl.case().when( hl.is_snp(ht.alleles[0], ht.alleles[1]), "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]), "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]), "del").default("complex")) ht = ht.annotate(allele_data=ht.allele_data.annotate( allele_type=allele_type, was_mixed=ht.allele_data.variant_type == "mixed")) return ht
def conditional_phenotypes(mt: hl.MatrixTable, column_field, entry_field, lists_of_columns, new_col_name='grouping', new_entry_name='new_entry'): """ Create a conditional phenotype by setting phenotype1 to missing for any individual without phenotype2. Pheno1 Pheno2 new_pheno T T T T F NA F F NA F T F `lists_of_columns` should be a list of lists (of length 2 for the inner list). The first element corresponds to the phenotype to maintain, except for setting to missing when the phenotype coded by the second element is False. new_entry = Pheno1 conditioned on having Pheno2 Example: mt = hl.balding_nichols_model(1, 3, 10).drop('GT') mt = mt.annotate_entries(pheno=hl.rand_bool(0.5)) lists_of_columns = [[0, 1], [2, 1]] entry_field = mt.pheno column_field = mt.sample_idx :param MatrixTable mt: Input MatrixTable :param Expression column_field: Column-indexed Expression to group by :param Expression entry_field: Entry-indexed Expression to which to apply `grouping_function` :param list of list lists_of_columns: Entry in this list should be the same type as `column_field` :param str new_col_name: Name for new column key (default 'grouping') :param str new_entry_name: Name for new entry expression (default 'new_entry') :return: Re-grouped MatrixTable :rtype: MatrixTable """ assert all([len(x) == 2 for x in lists_of_columns]) lists_of_columns = hl.literal(lists_of_columns) mt = mt._annotate_all(col_exprs={'_col_expr': column_field}, entry_exprs={'_entry_expr': entry_field}) mt = mt.annotate_cols( _col_expr=lists_of_columns.filter(lambda x: x.contains( mt._col_expr)).map(lambda y: (y, y[0] == mt._col_expr))) mt = mt.explode_cols('_col_expr') # if second element (~mt._col_expr[1]) is false (~mt._entry_expr), then return missing # otherwise, get actual element (either true if second element, or actual first element) bool_array = hl.agg.collect( hl.if_else(~mt._col_expr[1] & ~mt._entry_expr, hl.null(hl.tbool), mt._entry_expr)) # if any element is missing, return missing. otherwise return first element return mt.group_cols_by(**{ new_col_name: mt._col_expr[0] }).aggregate( **{ new_entry_name: hl.if_else(hl.any(lambda x: hl.is_missing(x), bool_array), hl.null(hl.tbool), bool_array[0] & bool_array[1]) })
def generate_split_alleles(mt: hl.MatrixTable) -> hl.Table: allele_data = hl.struct(nonsplit_alleles=mt.alleles, has_star=hl.any(lambda a: a == '*', mt.alleles)) mt = mt.annotate_rows(allele_data=allele_data.annotate( **add_variant_type(mt.alleles))) mt = hl.split_multi_hts(mt, left_aligned=True) allele_type = (hl.case().when( hl.is_snp(mt.alleles[0], mt.alleles[1]), 'snv').when(hl.is_insertion(mt.alleles[0], mt.alleles[1]), 'ins').when(hl.is_deletion(mt.alleles[0], mt.alleles[1]), 'del').default('complex')) mt = mt.annotate_rows(allele_data=mt.allele_data.annotate( allele_type=allele_type, was_mixed=mt.allele_data.variant_type == 'mixed')) return mt
def get_filtered_mt(pop: str = 'all', imputed: bool = True, chrom: str = 'all', min_mac: int = 20): if imputed: ht = hl.read_table(ukb_af_ht_path) if pop == 'all': ht = ht.filter( hl.any(lambda x: ht.af[x] * ht.an[x] >= min_mac, hl.literal(POPS))) else: ht = ht.filter(ht.af[pop] * ht.an[pop] >= min_mac) mt = get_ukb_imputed_data(chrom, variant_list=ht) else: mt = hl.read_matrix_table('gs://ukb31063/ukb31063.genotype.mt') meta_ht = get_ukb_meta() mt = mt.annotate_cols(**meta_ht.key_by(s=hl.str(meta_ht.s))[mt.s]) if pop != 'all': mt = mt.filter_cols(mt.pop == pop) return mt
def all_and_leave_one_out(x, pop_array, all_f=hl.sum, loo_f=lambda i, x: hl.sum(x) - hl.or_else(x[i], 0)): """ Applies a function to an input array for all populations, and for each of leave-one-out populations. :param x: Input array :param pop_array: Population array :param all_f: Function for all populations. It takes the input array and returns a new value :param loo_f: Function for each of leave-one-out populations. It takes an index of leave-one-out population and the input array, and returns an array of new values. ... :return: Array of new values for all populations and for each of leave-one-out populations. :rtype: ArrayExpression """ arr = hl.array([all_f(x)]) arr = arr.extend(hl.map(lambda i: loo_f(i, x), hl.range(hl.len(pop_array)))) return hl.or_missing(hl.any(hl.is_defined, x), arr)
def get_r_within_gene( bm: BlockMatrix, ld_index: hl.Table, gene: str, vep_ht: hl.Table = None, reference_genome: str = None, ): """ Get LD information (`r`) for all pairs of variants within `gene`. Warning: this returns a table quadratic in number of variants. Exercise caution with large genes. :param bm: Input Block Matrix :param ld_index: Corresponding index table :param gene: Gene symbol as string :param vep_ht: Table with VEP annotations (if None, gets from get_gnomad_public_data()) :param reference_genome: Reference genome to pass to get_gene_intervals for fast filtering to gene :return: Table with pairs of variants """ if vep_ht is None: vep_ht = public_release("exomes").ht() if reference_genome is None: reference_genome = hl.default_reference().name intervals = hl.experimental.get_gene_intervals( gene_symbols=[gene], reference_genome=reference_genome) ld_index = hl.filter_intervals(ld_index, intervals) ld_index = ld_index.annotate(vep=vep_ht[ld_index.key].vep) ld_index = ld_index.filter( hl.any(lambda tc: tc.gene_symbol == gene, ld_index.vep.transcript_consequences)) indices_to_keep = ld_index.idx.collect() filt_bm = bm.filter(indices_to_keep, indices_to_keep) ht = filt_bm.entries() ld_index = ld_index.add_index("new_idx").key_by("new_idx") return ht.transmute(r=ht.entry, i_variant=ld_index[ht.i], j_variant=ld_index[ht.j])
def get_filtered_mt(chrom: str = 'all', pop: str = 'all', imputed: bool = True, min_mac: int = 20, entry_fields=('GP', ), filter_mac_instead_of_ac: bool = False): # get ac or mac based on filter_mac_instead_of_ac def get_ac(af, an): if filter_mac_instead_of_ac: # Note that the underlying file behind get_ukb_af_ht_path() accidentally double af and halve an return (1.0 - hl.abs(1.0 - af)) * an else: return af * an if imputed: ht = hl.read_table(get_ukb_af_ht_path()) if pop == 'all': ht = ht.filter( hl.any(lambda x: get_ac(ht.af[x], ht.an[x]) >= min_mac, hl.literal(POPS))) else: ht = ht.filter(get_ac(ht.af[pop], ht.an[pop]) >= min_mac) mt = get_ukb_imputed_data(chrom, variant_list=ht, entry_fields=entry_fields) else: mt = hl.read_matrix_table('gs://ukb31063/ukb31063.genotype.mt') covariates_ht = get_covariates() hq_samples_ht = get_hq_samples() mt = mt.annotate_cols(**covariates_ht[mt.s]) mt = mt.filter_cols( hl.is_defined(mt.pop) & hl.is_defined(hq_samples_ht[mt.s])) if pop != 'all': mt = mt.filter_cols(mt.pop == pop) return mt
def process_consequences(mt: hl.MatrixTable, vep_root: str = 'vep', penalize_flags: bool = True) -> hl.MatrixTable: """ Adds most_severe_consequence (worst consequence for a transcript) into [vep_root].transcript_consequences, and worst_csq_by_gene, any_lof into [vep_root] :param MatrixTable mt: Input MT :param str vep_root: Root for vep annotation (probably vep) :param bool penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them as equal to HC :return: MT with better formatted consequences :rtype: MatrixTable """ csqs = hl.literal(CSQ_ORDER) csq_dict = hl.literal(dict(zip(CSQ_ORDER, range(len(CSQ_ORDER))))) def add_most_severe_consequence( tc: hl.expr.StructExpression) -> hl.expr.StructExpression: """ Add most_severe_consequence annotation to transcript consequences This is for a given transcript, as there are often multiple annotations for a single transcript: e.g. splice_region_variant&intron_variant -> splice_region_variant """ return tc.annotate(most_severe_consequence=csqs.find( lambda c: tc.consequence_terms.contains(c))) def find_worst_transcript_consequence( tcl: hl.expr.ArrayExpression) -> hl.expr.StructExpression: """ Gets worst transcript_consequence from an array of em """ flag_score = 500 no_flag_score = flag_score * (1 + penalize_flags) def csq_score(tc): return csq_dict[csqs.find( lambda x: x == tc.most_severe_consequence)] tcl = tcl.map(lambda tc: tc.annotate( csq_score=hl.case(missing_false=True). when((tc.lof == 'HC') & (tc.lof_flags == ''), csq_score(tc) - no_flag_score).when( (tc.lof == 'HC') & (tc.lof_flags != ''), csq_score(tc) - flag_score).when(tc.lof == 'LC', csq_score(tc) - 10). when(tc.polyphen_prediction == 'probably_damaging', csq_score(tc) - 0.5).when( tc.polyphen_prediction == 'possibly_damaging', csq_score(tc) - 0.25).when( tc.polyphen_prediction == 'benign', csq_score(tc) - 0.1).default(csq_score(tc)))) return hl.or_missing( hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0]) transcript_csqs = mt[vep_root].transcript_consequences.map( add_most_severe_consequence) gene_dict = transcript_csqs.group_by(lambda tc: tc.gene_symbol) worst_csq_gene = gene_dict.map_values(find_worst_transcript_consequence) sorted_scores = hl.sorted(worst_csq_gene.values(), key=lambda tc: tc.csq_score) lowest_score = hl.or_missing( hl.len(sorted_scores) > 0, sorted_scores[0].csq_score) gene_with_worst_csq = sorted_scores.filter( lambda tc: tc.csq_score == lowest_score).map(lambda tc: tc.gene_symbol) ensg_with_worst_csq = sorted_scores.filter( lambda tc: tc.csq_score == lowest_score).map(lambda tc: tc.gene_id) vep_data = mt[vep_root].annotate( transcript_consequences=transcript_csqs, worst_csq_by_gene=worst_csq_gene, any_lof=hl.any(lambda x: x.lof == 'HC', worst_csq_gene.values()), gene_with_most_severe_csq=gene_with_worst_csq, ensg_with_most_severe_csq=ensg_with_worst_csq) return mt.annotate_rows(**{vep_root: vep_data})
def main(args): subsets = args.subsets hl.init( log= f"/generate_frequency_data{'.' + '_'.join(subsets) if subsets else ''}.log", default_reference="GRCh38", ) invalid_subsets = [] n_subsets_use_subpops = 0 for s in subsets: if s not in SUBSETS: invalid_subsets.append(s) if s in COHORTS_WITH_POP_STORED_AS_SUBPOP: n_subsets_use_subpops += 1 if invalid_subsets: raise ValueError( f"{', '.join(invalid_subsets)} subset(s) are not one of the following official subsets: {SUBSETS}" ) if n_subsets_use_subpops & (n_subsets_use_subpops != len(subsets)): raise ValueError( f"All or none of the supplied subset(s) should be in the list of cohorts that need to use subpops instead " f"of pops in frequency calculations: {COHORTS_WITH_POP_STORED_AS_SUBPOP}" ) try: logger.info("Reading full sparse MT and metadata table...") mt = get_gnomad_v3_mt( key_by_locus_and_alleles=True, release_only=not args.include_non_release, samples_meta=True, ) if args.test: logger.info("Filtering to two partitions on chr20") mt = hl.filter_intervals( mt, [hl.parse_locus_interval("chr20:1-1000000")]) mt = mt._filter_partitions(range(2)) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) if args.include_non_release: logger.info("Filtering MT columns to high quality samples") total_sample_count = mt.count_cols() mt = mt.filter_cols(mt.meta.high_quality) high_quality_sample_count = mt.count_cols() logger.info( f"Filtered {total_sample_count - high_quality_sample_count} from the full set of {total_sample_count} " f"samples...") if subsets: mt = mt.filter_cols(hl.any([mt.meta.subsets[s] for s in subsets])) logger.info( f"Running frequency generation pipeline on {mt.count_cols()} samples in {', '.join(subsets)} subset(s)..." ) else: logger.info( f"Running frequency generation pipeline on {mt.count_cols()} samples..." ) logger.info("Computing adj and sex adjusted genotypes...") mt = mt.annotate_entries( GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex_imputation.sex_karyotype), adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD), ) logger.info("Densify-ing...") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) > 1) # Temporary hotfix for depletion of homozygous alternate genotypes logger.info( "Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt..." ) # Load v3.0 allele frequencies to avoid an extra frequency calculation # NOTE: Using previous callset AF works for small incremental changes to a callset, but we will need to revisit for large increments freq_ht = get_freq(version="3").ht() freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF) mt = mt.annotate_entries(GT=hl.cond( (freq_ht[mt.row_key].AF > 0.01) & mt.GT.is_het() & (mt.AD[1] / mt.DP > 0.9), hl.call(1, 1), mt.GT, )) logger.info("Generating frequency data...") if subsets: mt = annotate_freq( mt, sex_expr=mt.meta.sex_imputation.sex_karyotype, pop_expr=mt.meta.population_inference.pop if not n_subsets_use_subpops else mt.meta.project_meta.project_subpop, # NOTE: TGP and HGDP labeled populations are highly specific and are stored in the project_subpop meta field ) # NOTE: no FAFs or popmax needed for subsets mt = mt.select_rows("freq") logger.info( f"Writing out frequency data for {', '.join(subsets)} subset(s)..." ) if args.test: mt.rows().write( get_checkpoint_path( f"chr20_test_freq.{'_'.join(subsets)}"), overwrite=True, ) else: mt.rows().write(get_freq(subset="_".join(subsets)).path, overwrite=args.overwrite) else: logger.info("Computing age histograms for each variant...") mt = mt.annotate_cols(age=hl.if_else( hl.is_defined(mt.meta.project_meta.age), mt.meta.project_meta.age, mt.meta.project_meta.age_alt, # NOTE: most age data is stored as integers in 'age' annotation, but for a select number of samples, age is stored as a bin range and 'age_alt' corresponds to an integer in the middle of the bin )) mt = mt.annotate_rows(**age_hists_expr(mt.adj, mt.GT, mt.age)) # Compute callset-wide age histogram global mt = mt.annotate_globals(age_distribution=mt.aggregate_cols( hl.agg.hist(mt.age, 30, 80, 10))) mt = annotate_freq( mt, sex_expr=mt.meta.sex_imputation.sex_karyotype, pop_expr=mt.meta.population_inference.pop, downsamplings=DOWNSAMPLINGS, ) # Remove all loci with raw AC=0 mt = mt.filter_rows(mt.freq[1].AC > 0) logger.info("Calculating InbreedingCoeff...") # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify mt = mt.annotate_rows( InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT)) logger.info("Computing filtering allele frequencies and popmax...") faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX) mt = mt.select_rows( "InbreedingCoeff", "freq", faf=faf, popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX), ) mt = mt.annotate_globals( faf_meta=faf_meta, faf_index_dict=make_faf_index_dict(faf_meta)) mt = mt.annotate_rows(popmax=mt.popmax.annotate( faf95=mt.faf[mt.faf_meta.index( lambda x: x.values() == ["adj", mt.popmax.pop])].faf95)) logger.info("Annotating quality metrics histograms...") # NOTE: these are performed here as the quality metrics histograms also require densifying mt = mt.annotate_rows( qual_hists=qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD, mt.adj)) ht = mt.rows() ht = ht.annotate( qual_hists=hl.Struct( **{ i.replace("_adj", ""): ht.qual_hists[i] for i in ht.qual_hists if "_adj" in i }), raw_qual_hists=hl.Struct(**{ i: ht.qual_hists[i] for i in ht.qual_hists if "_adj" not in i }), ) logger.info("Writing out frequency data...") if args.test: ht.write(get_checkpoint_path("chr20_test_freq"), overwrite=True) else: ht.write(get_freq().path, overwrite=args.overwrite) finally: logger.info("Copying hail log to logging bucket...") hl.copy_log(f"{qc_temp_prefix()}logs/")
# Mixture of non-empty with empty PL fields causes problems with sample QC for some reason; setting field to all empty mt = mt.annotate_entries(PL=hl.missing(mt.PL.dtype)) # Add variant-level annotations necessary for variant QC later ## Annotate variants in one of the categories: SNV, multi-SNV, indel, multi-indel, mixed mt = mt.annotate_rows(**add_variant_type(mt.alleles)) ## Number of alleles at the site mt = mt.annotate_rows(n_alleles = hl.len(mt.alleles)) ## Mixed sites (SNVs and indels present at the site) mt = mt.annotate_rows(mixed_site = hl.if_else(mt.variant_type == "mixed", True, False)) ## Spanning deletions mt = mt.annotate_rows(spanning_deletion=hl.any(lambda a: a == "*", mt.alleles)) # Number of Rows, Columns mt.count() # Number of Columns mt.count_cols() # Variants breakdown hl.summarize_variants(mt) # Split variants with multiple alleles into biallelic configuration. Notice that hl.count() and hl.summarize_variants() will give different numbers after multi-allele sites splitting than before mt = hl.split_multi_hts(mt) # Remove monomorphic sites mt = mt.filter_rows(mt.n_alleles > 1)
def load_icd_data(pre_phesant_data_path, icd_codings_path, temp_directory, force_overwrite_intermediate: bool = False, include_dates: bool = False, icd9: bool = False): """ Load raw (pre-PHESANT) phenotype data and extract ICD codes into hail MatrixTable with booleans as entries :param str pre_phesant_data_path: Input phenotype file :param str icd_codings_path: Input coding metadata :param str temp_directory: Temp bucket/directory to write intermediate file :param bool force_overwrite_intermediate: Whether to overwrite intermediate loaded file :param bool include_dates: Whether to also load date data (not implemented yet) :param bool icd9: Whether to load ICD9 data :return: MatrixTable with ICD codes :rtype: MatrixTable """ if icd9: code_locations = {'primary_codes': '41203', 'secondary_codes': '41205'} else: code_locations = { 'primary_codes': '41202', 'secondary_codes': '41204', 'external_codes': '41201', 'cause_of_death_codes': '40001' } date_locations = {'primary_codes': '41262'} ht = hl.import_table(pre_phesant_data_path, impute=not icd9, min_partitions=100, missing='', key='userId', types={'userId': hl.tint32}) ht = ht.checkpoint(f'{temp_directory}/pre_phesant.ht', _read_if_exists=not force_overwrite_intermediate) all_phenos = list(ht.row_value) fields_to_select = { code: [ht[x] for x in all_phenos if x.startswith(f'x{loc}')] for code, loc in code_locations.items() } if include_dates: fields_to_select.update({ f'date_{code}': [ht[x] for x in all_phenos if x.startswith(f'x{loc}')] for code, loc in date_locations.items() }) ht = ht.select(**fields_to_select) ht = ht.annotate( **{ code: ht[code].filter(lambda x: hl.is_defined(x)) for code in code_locations }, # **{f'date_{code}': ht[code].filter(lambda x: hl.is_defined(x)) for code in date_locations} ) # ht = ht.annotate(primary_codes_with_date=hl.dict(hl.zip(ht.primary_codes, ht.date_primary_codes))) all_codes = hl.sorted( hl.array( hl.set( hl.flatmap( lambda x: hl.array(x), ht.aggregate([ hl.agg.explode(lambda c: hl.agg.collect_as_set(c), ht[code]) for code in code_locations ], _localize=True))))) ht = ht.select(bool_codes=all_codes.map(lambda x: hl.struct( **{code: ht[code].contains(x) for code in code_locations}))) ht = ht.annotate_globals( all_codes=all_codes.map(lambda x: hl.struct(icd_code=x))) mt = ht._unlocalize_entries('bool_codes', 'all_codes', ['icd_code']) mt = mt.annotate_entries( any_codes=hl.any(lambda x: x, list(mt.entry.values()))) # mt = mt.annotate_entries(date=hl.cond(mt.primary_codes, mt.primary_codes_with_date[mt.icd_code], hl.null(hl.tstr))) mt = mt.annotate_cols(truncated=False).annotate_globals( code_locations=code_locations) mt = mt.checkpoint(f'{temp_directory}/raw_icd.mt', _read_if_exists=not force_overwrite_intermediate) trunc_mt = mt.filter_cols((hl.len(mt.icd_code) == 3) | (hl.len(mt.icd_code) == 4)) trunc_mt = trunc_mt.key_cols_by(icd_code=trunc_mt.icd_code[:3]) trunc_mt = trunc_mt.group_cols_by('icd_code').aggregate_entries( **{ code: hl.agg.any(trunc_mt[code]) for code in list(code_locations.keys()) + ['any_codes'] }).aggregate_cols(n_phenos_truncated=hl.agg.count()).result() trunc_mt = trunc_mt.filter_cols(trunc_mt.n_phenos_truncated > 1) trunc_mt = trunc_mt.annotate_cols( **mt.cols().drop('truncated', 'code_locations')[trunc_mt.icd_code], truncated=True).drop('n_phenos_truncated') mt = mt.union_cols(trunc_mt) coding_ht = hl.read_table(icd_codings_path) return mt.annotate_cols(**coding_ht[mt.col_key])
def annotate_variants(mt): ''' Takes matrix table and annotates variants with gene, LOF and missense annotations by parsing VEP annotations. :param mt: matrix table to annotate :return: returns matrix table with new row annotations gene, LOF, and missense. ''' try: test = hl.is_defined(mt.row.was_split) except Exception as e: print('Split multi-allelics before running!') print(e) return # If there is no canonical and protein-coding transcript consequence for that variant, # give the gene corresponding to the most severe consequence. # If there is a canonical and protein-coding transcript consequence for that variant, # give the gene symbol associated with that transcript consequence. canon_pc = mt.row.vep.transcript_consequences.filter( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding')) most_severe = mt.row.vep.transcript_consequences.filter( lambda x: x.consequence_terms.contains(mt.row.vep. most_severe_consequence)) mt = mt.annotate_rows(gene=hl.if_else( hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences), canon_pc.map(lambda x: x.gene_symbol), most_severe.map(lambda x: x.gene_symbol))) # The above returns gene symbols for all canonical and protein coding transcripts, not just the one related to the # most severe consequence. So we will keep the above, but annotate also the gene corresponding to the most severe # consequence as well (useful for synonymous, missense, and LOF annotations) canon_pc = mt.row.vep.transcript_consequences.filter( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding') & x.consequence_terms.contains(mt.vep.most_severe_consequence)) most_severe = mt.vep.transcript_consequences.filter( lambda x: x.consequence_terms.contains(mt.row.vep. most_severe_consequence)) mt = mt.annotate_rows(gene_most_severe_conseq=hl.if_else( hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.vep.transcript_consequences), canon_pc.map(lambda x: x.gene_symbol), most_severe.map(lambda x: x.gene_symbol))) # either if there is a canonical and protein coding transcript consequence for that variant, # and the lof annotation is not missing and equal to HC, and the lof flag is missing or is blank, # or if there isn't a canonical and protein coding transcript consequence for that variant and the # transcript consequence with consequence terms containing the most severe consequence term has lof not missing, # is equal to HC, and lof flags missing or blank, # true, else false canon_pc = mt.row.vep.transcript_consequences\ .filter(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding')) most_severe = mt.row.vep.transcript_consequences\ .filter(lambda x: x.consequence_terms.contains( mt.row.vep.most_severe_consequence)) canon_bool = ( hl.any(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences) & hl.any(lambda x: hl.is_defined(x.lof), canon_pc) & (canon_pc.map(lambda x: x.lof) == ["HC"]) & (hl.all(lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == ""), canon_pc))) non_canon_bool = (~(hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) & hl.any(lambda x: hl.is_defined(x.lof), most_severe) & (most_severe.map(lambda x: x.lof) == ["HC"]) & (hl.all( lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == ""), most_severe))) mt = mt.annotate_rows(LOF=hl.if_else(canon_bool | non_canon_bool, True, False)) # Either if there is a canonical and protein coding transcript consequence for that variant # whose consequence terms contain "missense variant" # or if there is not a canonical and protein coding transcript consequence for that variant, # but the most severe consequence is "missense variant" # or if if there is a canonical and protein coding transcript consequence for that variant # whose consequence terms contain "inframe deletion" # or if there is not a canonical and protein coding transcript consequence for that variant, # but the variant's most severe consequence is "inframe deletion" # true else false canon_pc = mt.row.vep.transcript_consequences\ .filter(lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding')) canon_missense_bool = canon_pc.map(lambda x: x.consequence_terms).contains( ["missense_variant"]) noncanon_missense_bool = (~(hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) & (mt.row.vep.most_severe_consequence == "missense_variant")) canon_inframe_bool = canon_pc.map(lambda x: x.consequence_terms).contains( ["inframe_deletion"]) noncanon_inframe_bool = (~(hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) & (mt.row.vep.most_severe_consequence == "inframe_deletion")) canon_inframe_ins_bool = canon_pc.map( lambda x: x.consequence_terms).contains(["inframe_insertion"]) noncanon_inframe_ins_bool = (~(hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences)) & (mt.row.vep.most_severe_consequence == "inframe_insertion")) mt = mt.annotate_rows( missense=hl.if_else((canon_missense_bool | noncanon_missense_bool | canon_inframe_bool | noncanon_inframe_bool | canon_inframe_ins_bool | noncanon_inframe_ins_bool), True, False)) # If the most severe consequence is "synonymous_variant", true else false mt = mt.annotate_rows(synonymous=hl.if_else( mt.row.vep.most_severe_consequence == "synonymous_variant", True, False)) # When there is a transcript consequence for that variant that is canonical, # protein coding, and lof = "HC", its lof flags # When there is not a transcript consequence for that variant that is canonical and protein coding, # but there is a transcript consequence whose consequence terms contains the most severe consequence # and its lof == HC, its lof flags # else blank canon_bool = hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding'), mt.row.vep.transcript_consequences) canon_hc_bool = hl.any( lambda x: (x.canonical == 1) & (x.biotype == 'protein_coding') & (x.lof == 'HC'), mt.row.vep.transcript_consequences) canon_pc_hc = mt.row.vep.transcript_consequences.filter(lambda x: ( x.canonical == 1) & (x.biotype == 'protein_coding') & (x.lof == "HC")) most_severe_bool = hl.any( lambda x: (x.consequence_terms.contains(mt.row.vep.most_severe_consequence)) & (x.lof == 'HC'), mt.row.vep.transcript_consequences) most_severe_hc = mt.row.vep.transcript_consequences.filter(lambda x: ( x.consequence_terms.contains(mt.row.vep.most_severe_consequence)) & (x.lof == "HC")) mt = mt.annotate_rows(LOF_flag=hl.case().when( canon_hc_bool, canon_pc_hc.map(lambda x: x.lof_flags)).when( ~canon_bool & most_severe_bool, most_severe_hc.map(lambda x: x.lof_flags)).default([""])) return mt
def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None, verbose=True, reference_genome=None, gtf_file=None): """Get intervals of genes or transcripts. Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable. On Google Cloud platform: Gencode v19 (GRCh37) GTF available at: gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz Gencode v29 (GRCh38) GTF available at: gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz Example ------- >>> hl.filter_intervals(ht, get_gene_intervals(gene_symbols=['PCSK9'], reference_genome='GRCh37')) # doctest: +SKIP Parameters ---------- gene_symbols : :obj:`list` of :obj:`str`, optional Gene symbols (e.g. PCSK9). gene_ids : :obj:`list` of :obj:`str`, optional Gene IDs (e.g. ENSG00000223972). transcript_ids : :obj:`list` of :obj:`str`, optional Transcript IDs (e.g. ENSG00000223972). verbose : :obj:`bool` If ``True``, print which genes and transcripts were matched in the GTF file. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use (passed along to import_gtf). gtf_file : :obj:`str` GTF file to load. If none is provided, but `reference_genome` is one of `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform). Returns ------- :obj:`list` of :class:`.Interval` """ GTFS = { 'GRCh37': 'gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz', 'GRCh38': 'gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz', } if reference_genome is None: reference_genome = hl.default_reference().name if gtf_file is None: gtf_file = GTFS.get(reference_genome) if gtf_file is None: raise ValueError('get_gene_intervals requires a GTF file, or the reference genome be one of GRCh37 or GRCh38 (when on Google Cloud Platform)') if gene_symbols is None and gene_ids is None and transcript_ids is None: raise ValueError('get_gene_intervals requires at least one of gene_symbols, gene_ids, or transcript_ids') ht = hl.experimental.import_gtf(gtf_file, reference_genome=reference_genome, skip_invalid_contigs=True, min_partitions=12) ht = ht.annotate(gene_id=ht.gene_id.split(f'\\.')[0], transcript_id=ht.transcript_id.split('\\.')[0]) criteria = [] if gene_symbols: criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_name == y), gene_symbols)) if gene_ids: criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_id == y.split('\\.')[0]), gene_ids)) if transcript_ids: criteria.append(hl.any(lambda y: (ht.feature == 'transcript') & (ht.transcript_id == y.split('\\.')[0]), transcript_ids)) ht = ht.filter(functools.reduce(operator.ior, criteria)) gene_info = ht.aggregate(hl.agg.collect((ht.feature, ht.gene_name, ht.gene_id, ht.transcript_id, ht.interval))) if verbose: info(f'get_gene_intervals found {len(gene_info)} entries:\n' + "\n".join(map(lambda x: f'{x[0]}: {x[1]} ({x[2] if x[0] == "gene" else x[3]})', gene_info))) intervals = list(map(lambda x: x[-1], gene_info)) return intervals
def filter_by_frequency( t: Union[hl.MatrixTable, hl.Table], direction: str, frequency: float = None, allele_count: int = None, population: str = None, subpop: str = None, downsampling: int = None, keep: bool = True, adj: bool = True, ) -> Union[hl.MatrixTable, hl.Table]: """ Filter MatrixTable or Table with gnomAD-format frequency data (assumed bi-allelic/split). gnomAD frequency data format expectation is: Array[Struct(Array[AC], Array[AF], AN, homozygote_count, meta)]. At least one of frequency or allele_count is required. Subpop can be specified without a population if desired. :param t: Input MatrixTable or Table :param direction: One of "above", "below", and "equal" (how to apply the filter) :param frequency: Frequency to filter by (one of frequency or allele_count is required) :param allele_count: Allele count to filter by (one of frequency or allele_count is required) :param population: Population in which to filter frequency :param subpop: Sub-population in which to filter frequency :param downsampling: Downsampling in which to filter frequency :param keep: Whether to keep rows passing this frequency (passed to filter_rows) :param adj: Whether to use adj frequency :return: Filtered MatrixTable or Table """ if frequency is None and allele_count is None: raise ValueError( "At least one of frequency or allele_count must be specified") if direction not in ("above", "below", "equal"): raise ValueError( 'direction needs to be one of "above", "below", or "equal"') group = "adj" if adj else "raw" criteria = [lambda f: f.meta.get("group", "") == group] if frequency is not None: if direction == "above": criteria.append(lambda f: f.AF[1] > frequency) elif direction == "below": criteria.append(lambda f: f.AF[1] < frequency) else: criteria.append(lambda f: f.AF[1] == frequency) if allele_count is not None: if direction == "above": criteria.append(lambda f: f.AC[1] > allele_count) elif direction == "below": criteria.append(lambda f: f.AC[1] < allele_count) else: criteria.append(lambda f: f.AC[1] == allele_count) size = 1 if population: criteria.append(lambda f: f.meta.get("pop", "") == population) size += 1 if subpop: criteria.append(lambda f: f.meta.get("subpop", "") == subpop) size += 1 # If one supplies a subpop but not a population, this will ensure this gets it right if not population: size += 1 if downsampling: criteria.append( lambda f: f.meta.get("downsampling", "") == str(downsampling)) size += 1 if not population: size += 1 criteria.append(lambda f: f.meta.get("pop", "") == "global") if subpop: raise Exception( "No downsampling data for subpopulations implemented") criteria.append(lambda f: f.meta.size() == size) def combine_functions(func_list, x): cond = func_list[0](x) for c in func_list[1:]: cond &= c(x) return cond filt = lambda x: combine_functions(criteria, x) criteria = hl.any(filt, t.freq) return (t.filter_rows(criteria, keep=keep) if isinstance( t, hl.MatrixTable) else t.filter(criteria, keep=keep))
def summarize_variant_filters( t: Union[hl.MatrixTable, hl.Table], variant_filter_field: str = "RF", problematic_regions: List[str] = ["lcr", "segdup", "nonpar"], single_filter_count: bool = False, monoallelic_expr: Optional[hl.expr.BooleanExpression] = None, extra_filter_checks: Optional[Dict[str, hl.expr.Expression]] = None, n_rows: int = 50, n_cols: int = 140, ) -> None: """ Summarize variants filtered under various conditions in input MatrixTable or Table. Summarize counts for: - Total number of variants - Fraction of variants removed due to: - Any filter - Inbreeding coefficient filter in combination with any other filter - AC0 filter in combination with any other filter - `variant_filter_field` filtering in combination with any other filter in combination with any other filter - Only inbreeding coefficient filter - Only AC0 filter - Only `variant_filter_field` filtering :param t: Input MatrixTable or Table to be checked. :param variant_filter_field: String of variant filtration used in the filters annotation on `ht` (e.g. RF, VQSR, AS_VQSR). Default is "RF". :param problematic_regions: List of regions considered problematic to run filter check in. Default is ["lcr", "segdup", "nonpar"]. :param single_filter_count: If True, explode the Table's filter column and give a supplement total count of each filter. Default is False. :param monoallelic_expr: Optional boolean expression of monoallelic status that logs how many monoallelic sites are in the Table. :param extra_filter_checks: Optional dictionary containing filter condition name (key) and extra filter expressions (value) to be examined. :param n_rows: Number of rows to display only when showing percentages of filtered variants grouped by multiple conditions. Default is 50. :param n_cols: Number of columns to display only when showing percentages of filtered variants grouped by multiple conditions. Default is 140. :return: None """ t = t.rows() if isinstance(t, hl.MatrixTable) else t filters = t.aggregate(hl.agg.counter(t.filters)) logger.info("Variant filter counts: %s", filters) if single_filter_count: exp_t = t.explode(t.filters) filters = exp_t.aggregate(hl.agg.counter(exp_t.filters)) logger.info("Exploded variant filter counts: %s", filters) if monoallelic_expr is not None: if isinstance(t, hl.MatrixTable): mono_sites = t.filter_rows(monoallelic_expr).count_rows() else: mono_sites = t.filter(monoallelic_expr).count() logger.info("There are %d monoallelic sites in the dataset.", mono_sites) filtered_expr = hl.len(t.filters) > 0 problematic_region_expr = hl.any( lambda x: x, [t.info[region] for region in problematic_regions]) t = t.annotate(is_filtered=filtered_expr, in_problematic_region=problematic_region_expr) def _filter_agg_order( t: Union[hl.MatrixTable, hl.Table], group_exprs: Dict[str, hl.expr.Expression], n_rows: Optional[int] = None, n_cols: Optional[int] = None, ) -> None: """ Perform validity checks to measure percentages of variants filtered under different conditions. :param t: Input MatrixTable or Table. :param group_exprs: Dictionary of expressions to group the Table by. :param extra_filter_checks: Optional dictionary containing filter condition name (key) and extra filter expressions (value) to be examined. :param n_rows: Number of rows to show. Default is None (to display 10 rows). :param n_cols: Number of columns to show. Default is None (to display 10 cols). :return: None """ t = t.rows() if isinstance(t, hl.MatrixTable) else t # NOTE: make_filters_expr_dict returns a dict with %ages of variants filtered t.group_by(**group_exprs).aggregate(**make_filters_expr_dict( t, extra_filter_checks, variant_filter_field)).order_by( hl.desc("n")).show(n_rows, n_cols) logger.info( "Checking distributions of filtered variants amongst variant filters..." ) _filter_agg_order(t, {"is_filtered": t.is_filtered}) logger.info( "Checking distributions of variant type amongst variant filters...") _filter_agg_order(t, {"allele_type": t.info.allele_type}) logger.info( "Checking distributions of variant type and region type amongst variant filters..." ) _filter_agg_order( t, { "allele_type": t.info.allele_type, "in_problematic_region": t.in_problematic_region, }, n_rows, n_cols, ) logger.info( "Checking distributions of variant type, region type, and number of alt alleles amongst variant filters..." ) _filter_agg_order( t, { "allele_type": t.info.allele_type, "in_problematic_region": t.in_problematic_region, "n_alt_alleles": t.info.n_alt_alleles, }, n_rows, n_cols, )