def add_filters_expr( filters: Dict[str, hl.expr.BooleanExpression], current_filters: hl.expr.SetExpression = None, ) -> hl.expr.SetExpression: """ Create an expression to create or add filters. For each entry in the `filters` dictionary, if the value evaluates to `True`, then the key is added as a filter name. Current filters are kept if provided using `current_filters` :param filters: The filters and their expressions :param current_filters: The set of current filters :return: An expression that can be used to annotate the filters """ if current_filters is None: current_filters = hl.empty_set(hl.tstr) return hl.fold( lambda x, y: x.union(y), current_filters, [ hl.cond(filter_condition, hl.set([filter_name]), hl.empty_set(hl.tstr)) for filter_name, filter_condition in filters.items() ], )
def test_complex_round_trips(): assert_round_trip(hl.struct()) assert_round_trip(hl.empty_array(hl.tint32)) assert_round_trip(hl.empty_set(hl.tint32)) assert_round_trip(hl.empty_dict(hl.tint32, hl.tint32)) assert_round_trip(hl.locus('1', 100)) assert_round_trip(hl.struct(x=3)) assert_round_trip(hl.set([3, 4, 5, 3])) assert_round_trip(hl.array([3, 4, 5])) assert_round_trip(hl.dict({3: 'a', 4: 'b', 5: 'c'})) assert_round_trip( hl.struct(x=hl.dict({ 3: 'a', 4: 'b', 5: 'c' }), y=hl.array([3, 4, 5]), z=hl.set([3, 4, 5, 3])))
def pop_max_expr( freq: hl.expr.ArrayExpression, freq_meta: hl.expr.ArrayExpression, pops_to_exclude: Optional[Set[str]] = None, ) -> hl.expr.StructExpression: """ Create an expression containing the frequency information about the population that has the highest AF in `freq_meta`. Populations specified in `pops_to_exclude` are excluded and only frequencies from adj populations are considered. This resulting struct contains the following fields: - AC: int32 - AF: float64 - AN: int32 - homozygote_count: int32 - pop: str :param freq: ArrayExpression of Structs with fields ['AC', 'AF', 'AN', 'homozygote_count'] :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (as returned by annotate_freq) :param pops_to_exclude: Set of populations to skip for popmax calcluation :return: Popmax struct """ _pops_to_exclude = (hl.literal(pops_to_exclude) if pops_to_exclude is not None else hl.empty_set(hl.tstr)) # pylint: disable=invalid-unary-operand-type popmax_freq_indices = hl.range(0, hl.len(freq_meta)).filter( lambda i: (hl.set(freq_meta[i].keys()) == {"group", "pop"}) & (freq_meta[i]["group"] == "adj") & (~_pops_to_exclude.contains(freq_meta[i]["pop"]))) freq_filtered = popmax_freq_indices.map(lambda i: freq[i].annotate( pop=freq_meta[i]["pop"])).filter(lambda f: f.AC > 0) sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True) return hl.or_missing(hl.len(sorted_freqs) > 0, sorted_freqs[0])
def prepare_genes(gencode_path, hgnc_path, reference_genome): genes = import_gencode(gencode_path, reference_genome) hgnc = import_hgnc(hgnc_path) genes = genes.annotate(**hgnc[genes.gene_id]) # If a symbol was not present in HGNC data, use the symbol from GENCODE genes = genes.annotate( symbol=hl.or_else(genes.symbol, genes.gencode_symbol)) genes = genes.annotate( symbol_upper_case=genes.symbol.upper(), search_terms=hl.empty_set(hl.tstr).add(genes.symbol).add( genes.gencode_symbol).union(genes.previous_symbols).union( genes.alias_symbols).map(lambda s: s.upper()), ) genes = genes.annotate( reference_genome=reference_genome, transcripts=genes.transcripts.map( lambda transcript: transcript.annotate(reference_genome= reference_genome)), ) return genes
def faf_expr( freq: hl.expr.ArrayExpression, freq_meta: hl.expr.ArrayExpression, locus: hl.expr.LocusExpression, pops_to_exclude: Optional[Set[str]] = None, faf_thresholds: List[float] = [0.95, 0.99], ) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, str]]]: """ Calculate the filtering allele frequency (FAF) for each threshold specified in `faf_thresholds`. See http://cardiodb.org/allelefrequencyapp/ for more information. The FAF is computed for each of the following population stratification if found in `freq_meta`: - All samples, with adj criteria - For each population, with adj criteria - For all sex/population on the non-PAR regions of sex chromosomes (will be missing on autosomes and PAR regions of sex chromosomes) Each of the FAF entry is a struct with one entry per threshold specified in `faf_thresholds` of type float64. This returns a tuple with two expressions: 1. An array of FAF expressions as described above 2. An array of dict containing the metadata for each of the array elements, in the same format as that produced by `annotate_freq`. :param freq: ArrayExpression of call stats structs (typically generated by hl.agg.call_stats) :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (typically generated using annotate_freq) :param locus: locus :param pops_to_exclude: Set of populations to exclude from faf calculation (typically bottlenecked or consanguineous populations) :param faf_thresholds: List of FAF thresholds to compute :return: (FAF expression, FAF metadata) """ _pops_to_exclude = (hl.literal(pops_to_exclude) if pops_to_exclude is not None else hl.empty_set(hl.tstr)) # pylint: disable=invalid-unary-operand-type faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter( lambda i: (freq_meta[i].get("group") == "adj") & ((freq_meta[i].size() == 1) | ((hl.set(freq_meta[i].keys()) == {"pop", "group"}) & (~_pops_to_exclude.contains(freq_meta[i]["pop"]))))) sex_faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter( lambda i: (freq_meta[i].get("group") == "adj") & (freq_meta[i].contains("sex")) & ((freq_meta[i].size() == 2) | ((hl.set(freq_meta[i].keys()) == {"pop", "group", "sex"}) & (~_pops_to_exclude.contains(freq_meta[i]["pop"]))))) faf_expr = faf_freq_indices.map(lambda i: hl.struct( **{ f"faf{str(threshold)[2:]}": hl.experimental. filtering_allele_frequency(freq[i].AC, freq[i].AN, threshold) for threshold in faf_thresholds })) faf_expr = faf_expr.extend( sex_faf_freq_indices.map(lambda i: hl.or_missing( ~locus.in_autosome_or_par(), hl.struct( **{ f"faf{str(threshold)[2:]}": hl. experimental.filtering_allele_frequency( freq[i].AC, freq[i].AN, threshold) for threshold in faf_thresholds }), ))) faf_meta = faf_freq_indices.extend(sex_faf_freq_indices).map( lambda i: freq_meta[i]) return faf_expr, hl.eval(faf_meta)
def get_expr_for_vep_sorted_transcript_consequences_array( vep_root, include_coding_annotations=True, omit_consequences=OMIT_CONSEQUENCE_TERMS): """Sort transcripts by 3 properties: 1. coding > non-coding 2. transcript consequence severity 3. canonical > non-canonical so that the 1st array entry will be for the coding, most-severe, canonical transcript (assuming one exists). Also, for each transcript in the array, computes these additional fields: domains: converts Array[Struct] to string of comma-separated domain names hgvs: set to hgvsp is it exists, or else hgvsc. formats hgvsp for synonymous variants. major_consequence: set to most severe consequence for that transcript ( VEP sometimes provides multiple consequences for a single transcript) major_consequence_rank: major_consequence rank based on VEP SO ontology (most severe = 1) (see http://www.ensembl.org/info/genome/variation/predicted_data.html) category: set to one of: "lof", "missense", "synonymous", "other" based on the value of major_consequence. Args: vep_root (StructExpression): root path of the VEP struct in the MT include_coding_annotations (bool): if True, fields relevant to protein-coding variants will be included """ selected_annotations = [ "biotype", "canonical", "cdna_start", "cdna_end", "codons", "gene_id", "gene_symbol", "hgvsc", "hgvsp", "transcript_id", ] if include_coding_annotations: selected_annotations.extend([ "amino_acids", "lof", "lof_filter", "lof_flags", "lof_info", "polyphen_prediction", "protein_id", "protein_start", "sift_prediction", ]) omit_consequence_terms = hl.set( omit_consequences) if omit_consequences else hl.empty_set(hl.tstr) result = hl.sorted( vep_root.transcript_consequences.map(lambda c: c.select( *selected_annotations, consequence_terms=c.consequence_terms.filter( lambda t: ~omit_consequence_terms.contains(t)), domains=c.domains.map(lambda domain: domain.db + ":" + domain.name ), major_consequence=hl.cond( c.consequence_terms.size() > 0, hl.sorted(c.consequence_terms, key=lambda t: CONSEQUENCE_TERM_RANK_LOOKUP.get(t))[0 ], hl.null(hl.tstr), ))).filter(lambda c: c.consequence_terms.size() > 0). map(lambda c: c.annotate( category=(hl.case().when( CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence) <= CONSEQUENCE_TERM_RANK_LOOKUP.get("frameshift_variant"), "lof", ).when( CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence) <= CONSEQUENCE_TERM_RANK_LOOKUP.get("missense_variant"), "missense", ).when( CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence) <= CONSEQUENCE_TERM_RANK_LOOKUP.get("synonymous_variant"), "synonymous", ).default("other")), hgvs=get_expr_for_formatted_hgvs(c), major_consequence_rank=CONSEQUENCE_TERM_RANK_LOOKUP.get( c.major_consequence), )), lambda c: (hl.bind( lambda is_coding, is_most_severe, is_canonical: (hl.cond( is_coding, hl.cond(is_most_severe, hl.cond(is_canonical, 1, 2), hl.cond(is_canonical, 3, 4)), hl.cond(is_most_severe, hl.cond(is_canonical, 5, 6), hl.cond(is_canonical, 7, 8)), )), hl.or_else(c.biotype, "") == "protein_coding", hl.set(c.consequence_terms).contains(vep_root. most_severe_consequence), hl.or_else(c.canonical, 0) == 1, )), ) if not include_coding_annotations: # for non-coding variants, drop fields here that are hard to exclude in the above code result = result.map(lambda c: c.drop("domains", "hgvsp")) return hl.zip_with_index(result).map(lambda csq_with_index: csq_with_index[ 1].annotate(transcript_rank=csq_with_index[0]))
gnomad_svs_AF=hl.null('float'), pos=10000, filters=['LOW_CALL_RATE'], xpos=1000010000, cpx_intervals=NULL_INTERVALS, xstart=1000010000, xstop=1000017000, svType='DUP', transcriptConsequenceTerms=['DUP'], sv_type_detail=hl.null('str'), sortedTranscriptConsequences=[ hl.struct(gene_symbol='OR4F5', gene_id='ENSG00000284662', predicted_consequence='NEAREST_TSS') ], geneIds=hl.empty_set(hl.dtype('str')), samples_no_call=EMPTY_STR_ARRAY, samples_num_alt_1=['SAMPLE-1', 'SAMPLE-2', 'SAMPLE-3'], samples_num_alt_2=EMPTY_STR_ARRAY, genotypes=[ hl.struct(sample_id='SAMPLE-1', gq=999, num_alt=1, cn=3), hl.struct(sample_id='SAMPLE-2', gq=52, num_alt=1, cn=3), hl.struct(sample_id='SAMPLE-3', gq=19, num_alt=1,