def annotate_sex(ds: Union[hl.MatrixTable, hl.Table], f_stat_field: str = 'f_stat', sex_field: str = 'sex', female_upper_threshold: float = 0.4, male_lower_threshold: float = 0.6, ) -> Union[hl.MatrixTable, hl.Table]: """ Annotate sex (0-female/1-male) based on F_stat (inbreeding coefficient computed on chr X) :param ds: Input MatrixTable or HailTable :param f_stat_field: F stat field name :param sex_field: Sex field name to be annotated :param female_upper_threshold: F_stat female upper threshold :param male_lower_threshold: F_stat male lower threshold :return: Annotated ds """ if isinstance(ds, hl.MatrixTable): ds = (ds .annotate_cols(**{sex_field: (hl.case() .when(ds[f_stat_field] <= female_upper_threshold, 0) .when(ds[f_stat_field] >= male_lower_threshold, 1) .or_missing())} ) ) else: ds = (ds .annotate(**{sex_field: (hl.case() .when(ds[f_stat_field] <= female_upper_threshold, 0) .when(ds[f_stat_field] >= male_lower_threshold, 1) .or_missing())} ) ) return ds
def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab), failure) .when((hl.sum(parent.AD) == 0), failure) .when(parent.AD[1] / hl.sum(parent.AD) > max_parent_ab, failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.3), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) )
def kyle_sex_specific_qc(mt_path): mt = hl.read_matrix_table(mt_path) mt = mt.annotate_cols(sex=hl.cond(hl.rand_bool(0.5), 'Male', 'Female')) (num_males, num_females) = mt.aggregate_cols((hl.agg.count_where(mt.sex == 'Male'), hl.agg.count_where(mt.sex == 'Female'))) mt = mt.annotate_rows( male_hets=hl.agg.count_where(mt.GT.is_het() & (mt.sex == 'Male')), male_homvars=hl.agg.count_where(mt.GT.is_hom_var() & (mt.sex == 'Male')), male_calls=hl.agg.count_where(hl.is_defined(mt.GT) & (mt.sex == 'Male')), female_hets=hl.agg.count_where(mt.GT.is_het() & (mt.sex == 'Female')), female_homvars=hl.agg.count_where(mt.GT.is_hom_var() & (mt.sex == 'Female')), female_calls=hl.agg.count_where(hl.is_defined(mt.GT) & (mt.sex == 'Female')) ) mt = mt.annotate_rows( call_rate=(hl.case() .when(mt.locus.in_y_nonpar(), (mt.male_calls / num_males)) .when(mt.locus.in_x_nonpar(), (mt.male_calls + 2 * mt.female_calls) / (num_males + 2 * num_females)) .default((mt.male_calls + mt.female_calls) / (num_males + num_females))), AC=(hl.case() .when(mt.locus.in_y_nonpar(), mt.male_homvars) .when(mt.locus.in_x_nonpar(), mt.male_homvars + mt.female_hets + 2 * mt.female_homvars) .default(mt.male_hets + 2 * mt.male_homvars + mt.female_hets + 2 * mt.female_homvars)), AN=(hl.case() .when(mt.locus.in_y_nonpar(), mt.male_calls) .when(mt.locus.in_x_nonpar(), mt.male_calls + 2 * mt.female_calls) .default(2 * mt.male_calls + 2 * mt.female_calls)) ) mt.rows()._force_count()
def pheno_ht_to_mt(pheno_ht: hl.Table, data_type: str, special_fields: str = ('age', 'sex'), rekey: bool = True): """ Input Hail Table with lots of phenotype row fields, distill into MatrixTable with either categorical or continuous data types as entries :param Table pheno_ht: Input hail Table with phenotypes as row fields :param str data_type: one of "categorical" or "continuous" :return: Hail MatrixTable with phenotypes as entries :rtype: MatrixTable """ if data_type == 'categorical': filter_type = {hl.tbool} value_type = hl.bool else: filter_type = {hl.tint, hl.tfloat} value_type = hl.float special_fields_to_include = [] fields = set(pheno_ht.row_value) for field in special_fields: if field in fields: fields.remove(field) special_fields_to_include.append(field) select_fields = { x: value_type(pheno_ht[x]) for x in fields if pheno_ht[x].dtype in filter_type } pheno_ht = pheno_ht.select(*special_fields_to_include, **select_fields) mt = pheno_ht.to_matrix_table_row_major(columns=list(select_fields), entry_field_name='value', col_field_name='phesant_pheno') if rekey: mt = mt.key_cols_by( trait_type=data_type, phenocode=mt.phesant_pheno.split('_')[0], pheno_sex='both_sexes', coding=hl.case().when( (data_type == 'categorical') & (hl.len(mt.phesant_pheno.split('_')) > 1), mt.phesant_pheno.split('_', 2)[1] ) # TODO: fix to 1 when https://github.com/hail-is/hail/issues/7893 is fixed .default(NULL_STR_KEY), modifier=hl.case().when( (data_type == 'continuous') & (hl.len(mt.phesant_pheno.split('_')) > 1), mt.phesant_pheno.split('_', 2)[1] ) # TODO: fix to 1 when https://github.com/hail-is/hail/issues/7893 is fixed .default(NULL_STR_KEY)) return mt
def make_sample_rank_table(phe_ht: hl.Table) -> hl.Table: """ Make table with rank of sample sorted by retention priority (lower rank has higher priority). It mainly uses two bits of information: - cases are prioritised over controls - samples are preferred based on the cohort info as follow: chd > ddd > ukbb :param phe_ht: Table with sample meta-data annotations (e.g. phenotype, cohort info...) :return: Hail Table """ phe_ht = ( phe_ht.annotate( case_control_rank=hl.int( phe_ht['phe.is_case']), # 0: control, 1: cases cohort_rank=hl.case().when(phe_ht.is_ukbb, 10).when( phe_ht.is_ddd, 100).when(phe_ht.is_chd, 1000).or_missing()).key_by()) phe_ht = (phe_ht.select('ega_id', 'case_control_rank', 'cohort_rank')) # sort table (descending) tb_rank = (phe_ht.order_by(hl.desc(phe_ht.case_control_rank), hl.desc(phe_ht.cohort_rank))) tb_rank = (tb_rank.add_index(name='rank').key_by('ega_id')) tb_rank = tb_rank.annotate(rank=tb_rank.rank + 1) return tb_rank
def get_worst_gene_csq_code_expr(vep_expr: hl.expr.StructExpression) -> hl.expr.DictExpression: worst_gene_csq_expr = vep_expr.transcript_consequences.filter( lambda tc: tc.biotype == 'protein_coding' ).map( lambda ts: ts.select( 'gene_id', 'gene_symbol', csq=( hl.case(missing_false=True) .when(ts.lof == 'HC', CSQ_CODES.index('lof')) .when(ts.polyphen_prediction == 'probably_damaging', CSQ_CODES.index('damaging_missense')) .when(ts.consequence_terms.any(lambda x: x == 'missense_variant'), CSQ_CODES.index('missense_variant')) .when(ts.consequence_terms.all(lambda x: x == 'synonymous_variant'), CSQ_CODES.index('synonymous_variant')) .or_missing() ) ) ) worst_gene_csq_expr = worst_gene_csq_expr.filter(lambda x: hl.is_defined(x.csq)) worst_gene_csq_expr = worst_gene_csq_expr.group_by(lambda x: x.gene_id) worst_gene_csq_expr = worst_gene_csq_expr.map_values( lambda x: hl.sorted(x, key=lambda y: y.csq)[0] ) return worst_gene_csq_expr
def find_worst_transcript_consequence( tcl: hl.expr.ArrayExpression, ) -> hl.expr.StructExpression: """ Gets worst transcript_consequence from an array of em """ flag_score = 500 no_flag_score = flag_score * (1 + penalize_flags) def csq_score(tc): return csq_dict[csqs.find( lambda x: x == tc.most_severe_consequence)] tcl = tcl.map( lambda tc: tc.annotate(csq_score=hl.case(missing_false=True).when( (tc.lof == "HC") & (tc.lof_flags == ""), csq_score(tc) - no_flag_score, ).when( (tc.lof == "HC") & (tc.lof_flags != ""), csq_score(tc) - flag_score ).when(tc.lof == "OS", csq_score(tc) - 20).when( tc.lof == "LC", csq_score(tc) - 10 ).when(tc.polyphen_prediction == "probably_damaging", csq_score(tc) - 0.5).when( tc.polyphen_prediction == "possibly_damaging", csq_score(tc) - 0.25).when( tc.polyphen_prediction == "benign", csq_score(tc) - 0.1).default(csq_score(tc)))) return hl.or_missing( hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0])
def check_sex( sex_ht: hl.Table, output_dir: str, output_name: str, ) -> None: """ Compare inferred to given sex and output file with column added for discrepancies. Output directory and name here are used to locate the functioning pedigree with given sexes. :param sex_ht: Table of inferred sexes for each sample :param output_dir: Path to directory to output results :param output_name: Output prefix to use for results :return: None """ # Read in functioning pedigree with given sexes ped_ht = hl.import_table( f"{output_dir}/{output_name}_functioning_pedigree.ped") ped_ht = ped_ht.key_by(s=ped_ht.Individual_ID).select("Sex") ped_ht = ped_ht.annotate( given_sex=hl.case().when(ped_ht.Sex == "M", "male").when( ped_ht.Sex == "F", "female").default(ped_ht.Sex)).drop("Sex") sex_ht = sex_ht.join(ped_ht, how="outer") sex_ht = sex_ht.annotate(discrepant_sex=sex_ht.sex != sex_ht.given_sex) sex_ht.export(f"{output_dir}/{output_name}_sex_check.txt")
def generate_allele_data(mt: hl.MatrixTable) -> hl.Table: """ Writes bi-allelic sites MT with the following annotations: - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles) :param MatrixTable mt: Full unsplit MT :return: Table with allele data annotations :rtype: Table """ ht = mt.rows().select() allele_data = hl.struct(nonsplit_alleles=ht.alleles, has_star=hl.any(lambda a: a == '*', ht.alleles)) ht = ht.annotate(allele_data=allele_data.annotate( **add_variant_type(ht.alleles))) ht = hl.split_multi_hts(ht) allele_type = (hl.case().when( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv').when(hl.is_insertion(ht.alleles[0], ht.alleles[1]), 'ins').when(hl.is_deletion(ht.alleles[0], ht.alleles[1]), 'del').default('complex')) ht = ht.annotate(allele_data=ht.allele_data.annotate( allele_type=allele_type, was_mixed=ht.allele_data.variant_type == 'mixed')) return ht
def compressed_variant_id(locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression) -> hl.expr.StringExpression: return hl.rbind( hl.len(alleles[0]), hl.len(alleles[1]), lambda ref_len, alt_len: hl.case() .when( ref_len > alt_len, normalized_contig(locus.contig) + "-" + hl.str(locus.position) + "d" + hl.str(ref_len - alt_len) + "-" + alleles[1], ) .when( ref_len < alt_len, normalized_contig(locus.contig) + "-" + hl.str(locus.position) + "i" + hl.str(alt_len - ref_len) + "-" + _encode_allele(alleles[1]), ) .default(variant_id(locus, alleles)), )
def generate_allele_data(ht: hl.Table) -> hl.Table: """ Returns bi-allelic sites HT with the following annotations: - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles) :param Table ht: Full unsplit HT :return: Table with allele data annotations :rtype: Table """ ht = ht.select() allele_data = hl.struct(nonsplit_alleles=ht.alleles, has_star=hl.any(lambda a: a == "*", ht.alleles)) ht = ht.annotate(allele_data=allele_data.annotate( **add_variant_type(ht.alleles))) ht = hl.split_multi_hts(ht) ht = ht.filter(hl.len(ht.alleles) > 1) allele_type = (hl.case().when( hl.is_snp(ht.alleles[0], ht.alleles[1]), "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]), "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]), "del").default("complex")) ht = ht.annotate(allele_data=ht.allele_data.annotate( allele_type=allele_type, was_mixed=ht.allele_data.variant_type == "mixed")) return ht
def get_expr_for_variant_type(table: hl.Table) -> hl.str: return hl.bind( lambda ref_len, alt_len: (hl.case().when(ref_len > alt_len, "D").when( ref_len < alt_len, "I").when(ref_len > 1, "M").default("S")), hl.len(get_expr_for_ref_allele(table)), hl.len(get_expr_for_alt_allele(table)), )
def adjusted_sex_ploidy_expr( locus_expr: hl.expr.LocusExpression, gt_expr: hl.expr.CallExpression, karyotype_expr: hl.expr.StringExpression, xy_karyotype_str: str = "XY", xx_karyotype_str: str = "XX", ) -> hl.expr.CallExpression: """ Creates an entry expression to convert males to haploid on non-PAR X/Y and females to missing on Y :param locus_expr: Locus :param gt_expr: Genotype :param karyotype_expr: Karyotype :param xy_karyotype_str: Male sex karyotype representation :param xx_karyotype_str: Female sex karyotype representation :return: Genotype adjusted for sex ploidy """ male = karyotype_expr == xy_karyotype_str female = karyotype_expr == xx_karyotype_str x_nonpar = locus_expr.in_x_nonpar() y_par = locus_expr.in_y_par() y_nonpar = locus_expr.in_y_nonpar() return (hl.case(missing_false=True).when( female & (y_par | y_nonpar), hl.null(hl.tcall)).when( male & (x_nonpar | y_nonpar) & gt_expr.is_het(), hl.null(hl.tcall)).when(male & (x_nonpar | y_nonpar), hl.call(gt_expr[0], phased=False)).default(gt_expr))
def require_biallelic(dataset, method) -> MatrixTable: require_row_key_variant(dataset, method) return dataset._select_rows( method, hl.case().when(dataset.alleles.length() == 2, dataset._rvrow).or_error( f"'{method}' expects biallelic variants ('alleles' field of length 2), found " + hl.str(dataset.locus) + ", " + hl.str(dataset.alleles)))
def sex_aware_sample_annotations(mt, mt_to_annotate, args): """ Creates sex-aware sample annotations for call rate :param mt: Matrix table where failing samples and variants have been filtered out :param mt_to_annotate: matrix table to copy the annotation to, without samples filtered out :return: Returns non-filtered matrix table with additional column annotation sexaware_sample_call_rate """ logging.info( f"Annotating sex aware sample call rate using column {args.sex_col}") num_y_non_par_vars = mt.aggregate_rows( hl.agg.count_where(mt.locus.in_y_nonpar())) num_all_other_vars = mt.aggregate_rows( hl.agg.count_where(~mt.locus.in_y_nonpar())) mt = mt.annotate_cols(sexaware_sample_call_rate=(hl.case().when( mt[args.sex_col] == args.female_tag, hl.agg.count_where(hl.is_defined(mt.GT) & ~mt.locus.in_y_nonpar()) / num_all_other_vars).default( hl.agg.count_where(hl.is_defined(mt.GT)) / (num_y_non_par_vars + num_all_other_vars)))) mt_to_annotate = mt_to_annotate.annotate_cols( sexaware_sample_call_rate=mt.cols()[ mt_to_annotate.s].sexaware_sample_call_rate) return mt_to_annotate
def test_maximal_independent_set3(self): is_case = {"A", "C", "E", "G", "H"} edges = [("A", "B"), ("C", "D"), ("E", "F"), ("G", "H")] edges = [{ "i": { "id": l, "is_case": l in is_case }, "j": { "id": r, "is_case": r in is_case } } for l, r in edges] t = hl.Table.parallelize( edges, hl.tstruct(i=hl.tstruct(id=hl.tstr, is_case=hl.tbool), j=hl.tstruct(id=hl.tstr, is_case=hl.tbool))) tiebreaker = lambda l, r: (hl.case().when(l.is_case & ( ~r.is_case), -1).when(~(l.is_case) & r.is_case, 1).default(0)) mis = hl.maximal_independent_set(t.i, t.j, tie_breaker=tiebreaker) expected_sets = [{"A", "C", "E", "G"}, {"A", "C", "E", "H"}] self.assertTrue(mis.all(mis.node.is_case)) self.assertTrue( set([row.id for row in mis.select(mis.node.id).collect()]) in expected_sets)
def get_adj_expr( gt_expr: hl.expr.CallExpression, gq_expr: Union[hl.expr.Int32Expression, hl.expr.Int64Expression], dp_expr: Union[hl.expr.Int32Expression, hl.expr.Int64Expression], ad_expr: hl.expr.ArrayNumericExpression, adj_gq: int = 20, adj_dp: int = 10, adj_ab: float = 0.2, haploid_adj_dp: int = 10 ) -> hl.expr.BooleanExpression: """ Gets adj genotype annotation. Defaults correspond to gnomAD values. """ return ( (gq_expr >= adj_gq) & hl.cond( gt_expr.is_haploid(), dp_expr >= haploid_adj_dp, dp_expr >= adj_dp ) & ( hl.case() .when(~gt_expr.is_het(), True) .when(gt_expr.is_het_ref(), ad_expr[gt_expr[1]] / dp_expr >= adj_ab) .default((ad_expr[gt_expr[0]] / dp_expr >= adj_ab ) & (ad_expr[gt_expr[1]] / dp_expr >= adj_ab )) ) )
def require_biallelic(dataset, method) -> MatrixTable: require_row_key_variant(dataset, method) return dataset._select_rows(method, hl.case() .when(dataset.alleles.length() == 2, dataset._rvrow) .or_error(f"'{method}' expects biallelic variants ('alleles' field of length 2), found " + hl.str(dataset.locus) + ", " + hl.str(dataset.alleles)))
def freq_bin_expr(freq_expr: hl.expr.ArrayExpression, index: int = 0) -> hl.expr.StringExpression: """ Return frequency string annotations based on input AC or AF. .. note:: - Default index is 0 because function assumes freq_expr was calculated with `annotate_freq`. - Frequency index 0 from `annotate_freq` is frequency for all pops calculated on adj genotypes only. :param freq_expr: Array of structs containing frequency information. :param index: Which index of freq_expr to use for annotation. Default is 0. :return: StringExpression containing bin name based on input AC or AF. """ return (hl.case(missing_false=True).when( freq_expr[index].AC == 0, "Not found").when(freq_expr[index].AC == 1, "Singleton").when( freq_expr[index].AC == 2, "Doubleton").when(freq_expr[index].AC <= 5, "AC 3 - 5").when( freq_expr[index].AF < 1e-4, "AC 6 - 0.01%").when( freq_expr[index].AF < 1e-3, "0.01% - 0.1%").when( freq_expr[index].AF < 1e-2, "0.1% - 1%").when( freq_expr[index].AF < 1e-1, "1% - 10%").when(freq_expr[index].AF > 0.95, ">95%").default("10% - 95%"))
def test_annotate_intervals(self): ds = get_dataset() bed1 = hl.import_bed(resource('example1.bed'), reference_genome='GRCh37') bed2 = hl.import_bed(resource('example2.bed'), reference_genome='GRCh37') bed3 = hl.import_bed(resource('example3.bed'), reference_genome='GRCh37') self.assertTrue(list(bed2.key.dtype) == ['interval']) self.assertTrue(list(bed2.row.dtype) == ['interval', 'target']) interval_list1 = hl.import_locus_intervals(resource('exampleAnnotation1.interval_list')) interval_list2 = hl.import_locus_intervals(resource('exampleAnnotation2.interval_list')) self.assertTrue(list(interval_list2.key.dtype) == ['interval']) self.assertTrue(list(interval_list2.row.dtype) == ['interval', 'target']) ann = ds.annotate_rows(in_interval=bed1[ds.locus]).rows() self.assertTrue(ann.all((ann.locus.position <= 14000000) | (ann.locus.position >= 17000000) | (hl.is_missing(ann.in_interval)))) for bed in [bed2, bed3]: ann = ds.annotate_rows(target=bed[ds.locus].target).rows() expr = (hl.case() .when(ann.locus.position <= 14000000, ann.target == 'gene1') .when(ann.locus.position >= 17000000, ann.target == 'gene2') .default(ann.target == hl.null(hl.tstr))) self.assertTrue(ann.all(expr)) self.assertTrue(ds.annotate_rows(in_interval=interval_list1[ds.locus]).rows() ._same(ds.annotate_rows(in_interval=bed1[ds.locus]).rows())) self.assertTrue(ds.annotate_rows(target=interval_list2[ds.locus].target).rows() ._same(ds.annotate_rows(target=bed2[ds.locus].target).rows()))
def contig_number(contig: hl.expr.StringExpression) -> hl.expr.Int32Expression: return hl.bind( lambda contig: (hl.case().when(contig == "X", 23).when(contig == "Y", 24).when( contig == "M", 25).default(hl.int(contig))), normalized_contig(contig), )
def min_rep(locus, ref, alt): mr = hl.min_rep(locus, [ref, alt]) return (hl.case().when( alt == '<NON_REF>', hl.struct(ref=ref[0:1], alt=alt)).when( locus == mr.locus, hl.struct(ref=mr.alleles[0], alt=mr.alleles[1])).or_error( "locus before and after minrep differ"))
def add_strand_flip_annotation(reference_ref, reference_alt, ds_a1, ds_a2): """ Document me here :) """ is_strand_ambig = hl.is_strand_ambiguous(ds_a1, ds_a2) ds_a1_flipped = flip_strand(ds_a1) ds_a2_flipped = flip_strand(ds_a2) is_snp = hl.is_snp(ds_a1, ds_a2) null = hl.null(hl.tbool) return (hl.case().when( (ds_a1 == reference_alt) & (ds_a2 == reference_ref), hl.cond(is_strand_ambig, [ hl.struct(swap=True, flip=True), hl.struct(swap=False, flip=False) ], [hl.struct(swap=False, flip=False)])).when( (ds_a1 == reference_ref) & (ds_a2 == reference_alt), hl.cond(is_strand_ambig, [ hl.struct(swap=True, flip=False), hl.struct(swap=False, flip=True) ], [hl.struct(swap=True, flip=False)])).when( (ds_a1_flipped == reference_alt) & (ds_a2_flipped == reference_ref) & is_snp, [hl.struct(swap=False, flip=True)]).when( (ds_a1_flipped == reference_ref) & (ds_a2_flipped == reference_alt) & is_snp, [hl.struct(swap=True, flip=True)]).default( hl.empty_array(hl.tstruct(swap=hl.tbool, flip=hl.tbool))))
def vep_protein_domain_ann_expr( s: hl.expr.StringExpression) -> hl.expr.DictExpression: """ Parse and annotate protein domain(s) from VEP annotation. Expected StringExpression as input (e.g. 'Pfam:PF13853&Prints:PR00237&PROSITE_profiles:PS50262') It will generate a dict<k,v> where keys (k) represent source/database and values (v) the annotated domain_id. :param s: hl.expr.StringExpression :return: hl.expr.DictExpression """ a1 = s.split(delim="&") # keep only well-annotated domain(s) (i.e. <source:domain_id>) a2 = a1.map(lambda x: x.split(delim=":")) a2 = a2.filter(lambda x: x.length() == 2) d = ( hl.case().when( hl.len(a2) > 0, hl.dict( hl.zip( a2.map(lambda x: x[0] ), # TODO: Optimize by scanning array just one. a2.map(lambda x: x[1])))).or_missing()) return d
def adjust_sex_ploidy(mt: hl.MatrixTable, sex_expr: hl.expr.StringExpression, male_str: str = 'male', female_str: str = 'female') -> hl.MatrixTable: """ Converts males to haploid on non-PAR X/Y, sets females to missing on Y :param MatrixTable mt: Input MatrixTable :param StringExpression sex_expr: Expression pointing to sex in MT (if not male_str or female_str, no change) :param str male_str: String for males (default 'male') :param str female_str: String for females (default 'female') :return: MatrixTable with fixed ploidy for sex chromosomes :rtype: MatrixTable """ male = sex_expr == male_str female = sex_expr == female_str x_nonpar = mt.locus.in_x_nonpar() y_par = mt.locus.in_y_par() y_nonpar = mt.locus.in_y_nonpar() return mt.annotate_entries( GT=hl.case( missing_false=True).when(female & (y_par | y_nonpar), hl.null(hl.tcall)). when(male & (x_nonpar | y_nonpar) & mt.GT.is_het(), hl.null(hl.tcall)). when(male & (x_nonpar | y_nonpar), hl.call(mt.GT[0], phased=False)).default(mt.GT))
def get_relationship_expr( # TODO: The threshold detection could be easily automated by fitting distributions over the data. kin_expr: hl.expr.NumericExpression, ibd0_expr: hl.expr.NumericExpression, ibd1_expr: hl.expr.NumericExpression, ibd2_expr: hl.expr.NumericExpression, first_degree_kin_thresholds: Tuple[float, float] = (0.19, 0.4), second_degree_min_kin: float = 0.1, ibd0_0_max: float = 0.025, ibd0_25_thresholds: Tuple[float, float] = (0.1, 0.425), # ibd0_50_thresholds = [0.37, 0.625], Not useful for relationship inference # ibd0_100_threshold = 0.625 , Not useful for relationship inference ibd1_0_thresholds: Tuple[float, float] = (-0.15, 0.1), # ibd1_25_thresholds: Tuple[float, float] = (0.1, 0.37), Not useful for relationship inference ibd1_50_thresholds: Tuple[float, float] = (0.275, 0.75), ibd1_100_min: float = 0.75, ibd2_0_max: float = 0.125, ibd2_25_thresholds: Tuple[float, float] = (0.1, 0.5), ibd2_100_thresholds: Tuple[float, float] = (0.75, 1.25) ) -> hl.expr.StringExpression: """ Returns an expression that gives the relationship between a pair of samples given their kin coefficient and IBDO, IBD1, IBD2 values. The kinship coefficient values in the defaults are in line with those output from `hail.methods.pc_relate <https://hail.is/docs/0.2/methods/genetics.html?highlight=pc_relate#hail.methods.pc_relate>`. :param kin_expr: Kin coefficient expression :param ibd0_expr: IBDO expression :param ibd1_expr: IBD1 expression :param ibd2_expr: IDB2 expression :param first_degree_kin_thresholds: (min, max) kinship threshold for 1st degree relatives :param second_degree_min_kin: min kinship threshold for 2nd degree relatives :param ibd0_0_max: max IBD0 threshold for 0 IBD0 sharing :param ibd0_25_thresholds: (min, max) thresholds for 0.25 IBD0 sharing :param ibd1_0_thresholds: (min, max) thresholds for 0 IBD1 sharing. Note that the min is there because pc_relate can output large negative values in some corner cases. :param ibd1_50_thresholds: (min, max) thresholds for 0.5 IBD1 sharing :param ibd1_100_min: min IBD1 threshold for 1.0 IBD1 sharing :param ibd2_0_max: max IBD2 threshold for 0 IBD2 sharing :param ibd2_25_thresholds: (min, max) thresholds for 0.25 IBD2 sharing :param ibd2_100_thresholds: (min, max) thresholds for 1.00 IBD2 sharing. Note that the min is there because pc_relate can output much larger IBD2 values in some corner cases. :return: The relationship annotation using the constants defined in this module. """ return (hl.case().when(kin_expr < second_degree_min_kin, UNRELATED).when( (kin_expr < first_degree_kin_thresholds[0]), SECOND_DEGREE_RELATIVES).when( (kin_expr < first_degree_kin_thresholds[1]) & (ibd0_expr <= ibd0_0_max) & (ibd1_expr >= ibd1_100_min) & (ibd2_expr <= ibd2_0_max), PARENT_CHILD).when( (kin_expr < first_degree_kin_thresholds[1]) & (ibd0_expr >= ibd0_25_thresholds[0]) & (ibd0_expr <= ibd0_25_thresholds[1]) & (ibd1_expr >= ibd1_50_thresholds[0]) & (ibd1_expr <= ibd1_50_thresholds[1]) & (ibd2_expr >= ibd2_25_thresholds[0]) & (ibd2_expr <= ibd2_25_thresholds[1]), SIBLINGS).when( (kin_expr > first_degree_kin_thresholds[1]) & (ibd0_expr < ibd0_0_max) & (ibd1_expr >= ibd1_0_thresholds[0]) & (ibd1_expr <= ibd1_0_thresholds[1]) & (ibd2_expr >= ibd2_100_thresholds[0]) & (ibd2_expr <= ibd2_100_thresholds[1]), DUPLICATE_OR_TWINS).default(AMBIGUOUS_RELATIONSHIP))
def test_lambda_gc_nans(self): N = 5000000 ht = hl.utils.range_table(N).annotate(x=hl.scan.count() / N, is_even=hl.scan.count() % 2 == 0) lgc_nan = hl.lambda_gc(hl.case().when(ht.is_even, hl.float('nan')).default(ht.x)) self.assertAlmostEqual(lgc_nan, 1, places=1) # approximate, 1 place is safe
def mac_category_case_builder(call_stats_expr): return (hl.case().when(call_stats_expr.AC <= 5, call_stats_expr.AC).when( call_stats_expr.AC <= 10, 10).when(call_stats_expr.AC <= 20, 20).when(call_stats_expr.AF <= 0.001, 0.001).when(call_stats_expr.AF <= 0.01, 0.01).when(call_stats_expr.AF <= 0.1, 0.1).default(0.99))
def unphase_mt(mt: hl.MatrixTable) -> hl.MatrixTable: """ Generate unphased version of MatrixTable (assumes call is in mt.GT and is diploid or haploid only) """ return mt.annotate_entries(GT=hl.case().when( mt.GT.is_diploid(), hl.call(mt.GT[0], mt.GT[1], phased=False)).when( mt.GT.is_haploid(), hl.call(mt.GT[0], phased=False)).default( hl.null(hl.tcall)))
def test_interval_join(self): left = hl.utils.range_table(50, n_partitions=10) intervals = hl.utils.range_table(4) intervals = intervals.key_by(interval=hl.interval(intervals.idx * 10, intervals.idx * 10 + 5)) left = left.annotate(interval_matches=intervals.index(left.key)) self.assertTrue(left.all(hl.case() .when(left.idx % 10 < 5, left.interval_matches.idx == left.idx // 10) .default(hl.is_missing(left.interval_matches))))
def get_lgt(e, n_alleles, has_non_ref, row): index = e.GT.unphased_diploid_gt_index() n_no_nonref = n_alleles - hl.int(has_non_ref) triangle_without_nonref = hl.triangle(n_no_nonref) return (hl.case().when(index < triangle_without_nonref, e.GT).when( index < hl.triangle(n_alleles), hl.null('call')).or_error('invalid GT ' + hl.str(e.GT) + ' at site ' + hl.str(row.locus)))
def _get_copy_state( locus: hl.expr.LocusExpression) -> hl.expr.Int32Expression: """ Helper method to go from LocusExpression to a copy-state int for indexing into the trans_count_map. """ return (hl.case().when(locus.in_autosome_or_par(), auto_or_par).when( locus.in_x_nonpar(), hemi_x).when(locus.in_y_nonpar(), hemi_y).or_missing())
def struct_from_min_rep(i): return hl.bind(lambda mr: (hl.case() .when(ds.locus == mr.locus, hl.struct( locus=ds.locus, alleles=[mr.alleles[0], mr.alleles[1]], a_index=i, was_split=True)) .or_error("Found non-left-aligned variant in sparse_split_multi")), hl.min_rep(ds.locus, [ds.alleles[0], ds.alleles[i]]))
def histogram(data, range=None, bins=50, legend=None, title=None): """Create a histogram. Parameters ---------- data : :class:`.Struct` or :class:`.Float64Expression` Sequence of data to plot. range : Tuple[float] Range of x values in the histogram. bins : int Number of bins in the histogram. legend : str Label of data on the x-axis. title : str Title of the histogram. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if isinstance(data, Expression): if data._indices.source is not None: agg_f = data._aggregation_method() if range is not None: start = range[0] end = range[1] else: finite_data = hail.bind(lambda x: hail.case().when(hail.is_finite(x), x).or_missing(), data) start, end = agg_f((aggregators.min(finite_data), aggregators.max(finite_data))) if start is None and end is None: raise ValueError(f"'data' contains no values that are defined and finite") data = agg_f(aggregators.hist(data, start, end, bins)) else: return ValueError('Invalid input') p = figure(title=title, x_axis_label=legend, y_axis_label='Frequency', background_fill_color='#EEEEEE') p.quad( bottom=0, top=data.bin_freq, left=data.bin_edges[:-1], right=data.bin_edges[1:], legend=legend, line_color='black') if data.n_larger > 0: p.quad( bottom=0, top=data.n_larger, left=data.bin_edges[-1], right=(data.bin_edges[-1] + (data.bin_edges[1] - data.bin_edges[0])), line_color='black', fill_color='green', legend='Outliers Above') if data.n_smaller > 0: p.quad( bottom=0, top=data.n_smaller, left=data.bin_edges[0] - (data.bin_edges[1] - data.bin_edges[0]), right=data.bin_edges[0], line_color='black', fill_color='red', legend='Outliers Below') return p
def test_maximal_independent_set3(self): is_case = {"A", "C", "E", "G", "H"} edges = [("A", "B"), ("C", "D"), ("E", "F"), ("G", "H")] edges = [{"i": {"id": l, "is_case": l in is_case}, "j": {"id": r, "is_case": r in is_case}} for l, r in edges] t = hl.Table.parallelize(edges, hl.tstruct(i=hl.tstruct(id=hl.tstr, is_case=hl.tbool), j=hl.tstruct(id=hl.tstr, is_case=hl.tbool))) tiebreaker = lambda l, r: (hl.case() .when(l.is_case & (~r.is_case), -1) .when(~(l.is_case) & r.is_case, 1) .default(0)) mis = hl.maximal_independent_set(t.i, t.j, tie_breaker=tiebreaker) expected_sets = [{"A", "C", "E", "G"}, {"A", "C", "E", "H"}] self.assertTrue(mis.all(mis.node.is_case)) self.assertTrue(set([row.id for row in mis.select(mis.node.id).collect()]) in expected_sets)
def histogram(data, range=None, bins=50, legend=None, title=None, log=False, interactive=False): """Create a histogram. Notes ----- `data` can be a :class:`.Float64Expression`, or the result of the :func:`.agg.hist` or :func:`.agg.approx_cdf` aggregators. Parameters ---------- data : :class:`.Struct` or :class:`.Float64Expression` Sequence of data to plot. range : Tuple[float] Range of x values in the histogram. bins : int Number of bins in the histogram. legend : str Label of data on the x-axis. title : str Title of the histogram. log : bool Plot the log10 of the bin counts. Returns ------- :class:`bokeh.plotting.figure.Figure` """ if isinstance(data, Expression): if data._indices.source is not None: if interactive: raise ValueError("'interactive' flag can only be used on data from 'approx_cdf'.") agg_f = data._aggregation_method() if range is not None: start = range[0] end = range[1] else: finite_data = hail.bind(lambda x: hail.case().when(hail.is_finite(x), x).or_missing(), data) start, end = agg_f((aggregators.min(finite_data), aggregators.max(finite_data))) if start is None and end is None: raise ValueError(f"'data' contains no values that are defined and finite") data = agg_f(aggregators.hist(data, start, end, bins)) else: return ValueError('Invalid input') elif 'values' in data: cdf = data hist, edges = np.histogram(cdf.values, bins=bins, weights=np.diff(cdf.ranks), density=True) data = Struct(bin_freq=hist, bin_edges=edges, n_larger=0, n_smaller=0) if log: data.bin_freq = [log10(x) for x in data.bin_freq] data.n_larger = log10(data.n_larger) data.n_smaller = log10(data.n_smaller) y_axis_label = 'log10 Frequency' else: y_axis_label = 'Frequency' x_span = data.bin_edges[-1] - data.bin_edges[0] x_start = data.bin_edges[0] - .05 * x_span x_end = data.bin_edges[-1] + .05 * x_span p = figure( title=title, x_axis_label=legend, y_axis_label=y_axis_label, background_fill_color='#EEEEEE', x_range=(x_start, x_end)) q = p.quad( bottom=0, top=data.bin_freq, left=data.bin_edges[:-1], right=data.bin_edges[1:], legend=legend, line_color='black') if data.n_larger > 0: p.quad( bottom=0, top=data.n_larger, left=data.bin_edges[-1], right=(data.bin_edges[-1] + (data.bin_edges[1] - data.bin_edges[0])), line_color='black', fill_color='green', legend='Outliers Above') if data.n_smaller > 0: p.quad( bottom=0, top=data.n_smaller, left=data.bin_edges[0] - (data.bin_edges[1] - data.bin_edges[0]), right=data.bin_edges[0], line_color='black', fill_color='red', legend='Outliers Below') if interactive: def mk_interact(handle): def update(bins=bins, phase=0): if phase > 0 and phase < 1: bins = bins + 1 delta = (cdf.values[-1] - cdf.values[0]) / bins edges = np.linspace(cdf.values[0] - (1 - phase) * delta, cdf.values[-1] + phase * delta, bins) else: edges = np.linspace(cdf.values[0], cdf.values[-1], bins) hist, edges = np.histogram(cdf.values, bins=edges, weights=np.diff(cdf.ranks), density=True) new_data = {'top': hist, 'left': edges[:-1], 'right': edges[1:], 'bottom': np.full(len(hist), 0)} q.data_source.data = new_data bokeh.io.push_notebook(handle) from ipywidgets import interact interact(update, bins=(0, 5*bins), phase=(0, 1, .01)) return p, mk_interact else: return p
def transform_one(mt, vardp_outlier=100_000) -> Table: """transforms a gvcf into a form suitable for combining The input to this should be some result of either :func:`.import_vcf` or :func:`.import_vcfs` with `array_elements_required=False`. There is a strong assumption that this function will be called on a matrix table with one column. """ mt = localize(mt) if mt.row.dtype not in _transform_rows_function_map: f = hl.experimental.define_function( lambda row: hl.rbind( hl.len(row.alleles), '<NON_REF>' == row.alleles[-1], lambda alleles_len, has_non_ref: hl.struct( locus=row.locus, alleles=hl.cond(has_non_ref, row.alleles[:-1], row.alleles), rsid=row.rsid, __entries=row.__entries.map( lambda e: hl.struct( DP=e.DP, END=row.info.END, GQ=e.GQ, LA=hl.range(0, alleles_len - hl.cond(has_non_ref, 1, 0)), LAD=hl.cond(has_non_ref, e.AD[:-1], e.AD), LGT=e.GT, LPGT=e.PGT, LPL=hl.cond(has_non_ref, hl.cond(alleles_len > 2, e.PL[:-alleles_len], hl.null(e.PL.dtype)), hl.cond(alleles_len > 1, e.PL, hl.null(e.PL.dtype))), MIN_DP=e.MIN_DP, PID=e.PID, RGQ=hl.cond( has_non_ref, e.PL[hl.call(0, alleles_len - 1).unphased_diploid_gt_index()], hl.null(e.PL.dtype.element_type)), SB=e.SB, gvcf_info=hl.case() .when(hl.is_missing(row.info.END), hl.struct( ClippingRankSum=row.info.ClippingRankSum, BaseQRankSum=row.info.BaseQRankSum, MQ=row.info.MQ, MQRankSum=row.info.MQRankSum, MQ_DP=row.info.MQ_DP, QUALapprox=row.info.QUALapprox, RAW_MQ=row.info.RAW_MQ, ReadPosRankSum=row.info.ReadPosRankSum, VarDP=hl.cond(row.info.VarDP > vardp_outlier, row.info.DP, row.info.VarDP))) .or_missing() ))), ), mt.row.dtype) _transform_rows_function_map[mt.row.dtype] = f transform_row = _transform_rows_function_map[mt.row.dtype] return Table(TableMapRows(mt._tir, Apply(transform_row._name, TopLevelReference('row'))))
def test_trio_matrix(self): """ This test depends on certain properties of the trio matrix VCF and pedigree structure. This test is NOT a valid test if the pedigree includes quads: the trio_matrix method will duplicate the parents appropriately, but the genotypes_table and samples_table orthogonal paths would require another duplication/explode that we haven't written. """ ped = hl.Pedigree.read(resource('triomatrix.fam')) ht = hl.import_fam(resource('triomatrix.fam')) mt = hl.import_vcf(resource('triomatrix.vcf')) mt = mt.annotate_cols(fam=ht[mt.s].fam_id) dads = ht.filter(hl.is_defined(ht.pat_id)) dads = dads.select(dads.pat_id, is_dad=True).key_by('pat_id') moms = ht.filter(hl.is_defined(ht.mat_id)) moms = moms.select(moms.mat_id, is_mom=True).key_by('mat_id') et = (mt.entries() .key_by('s') .join(dads, how='left') .join(moms, how='left')) et = et.annotate(is_dad=hl.is_defined(et.is_dad), is_mom=hl.is_defined(et.is_mom)) et = (et .group_by(et.locus, et.alleles, fam=et.fam) .aggregate(data=hl.agg.collect(hl.struct( role=hl.case().when(et.is_dad, 1).when(et.is_mom, 2).default(0), g=hl.struct(GT=et.GT, AD=et.AD, DP=et.DP, GQ=et.GQ, PL=et.PL))))) et = et.filter(hl.len(et.data) == 3) et = et.select('data').explode('data') tt = hl.trio_matrix(mt, ped, complete_trios=True).entries().key_by('locus', 'alleles') tt = tt.annotate(fam=tt.proband.fam, data=[hl.struct(role=0, g=tt.proband_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')), hl.struct(role=1, g=tt.father_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')), hl.struct(role=2, g=tt.mother_entry.select('GT', 'AD', 'DP', 'GQ', 'PL'))]) tt = tt.select('fam', 'data').explode('data') tt = tt.filter(hl.is_defined(tt.data.g)).key_by('locus', 'alleles', 'fam') self.assertEqual(et.key.dtype, tt.key.dtype) self.assertEqual(et.row.dtype, tt.row.dtype) self.assertTrue(et._same(tt)) # test annotations e_cols = (mt.cols() .join(dads, how='left') .join(moms, how='left')) e_cols = e_cols.annotate(is_dad=hl.is_defined(e_cols.is_dad), is_mom=hl.is_defined(e_cols.is_mom)) e_cols = (e_cols.group_by(fam=e_cols.fam) .aggregate(data=hl.agg.collect(hl.struct(role=hl.case() .when(e_cols.is_dad, 1).when(e_cols.is_mom, 2).default(0), sa=hl.struct(**e_cols.row.select(*mt.col)))))) e_cols = e_cols.filter(hl.len(e_cols.data) == 3).select('data').explode('data') t_cols = hl.trio_matrix(mt, ped, complete_trios=True).cols() t_cols = t_cols.annotate(fam=t_cols.proband.fam, data=[ hl.struct(role=0, sa=t_cols.proband), hl.struct(role=1, sa=t_cols.father), hl.struct(role=2, sa=t_cols.mother)]).key_by('fam').select('data').explode('data') t_cols = t_cols.filter(hl.is_defined(t_cols.data.sa)) self.assertEqual(e_cols.key.dtype, t_cols.key.dtype) self.assertEqual(e_cols.row.dtype, t_cols.row.dtype) self.assertTrue(e_cols._same(t_cols))
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +NOTEST ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint}, missing='.', delimiter='\t') ht = ht.rename({'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute'}) ht = ht.annotate(attribute=hl.dict( hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x}) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case() .when(ht['seqname'].startswith('HLA'), ht['seqname']) .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', '')) .when(ht['seqname'].startswith('chr'), ht['seqname']) .default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute(interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
def variant_qc(mt, name='variant_qc') -> MatrixTable: """Compute common variant statistics (quality control metrics). .. include:: ../_templates/req_tvariant.rst Examples -------- >>> dataset_result = hl.variant_qc(dataset) Notes ----- This method computes variant statistics from the genotype data, returning a new struct field `name` with the following metrics based on the fields present in the entry schema. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `AF` (``array<float64>``) -- Calculated allele frequency, one element per allele, including the reference. Sums to one. Equivalent to `AC` / `AN`. - `AC` (``array<int32>``) -- Calculated allele count, one element per allele, including the reference. Sums to `AN`. - `AN` (``int32``) -- Total number of called alleles. - `homozygote_count` (``array<int32>``) -- Number of homozygotes per allele. One element per allele, including the reference. - `call_rate` (``float64``) -- Fraction of calls neither missing nor filtered. Equivalent to `n_called` / :meth:`.count_cols`. - `n_called` (``int64``) -- Number of samples with a defined `GT`. - `n_not_called` (``int64``) -- Number of samples with a missing `GT`. - `n_filtered` (``int64``) -- Number of filtered entries. - `n_het` (``int64``) -- Number of heterozygous samples. - `n_non_ref` (``int64``) -- Number of samples with at least one called non-reference allele. - `het_freq_hwe` (``float64``) -- Expected frequency of heterozygous samples under Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. - `p_value_hwe` (``float64``) -- p-value from test of Hardy-Weinberg equilibrium. See :func:`.functions.hardy_weinberg_test` for details. Warning ------- `het_freq_hwe` and `p_value_hwe` are calculated as in :func:`.functions.hardy_weinberg_test`, with non-diploid calls (``ploidy != 2``) ignored in the counts. As this test is only statistically rigorous in the biallelic setting, :func:`.variant_qc` sets both fields to missing for multiallelic variants. Consider using :func:`~hail.methods.split_multi` to split multi-allelic variants beforehand. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` """ require_row_key_variant(mt, 'variant_qc') bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError(f"'variant_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) bound_exprs['n_filtered'] = mt.count_cols(_localize=False) - hl.agg.count() bound_exprs['call_stats'] = hl.agg.call_stats(mt.GT, mt.alleles) result = hl.rbind(hl.struct(**bound_exprs), lambda e1: hl.rbind( hl.case().when(hl.len(mt.alleles) == 2, hl.hardy_weinberg_test(e1.call_stats.homozygote_count[0], e1.call_stats.AC[1] - 2 * e1.call_stats.homozygote_count[1], e1.call_stats.homozygote_count[1]) ).or_missing(), lambda hwe: hl.struct(**{ **gq_dp_exprs, **e1.call_stats, 'call_rate': hl.float(e1.n_called) / (e1.n_called + e1.n_not_called + e1.n_filtered), 'n_called': e1.n_called, 'n_not_called': e1.n_not_called, 'n_filtered': e1.n_filtered, 'n_het': e1.n_called - hl.sum(e1.call_stats.homozygote_count), 'n_non_ref': e1.n_called - e1.call_stats.homozygote_count[0], 'het_freq_hwe': hwe.het_freq_hwe, 'p_value_hwe': hwe.p_value}))) return mt.annotate_rows(**{name: result})
def locus_windows(locus_expr, radius, coord_expr=None, _localize=True): """Returns start and stop indices for window around each locus. Examples -------- Windows with 2bp radius for one contig with positions 1, 2, 3, 4, 5: >>> starts, stops = hl.linalg.utils.locus_windows( ... hl.balding_nichols_model(1, 5, 5).locus, ... radius=2) >>> starts, stops (array([0, 0, 0, 1, 2]), array([3, 4, 5, 5, 5])) The following examples involve three contigs. >>> loci = [{'locus': hl.Locus('1', 1), 'cm': 1.0}, ... {'locus': hl.Locus('1', 2), 'cm': 3.0}, ... {'locus': hl.Locus('1', 4), 'cm': 4.0}, ... {'locus': hl.Locus('2', 1), 'cm': 2.0}, ... {'locus': hl.Locus('2', 1), 'cm': 2.0}, ... {'locus': hl.Locus('3', 3), 'cm': 5.0}] >>> ht = hl.Table.parallelize( ... loci, ... hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), ... key=['locus']) Windows with 1bp radius: >>> hl.linalg.utils.locus_windows(ht.locus, 1) (array([0, 0, 2, 3, 3, 5]), array([2, 2, 3, 5, 5, 6])) Windows with 1cm radius: >>> hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) (array([0, 1, 1, 3, 3, 5]), array([1, 3, 3, 5, 5, 6])) Notes ----- This function returns two 1-dimensional ndarrays of integers, ``starts`` and ``stops``, each of size equal to the number of rows. By default, for all indices ``i``, ``[starts[i], stops[i])`` is the maximal range of row indices ``j`` such that ``contig[i] == contig[j]`` and ``position[i] - radius <= position[j] <= position[i] + radius``. If the :meth:`.global_position` on `locus_expr` is not in ascending order, this method will fail. Ascending order should hold for a matrix table keyed by locus or variant (and the associated row table), or for a table that has been ordered by `locus_expr`. Set `coord_expr` to use a value other than position to define the windows. This row-indexed numeric expression must be non-missing, non-``nan``, on the same source as `locus_expr`, and ascending with respect to locus position for each contig; otherwise the function will fail. The last example above uses centimorgan coordinates, so ``[starts[i], stops[i])`` is the maximal range of row indices ``j`` such that ``contig[i] == contig[j]`` and ``cm[i] - radius <= cm[j] <= cm[i] + radius``. Index ranges are start-inclusive and stop-exclusive. This function is especially useful in conjunction with :meth:`.BlockMatrix.sparsify_row_intervals`. Parameters ---------- locus_expr : :class:`.LocusExpression` Row-indexed locus expression on a table or matrix table. radius: :obj:`int` Radius of window for row values. coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value. Must be on the same table or matrix table as `locus_expr`. By default, the row value is given by the locus position. Returns ------- (:class:`ndarray` of :obj:`int64`, :class:`ndarray` of :obj:`int64`) Tuple of start indices array and stop indices array. """ if radius < 0: raise ValueError(f"locus_windows: 'radius' must be non-negative, found {radius}") check_row_indexed('locus_windows', locus_expr) if coord_expr is not None: check_row_indexed('locus_windows', coord_expr) src = locus_expr._indices.source if locus_expr not in src._fields_inverse: locus = Env.get_uid() annotate_fields = {locus: locus_expr} if coord_expr is not None: if coord_expr not in src._fields_inverse: coords = Env.get_uid() annotate_fields[coords] = coord_expr else: coords = src._fields_inverse[coord_expr] if isinstance(src, hl.MatrixTable): new_src = src.annotate_rows(**annotate_fields) else: new_src = src.annotate(**annotate_fields) locus_expr = new_src[locus] if coord_expr is not None: coord_expr = new_src[coords] if coord_expr is None: coord_expr = locus_expr.position rg = locus_expr.dtype.reference_genome contig_group_expr = hl.agg.group_by(hl.locus(locus_expr.contig, 1, reference_genome=rg), hl.agg.collect(coord_expr)) # check loci are in sorted order last_pos = hl.fold(lambda a, elt: (hl.case() .when(a <= elt, elt) .or_error("locus_windows: 'locus_expr' global position must be in ascending order.")), -1, hl.agg.collect(hl.case() .when(hl.is_defined(locus_expr), locus_expr.global_position()) .or_error("locus_windows: missing value for 'locus_expr'."))) checked_contig_groups = (hl.case() .when(last_pos >= 0, contig_group_expr) .or_error("locus_windows: 'locus_expr' has length 0")) contig_groups = locus_expr._aggregation_method()(checked_contig_groups, _localize=False) coords = hl.sorted(hl.array(contig_groups)).map(lambda t: t[1]) starts_and_stops = hl._locus_windows_per_contig(coords, radius) if not _localize: return starts_and_stops starts, stops = hl.eval(starts_and_stops) return np.array(starts), np.array(stops)
def phase_by_transmission( locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression, proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """Phases genotype calls in a trio based allele transmission. Notes ----- In the phased calls returned, the order is as follows: - Proband: father_allele | mother_allele - Parents: transmitted_allele | untransmitted_allele Phasing of sex chromosomes: - Sex chromosomes of male individuals should be haploid to be phased correctly. - If `proband_call` is diploid on non-par regions of the sex chromosomes, it is assumed to be female. Returns `NA` when genotype calls cannot be phased. The following genotype calls combinations cannot be phased by transmission: 1. One of the calls in the trio is missing 2. The proband genotype cannot be obtained from the parents alleles (Mendelian violation) 3. All individuals of the trio are heterozygous for the same two alleles 4. Father is diploid on non-PAR region of X or Y 5. Proband is diploid on non-PAR region of Y In addition, individual phased genotype calls are returned as missing in the following situations: 1. All mother genotype calls non-PAR region of Y 2. Diploid father genotype calls on non-PAR region of X for a male proband (proband and mother are still phased as father doesn't participate in allele transmission) Note ---- :meth:`.experimental.phase_trio_matrix_by_transmission` provides a convenience wrapper for phasing a trio matrix. Parameters ---------- locus : :class:`.LocusExpression` Expression for the locus in the trio matrix alleles : :class:`.ArrayExpression` Expression for the alleles in the trio matrix proband_call : :class:`.CallExpression` Expression for the proband call in the trio matrix father_call : :class:`.CallExpression` Expression for the father call in the trio matrix mother_call : :class:`.CallExpression` Expression for the mother call in the trio matrix Returns ------- :class:`.ArrayExpression` Array containing: [phased proband call, phased father call, phased mother call]""" def call_to_one_hot_alleles_array(call: hl.expr.CallExpression, alleles: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression: """ Get the set of all different one-hot-encoded allele-vectors in a genotype call. It is returned as an ordered array where the first vector corresponds to the first allele, and the second vector (only present if het) the second allele. :param CallExpression call: genotype :param ArrayExpression alleles: Alleles at the site :return: Array of one-hot-encoded alleles :rtype: ArrayExpression """ return hl.cond( call.is_het(), hl.array([ hl.call(call[0]).one_hot_alleles(alleles), hl.call(call[1]).one_hot_alleles(alleles), ]), hl.array([hl.call(call[0]).one_hot_alleles(alleles)]) ) def phase_parent_call(call: hl.expr.CallExpression, transmitted_allele_index: int): """ Given a genotype and which allele was transmitted to the offspring, returns the parent phased genotype. :param CallExpression call: Parent genotype :param int transmitted_allele_index: index of transmitted allele (0 or 1) :return: Phased parent genotype :rtype: CallExpression """ return hl.call( call[transmitted_allele_index], call[hl.int(transmitted_allele_index == 0)], phased=True ) def phase_diploid_proband( locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression, proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a diploid proband (autosomes, PAR regions of sex chromosomes or non-PAR regions of a female proband) :param LocusExpression locus: Locus in the trio MatrixTable :param ArrayExpression alleles: Alleles in the trio MatrixTable :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ proband_v = proband_call.one_hot_alleles(alleles) father_v = hl.cond( locus.in_x_nonpar() | locus.in_y_nonpar(), hl.or_missing(father_call.is_haploid(), hl.array([father_call.one_hot_alleles(alleles)])), call_to_one_hot_alleles_array(father_call, alleles) ) mother_v = call_to_one_hot_alleles_array(mother_call, alleles) combinations = hl.flatmap( lambda f: hl.zip_with_index(mother_v) .filter(lambda m: m[1] + f[1] == proband_v) .map(lambda m: hl.struct(m=m[0], f=f[0])), hl.zip_with_index(father_v) ) return ( hl.or_missing( hl.is_defined(combinations) & (hl.len(combinations) == 1), hl.array([ hl.call(father_call[combinations[0].f], mother_call[combinations[0].m], phased=True), hl.cond(father_call.is_haploid(), hl.call(father_call[0], phased=True), phase_parent_call(father_call, combinations[0].f)), phase_parent_call(mother_call, combinations[0].m) ]) ) ) def phase_haploid_proband_x_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a haploid proband in the non-PAR region of X :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ transmitted_allele = hl.zip_with_index(hl.array([mother_call[0], mother_call[1]])).find(lambda m: m[1] == proband_call[0]) return hl.or_missing( hl.is_defined(transmitted_allele), hl.array([ hl.call(proband_call[0], phased=True), hl.or_missing(father_call.is_haploid(), hl.call(father_call[0], phased=True)), phase_parent_call(mother_call, transmitted_allele[0]) ]) ) def phase_y_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the non-PAR region of Y (requires both father and proband to be haploid to return phase) :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ return hl.or_missing( proband_call.is_haploid() & father_call.is_haploid() & (father_call[0] == proband_call[0]), hl.array([ hl.call(proband_call[0], phased=True), hl.call(father_call[0], phased=True), hl.null(hl.tcall) ]) ) return ( hl.case() .when(locus.in_x_nonpar() & proband_call.is_haploid(), phase_haploid_proband_x_nonpar(proband_call, father_call, mother_call)) .when(locus.in_y_nonpar(), phase_y_nonpar(proband_call, father_call)) .when(proband_call.is_diploid(), phase_diploid_proband(locus, alleles, proband_call, father_call, mother_call)) .or_missing() )
def full_outer_join_mt(left: hl.MatrixTable, right: hl.MatrixTable) -> hl.MatrixTable: """Performs a full outer join on `left` and `right`. Replaces row, column, and entry fields with the following: - `left_row` / `right_row`: structs of row fields from left and right. - `left_col` / `right_col`: structs of column fields from left and right. - `left_entry` / `right_entry`: structs of entry fields from left and right. Parameters ---------- left : :class:`.MatrixTable` right : :class:`.MatrixTable` Returns ------- :class:`.MatrixTable` """ if [x.dtype for x in left.row_key.values()] != [x.dtype for x in right.row_key.values()]: raise ValueError(f"row key types do not match:\n" f" left: {list(left.row_key.values())}\n" f" right: {list(right.row_key.values())}") if [x.dtype for x in left.col_key.values()] != [x.dtype for x in right.col_key.values()]: raise ValueError(f"column key types do not match:\n" f" left: {list(left.col_key.values())}\n" f" right: {list(right.col_key.values())}") left = left.select_rows(left_row=left.row) left_t = left.localize_entries('left_entries', 'left_cols') right = right.select_rows(right_row=right.row) right_t = right.localize_entries('right_entries', 'right_cols') ht = left_t.join(right_t, how='outer') ht = ht.annotate_globals( left_keys=hl.group_by( lambda t: t[0], hl.zip_with_index( ht.left_cols.map(lambda x: hl.tuple([x[f] for f in left.col_key])), index_first=False)).map_values( lambda elts: elts.map(lambda t: t[1])), right_keys=hl.group_by( lambda t: t[0], hl.zip_with_index( ht.right_cols.map(lambda x: hl.tuple([x[f] for f in right.col_key])), index_first=False)).map_values( lambda elts: elts.map(lambda t: t[1]))) ht = ht.annotate_globals( key_indices=hl.array(ht.left_keys.key_set().union(ht.right_keys.key_set())) .map(lambda k: hl.struct(k=k, left_indices=ht.left_keys.get(k), right_indices=ht.right_keys.get(k))) .flatmap(lambda s: hl.case() .when(hl.is_defined(s.left_indices) & hl.is_defined(s.right_indices), hl.range(0, s.left_indices.length()).flatmap( lambda i: hl.range(0, s.right_indices.length()).map( lambda j: hl.struct(k=s.k, left_index=s.left_indices[i], right_index=s.right_indices[j])))) .when(hl.is_defined(s.left_indices), s.left_indices.map( lambda elt: hl.struct(k=s.k, left_index=elt, right_index=hl.null('int32')))) .when(hl.is_defined(s.right_indices), s.right_indices.map( lambda elt: hl.struct(k=s.k, left_index=hl.null('int32'), right_index=elt))) .or_error('assertion error'))) ht = ht.annotate(__entries=ht.key_indices.map(lambda s: hl.struct(left_entry=ht.left_entries[s.left_index], right_entry=ht.right_entries[s.right_index]))) ht = ht.annotate_globals(__cols=ht.key_indices.map( lambda s: hl.struct(**{f: s.k[i] for i, f in enumerate(left.col_key)}, left_col=ht.left_cols[s.left_index], right_col=ht.right_cols[s.right_index]))) ht = ht.drop('left_entries', 'left_cols', 'left_keys', 'right_entries', 'right_cols', 'right_keys', 'key_indices') return ht._unlocalize_entries('__entries', '__cols', list(left.col_key))
def de_novo(mt: MatrixTable, pedigree: Pedigree, pop_frequency_prior, *, min_gq: int = 20, min_p: float = 0.05, max_parent_ab: float = 0.05, min_child_ab: float = 0.20, min_dp_ratio: float = 0.10) -> Table: r"""Call putative *de novo* events from trio data. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Call de novo events: >>> pedigree = hl.Pedigree.read('data/trios.fam') >>> priors = hl.import_table('data/gnomadFreq.tsv', impute=True) >>> priors = priors.transmute(**hl.parse_variant(priors.Variant)).key_by('locus', 'alleles') >>> de_novo_results = hl.de_novo(dataset, pedigree, pop_frequency_prior=priors[dataset.row_key].AF) Notes ----- This method assumes the GATK high-throughput sequencing fields exist: `GT`, `AD`, `DP`, `GQ`, `PL`. This method replicates the functionality of `Kaitlin Samocha's de novo caller <https://github.com/ksamocha/de_novo_scripts>`__. The version corresponding to git commit ``bde3e40`` is implemented in Hail with her permission and assistance. This method produces a :class:`.Table` with the following fields: - `locus` (``locus``) -- Variant locus. - `alleles` (``array<str>``) -- Variant alleles. - `id` (``str``) -- Proband sample ID. - `prior` (``float64``) -- Site frequency prior. It is the maximum of: the computed dataset alternate allele frequency, the `pop_frequency_prior` parameter, and the global prior ``1 / 3e7``. - `proband` (``struct``) -- Proband column fields from `mt`. - `father` (``struct``) -- Father column fields from `mt`. - `mother` (``struct``) -- Mother column fields from `mt`. - `proband_entry` (``struct``) -- Proband entry fields from `mt`. - `father_entry` (``struct``) -- Father entry fields from `mt`. - `proband_entry` (``struct``) -- Mother entry fields from `mt`. - `is_female` (``bool``) -- ``True`` if proband is female. - `p_de_novo` (``float64``) -- Unfiltered posterior probability that the event is *de novo* rather than a missed heterozygous event in a parent. - `confidence` (``str``) Validation confidence. One of: ``'HIGH'``, ``'MEDIUM'``, ``'LOW'``. The key of the table is ``['locus', 'alleles', 'id']``. The model looks for de novo events in which both parents are homozygous reference and the proband is a heterozygous. The model makes the simplifying assumption that when this configuration ``x = (AA, AA, AB)`` of calls occurs, exactly one of the following is true: - ``d``: a de novo mutation occurred in the proband and all calls are accurate. - ``m``: at least one parental allele is actually heterozygous and the proband call is accurate. We can then estimate the posterior probability of a de novo mutation as: .. math:: \mathrm{P_{\text{de novo}}} = \frac{\mathrm{P}(d\,|\,x)}{\mathrm{P}(d\,|\,x) + \mathrm{P}(m\,|\,x)} Applying Bayes rule to the numerator and denominator yields .. math:: \frac{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d)}{\mathrm{P}(x\,|\,d)\,\mathrm{P}(d) + \mathrm{P}(x\,|\,m)\,\mathrm{P}(m)} The prior on de novo mutation is estimated from the rate in the literature: .. math:: \mathrm{P}(d) = \frac{1 \text{mutation}}{30,000,000\, \text{bases}} The prior used for at least one alternate allele between the parents depends on the alternate allele frequency: .. math:: \mathrm{P}(m) = 1 - (1 - AF)^4 The likelihoods :math:`\mathrm{P}(x\,|\,d)` and :math:`\mathrm{P}(x\,|\,m)` are computed from the PL (genotype likelihood) fields using these factorizations: .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,d) = \Big( &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ \cdot &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) \Big) .. math:: \mathrm{P}(x = (AA, AA, AB) \,|\,m) = \Big( & \mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AB) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AA) \\ + \, &\mathrm{P}(x_{\mathrm{father}} = AA \,|\, \mathrm{father} = AA) \cdot \mathrm{P}(x_{\mathrm{mother}} = AA \,|\, \mathrm{mother} = AB) \Big) \\ \cdot \, &\mathrm{P}(x_{\mathrm{proband}} = AB \,|\, \mathrm{proband} = AB) (Technically, the second factorization assumes there is exactly (rather than at least) one alternate allele among the parents, which may be justified on the grounds that it is typically the most likely case by far.) While this posterior probability is a good metric for grouping putative de novo mutations by validation likelihood, there exist error modes in high-throughput sequencing data that are not appropriately accounted for by the phred-scaled genotype likelihoods. To this end, a number of hard filters are applied in order to assign validation likelihood. These filters are different for SNPs and insertions/deletions. In the below rules, the following variables are used: - ``DR`` refers to the ratio of the read depth in the proband to the combined read depth in the parents. - ``AB`` refers to the read allele balance of the proband (number of alternate reads divided by total reads). - ``AC`` refers to the count of alternate alleles across all individuals in the dataset at the site. - ``p`` refers to :math:`\mathrm{P_{\text{de novo}}}`. - ``min_p`` refers to the ``min_p`` function parameter. HIGH-quality SNV: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality SNV: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 && AC == 1 LOW-quality SNV: .. code-block:: text p > min_p && AB > 0.2 HIGH-quality indel: .. code-block:: text p > 0.99 && AB > 0.3 && DR > 0.2 or p > 0.99 && AB > 0.3 && AC == 1 MEDIUM-quality indel: .. code-block:: text p > 0.5 && AB > 0.3 or p > 0.5 && AB > 0.2 and AC == 1 LOW-quality indel: .. code-block:: text p > min_p && AB > 0.2 Additionally, de novo candidates are not considered if the proband GQ is smaller than the ``min_gq`` parameter, if the proband allele balance is lower than the ``min_child_ab`` parameter, if the depth ratio between the proband and parents is smaller than the ``min_depth_ratio`` parameter, or if the allele balance in a parent is above the ``max_parent_ab`` parameter. Parameters ---------- mt : :class:`.MatrixTable` High-throughput sequencing dataset. pedigree : :class:`.Pedigree` Sample pedigree. pop_frequency_prior : :class:`.Float64Expression` Expression for population alternate allele frequency prior. min_gq Minimum proband GQ to be considered for *de novo* calling. min_p Minimum posterior probability to be considered for *de novo* calling. max_parent_ab Maximum parent allele balance. min_child_ab Minimum proband allele balance/ min_dp_ratio Minimum ratio between proband read depth and parental read depth. Returns ------- :class:`.Table` """ DE_NOVO_PRIOR = 1 / 30000000 MIN_POP_PRIOR = 100 / 30000000 required_entry_fields = {'GT', 'AD', 'DP', 'GQ', 'PL'} missing_fields = required_entry_fields - set(mt.entry) if missing_fields: raise ValueError(f"'de_novo': expected 'MatrixTable' to have at least {required_entry_fields}, " f"missing {missing_fields}") mt = mt.annotate_rows(__prior=pop_frequency_prior, __alt_alleles=hl.agg.sum(mt.GT.n_alt_alleles()), __total_alleles=2 * hl.agg.sum(hl.is_defined(mt.GT))) # subtract 1 from __alt_alleles to correct for the observed genotype mt = mt.annotate_rows(__site_freq=hl.max((mt.__alt_alleles - 1) / mt.__total_alleles, mt.__prior, MIN_POP_PRIOR)) mt = require_biallelic(mt, 'de_novo') # FIXME check that __site_freq is between 0 and 1 when possible in expr tm = trio_matrix(mt, pedigree, complete_trios=True) autosomal = tm.locus.in_autosome_or_par() | (tm.locus.in_x_nonpar() & tm.is_female) hemi_x = tm.locus.in_x_nonpar() & ~tm.is_female hemi_y = tm.locus.in_y_nonpar() & ~tm.is_female hemi_mt = tm.locus.in_mito() & tm.is_female is_snp = hl.is_snp(tm.alleles[0], tm.alleles[1]) n_alt_alleles = tm.__alt_alleles prior = tm.__site_freq het_hom_hom = tm.proband_entry.GT.is_het() & tm.father_entry.GT.is_hom_ref() & tm.mother_entry.GT.is_hom_ref() kid_ad_fail = tm.proband_entry.AD[1] / hl.sum(tm.proband_entry.AD) < min_child_ab failure = hl.null(hl.tstruct(p_de_novo=hl.tfloat64, confidence=hl.tstr)) kid = tm.proband_entry dad = tm.father_entry mom = tm.mother_entry kid_linear_pl = 10 ** (-kid.PL / 10) kid_pp = hl.bind(lambda x: x / hl.sum(x), kid_linear_pl) dad_linear_pl = 10 ** (-dad.PL / 10) dad_pp = hl.bind(lambda x: x / hl.sum(x), dad_linear_pl) mom_linear_pl = 10 ** (-mom.PL / 10) mom_pp = hl.bind(lambda x: x / hl.sum(x), mom_linear_pl) kid_ad_ratio = kid.AD[1] / hl.sum(kid.AD) dp_ratio = kid.DP / (dad.DP + mom.DP) def call_auto(kid_pp, dad_pp, mom_pp, kid_ad_ratio): p_data_given_dn = dad_pp[0] * mom_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior) ** 4 p_data_given_missed_het = (dad_pp[1] * mom_pp[0] + dad_pp[0] * mom_pp[1]) * kid_pp[1] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (dad.DP + mom.DP) < min_dp_ratio) | ~(kid_ad_ratio >= min_child_ab), failure) .when((hl.sum(mom.AD) == 0) | (hl.sum(dad.AD) == 0), failure) .when((mom.AD[1] / hl.sum(mom.AD) > max_parent_ab) | (dad.AD[1] / hl.sum(dad.AD) > max_parent_ab), failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) ) return hl.bind(solve, p_de_novo) def call_hemi(kid_pp, parent, parent_pp, kid_ad_ratio): p_data_given_dn = parent_pp[0] * kid_pp[1] * DE_NOVO_PRIOR p_het_in_parent = 1 - (1 - prior) ** 4 p_data_given_missed_het = (parent_pp[1] + parent_pp[2]) * kid_pp[2] * p_het_in_parent p_de_novo = p_data_given_dn / (p_data_given_dn + p_data_given_missed_het) def solve(p_de_novo): return ( hl.case() .when(kid.GQ < min_gq, failure) .when((kid.DP / (parent.DP) < min_dp_ratio) | (kid_ad_ratio < min_child_ab), failure) .when((hl.sum(parent.AD) == 0), failure) .when(parent.AD[1] / hl.sum(parent.AD) > max_parent_ab, failure) .when(p_de_novo < min_p, failure) .when(~is_snp, hl.case() .when((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles <= 5), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.3), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing()) .default(hl.case() .when(((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (dp_ratio > 0.2)) | ((p_de_novo > 0.99) & (kid_ad_ratio > 0.3) & (n_alt_alleles == 1)) | ((p_de_novo > 0.5) & (kid_ad_ratio > 0.3) & (n_alt_alleles < 10) & (kid.DP > 10)), hl.struct(p_de_novo=p_de_novo, confidence='HIGH')) .when((p_de_novo > 0.5) & ((kid_ad_ratio > 0.3) | (n_alt_alleles == 1)), hl.struct(p_de_novo=p_de_novo, confidence='MEDIUM')) .when((p_de_novo > 0.05) & (kid_ad_ratio > 0.2), hl.struct(p_de_novo=p_de_novo, confidence='LOW')) .or_missing() ) ) return hl.bind(solve, p_de_novo) de_novo_call = ( hl.case() .when(~het_hom_hom | kid_ad_fail, failure) .when(autosomal, hl.bind(call_auto, kid_pp, dad_pp, mom_pp, kid_ad_ratio)) .when(hemi_x | hemi_mt, hl.bind(call_hemi, kid_pp, mom, mom_pp, kid_ad_ratio)) .when(hemi_y, hl.bind(call_hemi, kid_pp, dad, dad_pp, kid_ad_ratio)) .or_missing() ) tm = tm.annotate_entries(__call=de_novo_call) tm = tm.filter_entries(hl.is_defined(tm.__call)) entries = tm.entries() return (entries.select('__site_freq', 'proband', 'father', 'mother', 'proband_entry', 'father_entry', 'mother_entry', 'is_female', **entries.__call) .rename({'__site_freq': 'prior'}))