#at least one read in both forward and reverse orientations #remove monomorphic variants mt3 = mt2.filter_entries( ((mt2.AD[1] < 2) | (mt2.F1R2[1] == 0) | (mt2.F2R1[1] == 0)), keep=False) mt3 = hl.variant_qc(mt3) mt3 = mt3.filter_rows( (mt3.variant_qc.AF[1] > 0) & (mt3.variant_qc.AF[1] < 1), keep=True) mt4 = mt3.annotate_rows(v = hl.variant_str(mt3.locus, mt3.alleles),\ NumAltAlleles = hl.agg.max(mt3.GT.n_alt_alleles()), \ VAF =hl.agg.explode(lambda x: hl.agg.mean(x), mt3.AF),\ TLOD =mt3.info.TLOD[0], \ GERMQ = mt3.info.GERMQ, \ STR=mt3.info.STR,\ AD_alt=hl.agg.mean(mt3.AD[1]),\ AD_ref=hl.agg.mean(mt3.AD[0])) mt4 = mt4.annotate_entries( Binomial_Prob=hl.binom_test(mt4.AD[1], mt4.DP, 0.5, 'greater')) mt4 = mt4.key_rows_by("v") mt4 = mt4.drop('locus', 'alleles', 'qual', 'filters', 'variant_qc', 'GQ', 'PGT', 'PID', 'PL', 'PS', 'info', 'rsid', 'a_index', 'was_split') filt2 = mt4.count_rows() mt4.entries().export(filenamev2 + "." + str(filt2) + ".GTs.bgz") del (mt) del (mt2) del (mt3) del (mt4)
def main(args): # init hail hl.init(default_reference=args.default_ref_genome) # import MT mt = hl.read_matrix_table(args.mt_input_path) n_variants, n_samples = mt.count() # Getting variant table. Basically, a table keyed by <locus> or <locus, alleles> # with all variants in the dataset and no extra fields (a.k.a reference table). tb_variants = (mt.select_rows().rows()) # compute overall coverage if args.compute_overall_coverage: logger.info( f"Computing coverage stats for {n_variants} variant over {n_samples} samples..." ) ht_cov_overall = compute_coverage_stats(mt=mt, reference_ht=tb_variants) tb_variants = (tb_variants.annotate( overall=ht_cov_overall[tb_variants.key])) # compute coverage stratified by phenotype status (expected binary) # force the input MT to have a case_control bool filed (is_case) # *** if args.compute_phe_coverage: logger.info( f"Computing coverage stats stratified by phenotype status...") # Annotate sample meta info # Note: Temporal solution, better to import annotated MT mt = (mt.annotate_cols(**get_sample_meta_data()[mt.col_key])) mt = (mt.annotate_cols( case_control=hl.if_else(mt[args.phe_field], 'case', 'control'))) strata = (mt.aggregate_cols(hl.agg.collect_as_set(mt['case_control']))) dict_strata_ht = { s: compute_coverage_stats(mt=mt.filter_cols(mt['case_control'] == s), reference_ht=tb_variants) for s in strata } for k in dict_strata_ht.keys(): _tb = dict_strata_ht.get(k) tb_variants = tb_variants.annotate(**{k: _tb[tb_variants.key]}) if args.run_binomial_test: logger.info(f"Running binomial test...") # perform a binomial test on coverage and case/control status # DOI: https://doi.org/10.1002/acn3.582 tb_binomial = (tb_variants.annotate( n_cases_over_10=hl.int(tb_variants.case.over_10 * 100), n_controls_over_10=hl.int(tb_variants.control.over_10 * 100), total_cases=tb_variants.case.n_samples, total_controls=tb_variants.control.n_samples, ).select('n_cases_over_10', 'n_controls_over_10', 'total_cases', 'total_controls')) binomial_expr = { 'p_value': hl.binom_test( x=tb_binomial.n_cases_over_10, n=tb_binomial.n_cases_over_10 + tb_binomial.n_controls_over_10, p=tb_binomial.total_cases / (tb_binomial.total_cases + tb_binomial.total_controls), alternative='two.sided') } tb_binomial = (tb_binomial.annotate(**binomial_expr)) tb_variants = (tb_variants.annotate( binomial_stats=tb_binomial[tb_variants.key])) # make coverage filter expressions # Note: the default number of reads is set to 10X logger.info(f"Assigning per site coverage filters...") significant_level = args.pvalue_threshold min_sample_prop = args.min_sample_proportion coverage_filter_dict_expr = {} if args.compute_overall_coverage: coverage_filter_dict_expr.update({ 'overall_hard_cutoff': hl.if_else((tb_variants.overall.over_10 >= min_sample_prop), "pass", "fail") }) if args.compute_phe_coverage: # DOI: https://doi.org/10.1016/j.ajhg.2018.08.016 coverage_filter_dict_expr.update({ 'phe_hard_cutoff': hl.if_else((tb_variants.case.over_10 >= min_sample_prop) & (tb_variants.control.over_10 >= min_sample_prop), "concordant", "discordant") }) if args.run_binomial_test: coverage_filter_dict_expr.update({ 'phe_binomial': hl.if_else(tb_variants.binomial_stats.p_value < significant_level, 'dependent', 'independent') }) # annotate coverage filters tb_variants = (tb_variants.annotate(coverage_filter=hl.struct( **coverage_filter_dict_expr))) # add useful global annotations to final coverage stats ht # as well as affected/non-affected summary counts per filters global_ann_dict_expr = { 'date': current_date(), 'mt_path': args.mt_input_path, 'min_sample_prop': min_sample_prop } if args.compute_overall_coverage: global_ann_dict_expr.update({ 'overall_hard_cutoff': tb_variants.aggregate( hl.agg.counter( tb_variants.coverage_filter.overall_hard_cutoff)) }) if args.compute_phe_coverage: global_ann_dict_expr.update({ 'phe_hard_cutoff': tb_variants.aggregate( hl.agg.counter(tb_variants.coverage_filter.phe_hard_cutoff)) }) if args.run_binomial_test: global_ann_dict_expr.update({ 'phe_binomial': tb_variants.aggregate( hl.agg.counter(tb_variants.coverage_filter.phe_binomial)), 'binomial_pvalue_cutoff': significant_level if args.run_binomial_test else hl.float('') }) tb_variants = (tb_variants.annotate_globals(**global_ann_dict_expr)) # check tb_variants.globals.show() tb_variants.describe() # write HT tb_variants = tb_variants.checkpoint(output=args.ht_output_path, overwrite=args.overwrite) # export to file if true if args.write_to_file: (tb_variants.export(f'{args.ht_output_path}.tsv.bgz')) hl.stop()
def compute_info() -> hl.Table: """ Computes a HT with the typical GATK AS and site-level info fields as well as ACs and lowqual fields. Note that this table doesn't split multi-allelic sites. :return: Table with info fields :rtype: Table """ mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True, remove_hard_filtered_samples=False) mt = mt.filter_rows((hl.len(mt.alleles) > 1)) mt = mt.transmute_entries(**mt.gvcf_info) mt = mt.annotate_rows( alt_alleles_range_array=hl.range(1, hl.len(mt.alleles))) # Compute AS and site level info expr # Note that production defaults have changed: # For new releases, the `RAWMQ_andDP` field replaces the `RAW_MQ` and `MQ_DP` fields info_expr = get_site_info_expr( mt, sum_agg_fields=INFO_SUM_AGG_FIELDS + ["RAW_MQ"], int32_sum_agg_fields=INFO_INT32_SUM_AGG_FIELDS + ["MQ_DP"], array_sum_agg_fields=["SB"], ) info_expr = info_expr.annotate(**get_as_info_expr( mt, sum_agg_fields=INFO_SUM_AGG_FIELDS + ["RAW_MQ"], int32_sum_agg_fields=INFO_INT32_SUM_AGG_FIELDS + ["MQ_DP"], array_sum_agg_fields=["SB"], )) # Add AC and AC_raw: # First compute ACs for each non-ref allele, grouped by adj grp_ac_expr = hl.agg.array_agg( lambda ai: hl.agg.filter( mt.LA.contains(ai), hl.agg.group_by( get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD), hl.agg.sum( mt.LGT.one_hot_alleles(mt.LA.map(lambda x: hl.str(x)))[ mt.LA.index(ai)]), ), ), mt.alt_alleles_range_array, ) # Then, for each non-ref allele, compute # AC as the adj group # AC_raw as the sum of adj and non-adj groups info_expr = info_expr.annotate( AC_raw=grp_ac_expr.map( lambda i: hl.int32(i.get(True, 0) + i.get(False, 0))), AC=grp_ac_expr.map(lambda i: hl.int32(i.get(True, 0))), ) # Annotating raw MT with pab max info_expr = info_expr.annotate(AS_pab_max=hl.agg.array_agg( lambda ai: hl.agg.filter( mt.LA.contains(ai) & mt.LGT.is_het(), hl.agg.max( hl.binom_test(mt.LAD[mt.LA.index(ai)], hl.sum(mt.LAD), 0.5, "two-sided")), ), mt.alt_alleles_range_array, )) info_ht = mt.select_rows(info=info_expr).rows() # Add lowqual flag info_ht = info_ht.annotate( lowqual=get_lowqual_expr( info_ht.alleles, info_ht.info.QUALapprox, # The indel het prior used for gnomad v3 was 1/10k bases (phred=40). # This value is usually 1/8k bases (phred=39). indel_phred_het_prior=40, ), AS_lowqual=get_lowqual_expr(info_ht.alleles, info_ht.info.AS_QUALapprox, indel_phred_het_prior=40), ) return info_ht.naive_coalesce(7500)
def test_deprecated_binom_test(): assert hl.eval(hl.binom_test(2, 10, 0.5, 'two.sided')) == \ pytest.approx(spst.binom_test(2, 10, 0.5, 'two-sided'))
def test_binom_test(): arglists = [[2, 10, 0.5, 'two-sided'], [4, 10, 0.5, 'less'], [32, 50, 0.4, 'greater']] for args in arglists: assert hl.eval(hl.binom_test(*args)) == pytest.approx( spst.binom_test(*args)), args
INBR_COEFF = -0.3 AB_LOWER_LIM = 0.2 AB_UPPER_LIM = 1 - AB_LOWER_LIM # Read MatrixTable with sample QC-passing dataset mt = hl.read_matrix_table("sampleqc_pass.mt") # Calculate variant statistics mt = hl.variant_qc(mt) # Calculate inbreeding coefficient mt = mt.annotate_rows(inbr_coeff=bi_allelic_site_inbreeding_expr(mt.GT)) # Determine the maximum p-value for sampling the observed allele balance under a binomial model mt = mt.annotate_rows( pab_max=hl.agg.max(hl.binom_test(mt.AD[1], mt.DP, 0.5, "two-sided"))) # Removing variants with excess of heterozygotes mt = mt.filter_rows(mt.inbr_coeff > INBR_COEFF) # Removing variants for which no sample had high quality genotypes mt = mt.filter_rows(hl.agg.any(mt.GQ >= 20)) mt = mt.filter_rows(hl.agg.any(mt.DP >= 10)) mt = mt.annotate_entries(AB=(mt.AD[1] / hl.sum(mt.AD))) mt = mt.filter_rows( hl.agg.any((mt.GT.is_hom_ref() & (mt.AB < AB_LOWER_LIM)) | (mt.GT.is_het() & (mt.AB >= AB_LOWER_LIM) & (mt.AB <= AB_UPPER_LIM)) | (mt.GT.is_hom_var() & (mt.AB > AB_UPPER_LIM))))