def test_tdt(self): pedigree = hl.Pedigree.read(resource('tdt.fam')) tdt_tab = (hl.transmission_disequilibrium_test( hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)), pedigree)) truth = hl.import_table( resource('tdt_results.tsv'), types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32, 'Chi2': hl.tfloat64, 'Pval': hl.tfloat64}) truth = (truth .transmute(locus=hl.locus(truth.CHROM, truth.POSITION), alleles=[truth.REF, truth.ALT]) .key_by('locus', 'alleles')) if tdt_tab.count() != truth.count(): self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count())) bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False) .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer')) bad = bad.filter(~( (bad.t == bad.T) & (bad.u == bad.U) & (hl.abs(bad.chi2 - bad.Chi2) < 0.001) & (hl.abs(bad.p_value - bad.Pval) < 0.001))) if bad.count() != 0: bad.order_by(hl.asc(bad.v)).show() self.fail('Found rows in violation of the predicate (see show output)')
def test_tdt(self): pedigree = hl.Pedigree.read(resource('tdt.fam')) tdt_tab = (hl.transmission_disequilibrium_test( hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)), pedigree)) truth = hl.import_table( resource('tdt_results.tsv'), types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32, 'Chi2': hl.tfloat64, 'Pval': hl.tfloat64}) truth = (truth .transmute(locus=hl.locus(truth.CHROM, truth.POSITION), alleles=[truth.REF, truth.ALT]) .key_by('locus', 'alleles')) if tdt_tab.count() != truth.count(): self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count())) bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False) .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer')) bad.describe() bad = bad.filter(~( (bad.t == bad.T) & (bad.u == bad.U) & (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) & (hl.abs(bad.p_value - bad.Pval) < 0.001))) if bad.count() != 0: bad.order_by(hl.asc(bad.v)).show() self.fail('Found rows in violation of the predicate (see show output)')
def export_ldscore(ht, pop): hm3_snps = hl.read_table(get_hm3_snplist_path(pop)) ht = ht.select(CHR=ht.locus.contig, SNP=hl.variant_str(ht.locus, ht.alleles), RSID=ht.rsid, BP=ht.locus.position, L2=ht.ld_score, MAF=0.5 - hl.abs(0.5 - ht.AF)) count = ht.aggregate( hl.struct(M=hl.agg.count(), M_5_50=hl.agg.sum(ht.MAF > 0.05))) ht = ht.filter(hl.is_defined(hm3_snps[ht.locus, ht.alleles])) ht = ht.key_by().drop('locus', 'alleles', 'MAF') with hadoop_open(get_ld_score_flat_file_path(pop, extension='M'), 'w') as f: f.write(f'{count.M}\n') with hadoop_open(get_ld_score_flat_file_path(pop, extension='M_5_50'), 'w') as f: f.write(f'{count.M_5_50}\n') # LD score with variant ids ht.drop('RSID').export(get_ld_score_flat_file_path(pop)) # with rsids ht.transmute(SNP=ht.RSID).export( get_ld_score_flat_file_path(pop, rsid=True))
def test_de_novo(self): mt = hl.import_vcf(resource('denovo.vcf')) mt = mt.filter_rows(mt.locus.in_y_par(), keep=False) # de_novo_finder doesn't know about y PAR ped = hl.Pedigree.read(resource('denovo.fam')) r = hl.de_novo(mt, ped, mt.info.ESP) r = r.select( prior=r.prior, kid_id=r.proband.s, dad_id=r.father.s, mom_id=r.mother.s, p_de_novo=r.p_de_novo, confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') truth = hl.import_table(resource('denovo.out'), impute=True, comment='#') truth = truth.select( locus=hl.locus(truth['Chr'], truth['Pos']), alleles=[truth['Ref'], truth['Alt']], kid_id=truth['Child_ID'], dad_id=truth['Dad_ID'], mom_id=truth['Mom_ID'], p_de_novo=truth['Prob_dn'], confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') j = r.join(truth, how='outer') self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
def test_de_novo(self): mt = hl.import_vcf(resource('denovo.vcf')) mt = mt.filter_rows(mt.locus.in_y_par(), keep=False) # de_novo_finder doesn't know about y PAR ped = hl.Pedigree.read(resource('denovo.fam')) r = hl.de_novo(mt, ped, mt.info.ESP) r = r.select( prior=r.prior, kid_id=r.proband.s, dad_id=r.father.s, mom_id=r.mother.s, p_de_novo=r.p_de_novo, confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') truth = hl.import_table(resource('denovo.out'), impute=True, comment='#') truth = truth.select( locus=hl.locus(truth['Chr'], truth['Pos']), alleles=[truth['Ref'], truth['Alt']], kid_id=truth['Child_ID'], dad_id=truth['Dad_ID'], mom_id=truth['Mom_ID'], p_de_novo=truth['Prob_dn'], confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') j = r.join(truth, how='outer') self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
def get_metric_expr(ht, metric): metric_values = hl.agg.collect(ht[metric]) metric_median = hl.median(metric_values) metric_mad = 1.4826 * hl.median(hl.abs(metric_values - metric_median)) return hl.struct(median=metric_median, mad=metric_mad, upper=metric_median + 4 * metric_mad if metric != 'callrate' else 1, lower=metric_median - 4 * metric_mad if metric != 'callrate' else 0.99)
def get_freq(mt, sex, n_remove, seed): r''' Get allele frequencies and other SNP information (needed to fix previously created sumstats files) ''' print('... Calculating allele frequency ...') mt = mt.annotate_rows(freq=hl.agg.mean(mt.dosage) / 2) #frequency of alternate allele mt_rows = mt.rows() mt_rows = mt_rows.key_by('rsid') mt_rows = mt_rows.annotate(chr=mt_rows.locus.contig, bpos=mt_rows.locus.position) ss = hl.import_table( wd + f'{phen}.gwas.{sex}.n_remove_{n_remove_per_sex}.seed_{seed}.old.tsv.bgz', impute=True, key='SNP') ss = ss.annotate( chr=mt_rows[ss.SNP].chr, bpos=mt_rows[ss.SNP].bpos, freq=mt_rows[ss.SNP].freq, z=((-1) * (ss.beta < 0) * hl.abs(hl.qnorm(ss.p_value / 2)) + (ss.beta > 0) * hl.abs(hl.qnorm(ss.p_value / 2)))) if 'N' in ss.row: if 'n' not in ss.row: ss = ss.annotate(n=ss.N) ss = ss.drop('N') ss = ss.rename({'SNP': 'snpid', 'A1': 'a1', 'A2': 'a2', 'p_value': 'pval'}) ss = ss.key_by() ss = ss.select('snpid', 'chr', 'bpos', 'a1', 'a2', 'freq', 'beta', 'z', 'pval', 'n') ss = ss.key_by('snpid') ss.export( wd + f'{phen}.gwas.{sex}.n_remove_{n_remove_per_sex}.seed_{seed}.tsv.bgz')
def test_import_bgen_dosage_and_gp_dosage_function_agree(self): recoding = {'0{}'.format(i): str(i) for i in range(1, 10)} sample_file = resource('example.sample') bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding=recoding) bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file) et = bgenmt.entries() et = et.transmute(gp_dosage=hl.gp_dosage(et.GP)) self.assertTrue( et.all((hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage)) | (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
def test_import_bgen_dosage_and_gp_dosage_function_agree(self): recoding = {'0{}'.format(i): str(i) for i in range(1, 10)} sample_file = resource('example.sample') bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding=recoding) bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file) et = bgenmt.entries() et = et.transmute(gp_dosage = hl.gp_dosage(et.GP)) self.assertTrue(et.all( (hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage)) | (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
def get_freq_alt(mt, sex, n_remove, seed): ss = hl.import_table( wd + f'{phen}.gwas.{sex}.n_remove_{n_remove_per_sex}.seed_{seed}.tsv.bgz', impute=True, key='snpid') ss = ss.annotate(z=((-1) * (ss.beta < 0) * hl.abs(hl.qnorm(ss.pval / 2)) + (ss.beta > 0) * hl.abs(hl.qnorm(ss.pval / 2)))) if 'N' in ss.row: if 'n' not in ss.row: ss = ss.annotate(n=ss.N) ss = ss.drop('N') ss = ss.key_by() ss = ss.select('snpid', 'chr', 'bpos', 'a1', 'a2', 'freq', 'beta', 'z', 'pval', 'n') ss = ss.key_by('snpid') ss.export( wd + f'{phen}.gwas.{sex}.n_remove_{n_remove_per_sex}.seed_{seed}.tsv.bgz')
def get_platform_specific_intervals(platform_pc_loadings_ht: hl.Table, threshold: float) -> List[hl.Interval]: """ This takes the platform PC loadings and returns a list of intervals where the sum of the loadings above the given threshold. The experimental / untested idea behind this, is that those intervals may be problematic on some platforms. :param Table platform_pc_loadings_ht: Platform PCA loadings indexed by interval :param float threshold: Minimal threshold :param str intervals_path: Path to the intervals file to use (default: b37 exome calling intervals) :return: List of intervals with PC loadings above the given threshold :rtype: list of Interval """ platform_specific_intervals = platform_pc_loadings_ht.filter( hl.sum(hl.abs(platform_pc_loadings_ht.loadings)) >= threshold) return platform_specific_intervals.interval.collect()
def get_median_and_mad_expr( metric_expr: hl.expr.ArrayNumericExpression, k: float = 1.4826 ) -> hl.expr.StructExpression: """ Computes the median and median absolute deviation (MAD) for the given expression. Note that the default value of k assumes normally distributed data. :param metric_expr: Expression to compute median and MAD for :param k: The scaling factor for MAD calculation. Default assumes normally distributed data. :return: Struct with median and MAD """ return hl.bind( lambda x: hl.struct(median=x[1], mad=k * hl.median(hl.abs(x[0] - x[1]))), hl.bind(lambda x: hl.tuple([x, hl.median(x)]), hl.agg.collect(metric_expr)), )
def filter(self, mt): row_filter = mt[self._row_filter].filters if self._row_filter else mt.exclude_row col_filter = mt[self._col_filter].filters if self._col_filter else mt.exclude_col mt = mt.annotate_rows(cr=hl.or_missing(row_filter == False, hl.agg.group_by(mt.is_case, hl.agg.filter(col_filter == False, variant_qc_aggregator(mt).call_rate)))) mt = mt.annotate_rows(diff=hl.abs(mt.cr[False] - mt.cr[True])) mt = mt.annotate_rows(**{ 'cr_diff': hl.struct( filters=hl.agg.any((mt.diff > self._cr_thresh) & (mt[self._initial_row_filter].filters == False)))}) return mt
def metaanalyze_gwas(subsets, gwas_ht_list, sim_name, param_suffix, wd): if len(gwas_ht_list) == 1: # if list is single GWAS, don't meta-analyze return gwas_ht_list[0] sample_ct_dict = {} for subset_idx, tmp_gwas_ht in enumerate(gwas_ht_list, 1): sample_ct = subsets.filter(subsets.subset_idx == subset_idx).count() sample_ct_dict[subset_idx] = sample_ct print( f'\n\nmeta-analysis sample count subset {subset_idx}: {sample_ct}\n\n' ) comb_gwas_ht = gwas_ht_list[0].annotate(subset_idx=1, n=sample_ct_dict[1]) union_args = [ ht.annotate(subset_idx=subset_idx, n=sample_ct_dict[subset_idx]) for subset_idx, ht in enumerate(gwas_ht_list[1:], 2) ] # list of gwas_ht's to join comb_gwas_ht = comb_gwas_ht.union(*union_args) comb_gwas_ht = comb_gwas_ht.annotate(w=1 / (comb_gwas_ht['standard_error']**2)) agg_expr = { 'meta_se': hl.sqrt(1 / (hl.agg.sum(comb_gwas_ht.w))), 'meta_beta': hl.agg.sum(comb_gwas_ht['beta'] * comb_gwas_ht.w) / hl.agg.sum(comb_gwas_ht.w), 'meta_EAF': hl.agg.sum(comb_gwas_ht['EAF'] * comb_gwas_ht['n']) / hl.agg.sum(comb_gwas_ht['n']) } comb_gwas_ht = comb_gwas_ht.group_by('locus', 'alleles').aggregate(**agg_expr) comb_gwas_ht = comb_gwas_ht.annotate( meta_pval=2 * hl.pnorm(-hl.abs(comb_gwas_ht.meta_beta / comb_gwas_ht.meta_se))) meta_gwas_path = f'{wd}/gwas.logreg.{sim_name}.{param_suffix}.tsv.gz' comb_gwas_ht.export(meta_gwas_path)
def assert_c_king_same_as_hail_king(c_king_path, hail_king_mt): actual = hail_king_mt.entries() expected = hl.import_table(c_king_path, types={'Kinship': hl.tfloat}, key=['ID1', 'ID2']) expected = expected.rename({'ID1': 's_1', 'ID2': 's', 'Kinship': 'phi'}) expected = expected.key_by('s_1', 's') expected = expected.annotate(actual=actual[expected.key]) expected = expected.select( expected=expected.phi, actual=expected.actual.phi, diff=expected.phi - expected.actual.phi ) expected = expected.annotate( # KING prints 4 significant digits; but there are several instances # where we calculate 0.XXXX5 whereas KING outputs 0.XXXX failure=hl.abs(expected.diff) > 0.00006) expected = expected.filter(expected.failure) assert expected.count() == 0, expected.collect()
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') # get frequency of loadings values loadings = hl.read_table(LOADINGS) number_of_pcs = hl.len(loadings.loadings).take(1)[0] print(loadings.count()) for i in range(0, (number_of_pcs)): pc = i + 1 freq = Counter(hl.abs(loadings.loadings[i]).collect()) filename = 'loadings_pc' + str(pc) + '.txt' with open(filename, 'w') as f: for key, value in freq.items(): str_value = repr(key) + ' ' + repr(value) f.write(str_value + '\n') f.close() subprocess.run(['gsutil', 'cp', filename, output], check=False) # pull out variants that looked like they're capped in the loadings plot mt = hl.read_matrix_table(HGDP1KG_TOBWGS) # Get NFE samples only mt = mt.filter_cols(( mt.hgdp_1kg_metadata.population_inference.pop == 'nfe') | (mt.s.contains('TOB'))) intervals = [ hl.parse_locus(x, reference_genome='GRCh38') for x in [ 'chr1:176163025', 'chr5:272714', 'chr5:36104012', 'chr1:183565810', 'chr3:58111799', ] ] mt_hits = mt.filter_rows(hl.literal(intervals).contains(mt.locus)) mt_path = f'{output}/capped_loadings_intervals.mt' mt_hits.write(mt_path)
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') loadings_ht = hl.read_table(LOADINGS) number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0] for i in range(0, (number_of_pcs)): pc = i + 1 p = manhattan_loadings( pvals=hl.abs(loadings_ht.loadings[i]), locus=loadings_ht.locus, title='Loadings of PC ' + str(pc), collect_all=True, ) plot_filename = f'{output}/loadings_manhattan_plot_pc' + str( pc) + '.png' with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') plot_filename_html = 'loadings_pc' + str(pc) + '.html' output_file(plot_filename_html) save(p) subprocess.run(['gsutil', 'cp', plot_filename_html, output], check=False)
def create_binned_concordance(data_type: str, truth_sample: str, metric: str, nbins: int, overwrite: bool) -> None: """ Creates and writes a concordance table binned by rank (both absolute and relative) for a given data type, truth sample and metric. :param str data_type: One 'exomes' or 'genomes' :param str truth_sample: Which truth sample concordance to load :param str metric: One of the evaluation metrics (or a RF hash) :param int nbins: Number of bins for the rank :param bool overwrite: Whether to overwrite existing table :return: Nothing -- just writes the table :rtype: None """ if hl.hadoop_exists( binned_concordance_path(data_type, truth_sample, metric) + '/_SUCCESS') and not overwrite: logger.warn( f"Skipping binned concordance creation as {binned_concordance_path(data_type, truth_sample, metric)} exists and overwrite=False" ) else: ht = hl.read_table( annotations_ht_path(data_type, f'{truth_sample}_concordance')) # Remove 1bp indels for syndip as cannot be trusted if truth_sample == 'syndip': ht = ht.filter( hl.is_indel(ht.alleles[0], ht.alleles[1]) & (hl.abs(hl.len(ht.alleles[0]) - hl.len(ht.alleles[1])) == 1), keep=False) high_conf_intervals = hl.import_locus_intervals( syndip_high_conf_regions_bed_path) else: high_conf_intervals = hl.import_locus_intervals( NA12878_high_conf_regions_bed_path) lcr = hl.import_locus_intervals(lcr_intervals_path) segdup = hl.import_locus_intervals(segdup_intervals_path) ht = ht.filter( hl.is_defined(high_conf_intervals[ht.locus]) & hl.is_missing(lcr[ht.locus]) & hl.is_missing(segdup[ht.locus])) if metric in ['vqsr', 'rf_2.0.2', 'rf_2.0.2_beta', 'cnn']: metric_ht = hl.read_table(score_ranking_path(data_type, metric)) else: metric_ht = hl.read_table( rf_path(data_type, 'rf_result', run_hash=metric)) metric_snvs, metrics_indels = metric_ht.aggregate([ hl.agg.count_where( hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])), hl.agg.count_where( ~hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])) ]) snvs, indels = ht.aggregate([ hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])), hl.agg.count_where(~hl.is_snp(ht.alleles[0], ht.alleles[1])) ]) ht = ht.annotate_globals(global_counts=hl.struct( snvs=metric_snvs, indels=metrics_indels), counts=hl.struct(snvs=snvs, indels=indels)) ht = ht.annotate( snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), score=metric_ht[ht.key].score, global_rank=metric_ht[ht.key].rank, # TP => allele is found in both data sets n_tp=ht.concordance[3][3] + ht.concordance[3][4] + ht.concordance[4][3] + ht.concordance[4][4], # FP => allele is found only in test data set n_fp=hl.sum(ht.concordance[3][:2]) + hl.sum(ht.concordance[4][:2]), # FN => allele is found only in truth data set n_fn=hl.sum(ht.concordance[:2].map(lambda x: x[3] + x[4]))) ht = add_rank(ht, -1.0 * ht.score) ht = ht.annotate(rank=[ hl.tuple([ 'global_rank', (ht.global_rank + 1) / hl.cond(ht.snv, ht.globals.global_counts.snvs, ht.globals.global_counts.indels) ]), hl.tuple([ 'truth_sample_rank', (ht.rank + 1) / hl.cond( ht.snv, ht.globals.counts.snvs, ht.globals.counts.indels) ]) ]) ht = ht.explode(ht.rank) ht = ht.annotate(rank_name=ht.rank[0], bin=hl.int(ht.rank[1] * nbins)) ht = ht.group_by('rank_name', 'snv', 'bin').aggregate( # Look at site-level metrics -> tp > fp > fn -- only important for multi-sample comparisons tp=hl.agg.count_where(ht.n_tp > 0), fp=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp > 0)), fn=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp == 0) & (ht.n_fn > 0)), min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n_alleles=hl.agg.count()).repartition(5) ht.write(binned_concordance_path(data_type, truth_sample, metric), overwrite=overwrite)
def main(args): hl.init() # Read in all sumstats mt = load_final_sumstats_mt(filter_phenos=True, filter_variants=False, filter_sumstats=True, separate_columns_by_pop=False, annotate_with_nearest_gene=False) # Annotate per-entry sample size def get_n(pheno_data, i): return pheno_data[i].n_cases + hl.or_else(pheno_data[i].n_controls, 0) mt = mt.annotate_entries(summary_stats=hl.map( lambda x: x[1].annotate(N=hl.or_missing(hl.is_defined(x[1]), get_n(mt.pheno_data, x[0]))), hl.zip_with_index(mt.summary_stats))) # Exclude entries with low confidence flag. if not args.keep_low_confidence_variants: mt = mt.annotate_entries(summary_stats=hl.map( lambda x: hl.or_missing(~x.low_confidence, x), mt.summary_stats)) # Run fixed-effect meta-analysis (all + leave-one-out) mt = mt.annotate_entries(unnorm_beta=mt.summary_stats.BETA / (mt.summary_stats.SE**2), inv_se2=1 / (mt.summary_stats.SE**2)) mt = mt.annotate_entries( sum_unnorm_beta=all_and_leave_one_out(mt.unnorm_beta, mt.pheno_data.pop), sum_inv_se2=all_and_leave_one_out(mt.inv_se2, mt.pheno_data.pop)) mt = mt.transmute_entries(META_BETA=mt.sum_unnorm_beta / mt.sum_inv_se2, META_SE=hl.map(lambda x: hl.sqrt(1 / x), mt.sum_inv_se2)) mt = mt.annotate_entries( META_Pvalue=hl.map(lambda x: 2 * hl.pnorm(x), -hl.abs(mt.META_BETA / mt.META_SE))) # Run heterogeneity test (Cochran's Q) mt = mt.annotate_entries(META_Q=hl.map( lambda x: hl.sum((mt.summary_stats.BETA - x)**2 * mt.inv_se2), mt.META_BETA), variant_exists=hl.map(lambda x: ~hl.is_missing(x), mt.summary_stats.BETA)) mt = mt.annotate_entries(META_N_pops=all_and_leave_one_out( mt.variant_exists, mt.pheno_data.pop)) mt = mt.annotate_entries(META_Pvalue_het=hl.map( lambda i: hl.pchisqtail(mt.META_Q[i], mt.META_N_pops[i] - 1), hl.range(hl.len(mt.META_Q)))) # Add other annotations mt = mt.annotate_entries( ac_cases=hl.map(lambda x: x["AF.Cases"] * x.N, mt.summary_stats), ac_controls=hl.map(lambda x: x["AF.Controls"] * x.N, mt.summary_stats), META_AC_Allele2=all_and_leave_one_out( mt.summary_stats.AF_Allele2 * mt.summary_stats.N, mt.pheno_data.pop), META_N=all_and_leave_one_out(mt.summary_stats.N, mt.pheno_data.pop)) mt = mt.annotate_entries( META_AF_Allele2=mt.META_AC_Allele2 / mt.META_N, META_AF_Cases=all_and_leave_one_out(mt.ac_cases, mt.pheno_data.pop) / mt.META_N, META_AF_Controls=all_and_leave_one_out(mt.ac_controls, mt.pheno_data.pop) / mt.META_N) mt = mt.drop('unnorm_beta', 'inv_se2', 'variant_exists', 'ac_cases', 'ac_controls', 'summary_stats', 'META_AC_Allele2') # Format everything into array<struct> def is_finite_or_missing(x): return (hl.or_missing(hl.is_finite(x), x)) meta_fields = [ 'BETA', 'SE', 'Pvalue', 'Q', 'Pvalue_het', 'N', 'N_pops', 'AF_Allele2', 'AF_Cases', 'AF_Controls' ] mt = mt.transmute_entries(meta_analysis=hl.map( lambda i: hl.struct( **{ field: is_finite_or_missing(mt[f'META_{field}'][i]) for field in meta_fields }), hl.range(hl.len(mt.META_BETA)))) col_fields = ['n_cases', 'n_controls'] mt = mt.annotate_cols( **{ field: all_and_leave_one_out(mt.pheno_data[field], mt.pheno_data.pop) for field in col_fields }) col_fields += ['pop'] mt = mt.annotate_cols(pop=all_and_leave_one_out( mt.pheno_data.pop, mt.pheno_data.pop, all_f=lambda x: x, loo_f=lambda i, x: hl.filter(lambda y: y != x[i], x), )) mt = mt.transmute_cols(meta_analysis_data=hl.map( lambda i: hl.struct(**{field: mt[field][i] for field in col_fields}), hl.range(hl.len(mt.pop)))) mt.describe() mt.write(get_meta_analysis_results_path(), overwrite=args.overwrite) hl.copy_log('gs://ukb-diverse-pops/combined_results/meta_analysis.log')
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') scores = hl.read_table(SCORES) scores = scores.annotate( study=hl.if_else(scores.s.contains('TOB'), 'TOB-WGS', 'HGDP-1kG')) sample_names = scores.s.collect() labels = scores.study.collect() study = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # Get number of PCs number_of_pcs = len(eigenvalues) # plot by study for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='Study', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=sample_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', ['#1b9e77', '#d95f02'], study), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'study_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'study_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # plot by continental population hgdp1kg_tobwgs = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = scores.annotate(continental_pop=hgdp1kg_tobwgs.cols()[ scores.s].hgdp_1kg_metadata.population_inference.pop) labels = scores.continental_pop.collect() # Change TOB-WGS 'none' values to 'TOB-WGS' labels = ['TOB-NFE' if x is None else x for x in labels] continental_population = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='Continental Population', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=sample_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', turbo(len(continental_population)), continental_population), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'continental_pop_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'continental_pop_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # plot by subpopulation scores = scores.annotate(subpop=hgdp1kg_tobwgs.cols()[ scores.s].hgdp_1kg_metadata.labeled_subpop) labels = scores.subpop.collect() labels = ['TOB-NFE' if x is None else x for x in labels] sub_population = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='Subpopulation', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=sample_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', turbo(len(sub_population)), sub_population), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'subpop_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'subpop_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # Plot loadings loadings_ht = hl.read_table(LOADINGS) for i in range(0, (number_of_pcs)): pc = i + 1 plot = manhattan_loadings( pvals=hl.abs(loadings_ht.loadings[i]), locus=loadings_ht.locus, title='Loadings of PC ' + str(pc), collect_all=True, ) plot_filename = output_path(f'loadings_pc{pc}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def get_ac(af, an): if filter_mac_instead_of_ac: # Note that the underlying file behind get_ukb_af_ht_path() accidentally double af and halve an return (1.0 - hl.abs(1.0 - af)) * an else: return af * an
def get_ac(af, an): if filter_mac_instead_of_ac: return (0.5 - hl.abs(0.5 - af)) * an else: return af * an
def get_maf(af): return 0.5 - hl.abs(0.5 - af)
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') loadings_ht = hl.read_table(LOADINGS) gtf_ht = hl.experimental.import_gtf( GTF_FILE, reference_genome='GRCh38', skip_invalid_contigs=True, min_partitions=12, ) number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0] - 1 for i in range(0, (number_of_pcs)): pc = i + 1 plot_filename = output_path(f'loadings_manhattan_plot_pc{pc}.png', 'web') if not hl.hadoop_exists(plot_filename): p = manhattan_loadings( iteration=i, gtf=gtf_ht, loadings=loadings_ht, title=f'Loadings of PC{pc}', collect_all=True, ) with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') html = file_html(p, CDN, 'my plot') plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # Get samples which are driving loadings mt = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = hl.read_table(SCORES) mt = mt.semi_join_cols(scores) loadings_ht = loadings_ht.key_by('locus') mt = mt.annotate_rows(loadings=loadings_ht[mt.locus].loadings) for dim in range(0, number_of_pcs): max_value = mt.aggregate_rows(hl.agg.stats(hl.abs( mt.loadings[dim]))).max significant_variants = mt.filter_rows( hl.abs(mt.loadings[dim]) == max_value) significant_variants = hl.sample_qc(significant_variants) significant_variant_list = significant_variants.locus.collect() print(f'PC{dim}:', significant_variant_list) heterozygous_samples = significant_variants.filter_cols( significant_variants.sample_qc.n_het > 0).s.collect() homozygous_alternate_samples = significant_variants.filter_cols( significant_variants.sample_qc.n_hom_var > 0).s.collect() if len(heterozygous_samples) > len(homozygous_alternate_samples): homozygous_alternate_samples.extend('null' for _ in range( len(heterozygous_samples) - len(homozygous_alternate_samples))) elif len(heterozygous_samples) < len(homozygous_alternate_samples): heterozygous_samples.extend('null' for _ in range( len(homozygous_alternate_samples) - len(heterozygous_samples))) # save as html html = pd.DataFrame({ 'heterozygous_samples': heterozygous_samples, 'homozygous_alternate_samples': homozygous_alternate_samples, }).to_html() plot_filename_html = output_path( f'significant_variants_non_ref_samples{dim}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
for phen_i in idx: phen = phens[phen_i] output_path = wd + f'{phen}.diffgwasloci.tsv.bgz' try: subprocess.check_output([f'gsutil', 'ls', output_path]) != None print(f'\n#############\n{phen} already completed!\n#############\n') except: print( f'\n#############\nStarting phenotype {phen} ({idx.index(phen_i)+1} of {len(phens)} for paridx {paridx})\n#############\n' ) start = dt.datetime.now() f = hl.import_table(path + phen + '.gwas.imputed_v3.female.tsv.bgz', force_bgz=True, impute=True, key='variant') m = hl.import_table(path + phen + '.gwas.imputed_v3.male.tsv.bgz', force_bgz=True, impute=True, key='variant') both = f.join(m) both1 = both.filter( ~(both.low_confidence_variant | both.low_confidence_variant_1)) #remove low confidence variants both2 = both1.annotate(diff=(both1.beta - both1.beta_1), diff_se=hl.sqrt(both1.se**2 + both1.se_1**2)) both3 = both2.annotate(diff_pval=2 * hl.pnorm(-hl.abs(both2.diff / both2.diff_se))) both3.select('diff', 'diff_se', 'diff_pval').export(output_path) print( f'\n#############\nTime for phenotype {phen}: {round((dt.datetime.now()-start).seconds/60, 2)} min\n#############' )
def manhattan_loadings( iteration, gtf, loadings, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, ): """modify hail manhattan plot""" palette = [ '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', ] # add gene names, p-values, and locus info loadings = loadings.annotate(gene_names=gtf[loadings.locus].gene_name) pvals = hl.abs(loadings.loadings[iteration]) locus = loadings.locus if hover_fields is None: hover_fields = {} hover_fields['locus'] = hl.str(locus) hover_fields['gene'] = hl.str(loadings.gene_names) source_pd = ( hl.plot.plots._collect_scatter_plot_data( # pylint: disable=protected-access ('_global_locus', locus.global_position()), ('_pval', pvals), fields=hover_fields, n_divisions=None if collect_all else n_divisions, )) source_pd['p_value'] = source_pd['_pval'] source_pd['_contig'] = [ locus.split(':')[0] for locus in source_pd['locus'] ] observed_contigs = set(source_pd['_contig']) ref = locus.dtype.reference_genome observed_contigs = [ contig for contig in ref.contigs.copy() if contig in observed_contigs ] contig_ticks = [ ref._contig_global_position(contig) # pylint: disable=protected-access + ref.contig_length(contig) // 2 for contig in observed_contigs ] color_mapper = CategoricalColorMapper(factors=ref.contigs, palette=palette[:2] * int( (len(ref.contigs) + 1) / 2)) p = figure(title=title, x_axis_label='Chromosome', y_axis_label='Loadings', width=1000) ( p, _, legend, _, _, _, ) = hl.plot.plots._get_scatter_plot_elements( # pylint: disable=protected-access p, source_pd, x_col='_global_locus', y_col='_pval', label_cols=['_contig'], colors={'_contig': color_mapper}, size=size, ) legend.visible = False p.xaxis.ticker = contig_ticks p.xaxis.major_label_overrides = dict(zip(contig_ticks, observed_contigs)) p.select_one(HoverTool).tooltips = [ t for t in p.select_one(HoverTool).tooltips if not t[0].startswith('_') ] return p
#aggregation per TSS distance, eQTL p value, and MAC in GTEx ems = hl.read_table( "gs://qingbowang/ems_v1_test/ems_pcausal_gtexvg_all{0}.ht".format( tissue_name)) vg = hl.read_table( "gs://qingbowang/ems_v1_test/{0}_allpairs.ht".format(tissue_name)) vg = vg.annotate(vg=vg.variant_id + "_" + vg.gene_id) vg = vg.key_by("vg") ems = ems.join(vg, how="left") ems = ems.annotate(conf_gain_log10_bin=hl.ceil(ems.confidence_gain_log10)) #tss dist bin ems = ems.annotate( tss_dist_bin_unsigned=hl.ceil(hl.log10(hl.abs(ems.tss_distance)))) ems = ems.transmute( tss_dist_bin=hl.cond(ems.tss_distance > 0, ems.tss_dist_bin_unsigned, ems.tss_dist_bin_unsigned * -1)) agged = ems.group_by("tss_dist_bin", "conf_gain_log10_bin").aggregate(n=hl.agg.count()) agged.export("gs://qingbowang/ems_v1_test/tmp/{0}_tssdist_vs_EMS.tsv".format( tissue_name)) #p value ems = ems.annotate( pval_bin=hl.case().when(ems.pval_nominal < 5 * 10**-8, -1).when( ems.pval_nominal > 0.05, 1).default(0)) agged = ems.group_by("pval_bin", "conf_gain_log10_bin").aggregate(n=hl.agg.count()) agged.export(
def score_bin_agg( ht: hl.GroupedTable, fam_stats_ht: hl.Table ) -> Dict[str, hl.expr.Aggregation]: """ Default aggregation function to add aggregations for min/max of score, number of ClinVar variants, number of truth variants (omni, mills, hapmap, and kgp_phase1), and family statistics. .. note:: This function uses `ht._parent` to get the origin Table from the GroupedTable for the aggregation This can easily be combined with the GroupedTable returned by `compute_grouped_binned_ht` Example: .. code-block:: python binned_ht = create_binned_ht(...) grouped_binned_ht = compute_grouped_binned_ht(binned_ht) agg_ht = grouped_binned_ht.aggregate(score_bin_agg(**grouped_binned_ht, ...)) .. note:: The following annotations should be present: In ht: - score - singleton - positive_train_site - negative_train_site - ac_raw - expected that this is the raw allele count before adj filtering - ac - expected that this is the allele count after adj filtering - ac_qc_samples_unrelated_raw - allele count before adj filtering for unrelated samples passing sample QC - info - struct that includes QD, FS, and MQ in order to add an annotation for fail_hard_filters In truth_ht: - omni - mills - hapmap - kgp_phase1_hc In fam_stats_ht: - n_de_novos_adj - n_de_novos_raw - n_transmitted_raw - n_untransmitted_raw Automatic aggregations that will be done are: - `min_score` - minimun of score annotation per group - `max_score` - maiximum of score annotation per group - `n` - count of variants per group - `n_ins` - count of insertion per group - `n_ins` - count of insertion per group - `n_del` - count of deletions per group - `n_ti` - count of transitions per group - `n_tv` - count of trnasversions per group - `n_1bp_indel` - count of one base pair indels per group - `n_mod3bp_indel` - count of indels with a length divisible by three per group - `n_singleton` - count of singletons per group - `fail_hard_filters` - count of variants per group with QD < 2 | FS > 60 | MQ < 30 - `n_vqsr_pos_train` - count of variants that were a VQSR positive train site per group - `n_vqsr_neg_train` - count of variants that were a VQSR negative train site per group - `n_clinvar` - count of clinvar variants - `n_de_novos_singleton_adj` - count of singleton de novo variants after adj filtration - `n_de_novo_singleton` - count of raw unfiltered singleton de novo variants - `n_de_novos_adj` - count of adj filtered de novo variants - `n_de_novos` - count of raw unfiltered de novo variants - `n_trans_singletons` - count of transmitted singletons - `n_untrans_singletons` - count of untransmitted singletons - `n_omni` - count of omni truth variants - `n_mills` - count of mills truth variants - `n_hapmap` - count of hapmap truth variants - `n_kgp_phase1_hc` - count of 1000 genomes phase 1 high confidence truth variants :param ht: Table that aggregation will be performed on :param fam_stats_ht: Path to family statistics HT :return: a dictionary containing aggregations to perform on ht """ # Annotate binned table with the evaluation data ht = ht._parent indel_length = hl.abs(ht.alleles[0].length() - ht.alleles[1].length()) # Load external evaluation data build = get_reference_genome(ht.locus).name clinvar = ( grch37_resources.reference_data.clinvar if build == "GRCh37" else grch38_resources.reference_data.clinvar ).ht()[ht.key] truth_data = ( grch37_resources.reference_data.get_truth_ht() if build == "GRCh37" else grch38_resources.reference_data.get_truth_ht() )[ht.key] fam = fam_stats_ht[ht.key] return dict( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where(hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where(hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where(hl.is_transition(ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where(hl.is_transversion(ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(indel_length == 1), n_mod3bp_indel=hl.agg.count_where((indel_length % 3) == 0), n_singleton=hl.agg.count_where(ht.singleton), fail_hard_filters=hl.agg.count_where( (ht.info.QD < 2) | (ht.info.FS > 60) | (ht.info.MQ < 30) ), n_pos_train=hl.agg.count_where(ht.positive_train_site), n_neg_train=hl.agg.count_where(ht.negative_train_site), n_clinvar=hl.agg.count_where(hl.is_defined(clinvar)), n_de_novos_singleton_adj=hl.agg.filter( ht.ac == 1, hl.agg.sum(fam.n_de_novos_adj) ), n_de_novo_singleton=hl.agg.filter( ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw) ), n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj), n_de_novo=hl.agg.sum(fam.n_de_novos_raw), n_trans_singletons=hl.agg.filter( ht.ac_raw == 2, hl.agg.sum(fam.n_transmitted_raw) ), n_untrans_singletons=hl.agg.filter( (ht.ac_raw < 3) & (ht.ac_qc_samples_unrelated_raw == 1), hl.agg.sum(fam.n_untransmitted_raw), ), n_train_trans_singletons=hl.agg.filter( (ht.ac_raw == 2) & ht.positive_train_site, hl.agg.sum(fam.n_transmitted_raw) ), n_omni=hl.agg.count_where(truth_data.omni), n_mills=hl.agg.count_where(truth_data.mills), n_hapmap=hl.agg.count_where(truth_data.hapmap), n_kgp_phase1_hc=hl.agg.count_where(truth_data.kgp_phase1_hc), )
def create_binned_data_initial(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table: # Count variants for ranking count_expr = {x: hl.agg.filter(hl.is_defined(ht[x]), hl.agg.counter(hl.cond(hl.is_snp( ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank')} rank_variant_counts = ht.aggregate(hl.Struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(rank_variant_counts)}") ht_truth_data = hl.read_table( f"{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht") ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts) ht = ht.annotate( **ht_truth_data[ht.key], # **fam_ht[ht.key], # **gnomad_ht[ht.key], # **denovo_ht[ht.key], # clinvar=hl.is_defined(clinvar_ht[ht.key]), indel_length=hl.abs(ht.alleles[0].length()-ht.alleles[1].length()), rank_bins=hl.array( [hl.Struct( rank_id=rank_name, bin=hl.int(hl.ceil(hl.float(ht[rank_name] + 1) / hl.floor(ht.globals.rank_variant_counts[rank_name][hl.cond( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins))) ) for rank_name in rank_variant_counts] ), # lcr=hl.is_defined(lcr_intervals[ht.locus]) ) ht = ht.explode(ht.rank_bins) ht = ht.transmute( rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin ) ht = ht.filter(hl.is_defined(ht.bin)) ht = ht.checkpoint( f'{tmp_dir}/gnomad_score_binning_tmp.ht', overwrite=True) # Create binned data return ( ht .group_by( rank_id=ht.rank_id, contig=ht.locus.contig, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), bi_allelic=hl.is_defined(ht.biallelic_rank), singleton=ht.transmitted_singleton, trans_singletons=hl.is_defined(ht.singleton_rank), de_novo_high_quality=ht.de_novo_high_quality_rank, de_novo_medium_quality=hl.is_defined( ht.de_novo_medium_quality_rank), de_novo_synonymous=hl.is_defined(ht.de_novo_synonymous_rank), # release_adj=ht.ac > 0, bin=ht.bin )._set_buffer_size(20000) .aggregate( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where( hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where( hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where(hl.is_transition( ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where(hl.is_transversion( ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(ht.indel_length == 1), n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0), # n_clinvar=hl.agg.count_where(ht.clinvar), n_singleton=hl.agg.count_where(ht.transmitted_singleton), n_high_quality_de_novos=hl.agg.count_where( ht.de_novo_data.p_de_novo[0] > 0.99), n_validated_DDD_denovos=hl.agg.count_where( ht.inheritance.contains("De novo")), n_medium_quality_de_novos=hl.agg.count_where( ht.de_novo_data.p_de_novo[0] > 0.5), n_high_confidence_de_novos=hl.agg.count_where( ht.de_novo_data.confidence[0] == 'HIGH'), n_de_novo=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.sum( ht.family_stats.mendel[0].errors)), n_high_quality_de_novos_synonymous=hl.agg.count_where( (ht.de_novo_data.p_de_novo[0] > 0.99) & (ht.consequence == "synonymous_variant")), # n_de_novo_no_lcr=hl.agg.filter(~ht.lcr & ( # ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_sites=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.count_where( ht.family_stats.mendel[0].errors > 0)), # n_de_novo_sites_no_lcr=hl.agg.filter(~ht.lcr & ( # ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_trans_singletons=hl.agg.filter((ht.ac_raw < 3) & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)), n_trans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)), n_untrans_singletons=hl.agg.filter((ht.ac_raw < 3) & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)), n_untrans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)), n_train_trans_singletons=hl.agg.count_where( (ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1) & (ht.family_stats.tdt[0].t == 1)), n_omni=hl.agg.count_where(ht.omni), n_mills=hl.agg.count_where(ht.mills), n_hapmap=hl.agg.count_where(ht.hapmap), n_kgp_high_conf_snvs=hl.agg.count_where( ht.kgp_phase1_hc), fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters), # n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site), # n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site) ) )
def create_binned_data(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table: """ Creates binned data from a rank Table grouped by rank_id (rank, biallelic, etc.), contig, snv, bi_allelic and singleton containing the information needed for evaluation plots. :param Table ht: Input rank table :param str data: Which data/run hash is being created :param str data_type: one of 'exomes' or 'genomes' :param int n_bins: Number of bins. :return: Binned Table :rtype: Table """ # Count variants for ranking count_expr = { x: hl.agg.filter( hl.is_defined(ht[x]), hl.agg.counter( hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank') } rank_variant_counts = ht.aggregate(hl.Struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(rank_variant_counts)}" ) ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts) # Load external evaluation data clinvar_ht = hl.read_table(clinvar_ht_path) denovo_ht = get_validated_denovos_ht() if data_type == 'exomes': denovo_ht = denovo_ht.filter(denovo_ht.gnomad_exomes.high_quality) else: denovo_ht = denovo_ht.filter(denovo_ht.gnomad_genomes.high_quality) denovo_ht = denovo_ht.select( validated_denovo=denovo_ht.validated, high_confidence_denovo=denovo_ht.Confidence == 'HIGH') ht_truth_data = hl.read_table(annotations_ht_path(data_type, 'truth_data')) fam_ht = hl.read_table(annotations_ht_path(data_type, 'family_stats')) fam_ht = fam_ht.select(family_stats=fam_ht.family_stats[0]) gnomad_ht = get_gnomad_data(data_type).rows() gnomad_ht = gnomad_ht.select( vqsr_negative_train_site=gnomad_ht.info.NEGATIVE_TRAIN_SITE, vqsr_positive_train_site=gnomad_ht.info.POSITIVE_TRAIN_SITE, fail_hard_filters=(gnomad_ht.info.QD < 2) | (gnomad_ht.info.FS > 60) | (gnomad_ht.info.MQ < 30)) lcr_intervals = hl.import_locus_intervals(lcr_intervals_path) ht = ht.annotate( **ht_truth_data[ht.key], **fam_ht[ht.key], **gnomad_ht[ht.key], **denovo_ht[ht.key], clinvar=hl.is_defined(clinvar_ht[ht.key]), indel_length=hl.abs(ht.alleles[0].length() - ht.alleles[1].length()), rank_bins=hl.array([ hl.Struct( rank_id=rank_name, bin=hl.int( hl.ceil( hl.float(ht[rank_name] + 1) / hl.floor( ht.globals.rank_variant_counts[rank_name][hl.cond( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins)))) for rank_name in rank_variant_counts ]), lcr=hl.is_defined(lcr_intervals[ht.locus])) ht = ht.explode(ht.rank_bins) ht = ht.transmute(rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin) ht = ht.filter(hl.is_defined(ht.bin)) ht = ht.checkpoint( f'gs://gnomad-tmp/gnomad_score_binning_{data_type}_tmp_{data}.ht', overwrite=True) # Create binned data return (ht.group_by( rank_id=ht.rank_id, contig=ht.locus.contig, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), bi_allelic=hl.is_defined(ht.biallelic_rank), singleton=ht.singleton, release_adj=ht.ac > 0, bin=ht.bin)._set_buffer_size(20000).aggregate( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where( hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where( hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where( hl.is_transition(ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where( hl.is_transversion(ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(ht.indel_length == 1), n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0), n_clinvar=hl.agg.count_where(ht.clinvar), n_singleton=hl.agg.count_where(ht.singleton), n_validated_de_novos=hl.agg.count_where(ht.validated_denovo), n_high_confidence_de_novos=hl.agg.count_where( ht.high_confidence_denovo), n_de_novo=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_sites=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_de_novo_sites_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_trans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.t)), n_untrans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.u)), n_train_trans_singletons=hl.agg.count_where( (ht.family_stats.unrelated_qc_callstats.AC[1] == 1) & (ht.family_stats.tdt.t == 1)), n_omni=hl.agg.count_where(ht.truth_data.omni), n_mills=hl.agg.count_where(ht.truth_data.mills), n_hapmap=hl.agg.count_where(ht.truth_data.hapmap), n_kgp_high_conf_snvs=hl.agg.count_where( ht.truth_data.kgp_high_conf_snvs), fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters), n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site), n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)))
hl.set_global_seed(seed) gt_sim_suffix = f'bn.npops_{n_pops}.nvars_{n_vars}.nsim_{n_sim}' if sim_name[: 3] == 'bn_' else '' # suffix for genotype simulation (empty string if using ukb data) param_suffix = f'{gt_sim_suffix}.h2_{h2}.pi_{pi}.K_{K}.seed_{seed}' betas_path = f'{smiles_wd}/betas.{param_suffix}.tsv.gz' phens_path = f'{smiles_wd}/phens.{param_suffix}.tsv.gz' if sim_name[:3] == 'bn_': mt = hl.balding_nichols_model(n_populations=n_pops, n_samples=n_sim, n_variants=n_vars, fst=fst) mt = mt.filter_rows( (hl.abs(hl.agg.mean(mt.GT.n_alt_alleles()) / 2 - 0.5) < 0.5)) # remove invariant SNPs mt = mt.annotate_cols(s=hl.str(mt.sample_idx)) if hl.hadoop_is_file(betas_path) and hl.hadoop_is_file(phens_path): # betas = hl.import_table(betas_path, impute=True, force=True) # betas = betas.annotate(locus = hl.parse_locus(betas.locus), # alleles = betas.alleles.replace('\[\"','').replace('\"\]','').split('\",\"')) # betas = betas.key_by('locus','alleles') phens = hl.import_table(phens_path, key=['s'], types={'s': hl.tstr}, impute=True, force=True)
def main(): args = parse_args() tables = [] for i, path in enumerate(args.paths): ht = import_SJ_out_tab(path) ht = ht.key_by("chrom", "start_1based", "end_1based") if args.normalize_read_counts: ht = ht.annotate_globals( unique_reads_in_sample=ht.aggregate(hl.agg.sum( ht.unique_reads)), multi_mapped_reads_in_sample=ht.aggregate( hl.agg.sum(ht.multi_mapped_reads)), ) # add 'interval' column #ht = ht.annotate(interval=hl.interval( # hl.locus(ht.chrom, ht.start_1based, reference_genome=reference_genome), # hl.locus(ht.chrom, ht.end_1based, reference_genome=reference_genome),)) tables.append(ht) # compute mean if args.normalize_read_counts: mean_unique_reads_in_sample = sum( [hl.eval(ht.unique_reads_in_sample) for ht in tables]) / float(len(tables)) mean_multi_mapped_reads_in_sample = sum( [hl.eval(ht.multi_mapped_reads_in_sample) for ht in tables]) / float(len(tables)) print( f"mean_unique_reads_in_sample: {mean_unique_reads_in_sample:01f}, mean_multi_mapped_reads_in_sample: {mean_multi_mapped_reads_in_sample:01f}" ) combined_ht = None for i, ht in enumerate(tables): print(f"Processing table #{i} out of {len(tables)}") if args.normalize_read_counts: unique_reads_multiplier = mean_unique_reads_in_sample / float( hl.eval(ht.unique_reads_in_sample)) multi_mapped_reads_multiplier = mean_multi_mapped_reads_in_sample / float( hl.eval(ht.multi_mapped_reads_in_sample)) print( f"unique_reads_multiplier: {unique_reads_multiplier:01f}, multi_mapped_reads_multiplier: {multi_mapped_reads_multiplier:01f}" ) ht = ht.annotate( strand_counter=hl.or_else( hl.switch(ht.strand).when(1, 1).when(2, -1).or_missing(), 0), num_samples_with_this_junction=1, ) if args.normalize_read_counts: ht = ht.annotate( unique_reads=hl.int32(ht.unique_reads * unique_reads_multiplier), multi_mapped_reads=hl.int32(ht.multi_mapped_reads * multi_mapped_reads_multiplier), ) if combined_ht is None: combined_ht = ht continue print("----") print_stats(path, ht) combined_ht = combined_ht.join(ht, how="outer") combined_ht = combined_ht.transmute( strand=hl.or_else( combined_ht.strand, combined_ht.strand_1 ), ## in rare cases, the strand for the same junction may differ across samples, so use a 2-step process that assigns strand based on majority of samples strand_counter=hl.sum([ combined_ht.strand_counter, combined_ht.strand_counter_1 ]), # samples vote on whether strand = 1 (eg. '+') or 2 (eg. '-') intron_motif=hl.or_else(combined_ht.intron_motif, combined_ht.intron_motif_1 ), ## double-check that left == right? known_splice_junction=hl.or_else( hl.cond((combined_ht.known_splice_junction == 1) | (combined_ht.known_splice_junction_1 == 1), 1, 0), 0), ## double-check that left == right? unique_reads=hl.sum( [combined_ht.unique_reads, combined_ht.unique_reads_1]), multi_mapped_reads=hl.sum([ combined_ht.multi_mapped_reads, combined_ht.multi_mapped_reads_1 ]), maximum_overhang=hl.max( [combined_ht.maximum_overhang, combined_ht.maximum_overhang_1]), num_samples_with_this_junction=hl.sum([ combined_ht.num_samples_with_this_junction, combined_ht.num_samples_with_this_junction_1 ]), ) combined_ht = combined_ht.checkpoint( f"checkpoint{i % 2}.ht", overwrite=True) #, _read_if_exists=True) total_junctions_count = combined_ht.count() strand_conflicts_count = combined_ht.filter( hl.abs(combined_ht.strand_counter) / hl.float(combined_ht.num_samples_with_this_junction) < 0.1, keep=True).count() # set final strand value to 1 (eg. '+') or 2 (eg. '-') or 0 (eg. uknown) based on the setting in the majority of samples combined_ht = combined_ht.annotate( strand=hl.case().when(combined_ht.strand_counter > 0, 1).when( combined_ht.strand_counter < 0, 2).default(0)) combined_ht = combined_ht.annotate_globals(combined_tables=args.paths, n_combined_tables=len( args.paths)) if strand_conflicts_count: print( f"WARNING: Found {strand_conflicts_count} strand_conflicts out of {total_junctions_count} total_junctions" ) # write as HT combined_ht = combined_ht.checkpoint( f"combined.SJ.out.ht", overwrite=True) #, _read_if_exists=True) ## write as tsv output_prefix = f"combined.{len(tables)}_samples{'.normalized_counts' if args.normalize_read_counts else ''}" combined_ht = combined_ht.key_by() combined_ht.export(f"{output_prefix}.with_header.combined.SJ.out.tab", header=True) combined_ht = combined_ht.select( "chrom", "start_1based", "end_1based", "strand", "intron_motif", "known_splice_junction", "unique_reads", "multi_mapped_reads", "maximum_overhang", ) combined_ht.export(f"{output_prefix}.SJ.out.tab", header=False) print( f"unique_reads_in combined table: {combined_ht.aggregate(hl.agg.sum(combined_ht.unique_reads))}" )