def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) # Perform kinship test with pc_relate pc_rel_path = output_path('pc_relate_kinship_estimate.ht') pc_rel = hl.pc_relate(mt.GT, 0.01, k=10, statistics='kin') pc_rel.write(pc_rel_path, overwrite=True) pairs = pc_rel.filter(pc_rel['kin'] >= 0.125) related_samples_to_remove = hl.maximal_independent_set( pairs.i, pairs.j, False) n_related_samples = related_samples_to_remove.count() print(f'related_samples_to_remove.count() = {n_related_samples}') # save as html html = pd.DataFrame({ 'removed_individual': related_samples_to_remove.node.s.collect() }).to_html() plot_filename_html = output_path(f'removed_samples.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles') snp_chip = hl.read_matrix_table(SNP_CHIP).key_rows_by('locus', 'alleles') # filter to loci that are contained in snp-chip data after densifying tob_wgs = hl.experimental.densify(tob_wgs) tob_wgs = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA) ).select_cols() snp_chip = snp_chip.select_entries(snp_chip.GT).select_cols() snp_chip = snp_chip.key_cols_by(s=snp_chip.s + '_snp_chip') tob_combined = tob_wgs.union_cols(snp_chip) tob_combined = tob_combined.cache() print(tob_combined.count_rows()) # Perform PCA eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') eigenvalues, scores, loadings = hl.hwe_normalized_pca( tob_combined.GT, compute_loadings=True, k=20 ) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS) hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) # keep loci that are contained in the densified, filtered tob-wgs mt hgdp_1kg = hgdp_1kg.semi_join_rows(tob_wgs.rows()) # Entries and columns must be identical tob_wgs_select = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)).select_cols() hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT).select_cols() # Join datasets hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select) # Add in metadata information hgdp_1kg_metadata = hgdp_1kg.cols() hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols( hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s]) # save this for population-level PCAs mt_path = output_path('hgdp1kg_tobwgs_joined_all_samples.mt') if not hl.hadoop_exists(mt_path): hgdp1kg_tobwgs_joined.write(mt_path) # Perform PCA eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') eigenvalues, scores, loadings = hl.hwe_normalized_pca( hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) mt = mt.filter_cols( (mt.hgdp_1kg_metadata.population_inference.pop == 'nfe') | (mt.s.contains('TOB')) ) # Remove related samples (at the 2nd degree or closer) king = hl.king(mt.GT) king_path = output_path('king_kinship_estimate_NFE.ht') king.write(king_path) ht = king.entries() related_samples = ht.filter((ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True) struct = hl.struct(i=related_samples.s_1, j=related_samples.s) struct = struct.annotate(phi=related_samples.phi) related_samples_to_remove = hl.maximal_independent_set( struct.i, struct.j, False # pylint: disable=E1101 ) n_related_samples = related_samples_to_remove.count() print(f'related_samples_to_remove.count() = {n_related_samples}') # save as html html = pd.DataFrame( {'related_individual': related_samples_to_remove.node.collect()} ).to_html() plot_filename_html = output_path(f'related_samples.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) # Get samples from the specified population only mt = mt.filter_cols(( mt.hgdp_1kg_metadata.population_inference.pop == 'nfe') | (mt.s.contains('TOB'))) # remove outlier samples, as identified by PCA outliers = [ 'TOB1734', 'TOB1714', 'TOB1126', 'TOB1653', 'TOB1668', 'TOB1681', 'TOB1116', 'TOB1107', 'TOB1635', 'HG01628', 'TOB1675', 'TOB1125', 'TOB1762', 'TOB1263', 'TOB1640', 'HG01669', 'TOB1795', 'TOB1707', 'HG01695', 'HG01694', 'TOB1673', 'HG01630', ] mt = mt.filter_cols(hl.literal(outliers).contains(mt.s), keep=False) # Remove related samples at the 2nd degree or closer, as indicated by gnomAD mt = mt.filter_cols(mt.hgdp_1kg_metadata.gnomad_release | mt.s.startswith('TOB')) # Perform PCA eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') eigenvalues, scores, loadings = hl.hwe_normalized_pca( mt.GT, compute_loadings=True, k=20) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(FILTERED_VARIANTS) nrows = mt.count_rows() print(f'mt.count_rows() = {nrows}') # Plot the allele frequency fig = figure( title='Variant AF', x_axis_label='Allele Frequency', y_axis_label='Frequency (%)', ) variant_af = mt.variant_qc.AF[1].collect() af_count, edges = np.histogram(variant_af, bins=100, weights=np.ones(len(variant_af)) / len(variant_af)) variant_af_count = pd.DataFrame({ 'variant_af_count': af_count, 'left': edges[:-1], 'right': edges[1:] }) fig.quad( bottom=0, top=variant_af_count['variant_af_count'], left=variant_af_count['left'], right=variant_af_count['right'], fill_color='blue', line_color='black', ) # Add in the cumulative distribution cumulative_af = np.cumsum(af_count) fig.line( x=variant_af_count['right'], y=cumulative_af, color='gray', line_width=1, legend='Cum dist', ) fig.legend.location = 'top_left' fig_filename = output_path('variant_selection_histogram.png', 'web') with hl.hadoop_open(fig_filename, 'wb') as f: get_screenshot_as_png(fig).save(f, format='PNG') html = file_html(fig, CDN, 'my plot') fig_filename_html = output_path('variant_selection_histogram.html', 'web') with hl.hadoop_open(fig_filename_html, 'w') as f: f.write(html)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') snp_chip = hl.read_matrix_table(SNP_CHIP) eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') # Perform PCA eigenvalues, scores, loadings = hl.hwe_normalized_pca( snp_chip.GT, compute_loadings=True, k=5) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') loadings = hl.read_table(LOADINGS) loadings = loadings.repartition(100, shuffle=False) loadings_path = output_path(f'gnomad_loadings_90k_liftover_repartitioned.ht') loadings.write(loadings_path)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') snp_chip = hl.read_matrix_table(SNP_CHIP).key_rows_by('locus', 'alleles') snp_chip = snp_chip.repartition(10000) snp_chip_path = output_path('snp_chip_10000_partitions.mt') snp_chip.write(snp_chip_path, overwrite=True)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS) tob_wgs = hl.experimental.densify(tob_wgs) tob_wgs = hl.split_multi_hts(tob_wgs) tob_wgs_path = output_path('tob_wgs_plink') hl.export_plink(tob_wgs, tob_wgs_path, ind_id=tob_wgs.s)
def query(rerun): """Query script entry point.""" hl.init(default_reference='GRCh38') sample_qc_path = output_path('sample_qc.mt') if rerun or not hl.hadoop_exists(sample_qc_path): mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) mt = mt.head(100, n_cols=100) mt_qc = hl.sample_qc(mt) mt_qc.write(sample_qc_path) mt_qc = hl.read_matrix_table(sample_qc_path) plot_filename = output_path('call_rate_plot.png', 'web') if rerun or not hl.hadoop_exists(plot_filename): call_rate_plot = hl.plot.histogram(mt_qc.sample_qc.call_rate, range=(0, 1), legend='Call rate') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(call_rate_plot).save(f, format='PNG')
def main(mt: str): """ Run vep using main.py wrapper """ hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(mt) # filter to biallelic loci only mt = mt.filter_rows(hl.len(mt.alleles) == 2) mt = mt.filter_rows(mt.alleles[1] != '*') vep = hl.vep(mt, config='file:///vep_data/vep-gcloud.json') vep_path = output_path('vep105_GRCh38.mt') vep.write(vep_path)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS) tob_wgs = hl.experimental.densify(tob_wgs) # filter out constant variants tob_wgs = tob_wgs.filter_rows(hl.len(tob_wgs.alleles) == 2) tob_wgs = tob_wgs.head(30000) ld = hl.ld_matrix(tob_wgs.GT.n_alt_alleles(), tob_wgs.locus, radius=2e6) ld = pd.DataFrame(ld.to_numpy()) # save pandas df ld_filename = output_path(f'ld_matrix.csv', 'analysis') ld.to_csv(ld_filename, index=False)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = hl.read_table(SCORES) # Filter outliers and related samples mt = mt.semi_join_cols(scores) mt = mt.annotate_cols(scores=scores[mt.s].scores) mt = mt.annotate_cols( study=hl.if_else(mt.s.contains('TOB'), 'TOB-WGS', 'HGDP-1kG')) # PCA plot must all come from the same object columns = mt.cols() pca_scores = columns.scores labels = columns.study sample_names = columns.s cohort_sample_codes = list(set(labels.collect())) tooltips = [('labels', '@label'), ('samples', '@samples')] # get percent variance explained eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # Get number of PCs number_of_pcs = len(eigenvalues) print('Making PCA plots labelled by study') for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') plot = figure( title='TOB-WGS + HGDP/1kG Dataset', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]}%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=pca_scores[pc1].collect(), y=pca_scores[pc2].collect(), label=labels.collect(), samples=sample_names.collect(), )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', ['#1b9e77', '#d95f02'], cohort_sample_codes), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'study_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'study_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) print('Making PCA plots labelled by the subpopulation') labels = columns.hgdp_1kg_metadata.labeled_subpop.collect() labels = ['TOB-WGS' if x is None else x for x in labels] subpopulation = list(set(labels)) # change ordering of subpopulations # so TOB-WGS is at the end and glyphs appear on top subpopulation.append(subpopulation.pop(subpopulation.index('TOB-WGS'))) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') plot = figure( title='Subpopulation', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]}%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=pca_scores[pc1].collect(), y=pca_scores[pc2].collect(), label=labels, samples=sample_names.collect(), )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', turbo(len(subpopulation)), subpopulation), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'subpopulation_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'subpopulation_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') # save relatedness estimates for pc_relate global populations ht = hl.read_table(PC_RELATE_ESTIMATE_GLOBAL) related_samples = ht.filter(ht.kin > 0.1) pc_relate_global = pd.DataFrame({ 'i_s': related_samples.i.s.collect(), 'j_s': related_samples.j.s.collect(), 'kin': related_samples.kin.collect(), }) filename = output_path(f'pc_relate_global_matrix.csv', 'analysis') pc_relate_global.to_csv(filename, index=False) # get maximal independent set pairs = ht.filter(ht['kin'] >= 0.125) related_samples_to_remove = hl.maximal_independent_set( pairs.i, pairs.j, False) related_samples = pd.DataFrame( {'removed_individual': related_samples_to_remove.node.s.collect()}) filename = output_path(f'pc_relate_global_maximal_independent_set.csv', 'analysis') related_samples.to_csv(filename, index=False) # save relatedness estimates for pc_relate NFE samples ht = hl.read_table(PC_RELATE_ESTIMATE_NFE) related_samples = ht.filter(ht.kin > 0.1) pc_relate_nfe = pd.DataFrame({ 'i_s': related_samples.i.s.collect(), 'j_s': related_samples.j.s.collect(), 'kin': related_samples.kin.collect(), }) filename = output_path(f'pc_relate_nfe_matrix.csv', 'analysis') pc_relate_nfe.to_csv(filename, index=False) # get maximal independent set pairs = ht.filter(ht['kin'] >= 0.125) related_samples_to_remove = hl.maximal_independent_set( pairs.i, pairs.j, False) related_samples = pd.DataFrame( {'removed_individual': related_samples_to_remove.node.s.collect()}) filename = output_path(f'pc_relate_nfe_maximal_independent_set.csv', 'analysis') related_samples.to_csv(filename, index=False) # save relatedness estimates for KING NFE samples mt = hl.read_matrix_table(KING_ESTIMATE_NFE) ht = mt.entries() # remove entries where samples are identical related_samples = ht.filter(ht.s_1 != ht.s) related_samples = ht.filter(ht.phi > 0.1) king_nfe = pd.DataFrame({ 'i_s': related_samples.s_1.collect(), 'j_s': related_samples.s.collect(), 'kin': related_samples.phi.collect(), }) filename = output_path(f'king_nfe_matrix_90k.csv', 'analysis') king_nfe.to_csv(filename, index=False) # save KING NFE maximal independent set second_degree_related_samples = ht.filter( (ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True) struct = hl.struct(i=second_degree_related_samples.s_1, j=second_degree_related_samples.s) struct = struct.annotate(phi=second_degree_related_samples.phi) related_samples_to_remove = hl.maximal_independent_set( struct.i, struct.j, False # pylint: disable=E1101 ) related_samples = pd.DataFrame( {'related_individual': related_samples_to_remove.node.collect()}) filename = output_path( f'king_90k_related_samples_maximal_independent_set.csv', 'analysis') related_samples.to_csv(filename, index=False)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') scores = hl.read_table(SCORES) scores = scores.annotate( study=hl.if_else(scores.s.contains('TOB'), 'TOB-WGS', 'HGDP-1kG')) sample_names = scores.s.collect() labels = scores.study.collect() study = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # Get number of PCs number_of_pcs = len(eigenvalues) # plot by study for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='Study', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=sample_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', ['#1b9e77', '#d95f02'], study), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'study_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'study_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # plot by continental population hgdp1kg_tobwgs = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = scores.annotate(continental_pop=hgdp1kg_tobwgs.cols()[ scores.s].hgdp_1kg_metadata.population_inference.pop) labels = scores.continental_pop.collect() # Change TOB-WGS 'none' values to 'TOB-WGS' labels = ['TOB-NFE' if x is None else x for x in labels] continental_population = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='Continental Population', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=sample_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', turbo(len(continental_population)), continental_population), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'continental_pop_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'continental_pop_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # plot by subpopulation scores = scores.annotate(subpop=hgdp1kg_tobwgs.cols()[ scores.s].hgdp_1kg_metadata.labeled_subpop) labels = scores.subpop.collect() labels = ['TOB-NFE' if x is None else x for x in labels] sub_population = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='Subpopulation', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=sample_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=4, color=factor_cmap('label', turbo(len(sub_population)), sub_population), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'subpop_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'subpop_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # Plot loadings loadings_ht = hl.read_table(LOADINGS) for i in range(0, (number_of_pcs)): pc = i + 1 plot = manhattan_loadings( pvals=hl.abs(loadings_ht.loadings[i]), locus=loadings_ht.locus, title='Loadings of PC ' + str(pc), collect_all=True, ) plot_filename = output_path(f'loadings_pc{pc}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') scores = hl.read_table(SCORES) tob_wgs = hl.read_matrix_table(TOB_WGS) snp_chip_names = scores.s.collect() wgs_names = tob_wgs.s.collect() def sample_type(sample_name): return 'dual_sample' if sample_name in wgs_names else 'snp_chip_only' labels = list(map(sample_type, snp_chip_names)) # get percent variance explained eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # Get number of PCs number_of_pcs = len(eigenvalues) # plot cohort_sample_codes = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='SNP Chip Samples', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=snp_chip_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=8, color=factor_cmap('label', ['#1b9e77', '#d95f02'], cohort_sample_codes), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') loadings_ht = hl.read_table(LOADINGS) gtf_ht = hl.experimental.import_gtf( GTF_FILE, reference_genome='GRCh38', skip_invalid_contigs=True, min_partitions=12, ) number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0] - 1 for i in range(0, (number_of_pcs)): pc = i + 1 plot_filename = output_path(f'loadings_manhattan_plot_pc{pc}.png', 'web') if not hl.hadoop_exists(plot_filename): p = manhattan_loadings( iteration=i, gtf=gtf_ht, loadings=loadings_ht, title=f'Loadings of PC{pc}', collect_all=True, ) with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') html = file_html(p, CDN, 'my plot') plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # Get samples which are driving loadings mt = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = hl.read_table(SCORES) mt = mt.semi_join_cols(scores) loadings_ht = loadings_ht.key_by('locus') mt = mt.annotate_rows(loadings=loadings_ht[mt.locus].loadings) for dim in range(0, number_of_pcs): max_value = mt.aggregate_rows(hl.agg.stats(hl.abs( mt.loadings[dim]))).max significant_variants = mt.filter_rows( hl.abs(mt.loadings[dim]) == max_value) significant_variants = hl.sample_qc(significant_variants) significant_variant_list = significant_variants.locus.collect() print(f'PC{dim}:', significant_variant_list) heterozygous_samples = significant_variants.filter_cols( significant_variants.sample_qc.n_het > 0).s.collect() homozygous_alternate_samples = significant_variants.filter_cols( significant_variants.sample_qc.n_hom_var > 0).s.collect() if len(heterozygous_samples) > len(homozygous_alternate_samples): homozygous_alternate_samples.extend('null' for _ in range( len(heterozygous_samples) - len(homozygous_alternate_samples))) elif len(heterozygous_samples) < len(homozygous_alternate_samples): heterozygous_samples.extend('null' for _ in range( len(homozygous_alternate_samples) - len(heterozygous_samples))) # save as html html = pd.DataFrame({ 'heterozygous_samples': heterozygous_samples, 'homozygous_alternate_samples': homozygous_alternate_samples, }).to_html() plot_filename_html = output_path( f'significant_variants_non_ref_samples{dim}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) # Get NFE samples only mt = mt.filter_cols(( mt.hgdp_1kg_metadata.population_inference.pop == 'nfe') | (mt.s.contains('TOB'))) scores = hl.read_table(SCORES) mt = mt.annotate_cols(scores=scores[mt.s].scores) mt = mt.annotate_cols(TOB_WGS=mt.s.contains('TOB')) # PCA plot must all come from the same object columns = mt.cols() pca_scores = columns.scores labels = columns.TOB_WGS hover_fields = dict([('s', columns.s)]) # get percent variance explained eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # Get number of PCs number_of_pcs = len(eigenvalues) print('Making PCA plots labelled by study') for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( pca_scores[pc1], pca_scores[pc2], label=labels, title='TOB-WGS', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', collect_all=True, hover_fields=hover_fields, ) plot_filename = output_path(f'study_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') html = file_html(p, CDN, 'my plot') plot_filename_html = output_path(f'study_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) print('Making PCA plots labelled by the subpopulation') labels = columns.hgdp_1kg_metadata.labeled_subpop pops = list(set(labels.collect())) for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( pca_scores[pc1], pca_scores[pc2], label=labels, title='Subpopulation', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', collect_all=True, colors=CategoricalColorMapper(palette=turbo(len(pops)), factors=pops), ) plot_filename = output_path(f'subpopulation_pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') html = file_html(p, CDN, 'my plot') plot_filename_html = output_path(f'subpopulation_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') scores = hl.read_table(SCORES) scores = scores.annotate(cohort_sample_codes=hl.if_else( scores.s.contains('snp_chip'), 'snp_chip', 'tob_wgs')) labels = scores.cohort_sample_codes hover_fields = dict([('s', scores.s)]) # get percent variance explained eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # Get number of PCs number_of_pcs = len(eigenvalues) for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( scores.scores[pc1], scores.scores[pc2], label=labels, title='TOB-WGS + TOB SNP Chip', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', hover_fields=hover_fields, ) plot_filename = output_path('pc' + str(pc2) + '.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') html = file_html(p, CDN, 'my plot') plot_filename_html = output_path(f'pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # Get partner sample information sample_names = scores.s.collect() def sample_type(sample_name): if sample_name.endswith('snp_chip'): partner_name = re.sub('_snp_chip', '', sample_name) tech = 'snp' else: partner_name = sample_name + '_snp_chip' tech = 'wgs' if partner_name in sample_names: prefix = 'dual_' else: prefix = '' return prefix + tech # save as html labels = list(map(sample_type, sample_names)) html = pd.DataFrame({ 'sample_name': sample_names, 'sample_tech': labels }).to_html() plot_filename_html = output_path(f'sample_technology.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # plot cohort_sample_codes = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='Reprocessed Sample Projection', x_axis_label='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', y_axis_label='PC' + str(pc2 + 1) + ' (' + str(variance[pc1]) + '%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=scores.scores[pc1].collect(), y=scores.scores[pc2].collect(), label=labels, samples=sample_names, )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=8, color=factor_cmap('label', Dark2[len(cohort_sample_codes)], cohort_sample_codes), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path('technology_type_pc' + str(pc2) + '.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'technology_type_pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') snp_chip = hl.read_matrix_table(SNP_CHIP) tob_wgs = hl.read_matrix_table(TOB_WGS) tob_wgs = hl.experimental.densify(tob_wgs) tob_wgs = tob_wgs.annotate_entries(GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)) snp_chip = snp_chip.semi_join_rows(tob_wgs.rows()) snp_chip_path = output_path('snp_chip_filtered_by_tob_wgs.mt', 'tmp') snp_chip = snp_chip.checkpoint(snp_chip_path) # Perform PCA eigenvalues, scores, loadings = hl.hwe_normalized_pca( snp_chip.GT, compute_loadings=True, k=5) scores_path = output_path('scores.ht', 'tmp') loadings_path = output_path('loadings.ht', 'tmp') scores = scores.checkpoint(scores_path) loadings = loadings.checkpoint(loadings_path) # make tob_wgs rows equivalent to the snp_chip rows tob_wgs = tob_wgs.semi_join_rows(snp_chip.rows()) tob_wgs_path = output_path('tob_wgs_filtered_by_snp_chip.mt', 'tmp') tob_wgs = tob_wgs.checkpoint(tob_wgs_path) snp_chip = snp_chip.annotate_rows( af=hl.agg.mean(snp_chip.GT.n_alt_alleles()) / 2) loadings = loadings.annotate(af=snp_chip.rows()[loadings.key].af) # project WGS samples onto PCA ht = pc_project(tob_wgs.GT, loadings.loadings, loadings.af) ht_path = output_path('pc_project_tob_wgs.ht', 'tmp') ht = ht.checkpoint(ht_path) scores = scores.key_by(s=scores.s + '_snp_chip') union_scores = ht.union(scores) variance = [(x / sum(eigenvalues) * 100) for x in eigenvalues] variance = [round(x, 2) for x in variance] # Get partner sample information sample_names = union_scores.s.collect() def sample_type(sample_name): if sample_name.endswith('snp_chip'): partner_name = re.sub('_snp_chip', '', sample_name) tech = 'snp' else: partner_name = sample_name + '_snp_chip' tech = 'wgs' if partner_name in sample_names: prefix = 'dual_' else: prefix = '' return prefix + tech # plot labels = list(map(sample_type, sample_names)) cohort_sample_codes = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] # Get number of PCs number_of_pcs = len(eigenvalues) for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='TOB-WGS + TOB SNP Chip', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=union_scores.scores[pc1].collect(), y=union_scores.scores[pc2].collect(), label=labels, samples=sample_names, )) source_filename = output_path(f'source_{pc2}.ht', 'tmp') hl.Table.from_pandas(pd.DataFrame(source.data)).export(source_filename) plot.circle( 'x', 'y', alpha=0.5, source=source, size=8, color=factor_cmap('label', Dark2[len(cohort_sample_codes)], cohort_sample_codes), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)