""" Estimate kinship coefficient using KING on NFE samples from the HGDP/1KG dataset. """ import hail as hl import pandas as pd from analysis_runner import bucket_path, output_path HGDP1KG_TOBWGS = bucket_path( '1kg_hgdp_densified_pca_new_variants/v0/hgdp1kg_tobwgs_joined_all_samples.mt' ) def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) mt = mt.filter_cols( (mt.hgdp_1kg_metadata.population_inference.pop == 'nfe') | (mt.s.contains('TOB')) ) # Remove related samples (at the 2nd degree or closer) king = hl.king(mt.GT) king_path = output_path('king_kinship_estimate_NFE.ht') king.write(king_path) ht = king.entries() related_samples = ht.filter((ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True) struct = hl.struct(i=related_samples.s_1, j=related_samples.s) struct = struct.annotate(phi=related_samples.phi)
"""Create PCA plots for the combined TOB-WGS/SNP-chip data""" import re from bokeh.io.export import get_screenshot_as_png from bokeh.resources import CDN from bokeh.embed import file_html from bokeh.transform import factor_cmap from bokeh.plotting import ColumnDataSource, figure from bokeh.palettes import Dark2 # pylint: disable=no-name-in-module import pandas as pd import hail as hl import click from analysis_runner import bucket_path, output_path SCORES = bucket_path('tob_wgs_snp_chip_variant_pca/v6/scores.ht/') EIGENVALUES = bucket_path('tob_wgs_snp_chip_variant_pca/v6/eigenvalues.ht') @click.command() def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') scores = hl.read_table(SCORES) scores = scores.annotate(cohort_sample_codes=hl.if_else( scores.s.contains('snp_chip'), 'snp_chip', 'tob_wgs')) labels = scores.cohort_sample_codes hover_fields = dict([('s', scores.s)]) # get percent variance explained
"""QC of newly-selected variants""" import click import hail as hl import numpy as np import pandas as pd from analysis_runner import bucket_path, output_path from bokeh.plotting import figure from bokeh.io.export import get_screenshot_as_png from bokeh.resources import CDN from bokeh.embed import file_html FILTERED_VARIANTS = bucket_path( 'tob_wgs_hgdp_1kg_variant_selection/v8/tob_wgs_hgdp_1kg_filtered_variants.mt' ) @click.command() def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(FILTERED_VARIANTS) nrows = mt.count_rows() print(f'mt.count_rows() = {nrows}') # Plot the allele frequency fig = figure( title='Variant AF', x_axis_label='Allele Frequency',
"""Export TOB-WGS joint callset as PLINK format""" import hail as hl from analysis_runner import bucket_path, output_path TOB_WGS = bucket_path('mt/v5.1.mt/') def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS) tob_wgs = hl.experimental.densify(tob_wgs) tob_wgs = hl.split_multi_hts(tob_wgs) tob_wgs_path = output_path('tob_wgs_plink') hl.export_plink(tob_wgs, tob_wgs_path, ind_id=tob_wgs.s) if __name__ == '__main__': query()
""" Generate PCA on SNP-chip data only. """ import click import hail as hl import pandas as pd from analysis_runner import bucket_path, output_path SNP_CHIP = bucket_path('snpchip/v1/snpchip_grch38.mt') @click.command() def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') snp_chip = hl.read_matrix_table(SNP_CHIP) eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') # Perform PCA eigenvalues, scores, loadings = hl.hwe_normalized_pca( snp_chip.GT, compute_loadings=True, k=5) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True) if __name__ == '__main__':
"""Plot PCA loadings for HGDP/1kG + TOB-WGS samples""" from bokeh.models import CategoricalColorMapper, HoverTool from bokeh.io.export import get_screenshot_as_png from bokeh.plotting import figure from bokeh.embed import file_html from bokeh.resources import CDN from analysis_runner import bucket_path, output_path import hail as hl import pandas as pd LOADINGS = bucket_path('tob_wgs_hgdp_1kg_nfe_pca_new_variants/v9/loadings.ht/') GTF_FILE = 'gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz' SCORES = bucket_path('tob_wgs_hgdp_1kg_nfe_pca_new_variants/v9/scores.ht/') HGDP1KG_TOBWGS = bucket_path( '1kg_hgdp_densified_pca_new_variants/v0/hgdp1kg_tobwgs_joined_all_samples.mt' ) def manhattan_loadings( iteration, gtf, loadings, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, ): """modify hail manhattan plot""" palette = [
"""Create PCA plots for the combined TOB-WGS/SNP-chip data""" from bokeh.io.export import get_screenshot_as_png from bokeh.resources import CDN from bokeh.embed import file_html from bokeh.transform import factor_cmap from bokeh.plotting import ColumnDataSource, figure import pandas as pd import hail as hl import click from analysis_runner import bucket_path, output_path SCORES = bucket_path('tob_snp_chip_pca/v0/scores.ht') EIGENVALUES = bucket_path('tob_snp_chip_pca/v0/eigenvalues.ht') TOB_WGS = bucket_path('mt/v3-raw.mt') @click.command() def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') scores = hl.read_table(SCORES) tob_wgs = hl.read_matrix_table(TOB_WGS) snp_chip_names = scores.s.collect() wgs_names = tob_wgs.s.collect() def sample_type(sample_name): return 'dual_sample' if sample_name in wgs_names else 'snp_chip_only'
from the HGDP/1kG + TOB-WGS datasets, removing outliers. """ import hail as hl import pandas as pd from analysis_runner import bucket_path, output_path from bokeh.io.export import get_screenshot_as_png from bokeh.resources import CDN from bokeh.embed import file_html from bokeh.plotting import ColumnDataSource, figure from bokeh.transform import factor_cmap from bokeh.palettes import turbo # pylint: disable=no-name-in-module HGDP1KG_TOBWGS = bucket_path( '1kg_hgdp_densified_pca_new_variants/v0/hgdp1kg_tobwgs_joined_all_samples.mt' ) SCORES = bucket_path('tob_wgs_hgdp_1kg_nfe_pca_new_variants/v9/scores.ht/') EIGENVALUES = bucket_path( 'tob_wgs_hgdp_1kg_nfe_pca_new_variants/v9/eigenvalues.ht') def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = hl.read_table(SCORES) # Filter outliers and related samples
"""Calculate ld using the ld_matrix function""" import hail as hl import pandas as pd from analysis_runner import bucket_path, output_path TOB_WGS = bucket_path('mt/v7.mt/') def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS) tob_wgs = hl.experimental.densify(tob_wgs) # filter out constant variants tob_wgs = tob_wgs.filter_rows(hl.len(tob_wgs.alleles) == 2) tob_wgs = tob_wgs.head(30000) ld = hl.ld_matrix(tob_wgs.GT.n_alt_alleles(), tob_wgs.locus, radius=2e6) ld = pd.DataFrame(ld.to_numpy()) # save pandas df ld_filename = output_path(f'ld_matrix.csv', 'analysis') ld.to_csv(ld_filename, index=False) if __name__ == '__main__': query()
"""QC of newly-selected variants""" import hail as hl import pandas as pd from analysis_runner import bucket_path, output_path from bokeh.io.export import get_screenshot_as_png from bokeh.resources import CDN from bokeh.embed import file_html from bokeh.transform import factor_cmap from bokeh.plotting import ColumnDataSource, figure from bokeh.palettes import turbo # pylint: disable=no-name-in-module from bokeh.models import CategoricalColorMapper, HoverTool HGDP1KG_TOBWGS = bucket_path( '1kg_hgdp_densified_pca_new_variants/v0/hgdp1kg_tobwgs_joined_all_samples.mt' ) SCORES = bucket_path('1kg_hgdp_densified_pca_new_variants/v0/scores.ht/') EIGENVALUES = bucket_path( '1kg_hgdp_densified_pca_new_variants/v0/eigenvalues.ht') LOADINGS = bucket_path('1kg_hgdp_densified_pca_new_variants/v0/loadings.ht/') def manhattan_loadings( pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500, ):
""" Perform PCA on densified TOB-WGS data. Reliant on output from ``` hgdp1kg_tobwgs_densified_pca_new_variants/ hgdp_1kg_tob_wgs_densified_pca_new_variants.py ```` """ import hail as hl import pandas as pd from hail.experimental import lgt_to_gt from analysis_runner import bucket_path, output_path TOB_WGS = bucket_path('1kg_hgdp_densify_new_variants/v0/tob_wgs_filtered.mt/') GNOMAD_HGDP_1KG_MT = ('gs://gcp-public-data--gnomad/release/3.1/mt/genomes/' 'gnomad.genomes.v3.1.hgdp_1kg_subset_dense.mt') def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS) hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) # keep loci that are contained in the densified, filtered tob-wgs mt hgdp_1kg = hgdp_1kg.semi_join_rows(tob_wgs.rows()) # Entries and columns must be identical tob_wgs_select = tob_wgs.select_entries(
""" Save scores of related individuals after running pc_relate. """ import hail as hl import pandas as pd from analysis_runner import bucket_path, output_path PC_RELATE_ESTIMATE_NFE = bucket_path( 'tob_wgs_hgdp_1kg_nfe_pc_relate/v0/pc_relate_kinship_estimate.ht') PC_RELATE_ESTIMATE_GLOBAL = bucket_path( 'tob_wgs_hgdp_1kg_pc_relate/v0/pc_relate_kinship_estimate.ht') KING_ESTIMATE_NFE = bucket_path('king/v0/king_kinship_estimate_NFE.ht') def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') # save relatedness estimates for pc_relate global populations ht = hl.read_table(PC_RELATE_ESTIMATE_GLOBAL) related_samples = ht.filter(ht.kin > 0.1) pc_relate_global = pd.DataFrame({ 'i_s': related_samples.i.s.collect(), 'j_s': related_samples.j.s.collect(), 'kin': related_samples.kin.collect(), }) filename = output_path(f'pc_relate_global_matrix.csv', 'analysis') pc_relate_global.to_csv(filename, index=False)
""" import re import hail as hl import pandas as pd from analysis_runner import bucket_path, output_path from hail.experimental import pc_project from hail.experimental import lgt_to_gt from bokeh.plotting import ColumnDataSource, figure from bokeh.palettes import Dark2 # pylint: disable=no-name-in-module from bokeh.transform import factor_cmap from bokeh.resources import CDN from bokeh.embed import file_html from bokeh.io.export import get_screenshot_as_png SNP_CHIP = bucket_path( 'tob_wgs_snp_chip_pca/increase_partitions/v2/snp_chip_10000_partitions.mt') TOB_WGS = bucket_path('mt/v3-raw.mt') def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') snp_chip = hl.read_matrix_table(SNP_CHIP) tob_wgs = hl.read_matrix_table(TOB_WGS) tob_wgs = hl.experimental.densify(tob_wgs) tob_wgs = tob_wgs.annotate_entries(GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)) snp_chip = snp_chip.semi_join_rows(tob_wgs.rows()) snp_chip_path = output_path('snp_chip_filtered_by_tob_wgs.mt', 'tmp') snp_chip = snp_chip.checkpoint(snp_chip_path)