def test_hwe_normalized_pca(): mt = hl.balding_nichols_model(3, 100, 50) eigenvalues, scores, loadings = hl.hwe_normalized_pca(mt.GT, k=2, compute_loadings=True) assert len(eigenvalues) == 2 assert isinstance(scores, hl.Table) scores.count() == 100 assert isinstance(loadings, hl.Table) _, _, loadings = hl.hwe_normalized_pca(mt.GT, k=2, compute_loadings=False) assert loadings is None
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS) hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) # keep loci that are contained in the densified, filtered tob-wgs mt hgdp_1kg = hgdp_1kg.semi_join_rows(tob_wgs.rows()) # Entries and columns must be identical tob_wgs_select = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)).select_cols() hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT).select_cols() # Join datasets hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select) # Add in metadata information hgdp_1kg_metadata = hgdp_1kg.cols() hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols( hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s]) # save this for population-level PCAs mt_path = output_path('hgdp1kg_tobwgs_joined_all_samples.mt') if not hl.hadoop_exists(mt_path): hgdp1kg_tobwgs_joined.write(mt_path) # Perform PCA eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') eigenvalues, scores, loadings = hl.hwe_normalized_pca( hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def query(output, pop): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) if pop: # Get samples from the specified population only mt = mt.filter_cols( (mt.hgdp_1kg_metadata.population_inference.pop == pop.lower()) | (mt.s.contains('TOB')) ) else: mt = mt.filter_cols(mt.s.contains('TOB')) # Perform PCA eigenvalues_path = f'{output}/eigenvalues.ht' scores_path = f'{output}/scores.ht' loadings_path = f'{output}/loadings.ht' eigenvalues, scores, loadings = hl.hwe_normalized_pca( mt.GT, compute_loadings=True, k=20 ) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def hwe_normalized_pca( qc_mt: hl.MatrixTable, related_samples_to_drop: Optional[hl.Table] = None, n_pcs: int = 10 ) -> Tuple[List[float], hl.Table, hl.Table]: """ First runs PCA excluding the given related samples, then projects these samples in the PC space to return scores for all samples. The `related_samples_to_drop` Table has to be keyed by the sample ID and all samples present in this table will be excluded from the PCA. The loadings Table returned also contains a `pca_af` annotation which is the allele frequency used for PCA. This is useful to project other samples in the PC space. :param qc_mt: Input QC MT :param related_samples_to_drop: Optional table of related samples to drop :param n_pcs: Number of PCs to compute :param autosomes_only: Whether to run the analysis on autosomes only :return: eigenvalues, scores and loadings """ unrelated_mt = qc_mt if related_samples_to_drop: unrelated_mt = qc_mt.filter_cols(hl.is_missing(related_samples_to_drop[qc_mt.col_key])) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(unrelated_mt.GT, k=n_pcs, compute_loadings=True) pca_af_ht = unrelated_mt.annotate_rows(pca_af=hl.agg.mean(unrelated_mt.GT.n_alt_alleles()) / 2).rows() pca_loadings = pca_loadings.annotate(pca_af=pca_af_ht[pca_loadings.key].pca_af) if not related_samples_to_drop: return pca_evals, pca_scores, pca_loadings else: related_mt = qc_mt.filter_cols(hl.is_defined(related_samples_to_drop[qc_mt.col_key])) related_scores = pc_project(related_mt, pca_loadings) pca_scores = pca_scores.union(related_scores) return pca_evals, pca_scores, pca_loadings
def compute_relatedness( data_type: str = "genomes", overwrite: bool = False, ) -> hl.Table: """ Perform sample QC on the split VDS table using `compute_stratified_sample_qc`. :param data_type: Whether data is from genomes or exomes, default is genomes :param overwrite: Whether to overwrite the file :return: Table table after running pc_relate :rtype: hl.Table """ logger.info("Computing relatedness table on CCDG %s VDS", data_type) pca_var_ht = hl.read_table(get_pca_variants_path()) mt = hl.vds.to_dense_mt(get_qc_vds(data_type)) mt = mt.filter_rows(hl.is_defined(pca_var_ht[mt.row_key])) eig, scores, _ = hl.hwe_normalized_pca(mt.GT, k=10, compute_loadings=False) scores = scores.checkpoint( get_ccdg_results_path(data_type=data_type, result="pc_scores"), overwrite=overwrite, _read_if_exists=not overwrite, ) relatedness_ht = hl.pc_relate( mt.GT, min_individual_maf=0.01, scores_expr=scores[mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics="all", ) return relatedness_ht.checkpoint( get_ccdg_results_path(data_type=data_type, result="relatedness"), overwrite=overwrite, _read_if_exists=(not overwrite), )
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles') snp_chip = hl.read_matrix_table(SNP_CHIP).key_rows_by('locus', 'alleles') # filter to loci that are contained in snp-chip data after densifying tob_wgs = hl.experimental.densify(tob_wgs) tob_wgs = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA) ).select_cols() snp_chip = snp_chip.select_entries(snp_chip.GT).select_cols() snp_chip = snp_chip.key_cols_by(s=snp_chip.s + '_snp_chip') tob_combined = tob_wgs.union_cols(snp_chip) tob_combined = tob_combined.cache() print(tob_combined.count_rows()) # Perform PCA eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') eigenvalues, scores, loadings = hl.hwe_normalized_pca( tob_combined.GT, compute_loadings=True, k=20 ) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def run_gwas(vcf_file, phenotypes_file, output_file): table = hl.import_table(phenotypes_file, impute=True).key_by('Sample') hl.import_vcf(vcf_file).write('tmp.mt') mt = hl.read_matrix_table('tmp.mt') mt = mt.annotate_cols(pheno=table[mt.s]) downsampled = mt.sample_rows(0.01, seed=11223344) eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT) mt = mt.annotate_cols(scores=pcs[mt.s].scores) gwas = hl.linear_regression_rows( y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]]) gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles), P=gwas.p_value) gwas = gwas.key_by(gwas.SNP) gwas = gwas.select(gwas.P) gwas.export(f'{output_file}.assoc', header=True) hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
def test_pcrelate_paths(): mt = hl.balding_nichols_model(3, 50, 100) _, scores3, _ = hl.hwe_normalized_pca(mt.GT, k=3, compute_loadings=False) kin1 = hl.pc_relate(mt.GT, 0.10, k=2, statistics='kin', block_size=64) kin2 = hl.pc_relate(mt.GT, 0.05, k=2, min_kinship=0.01, statistics='kin2', block_size=128).cache() kin3 = hl.pc_relate(mt.GT, 0.02, k=3, min_kinship=0.1, statistics='kin20', block_size=64).cache() kin_s1 = hl.pc_relate(mt.GT, 0.10, scores_expr=scores3[mt.col_key].scores[:2], statistics='kin', block_size=32) assert kin1._same(kin_s1, tolerance=1e-4) assert kin1.count() == 50 * 49 / 2 assert kin2.count() > 0 assert kin2.filter(kin2.kin < 0.01).count() == 0 assert kin3.count() > 0 assert kin3.filter(kin3.kin < 0.1).count() == 0
def query(output): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) eigenvalues_path = f'{output}/eigenvalues_10k.csv' scores_path = f'{output}/scores_10k.ht' loadings_path = f'{output}/loadings_10k.ht' downsampled_mt_path = f'{output}/downsampled_mt.mt' # filter out variants with a call rate <0.99 and variants where there # is no non-reference allele called. mt_qc = hl.variant_qc(mt) filt_mt = mt_qc.filter_rows((mt_qc.variant_qc.call_rate >= 0.99) & (mt_qc.variant_qc.n_non_ref >= 1)) nrows = filt_mt.count_rows() # Downsample the dataset to approximately 10k randomly-selected rows # (the input must be a proportion) downsampled_mt = filt_mt.sample_rows(10000 / nrows, seed=12345) eigenvalues, scores, loadings = hl.hwe_normalized_pca( downsampled_mt.GT, compute_loadings=True, k=20) # save the list of eigenvalues eigenvalues_df = pd.DataFrame(eigenvalues) eigenvalues_df.to_csv(eigenvalues_path, index=False) # save the scores, loadings, and downsampled matrix table scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True) downsampled_mt.write(downsampled_mt_path, overwrite=True)
def test_pc_project(self): mt = hl.balding_nichols_model(3, 100, 50) _, _, loadings_ht = hl.hwe_normalized_pca(mt.GT, k=10, compute_loadings=True) mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) loadings_ht = loadings_ht.annotate(af=mt.rows()[loadings_ht.key].af) mt_to_project = hl.balding_nichols_model(3, 100, 50) ht = hl.experimental.pc_project(mt_to_project.GT, loadings_ht.loadings, loadings_ht.af) assert ht._force_count() == 100
def joint_pca( ref_dirname: str = 'gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/', ref_basename: str = 'unrelated', in_mt: hl.MatrixTable = None, data_basename: str = None, npcs: int = 20, out_dir: str = None): """ Merges input dataset with ref by [locus, alleles] and runs PCA on merged dataset :param ref_dirname: directory name where reference data is :param ref_basename: base filename for reference data :param in_mt: input data MatrixTable :param data_basename: base filename for input data :param npcs: number of principal components to be used in PCA :param out_dir: output directory where files are going to be saved to :return: """ print('\nReading reference data mt') ref_mt = hl.read_matrix_table(f'{ref_dirname}{ref_basename}.mt') # We need to unkey the datasets and take only cols common between the two in order to be able to merge in Hail ref_mt = ref_mt.key_cols_by().key_rows_by() ref_downsampled = ref_mt.select_cols('s').select_rows( 'locus', 'alleles').select_entries('GT') ref_downsampled = ref_downsampled.key_cols_by('s').key_rows_by( 'locus', 'alleles') data_mt = in_mt.key_cols_by().key_rows_by() data_downsampled = data_mt.select_cols('s').select_rows( 'locus', 'alleles').select_entries('GT') data_downsampled = data_downsampled.key_cols_by('s').key_rows_by( 'locus', 'alleles') print('\nJoining Data with Ref by locus & alleles') joined = ref_downsampled.union_cols(data_downsampled) pca_snps = joined.count_rows() if pca_snps > 1000000: import warnings warnings.warn( f'Too many SNPs to be used in PCA: {pca_snps}. This will make PCA run longer' ) print(f'\nRunning PCA with {npcs} principal components') pca_evals, pca_scores, _ = hl.hwe_normalized_pca(joined.GT, k=npcs) pca_scores = pca_scores.transmute( **{f'PC{i}': pca_scores.scores[i - 1] for i in range(1, npcs + 1)}) print(f'\nExporting PCA scores to {out_dir}') pca_scores.export( f'{out_dir}GWASpy/PCA/{data_basename}/pca_joint/{data_basename}.1kg_hgdp.joint.pca.scores.txt.bgz' )
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) # Get samples from the specified population only mt = mt.filter_cols(( mt.hgdp_1kg_metadata.population_inference.pop == 'nfe') | (mt.s.contains('TOB'))) # remove outlier samples, as identified by PCA outliers = [ 'TOB1734', 'TOB1714', 'TOB1126', 'TOB1653', 'TOB1668', 'TOB1681', 'TOB1116', 'TOB1107', 'TOB1635', 'HG01628', 'TOB1675', 'TOB1125', 'TOB1762', 'TOB1263', 'TOB1640', 'HG01669', 'TOB1795', 'TOB1707', 'HG01695', 'HG01694', 'TOB1673', 'HG01630', ] mt = mt.filter_cols(hl.literal(outliers).contains(mt.s), keep=False) # Remove related samples at the 2nd degree or closer, as indicated by gnomAD mt = mt.filter_cols(mt.hgdp_1kg_metadata.gnomad_release | mt.s.startswith('TOB')) # Perform PCA eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') eigenvalues, scores, loadings = hl.hwe_normalized_pca( mt.GT, compute_loadings=True, k=20) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def query(output, pop): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) if pop: # Get samples from the specified population only mt = mt.filter_cols(( mt.hgdp_1kg_metadata.population_inference.pop == pop.lower()) | (mt.s.contains('TOB'))) else: mt = mt.filter_cols(mt.s.contains('TOB')) # Perform PCA eigenvalues_path = f'{output}/eigenvalues.csv' scores_path = f'{output}/scores.ht' loadings_path = f'{output}/loadings.ht' eigenvalues, scores, loadings = hl.hwe_normalized_pca( mt.GT, compute_loadings=True, k=20) eigenvalues_df = pd.DataFrame(eigenvalues) eigenvalues_df.to_csv(eigenvalues_path, index=False) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True) # get TOB-WGS allele frequencies tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles') tob_wgs = tob_wgs.annotate_entries(GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)) tob_wgs = tob_wgs.annotate_rows( gt_stats=hl.agg.call_stats(tob_wgs.GT, tob_wgs.alleles)) # Get gnomAD allele frequency of variants that aren't in TOB-WGS loadings_gnomad = hl.read_table(GNOMAD_LIFTOVER_LOADINGS).key_by( 'locus', 'alleles') hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) hgdp_1kg_row = hgdp_1kg.rows()[loadings_gnomad.locus, loadings_gnomad.alleles] tob_wgs_row = tob_wgs.rows()[loadings_gnomad.locus, loadings_gnomad.alleles] loadings_gnomad = loadings_gnomad.annotate( gnomad_AF=hgdp_1kg_row.gnomad_freq.AF, gnomad_popmax_AF=hgdp_1kg_row.gnomad_popmax.AF, TOB_WGS_AF=tob_wgs_row.gt_stats.AF, ) population_af_metadata = hgdp_1kg.gnomad_freq_meta.collect() loadings_gnomad = loadings_gnomad.annotate_globals( gnomad_freq_meta=population_af_metadata) gnomad_variants = loadings_gnomad.drop('loadings') gnomad_variants_path = f'{output}/gnomad_annotated_variants.mt' gnomad_variants.write(gnomad_variants_path)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') snp_chip = hl.read_matrix_table(SNP_CHIP) eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') # Perform PCA eigenvalues, scores, loadings = hl.hwe_normalized_pca( snp_chip.GT, compute_loadings=True, k=5) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def project_pcs_relateds(mt_ldpruned, mt, covar_pc_num): """ Tales LD pruned matrix table, calculates PCs, and projects those PCs back to related individuals included in mt :param mt_ldpruned: matrix table with relatives removed, maf and ld pruned :param mt: matrix table with relatives included :param covar_pc_num: Number of principal components as covariates to calculate :return: returns matrix table with relatives, with PCs annotated """ logging.info('Calculating principal components, annotating main dataset.') eigenvalues, scores, loadings = hl.hwe_normalized_pca( mt_ldpruned.GT, k=covar_pc_num, compute_loadings=True) # Project PCs to related individuals # mt of related individuals only, not pop outliers or failing samples QC related_mt = mt.filter_cols( (mt.related_to_remove == True) & (mt.pop_outlier_sample == False) & (hl.len(mt.failing_samples_qc) == 0), keep=True) mt_ldpruned = mt_ldpruned.annotate_rows( pca_af=hl.agg.mean(mt_ldpruned.GT.n_alt_alleles()) / 2) mtrows = mt_ldpruned.rows() loadings = loadings.annotate(pca_af=mtrows[loadings.locus, loadings.alleles].pca_af) related_scores = pc_project(related_mt, loadings) # Add pcs as annotations to main table mt = mt.annotate_cols(**{ 'pc' + str(k + 1): scores[mt.s].scores[k] for k in range(covar_pc_num) }) # Explanation: for k principal components in range 0 to covar_pc_num-1, # make pc k+1 (to start at pc1 instead of pc0) be the corresponding score (keyed by mt.s) from the table scores # Add pcs for related individuals mt = mt.annotate_cols( **{ 'pc' + str(k + 1): hl.or_else(mt['pc' + str(k + 1)], related_scores[mt.s].scores[k]) for k in range(covar_pc_num) }) # Explanation: for k principal components in range from 0 to (covar_pc_num-1) # give either the existing pcX, or if missing give the corresponding score (keyed by mt.s) # from the table related_scores return mt
def run_pca(prune_out: hl.MatrixTable, pca_prefix: str, overwrite: bool = False): """ Run PCA on a dataset :param mt: dataset to run PCA on :param pca_prefix: directory and filename prefix for where to put PCA output :return: """ mt = hl.read_matrix_table(prune_out) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( mt.GT, k=20, compute_loadings=True) pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) pca_loadings = pca_loadings.annotate( pca_af=pca_mt.rows()[pca_loadings.key].pca_af) pca_scores.write(pca_prefix + 'scores.ht', overwrite) pca_scores = hl.read_table(pca_prefix + 'scores.ht') pca_scores = pca_scores.transmute( **{f'PC{i}': pca_scores.scores[i - 1] for i in range(1, 21)}) pca_scores.export(pca_prefix + 'scores.txt.bgz') # individual-level PCs pca_loadings.export(pca_prefix + 'loadings.txt.bgz') pca_loadings.write(pca_prefix + 'loadings.ht', overwrite) # PCA loadings #export loadings in plink format ht = hl.read_table(pca_prefix + 'loadings.ht') ht = ht.key_by() ht_loadings = ht.select( ID=hl.variant_str(ht.locus, ht.alleles), ALT=ht.alleles[1], **{f"PC{i}": ht.loadings[i - 1] for i in range(1, 21)}) ht_afreq = ht.select( **{ "#ID": hl.variant_str(ht.locus, ht.alleles), "REF": ht.alleles[0], "ALT": ht.alleles[1], "ALT1_FREQ": ht.pca_af }) ht_loadings.export(pca_prefix + 'loadings.plink.tsv') ht_afreq.export(pca_prefix + 'loadings.plink.afreq')
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles') loadings = hl.read_table(GNOMAD_LIFTOVER_LOADINGS).key_by( 'locus', 'alleles') # filter to loci that are contained in both tables and the loadings after densifying tob_wgs = hl.experimental.densify(tob_wgs) hgdp_1kg = hgdp_1kg.filter_rows( hl.is_defined(loadings.index(hgdp_1kg['locus'], hgdp_1kg['alleles'])) & hl.is_defined( tob_wgs.index_rows(hgdp_1kg['locus'], hgdp_1kg['alleles']))) tob_wgs = tob_wgs.semi_join_rows(hgdp_1kg.rows()) # Entries and columns must be identical tob_wgs_select = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)) hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT) hgdp_1kg_select = hgdp_1kg_select.select_cols() # Join datasets hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select) # Add in metadata information hgdp_1kg_metadata = hgdp_1kg.cols() hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols( hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s]) mt_path = f'{output}/hgdp1kg_tobwgs_joined_all_samples.mt' if not hl.hadoop_exists(mt_path): hgdp1kg_tobwgs_joined.write(mt_path) hgdp1kg_tobwgs_joined = hl.read_matrix_table(mt_path) # Perform PCA eigenvalues_path = f'{output}/eigenvalues.csv' scores_path = f'{output}/scores.ht' loadings_path = f'{output}/loadings.ht' eigenvalues, scores, loadings = hl.hwe_normalized_pca( hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20) # save the list of eigenvalues eigenvalues_df = pd.DataFrame(eigenvalues) eigenvalues_df.to_csv(eigenvalues_path, index=False) # save the scores and loadings as a hail table scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def run_pca(my_data, out_prefix): pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( my_data.GT, k=20, compute_loadings=True) pca_mt = my_data.annotate_rows( pca_af=hl.agg.mean(my_data.GT.n_alt_alleles()) / 2) pca_loadings = pca_loadings.annotate( pca_af=pca_mt.rows()[pca_loadings.key].pca_af) pca_scores.write(out_prefix + 'scores.ht', args.overwrite) pca_scores = hl.read_table(out_prefix + 'scores.ht') pca_scores = pca_scores.transmute( **{f'PC{i}': pca_scores.scores[i - 1] for i in range(1, 21)}) pca_scores.export(out_prefix + 'scores.txt.bgz') # individual-level PCs pca_loadings.write(out_prefix + 'loadings.ht', args.overwrite) # PCA loadings
def query(output): """Query script entry point.""" hl.init(default_reference='GRCh38') eigenvalues_path = f'{output}/eigenvalues.csv' scores_path = f'{output}/scores.ht' loadings_path = f'{output}/loadings.ht' mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) # test on 100 samples mt_head = mt.head(n=mt.count_rows(), n_cols=100) eigenvalues, scores, loadings = hl.hwe_normalized_pca( mt_head.GT, compute_loadings=True, k=20 ) # save the list of eigenvalues eigenvalues_df = pd.DataFrame(eigenvalues) eigenvalues_df.to_csv(eigenvalues_path, index=False) # save the scores and loadings as a hail table scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def run_pca(mt: hl.MatrixTable, out_prefix: str, overwrite: bool = False): """ Run PCA on a dataset :param mt: dataset to run PCA on :param out_prefix: directory and filename prefix for where to put PCA output :return: """ pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( mt.GT, k=20, compute_loadings=True) pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) pca_loadings = pca_loadings.annotate( pca_af=pca_mt.rows()[pca_loadings.key].pca_af) pca_scores.write(out_prefix + 'scores.ht', overwrite) pca_scores = hl.read_table(out_prefix + 'scores.ht') pca_scores = pca_scores.transmute( **{f'PC{i}': pca_scores.scores[i - 1] for i in range(1, 21)}) pca_scores.export(out_prefix + 'scores.txt.bgz') # individual-level PCs pca_loadings.write(out_prefix + 'loadings.ht', overwrite) # PCA loadings
def main(args): if args.join_qc_mt: v2_qc_mt_liftover = get_liftover_v2_qc_mt('exomes', ld_pruned=True, release_only=True) v2_qc_mt_liftover = v2_qc_mt_liftover.key_cols_by(s=v2_qc_mt_liftover.s, data_type="v2_exomes") v3_qc_mt = qc.mt() v3_qc_mt = v3_qc_mt.filter_cols(meta.ht()[v3_qc_mt.col_key].release) v3_qc_mt = v3_qc_mt.select_rows().select_cols() v3_qc_mt = v3_qc_mt.key_cols_by(s=v3_qc_mt.s, data_type="v3_genomes") joint_qc_mt = v2_qc_mt_liftover.union_cols(v3_qc_mt) joint_qc_mt.write("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt", overwrite=args.overwrite) if args.run_pc_relate: logger.info('Running PC-Relate') logger.warning("PC-relate requires SSDs and doesn't work with preemptible workers!") joint_qc_mt = hl.read_matrix_table("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt") joint_qc_mt = joint_qc_mt.sample_rows(0.1) eig, scores, _ = hl.hwe_normalized_pca(joint_qc_mt.GT, k=10, compute_loadings=False) scores = scores.checkpoint(v2_v3_pc_relate_pca_scores.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) relatedness_ht = hl.pc_relate(joint_qc_mt.GT, min_individual_maf=0.01, scores_expr=scores[joint_qc_mt.col_key].scores, block_size=4096, min_kinship=0.1, statistics='all') relatedness_ht.write(v2_v3_relatedness.path, args.overwrite)
def run_ref_pca( mt: hl.MatrixTable = None, npcs: int = 20, data_basename: str = None, out_dir: str = None): """ Run PCA on a dataset :param mt: dataset to run PCA on :param npcs: number of principal components to be used in PCA :param data_basename: input data basename so outputs can be saved in correct dir :param out_dir: directory and filename prefix for where to put PCA output :return: """ pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(mt.GT, k=npcs, compute_loadings=True) pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) pca_loadings = pca_loadings.annotate(pca_af=pca_mt.rows()[pca_loadings.key].pca_af) # pca_scores.write(out_dir + 'GWASpy/PCA/' + '1000G_scores.ht', overwrite=True) # pca_scores = hl.read_table(out_dir + 'GWASpy/PCA/' + '1000G_scores.ht') pca_scores = pca_scores.transmute(**{f'PC{i}': pca_scores.scores[i - 1] for i in range(1, npcs+1)}) pca_scores.export(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp.project.pca.scores.txt.bgz') # individual-level PCs pca_loadings.write(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp_loadings.ht', overwrite=True) # PCA loadings
def run_gwas(vcf_file, phenotypes_file, output_file): table = hl.import_table(phenotypes_file, impute=True).key_by('Sample') hl.import_vcf(vcf_file).write('tmp.mt') mt = hl.read_matrix_table('tmp.mt') mt = mt.annotate_cols(pheno=table[mt.s]) mt = hl.sample_qc(mt) mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97)) ab = mt.AD[1] / hl.sum(mt.AD) filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) | (mt.GT.is_hom_var() & (ab >= 0.9))) mt = mt.filter_entries(filter_condition_ab) mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01) eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT) mt = mt.annotate_cols(scores=pcs[mt.s].scores) gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[ 1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2] ]) gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles), P=gwas.p_value) gwas = gwas.key_by(gwas.SNP) gwas = gwas.select(gwas.P) gwas.export(f'{output_file}.assoc', header=True) hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
print(num_rows) prob = min(1, 80000 / num_rows) vds = vds.sample_rows(prob) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # pca #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print('filter VDS...') vds = vds.filter_cols(hl.is_defined(rel_exclusion[vds.s]), keep=False) vds = vds.filter_rows(hl.is_defined(mhc_chr8inv[vds.locus]), keep=False) vds = vds.filter_rows( (vds.locus.contig == "chrX") | (vds.locus.contig == "chrY"), keep=False) print('PCA...') eigenvalues, scores, loadings = hl.hwe_normalized_pca(vds, k=10) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # write output #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ with hl.utils.hadoop_open(pca_value_file, 'w') as f: for val in eigenvalues: f.write(str(val) + '\n') scores.flatten().export(pca_score_file) # print runtime stop = timeit.default_timer() print("runtime: " + str(stop - start) + " seconds")
#print(mt.count()) (12194564, 1092) #annotate MT file table = (hl.import_table('gs://ines-work/KG-annotation-with-sexencoder.csv', delimiter=',', missing='', quote='"', types={ 'Gender_Classification': hl.tfloat64 }).key_by('Sample')) mt = mt.annotate_cols(**table[mt.s]) #print(mt.aggregate_cols(agg.counter(mt.Gender_Classification))) {'0.0': 567, '1.0': 525} #pca pca_eigenvalues, pca_scores, _ = hl.hwe_normalized_pca(mt.GT, k=2) mt = mt.annotate_cols(pca=pca_scores[mt.s]) x = pca_scores.scores[0] y = pca_scores.scores[1] label = mt.cols()[pca_scores.s].Super_Population collect_all = nullable(bool) if isinstance(x, Expression) and isinstance(y, Expression): agg_f = x._aggregation_method() if isinstance(label, Expression): if collect_all: res = hail.tuple([x, y, label]).collect() label = [point[2] for point in res] else: res = agg_f(
def main(args): mt = hl.read_matrix_table(args.matrixtable) # ld pruning pruned_ht = hl.ld_prune(mt.GT, r2=0.1) pruned_mt = mt.filter_rows(hl.is_defined(pruned_ht[mt.row_key])) pruned_mt.write(f"{args.output_dir}/mt_ldpruned.mt", overwrite=True) # PC relate pruned_mt = pruned_mt.select_entries( GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles())) eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) scores.write(f"{args.output_dir}/mt_pruned.pca_scores.ht", overwrite=True) relatedness_ht = hl.pc_relate(pruned_mt.GT, min_individual_maf=0.05, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics='kin2') relatedness_ht.write(f"{args.output_dir}/mt_relatedness.ht", overwrite=True) pairs = relatedness_ht.filter(relatedness_ht['kin'] > 0.125) related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False) related_samples_to_remove.write( f"{args.output_dir}/mt_related_samples_to_remove.ht", overwrite=True) pca_mt = pruned_mt.filter_cols(hl.is_defined( related_samples_to_remove[pruned_mt.col_key]), keep=False) related_mt = pruned_mt.filter_cols(hl.is_defined( related_samples_to_remove[pruned_mt.col_key]), keep=True) variants, samples = pca_mt.count() print(f"{samples} samples after relatedness step.") # Population pca plink_mt = pca_mt.annotate_cols(uid=pca_mt.s).key_cols_by('uid') hl.export_plink(plink_mt, f"{args.output_dir}/mt_unrelated.plink", fam_id=plink_mt.uid, ind_id=plink_mt.uid) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pca_mt.GT, k=20, compute_loadings=True) pca_af_ht = pca_mt.annotate_rows( pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows() pca_loadings = pca_loadings.annotate( pca_af=pca_af_ht[pca_loadings.key].pca_af) pca_scores.write(f"{args.output_dir}/mt_pca_scores.ht", overwrite=True) pca_loadings.write(f"{args.output_dir}/mt_pca_loadings.ht", overwrite=True) pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores) variants, samples = related_mt.count() print( 'Projecting population PCs for {} related samples...'.format(samples)) #related_scores = pc_project(related_mt, pca_loadings) #relateds = related_mt.cols() #relateds = relateds.annotate(scores=related_scores[relateds.key].scores) pca_mt.write(f"{args.output_dir}/mt_pca.mt", overwrite=True) p = hl.plot.scatter(pca_mt.scores[0], pca_mt.scores[1], title='PCA', xlabel='PC1', ylabel='PC2') output_file(f"{args.plot_dir}/pca.html") save(p)
def hwe_normalized_pca(): mt = get_mt() mt = mt.filter_rows(mt.info.AF[0] > 0.01) hl.hwe_normalized_pca(mt.GT)
mt = mt.annotate_rows(qc=mt.qc.annotate(p_value_hwe=hl.case().when( mt.locus.in_autosome(), mt.qc.het_freq_hwe).default( hl.agg.filter(mt.imputesex.impute_sex.is_female, hl.agg.hardy_weinberg_test(mt.GT).het_freq_hwe)))) mt = mt.annotate_rows(annotation=mt.annotation.annotate( info=mt.annotation.info.annotate( AC=mt.annotation.info.AC[mt.annotation.a_index - 1], AF=mt.annotation.info.AF[mt.annotation.a_index - 1], ))) mt = hl.sample_qc(mt) mt_pca = mt.filter_rows(hl.is_defined(ht_final_pruned_variants[mt.row_key])) pca_output = hl.hwe_normalized_pca(mt_pca.GT, k=10) pca_output = pca_output[1].key_by('s') pca_output = pca_output.annotate(PC1=pca_output.scores[0], PC2=pca_output.scores[1], PC3=pca_output.scores[2], PC4=pca_output.scores[3], PC5=pca_output.scores[4], PC6=pca_output.scores[5], PC7=pca_output.scores[6], PC8=pca_output.scores[7], PC9=pca_output.scores[8], PC10=pca_output.scores[9]) mt = mt.annotate_cols(pca=pca_output[mt.s]) n = mt.count()
#pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000) pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.1) pruned_mt = mt_vqc_filtered.filter_rows( hl.is_defined(pruned_ht[mt_vqc_filtered.row_key])) pruned_mt = pruned_mt.filter_rows(hl.is_defined( bed_to_exclude_pca[pruned_mt.locus]), keep=False) pruned_mt.write( f"{tmp_dir}/ddd-elgh-ukbb/1000g_chr1_20_snps_filtered_ldpruned.mt", overwrite=True) # run pca logger.info("run pca") pca_evals, pca_scores, loadings_ht = hl.hwe_normalized_pca( pruned_mt.GT, k=10, compute_loadings=True) pruned_mt = pruned_mt.annotate_rows( af=hl.agg.mean(pruned_mt.GT.n_alt_alleles()) / 2) loadings_ht = loadings_ht.annotate(af=pruned_mt.rows()[loadings_ht.key].af) pca_scores.write(f"{tmp_dir}/ddd-elgh-ukbb/100g_pca_scores.ht", overwrite=True) loadings_ht.write(f"{tmp_dir}/ddd-elgh-ukbb/1000g_pca_loadings.ht", overwrite=True) with open(f"{temp_dir}/ddd-elgh-ukbb/1000g_pca_evals.txt", 'w') as f: for val in pca_evals: f.write(str(val)) ht = pc_project(project_mt.GT, loadings_ht.loadings, loadings_ht.af) ht.write(f"{tmp_dir}/ddd-elgh-ukbb/pc_project_our_data.ht", overwrite=True)
10000, pop_dist=[0.1, 0.2, 0.3, 0.2, 0.2], fst=[.02, .06, .04, .12, .08], af_dist=hl.rand_beta(a=0.01, b=2.0, lower=0.05, upper=1.0), mixture=True) mt = hl.variant_qc(mt) mt.write('bn.mt', overwrite=True) mt = hl.read_matrix_table('bn.mt') if not hl.hadoop_exists('scores.t'): # Generate data for demonstratation purposes, this should already exist scores = hl.hwe_normalized_pca(mt.GT, k=5)[1] scores = scores.annotate(**mt.cols()[scores.sample_idx]) scores.write('scores.t') pcs = hl.read_table('scores.t') @routes.get('') @routes.get('/') async def get_sha(request): # pylint: disable=unused-argument arr = pcs.collect() pca_plot = px.scatter_3d([{ 'id': x['sample_idx'], 'pop': np.argmax(x['pop']), **{f'PC{i}': x['scores'][i] for i in range(5)}