def test_hadoop_exists(self): with hadoop_open(f'{BUCKET}/test_exists.txt', 'w') as f: f.write("HELLO WORLD") r_exists = f'{BUCKET}/test_exists.txt' r_not_exists = f'{BUCKET}/not_exists.txt' self.assertTrue(hl.hadoop_exists(r_exists)) self.assertFalse(hl.hadoop_exists(r_not_exists))
def test_hadoop_exists(self, bucket=None): if bucket is None: bucket = self.remote_bucket with hadoop_open(f'{bucket}/test_exists.txt', 'w') as f: f.write("HELLO WORLD") r_exists = f'{bucket}/test_exists.txt' r_not_exists = f'{bucket}/not_exists.txt' self.assertTrue(hl.hadoop_exists(r_exists)) self.assertFalse(hl.hadoop_exists(r_not_exists))
def test_hadoop_exists(self, prefix: Optional[str] = None): if prefix is None: prefix = self.remote_tmpdir with hadoop_open(f'{prefix}/test_exists.txt', 'w') as f: f.write("HELLO WORLD") r_exists = f'{prefix}/test_exists.txt' r_not_exists = f'{prefix}/not_exists.txt' self.assertTrue(hl.hadoop_exists(r_exists)) self.assertFalse(hl.hadoop_exists(r_not_exists))
def get_liftover_v2_qc_mt(data_type: str, ld_pruned: bool, release_only: bool = False, overwrite: bool = False) -> hl.MatrixTable: """ Returns MatrixTable for sample QC purposes on build 38: can be exomes, genomes, or joint (joint dataset can also be ld_pruned=True) Criteria: callrate > 0.99, AF > 0.001, SNPs only, bi-allelics only Note: sites where the locus changes chromosome are discarded """ path = qc_mt_path(data_type, ld_pruned, 'GRCh38') if not overwrite and hl.hadoop_exists(path): grch38_qc_mt = hl.read_matrix_table(path) else: grch38_qc_mt = hl.read_matrix_table( qc_mt_path(data_type, ld_pruned=ld_pruned)) get_liftover_genome(grch38_qc_mt) grch38_qc_mt = grch38_qc_mt.key_rows_by() grch38_qc_mt = grch38_qc_mt.transmute_rows(locus=hl.liftover( grch38_qc_mt.locus, 'GRCh38'), locus37=grch38_qc_mt.locus) grch38_qc_mt = grch38_qc_mt.filter_rows( grch38_qc_mt.locus.contig == 'chr' + grch38_qc_mt.locus37.contig) grch38_qc_mt = grch38_qc_mt.key_rows_by(locus=grch38_qc_mt.locus, alleles=grch38_qc_mt.alleles) grch38_qc_mt = grch38_qc_mt.checkpoint(path, overwrite=overwrite) if release_only: meta = get_gnomad_meta(data_type) grch38_qc_mt = grch38_qc_mt.filter_cols( meta[grch38_qc_mt.col_key].release) return grch38_qc_mt
def to_plink(pops: list, subsets_dir, mt, ht_sample, bfile_path, export_varid: bool = True, overwrite=False): r''' Exports matrix table to PLINK2 files NOTE: These files will need to split up by chromosome before plink_clump.py can be run. ''' assert 'GT' in mt.entry, "mt must have 'GT' as an entry field" assert mt.GT.dtype == hl.tcall, "entry field 'GT' must be of type `Call`" if not overwrite and all([ hl.hadoop_exists(f'{bfile_path}.{suffix}') for suffix in ['bed', 'bim'] ]): print(f'\nPLINK .bed and .bim files already exist for {bfile_path}') print(bfile_path) else: print(f'Saving to bfile prefix {bfile_path}') mt_sample = mt.annotate_rows(varid=hl.str(mt.locus) + ':' + mt.alleles[0] + ':' + mt.alleles[1]) mt_sample = mt_sample.filter_cols(hl.is_defined( ht_sample[mt_sample.s])) hl.export_plink(dataset=mt_sample, output=bfile_path, ind_id=mt_sample.s, varid=mt_sample.varid) # varid used to be rsid
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS) hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) # keep loci that are contained in the densified, filtered tob-wgs mt hgdp_1kg = hgdp_1kg.semi_join_rows(tob_wgs.rows()) # Entries and columns must be identical tob_wgs_select = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)).select_cols() hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT).select_cols() # Join datasets hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select) # Add in metadata information hgdp_1kg_metadata = hgdp_1kg.cols() hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols( hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s]) # save this for population-level PCAs mt_path = output_path('hgdp1kg_tobwgs_joined_all_samples.mt') if not hl.hadoop_exists(mt_path): hgdp1kg_tobwgs_joined.write(mt_path) # Perform PCA eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') eigenvalues, scores, loadings = hl.hwe_normalized_pca( hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def main(df_x_path, df_y_path, output_path, python_image): backend = hb.ServiceBackend() b = hb.Batch(name='rf-loo', default_python_image=python_image) with hl.hadoop_open(df_y_path) as f: local_df_y = pd.read_table(f, header=0, index_col=0) df_x_input = b.read_input(df_x_path) df_y_input = b.read_input(df_y_path) results = [] for window in local_df_y.index.to_list(): checkpoint = checkpoint_path(window) if hl.hadoop_exists(checkpoint): result = b.read_input(checkpoint) results.append(result) continue j = b.new_python_job() result = j.call(random_forest, df_x_input, df_y_input, window) tsv_result = j.call(as_tsv, result) tsv_result = tsv_result.as_str() b.write_output(tsv_result, checkpoint) results.append(tsv_result) output = hb.concatenate(b, results) b.write_output(output, output_path) b.run(wait=False) backend.close()
def create_rf_2_0_2_rank(data_type: str, beta: bool) -> None: """ Creates a rank file for 2.0.2 RF and writes it to its correct location. :param str data_type: One of 'exomes' or 'genomes' :param bool beta: If set, then creates the table for the "beta" 2.0.2 RF with QD / max(p(AB)) :return: Nothing :rtype: None """ logger.info( f"Creating rank file for {data_type} RF 2.0.2{'beta' if beta else ''}") if not hl.hadoop_exists( f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht'): ht = hl.import_table(get_2_0_2_rf_path(data_type, beta), types={'chrom': hl.tstr}, impute=True, min_partitions=1000) if 'chrom' in ht.row: ht = ht.transmute(locus=hl.locus(ht.chrom, ht.pos), alleles=[ht.ref, ht.alt]) else: ht = ht.transmute( v=hl.parse_variant(ht.v), rfprob=ht.rf_rpob_tp # Yes, this is awful ) ht = ht.transmute(locus=ht.v.locus, alleles=ht.v.alleles) ht = ht.key_by('locus', 'alleles') gnomad_ht = get_gnomad_annotations(data_type) ht = ht.annotate(**gnomad_ht[ht.key], score=ht.rfprob) ht.write( f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht') ht = hl.read_table( f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht') ht = add_rank(ht, score_expr=1 - ht.score, subrank_expr={ 'singleton_rank': ht.singleton, 'biallelic_rank': ~ht.was_split, 'biallelic_singleton_rank': ~ht.was_split & ht.singleton, 'adj_rank': ht.ac > 0, 'adj_biallelic_rank': ~ht.was_split & (ht.ac > 0), 'adj_singleton_rank': ht.singleton & (ht.ac > 0), 'adj_biallelic_singleton_rank': ~ht.was_split & ht.singleton & (ht.ac > 0) }) ht.write(score_ranking_path(data_type, 'rf_2.0.2{}'.format('_beta' if beta else '')), overwrite=True)
def get_files_in_parent_directory(parent_dir, fname: str = 'variant_results.ht'): all_outputs = [] for directory in parent_dir: if not directory['is_dir']: continue file_path = f'{directory["path"]}/{fname}' if hl.hadoop_exists(f'{file_path}/_SUCCESS'): all_outputs.append(file_path) return all_outputs
def query(rerun): """Query script entry point.""" hl.init(default_reference='GRCh38') sample_qc_path = output_path('sample_qc.mt') if rerun or not hl.hadoop_exists(sample_qc_path): mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) mt = mt.head(100, n_cols=100) mt_qc = hl.sample_qc(mt) mt_qc.write(sample_qc_path) mt_qc = hl.read_matrix_table(sample_qc_path) plot_filename = output_path('call_rate_plot.png', 'web') if rerun or not hl.hadoop_exists(plot_filename): call_rate_plot = hl.plot.histogram(mt_qc.sample_qc.call_rate, range=(0, 1), legend='Call rate') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(call_rate_plot).save(f, format='PNG')
def test_hadoop_mkdir_p(self): test_text = "HELLO WORLD" with hadoop_open(resource('./some/foo/bar.txt'), 'w') as out: out.write(test_text) self.assertTrue(hl.hadoop_exists(resource('./some/foo/bar.txt'))) with hadoop_open(resource('./some/foo/bar.txt')) as f: assert (f.read() == test_text) hl.current_backend().fs.rmtree(resource('./some'))
def main(args): hl.init(default_reference='GRCh38', log='/load_results.log') start_time = time.time() all_phenos_ht = hl.import_table('gs://finngen-public-data-r2/summary_stats/r2_manifest.tsv', impute=True) # all_phenos_ht = all_phenos_ht.annotate(code=all_phenos_ht.phenocode.split('_', 2)[0]) all_phenos = all_phenos_ht.collect() backend = pipeline.BatchBackend(billing_project='ukb_round2') # backend = pipeline.LocalBackend(gsa_key_file='/Users/konradk/.hail/ukb-diverse-pops.json') p = pipeline.Pipeline(name='finngen_load', backend=backend, default_image='gcr.io/ukbb-exome-pharma/hail_utils:3.3', default_storage='500Mi', default_cpu=8) tasks = [] for i, pheno in enumerate(all_phenos): variant_results_ht_path = f'{results_dir}/ht/{pheno.phenocode}.ht' if not args.overwrite_results and hl.hadoop_exists(f'{variant_results_ht_path.replace(".ht", ".mt")}/_SUCCESS'): continue t: pipeline.pipeline.Task = p.new_task(name='load_pheno', attributes={'pheno': pheno.phenocode}).cpu(args.n_threads) t.command(f""" PYTHONPATH=$PYTHONPATH:/ PYSPARK_SUBMIT_ARGS="--conf spark.driver.memory=24g pyspark-shell" python3 /ukb_exomes/hail/load_finngen_results_hail.py --input_file {pheno.path_bucket} --n_threads {args.n_threads} --load_single --vep_path {vep_path} --additional_dict {shq(json.dumps(dict(pheno)))} --output_ht {variant_results_ht_path} --output_mt {variant_results_ht_path.replace('.ht', '.mt')} --overwrite """.replace('\n', ' ')) tasks.append(t) if args.limit and i == args.limit: break t: pipeline.pipeline.Task = p.new_task(name='combine').cpu(args.n_threads) t.depends_on(*tasks) t.command(f""" PYTHONPATH=$PYTHONPATH:/ PYSPARK_SUBMIT_ARGS="--conf spark.driver.memory=4g --conf spark.executor.memory=24g pyspark-shell" python3 /ukb_exomes/hail/load_finngen_results_hail.py --combine_all --input_directory {results_dir}/ht --output_ht {final_results_ht} --output_mt {final_results_ht.replace('.ht', '.mt')} --overwrite --n_threads {args.n_threads} """.replace('\n', ' ')) logger.info(f'Setup took: {time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))}') logger.info(f'Submitting: {get_tasks_from_pipeline(p)}') p.run(dry_run=args.dry_run, verbose=True, delete_scratch_on_exit=False) logger.info(f'Finished: {get_tasks_from_pipeline(p)}')
def file_exists(fname: str) -> bool: """ Check whether a file exists. Supports either local or Google cloud (gs://) paths. If the file is a Hail file (.ht, .mt extensions), it checks that _SUCCESS is present. :param fname: File name :return: Whether the file exists """ fext = os.path.splitext(fname)[1] if fext in [".ht", ".mt"]: fname += "/_SUCCESS" if fname.startswith("gs://"): return hl.hadoop_exists(fname) else: return os.path.isfile(fname)
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles') loadings = hl.read_table(GNOMAD_LIFTOVER_LOADINGS).key_by( 'locus', 'alleles') # filter to loci that are contained in both tables and the loadings after densifying tob_wgs = hl.experimental.densify(tob_wgs) hgdp_1kg = hgdp_1kg.filter_rows( hl.is_defined(loadings.index(hgdp_1kg['locus'], hgdp_1kg['alleles'])) & hl.is_defined( tob_wgs.index_rows(hgdp_1kg['locus'], hgdp_1kg['alleles']))) tob_wgs = tob_wgs.semi_join_rows(hgdp_1kg.rows()) # Entries and columns must be identical tob_wgs_select = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)) hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT) hgdp_1kg_select = hgdp_1kg_select.select_cols() # Join datasets hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select) # Add in metadata information hgdp_1kg_metadata = hgdp_1kg.cols() hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols( hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s]) mt_path = f'{output}/hgdp1kg_tobwgs_joined_all_samples.mt' if not hl.hadoop_exists(mt_path): hgdp1kg_tobwgs_joined.write(mt_path) hgdp1kg_tobwgs_joined = hl.read_matrix_table(mt_path) # Perform PCA eigenvalues_path = f'{output}/eigenvalues.csv' scores_path = f'{output}/scores.ht' loadings_path = f'{output}/loadings.ht' eigenvalues, scores, loadings = hl.hwe_normalized_pca( hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20) # save the list of eigenvalues eigenvalues_df = pd.DataFrame(eigenvalues) eigenvalues_df.to_csv(eigenvalues_path, index=False) # save the scores and loadings as a hail table scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def file_exists(fname: str) -> bool: """ Check whether a file exists. Supports either local or Google cloud (gs://) paths. If the file is a Hail file (.ht, .mt extensions), it checks that _SUCCESS is present. :param str fname: File name :return: Whether the file exists :rtype: bool """ _, fext = os.path.splitext(fname) if fext in ['.ht', '.mt']: fname = os.path.join(fname, '_SUCCESS') if fname.startswith('gs://'): return hl.hadoop_exists(fname) else: return os.path.isfile(fname)
def create_rf_rank(data_type: str, run_hash: str) -> None: """ Creates a ranked table for a RF run and writes it to its correct location in annotations. :param str data_type: One of 'exomes' or 'genomes' :param str run_hash: RF run hash :return: Nothing :rtype: None """ logger.info(f"Creating rank file for {data_type} RF run {run_hash}") if not hl.hadoop_exists( f'gs://gnomad-tmp/gnomad_{data_type}_rf_{run_hash}.ht/_SUCCESS'): gnomad_ht = get_gnomad_annotations(data_type) ht = hl.read_table(rf_path(data_type, 'rf_result', run_hash=run_hash)) ht = ht.annotate(**gnomad_ht[ht.key], score=ht.rf_probability['TP']) # Write to temp location as result will be overwritten ht.write(f'gs://gnomad-tmp/gnomad_{data_type}_rf_{run_hash}.ht', overwrite=True) ht = hl.read_table(f'gs://gnomad-tmp/gnomad_{data_type}_rf_{run_hash}.ht') ht = add_rank(ht, score_expr=1 - ht.score, subrank_expr={ 'singleton_rank': ht.singleton, 'biallelic_rank': ~ht.was_split, 'biallelic_singleton_rank': ~ht.was_split & ht.singleton, 'adj_rank': ht.ac > 0, 'adj_biallelic_rank': ~ht.was_split & (ht.ac > 0), 'adj_singleton_rank': ht.singleton & (ht.ac > 0), 'adj_biallelic_singleton_rank': ~ht.was_split & ht.singleton & (ht.ac > 0) }) ht.write(rf_path(data_type, 'rf_result', run_hash=run_hash), overwrite=True)
def test_hadoop_mkdir_p(self): with self.assertRaises(Exception): hadoop_open(resource('./some2/foo/bar.txt'), 'r') self.assertFalse(hl.hadoop_exists(resource('./some2')))
def test_hadoop_exists(self): self.assertTrue(hl.hadoop_exists(resource('ls_test'))) self.assertFalse(hl.hadoop_exists(resource('doesnt.exist')))
def test_hadoop_exists(self): self.assertTrue(hl.hadoop_exists(resource('ls_test'))) self.assertFalse(hl.hadoop_exists(resource('doesnt.exist')))
import jinja2 import numpy as np import hail as hl import plotly import plotly.express as px import json from aiohttp import web import aiohttp_jinja2 app = web.Application() routes = web.RouteTableDef() if not hl.hadoop_exists('bn.mt'): # Generate data for demonstratation purposes, this should already exist mt = hl.balding_nichols_model(5, 100, 10000, pop_dist=[0.1, 0.2, 0.3, 0.2, 0.2], fst=[.02, .06, .04, .12, .08], af_dist=hl.rand_beta(a=0.01, b=2.0, lower=0.05, upper=1.0), mixture=True) mt = hl.variant_qc(mt) mt.write('bn.mt', overwrite=True) mt = hl.read_matrix_table('bn.mt') if not hl.hadoop_exists('scores.t'): # Generate data for demonstratation purposes, this should already exist
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') loadings_ht = hl.read_table(LOADINGS) gtf_ht = hl.experimental.import_gtf( GTF_FILE, reference_genome='GRCh38', skip_invalid_contigs=True, min_partitions=12, ) number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0] - 1 for i in range(0, (number_of_pcs)): pc = i + 1 plot_filename = output_path(f'loadings_manhattan_plot_pc{pc}.png', 'web') if not hl.hadoop_exists(plot_filename): p = manhattan_loadings( iteration=i, gtf=gtf_ht, loadings=loadings_ht, title=f'Loadings of PC{pc}', collect_all=True, ) with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') html = file_html(p, CDN, 'my plot') plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # Get samples which are driving loadings mt = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = hl.read_table(SCORES) mt = mt.semi_join_cols(scores) loadings_ht = loadings_ht.key_by('locus') mt = mt.annotate_rows(loadings=loadings_ht[mt.locus].loadings) for dim in range(0, number_of_pcs): max_value = mt.aggregate_rows(hl.agg.stats(hl.abs( mt.loadings[dim]))).max significant_variants = mt.filter_rows( hl.abs(mt.loadings[dim]) == max_value) significant_variants = hl.sample_qc(significant_variants) significant_variant_list = significant_variants.locus.collect() print(f'PC{dim}:', significant_variant_list) heterozygous_samples = significant_variants.filter_cols( significant_variants.sample_qc.n_het > 0).s.collect() homozygous_alternate_samples = significant_variants.filter_cols( significant_variants.sample_qc.n_hom_var > 0).s.collect() if len(heterozygous_samples) > len(homozygous_alternate_samples): homozygous_alternate_samples.extend('null' for _ in range( len(heterozygous_samples) - len(homozygous_alternate_samples))) elif len(heterozygous_samples) < len(homozygous_alternate_samples): heterozygous_samples.extend('null' for _ in range( len(homozygous_alternate_samples) - len(heterozygous_samples))) # save as html html = pd.DataFrame({ 'heterozygous_samples': heterozygous_samples, 'homozygous_alternate_samples': homozygous_alternate_samples, }).to_html() plot_filename_html = output_path( f'significant_variants_non_ref_samples{dim}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def exists(self, path): # pylint: disable=no-self-use return hl.hadoop_exists(path)
def query(output, pop): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) if pop: # Get samples from the specified population only mt = mt.filter_cols(( mt.hgdp_1kg_metadata.population_inference.pop == pop.lower()) | (mt.s.contains('TOB'))) else: mt = mt.filter_cols(mt.s.contains('TOB')) # Get allele-frequency and loadings for pc_project function mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) loadings = hl.read_table(LOADINGS) loadings = loadings.annotate(af=mt.rows()[loadings.key].af) reprocessed_samples = hl.read_matrix_table(REPROCESSED_1KG) reprocessed_samples = hl.experimental.densify(reprocessed_samples) reprocessed_samples = reprocessed_samples.annotate_entries( GT=lgt_to_gt(reprocessed_samples.LGT, reprocessed_samples.LA)) # Project new genotypes onto loadings ht = pc_project(reprocessed_samples.GT, loadings.loadings, loadings.af) ht = ht.key_by(s=ht.s + '_reprocessed') pcs = hl.read_table(SCORES) union_scores = ht.union(pcs) union_scores = union_scores.annotate( original=(union_scores.s == 'HG01513') | (union_scores.s == 'HG02238') | (union_scores.s == 'NA12248') | (union_scores.s == 'NA20502') | (union_scores.s == 'NA20826'), reprocessed=union_scores.s.contains('reprocessed'), ) expr = ( hl.case().when( (union_scores.original) & ( union_scores.reprocessed # pylint: disable=singleton-comparison == False # noqa: E712 ), 'original', ).when( (union_scores.original == False) # pylint: disable=singleton-comparison & (union_scores.reprocessed), 'reprocessed', ).default('unedited')) union_scores = union_scores.annotate(cohort_sample_codes=expr) # get percentage of variance explained eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # plot labels = union_scores.cohort_sample_codes sample_names = union_scores.s cohort_sample_codes = list(set(labels.collect())) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, 10): pc1 = i pc2 = i + 1 plot_filename = (f'{output}/reprocessed_sample_projection_pc' + str(i + 1) + '.png') if not hl.hadoop_exists(plot_filename): plot = figure( title='Reprocessed Sample Projection', x_axis_label='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', y_axis_label='PC' + str(pc2 + 1) + ' (' + str(variance[pc1]) + '%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=union_scores.scores[pc1].collect(), y=union_scores.scores[pc2].collect(), label=labels.collect(), samples=sample_names.collect(), )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=8, color=factor_cmap('label', Dark2[len(cohort_sample_codes)], cohort_sample_codes), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') plot_filename_html = ('reprocessed_sample_projection_pc' + str(i + 1) + '.html') output_file(plot_filename_html) save(plot) subprocess.run(['gsutil', 'cp', plot_filename_html, output], check=False)
def run_pca_normal(dirname: str = None, basename: str = None, input_type: str = None, reference: str = 'GRCh38', maf: float = 0.05, hwe: float = 1e-3, call_rate: float = 0.98, ld_cor: float = 0.2, ld_window: int = 250000, n_pcs: int = 20, relatedness_method: str = 'pc_relate', relatedness_thresh: float = 0.98, out_dir: str = None): print('\nReading mt') if reference.lower() == 'grch37': lifted_over = f'{dirname}{basename}.liftover.grch38.mt' if not hl.hadoop_exists(lifted_over): from gwaspy.utils.reference_liftover import liftover_to_grch38 mt = liftover_to_grch38(dirname=dirname, basename=basename, input_type=input_type) else: print(f'\nFound lifted-over over file: {lifted_over}') mt = hl.read_matrix_table(lifted_over) else: from gwaspy.utils.read_file import read_infile mt = read_infile(input_type=input_type, dirname=dirname, basename=basename) print('\nFiltering mt') mt = pca_filter_mt(in_mt=mt, maf=maf, hwe=hwe, call_rate=call_rate, ld_cor=ld_cor, ld_window=ld_window) mt = relatedness_check(in_mt=mt, method=relatedness_method, outdir=out_dir, kin_estimate=relatedness_thresh) pca_snps = mt.count_rows() if pca_snps > 1000000: import warnings warnings.warn( f'Too many SNPs to be used in PCA: {pca_snps}. This will make PCA run longer' ) print('\nRunning PCA') eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT, k=n_pcs) pcs_ht = pcs.transmute( **{f'PC{i}': pcs.scores[i - 1] for i in range(1, n_pcs + 1)}) # add phenotype and sex to the output, using information from the mt # first check if is_case and os_female fields exist in the mt all_column_field_names = list(mt.col) # sex status is a MUST but not phenotype status if 'is_case' in all_column_field_names: ann_cols = ['is_case', 'is_female'] else: ann_cols = ['is_female'] annotations_ht = mt.cols().select(*ann_cols) if 'is_case' in all_column_field_names: pcs_ht = pcs_ht.annotate(is_case=annotations_ht[pcs_ht.s].is_case) pcs_ht = pcs_ht.annotate(is_female=annotations_ht[pcs_ht.s].is_female) print('\nSaving PC scores file') out_scores_file = f'{out_dir}GWASpy/PCA/pca_normal/{basename}.pca.normal.scores.tsv' pcs_ht.export(out_scores_file) print('\nGenerating PCA plots') pcs_scores = pd.read_table(out_scores_file, header=0, sep='\t') if 'is_case' in all_column_field_names: pcs_scores[['is_case' ]] = pcs_scores[['is_case' ]].replace([True, False, None], ['case', 'control', 'unknown']) pcs_scores[['is_female' ]] = pcs_scores[['is_female' ]].replace([True, False, None], ['female', 'male', 'unknown']) figs_dict = {} for col in ann_cols: for i in range(1, n_pcs, 2): xpc = f'PC{i}' ypc = f'PC{i + 1}' figs_dict["fig{}{}".format(col, i)] = plot_pca(pcs_scores, xpc, ypc, col) pdf = PdfPages('/tmp/pca.no.ref.plots.pdf') for figname, figure in figs_dict.items(): pdf.savefig(figure) pdf.close() hl.hadoop_copy( 'file:///tmp/pca.no.ref.plots.pdf', f'{out_dir}GWASpy/PCA/pca_normal/{basename}.pca.no.ref.plots.pdf')
def run_impute(backend: Union[hb.ServiceBackend, hb.LocalBackend] = None, input_vcf: str = None, females_file: str = None, n_samples: int = None, n_panel_samples: int = 4099, phasing_software: str = None, memory: str = 'highmem', buffer_region: int = 250, out_dir: str = None): global phased_bcf print(f'\n1. IMPUTATION ON {input_vcf} PHASED CHUNKS\n') vcf_filebase = get_vcf_filebase(input_vcf) impute_b = hb.Batch(backend=backend, name=f'impute-phased-chunks-{vcf_filebase}') # use regions file to update the regions for imputation so that there's no overlaps like in phasing regions = pd.read_csv( f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/refscatter.bed', delim_whitespace=True, names=['chrom', 'start', 'end']) chroms_dfs = [] for chrom, df_group in regions.groupby('chrom'): # print(df_group.loc[df_group.index[0], 'end']) df_group.loc[df_group.index[0], 'stop'] = df_group.loc[df_group.index[0], 'end'] for i in range(1, len(df_group)): df_group.loc[df_group.index[i], 'stop'] = df_group.loc[df_group.index[i - 1], 'end'] + 1 df_group['stop'] = df_group['stop'].astype(int) # add index column df_group['ind'] = df_group.index # update the first line to start at 1 df_group.loc[df_group.index[0], 'stop'] = 1 # combine the chromosome, start, and end positions into one df_group['reg'] = df_group['chrom'].astype(str) + ":" + df_group[ 'stop'].astype(str) + "-" + df_group['end'].astype(str) # select only the two needed columns regions_to_import_group = df_group[['reg', 'ind']] chroms_dfs.append(regions_to_import_group) regions_to_import = pd.concat(chroms_dfs, axis=0) regions_to_import = regions_to_import.sort_values('ind') regions_to_import.to_csv( f'{out_dir}/GWASpy/{vcf_filebase}/Imputation/imputation.regions', sep='\t', header=False, index=False) regions_dict = pd.Series(regions_to_import.reg.values, index=regions_to_import.ind).to_dict() if phasing_software == 'shapeit': phased_vcfs_chunks = hl.utils.hadoop_ls( f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/phased_scatter/*.shapeit.bcf' ) else: phased_vcfs_chunks = hl.utils.hadoop_ls( f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/phased_scatter/*.eagle.bcf' ) for i in range(1, 24): if i == 23: chrom = 'chrX' else: chrom = f'chr{i}' ref_bcf = f'gs://gcp-public-data--gnomad/resources/hgdp_1kg/phased_haplotypes/hgdp.tgp.gwaspy.merged.{chrom}.merged.bcf' ref_size = bytes_to_gb(ref_bcf) ref = impute_b.read_input_group(**{ 'bcf': ref_bcf, 'bcf.csi': f'{ref_bcf}.csi' }) # output is not always bcf phased_filename = f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/phased_merged/{vcf_filebase}.{chrom}.phased.{phasing_software}' if hl.hadoop_exists(f'{phased_filename}.bcf'): phased_bcf = f'{phased_filename}.bcf' elif hl.hadoop_exists(f'{phased_filename}.vcf.gz'): phased_bcf = f'{phased_filename}.vcf.gz' in_vcf = impute_b.read_input_group(**{ 'bcf': phased_bcf, 'bcf.csi': f'{phased_bcf}.csi' }) vcf_size = bytes_to_gb(input_vcf) disk_size = int( round(10.0 + 3.0 * vcf_size + ((1.0 + 2.0 * n_samples / n_panel_samples) * ref_size))) job_memory = memory job_cpu = 16 if job_memory == 'highmem' else 8 for file in phased_vcfs_chunks: f = file['path'] vcf_basename = get_vcf_filebase(f) file_index = int(vcf_basename.split('.')[-3]) file_region = regions_dict[file_index] map_chrom = file_region.split(':')[0] imp_out_filename = f'{vcf_basename}.imputed.bcf' # file_dir = vcf_basename.split('.')[0] output_filepath_name = f'{out_dir}/GWASpy/{vcf_filebase}/Imputation/imputed_chunks/{imp_out_filename}' if map_chrom == chrom: # check if imputed file already exists if hl.hadoop_exists(output_filepath_name): continue else: if chrom == 'chrX': females_in = impute_b.read_input(females_file) sex_impute(b=impute_b, vcf=in_vcf, females_list=females_in, vcf_filename_no_ext=vcf_basename, ref=ref, region=file_region, buffer=buffer_region, storage=disk_size, memory=job_memory, cpu=job_cpu, out_dir=out_dir) else: aut_impute(b=impute_b, vcf=in_vcf, vcf_filename_no_ext=vcf_basename, ref=ref, region=file_region, chromosome=chrom, buffer=buffer_region, storage=disk_size, memory=job_memory, cpu=job_cpu, out_dir=out_dir) impute_b.run()
def create_binned_concordance(data_type: str, truth_sample: str, metric: str, nbins: int, overwrite: bool) -> None: """ Creates and writes a concordance table binned by rank (both absolute and relative) for a given data type, truth sample and metric. :param str data_type: One 'exomes' or 'genomes' :param str truth_sample: Which truth sample concordance to load :param str metric: One of the evaluation metrics (or a RF hash) :param int nbins: Number of bins for the rank :param bool overwrite: Whether to overwrite existing table :return: Nothing -- just writes the table :rtype: None """ if hl.hadoop_exists( binned_concordance_path(data_type, truth_sample, metric) + '/_SUCCESS') and not overwrite: logger.warn( f"Skipping binned concordance creation as {binned_concordance_path(data_type, truth_sample, metric)} exists and overwrite=False" ) else: ht = hl.read_table( annotations_ht_path(data_type, f'{truth_sample}_concordance')) # Remove 1bp indels for syndip as cannot be trusted if truth_sample == 'syndip': ht = ht.filter( hl.is_indel(ht.alleles[0], ht.alleles[1]) & (hl.abs(hl.len(ht.alleles[0]) - hl.len(ht.alleles[1])) == 1), keep=False) high_conf_intervals = hl.import_locus_intervals( syndip_high_conf_regions_bed_path) else: high_conf_intervals = hl.import_locus_intervals( NA12878_high_conf_regions_bed_path) lcr = hl.import_locus_intervals(lcr_intervals_path) segdup = hl.import_locus_intervals(segdup_intervals_path) ht = ht.filter( hl.is_defined(high_conf_intervals[ht.locus]) & hl.is_missing(lcr[ht.locus]) & hl.is_missing(segdup[ht.locus])) if metric in ['vqsr', 'rf_2.0.2', 'rf_2.0.2_beta', 'cnn']: metric_ht = hl.read_table(score_ranking_path(data_type, metric)) else: metric_ht = hl.read_table( rf_path(data_type, 'rf_result', run_hash=metric)) metric_snvs, metrics_indels = metric_ht.aggregate([ hl.agg.count_where( hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])), hl.agg.count_where( ~hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])) ]) snvs, indels = ht.aggregate([ hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])), hl.agg.count_where(~hl.is_snp(ht.alleles[0], ht.alleles[1])) ]) ht = ht.annotate_globals(global_counts=hl.struct( snvs=metric_snvs, indels=metrics_indels), counts=hl.struct(snvs=snvs, indels=indels)) ht = ht.annotate( snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), score=metric_ht[ht.key].score, global_rank=metric_ht[ht.key].rank, # TP => allele is found in both data sets n_tp=ht.concordance[3][3] + ht.concordance[3][4] + ht.concordance[4][3] + ht.concordance[4][4], # FP => allele is found only in test data set n_fp=hl.sum(ht.concordance[3][:2]) + hl.sum(ht.concordance[4][:2]), # FN => allele is found only in truth data set n_fn=hl.sum(ht.concordance[:2].map(lambda x: x[3] + x[4]))) ht = add_rank(ht, -1.0 * ht.score) ht = ht.annotate(rank=[ hl.tuple([ 'global_rank', (ht.global_rank + 1) / hl.cond(ht.snv, ht.globals.global_counts.snvs, ht.globals.global_counts.indels) ]), hl.tuple([ 'truth_sample_rank', (ht.rank + 1) / hl.cond( ht.snv, ht.globals.counts.snvs, ht.globals.counts.indels) ]) ]) ht = ht.explode(ht.rank) ht = ht.annotate(rank_name=ht.rank[0], bin=hl.int(ht.rank[1] * nbins)) ht = ht.group_by('rank_name', 'snv', 'bin').aggregate( # Look at site-level metrics -> tp > fp > fn -- only important for multi-sample comparisons tp=hl.agg.count_where(ht.n_tp > 0), fp=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp > 0)), fn=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp == 0) & (ht.n_fn > 0)), min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n_alleles=hl.agg.count()).repartition(5) ht.write(binned_concordance_path(data_type, truth_sample, metric), overwrite=overwrite)
def query(output, pop): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) if pop: # Get samples from the specified population only mt = mt.filter_cols(( mt.hgdp_1kg_metadata.population_inference.pop == pop.lower()) | (mt.s.contains('TOB'))) else: mt = mt.filter_cols(mt.s.contains('TOB')) mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) loadings = hl.read_table(LOADINGS) loadings = loadings.annotate(af=mt.rows()[loadings.key].af) tob_wgs_snp_chip = hl.read_matrix_table(SNP_CHIP).key_rows_by( 'locus', 'alleles') ht = pc_project(tob_wgs_snp_chip.GT, loadings.loadings, loadings.af) ht = ht.key_by(s=ht.s + '_SNP_CHIP') pcs = hl.read_table(SCORES) union_scores = ht.union(pcs) union_scores = union_scores.annotate( snp_chip=(union_scores.s.contains('_SNP_CHIP')), tob_wgs=(union_scores.s.contains('_SNP_CHIP') | union_scores.s.contains('TOB')), ) expr = ( hl.case().when( (union_scores.snp_chip), 'snp_chip', ).when( ( union_scores.snp_chip # noqa: E501; pylint: disable=singleton-comparison; == False # noqa: E712 ) & (union_scores.tob_wgs), 'tob_wgs', ).default('hgdp_1kg')) union_scores = union_scores.annotate(cohort_sample_codes=expr) # get percentage of variance explained eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # plot labels = union_scores.cohort_sample_codes sample_names = union_scores.s cohort_sample_codes = list(set(labels.collect())) tooltips = [('labels', '@label'), ('samples', '@samples')] number_of_pcs = len(eigenvalues) union_scores = union_scores.persist() for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot_filename = (f'{output}/reprocessed_sample_projection_pc' + str(i + 1) + '.png') if not hl.hadoop_exists(plot_filename): plot = figure( title='SNP-Chip Sample Projection', x_axis_label='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', y_axis_label='PC' + str(pc2 + 1) + ' (' + str(variance[pc1]) + '%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=union_scores.scores[pc1].collect(), y=union_scores.scores[pc2].collect(), label=labels.collect(), samples=sample_names.collect(), )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=8, color=factor_cmap('label', Dark2[len(cohort_sample_codes)], cohort_sample_codes), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') plot_filename_html = 'snp_chip_sample_projection_pc' + str( i + 1) + '.html' output_file(plot_filename_html) save(plot) subprocess.run(['gsutil', 'cp', plot_filename_html, output], check=False)
def main(): # # Args (local) # chrom = 11 # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz' # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen' # in_sample = 'output/ukb_10k_downsampled.sample' # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv' # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' # cores = 1 # Use "*" for all # maf_threshold = 0.001 # Args (server) chrom = sys.argv[1] chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz' in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen' in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample' to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv' out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k' cores = sys.argv[2] # Use "*" for all maf_threshold = 0.001 # Set the maximum number of cores hl.init(master="local[{}]".format(cores)) # Prepare liftover rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover(chain_file, rg38) # Create my own rg38 with altered names rg38_custom_contigs = [ contig.replace('chr', '') for contig in rg38.contigs ] rg38_custom_lens = {} for contig in rg38.lengths: rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig] rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs, rg38_custom_lens) print('Processing chromosome {0}'.format(chrom)) # Index bgen if not existing if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'): hl.index_bgen(in_bgen.format(chrom=chrom), contig_recoding={ "01": "1", "02": "2", "03": "3", "04": "4", "05": "5", "06": "6", "07": "7", "08": "8", "09": "9" }, reference_genome='GRCh37') # Load bgen mt = hl.import_bgen(in_bgen.format(chrom=chrom), entry_fields=['GT'], sample_file=in_sample) # Load list samples to keep samples_to_keep = hl.import_table(to_keep_list, no_header=True, impute=False, types={ 'f0': hl.tstr }).key_by('f0') # Downsample to required subset of samples mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s])) # Re-call to remove phasing (required for plink output) # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False)) # Filter on MAF mt = hl.variant_qc(mt) mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate( MAF=hl.min(mt.variant_qc.AF))) mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold) # Liftover mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38')) # Strip chr from contig name (causes problems with GCTA) mt = mt.annotate_rows( contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', '')) # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom) mt = mt.key_rows_by() mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38, mt.locus_GRCh38.position, reference_genome=rg38_custom)) mt = mt.key_rows_by(mt.locus, mt.alleles) # Remove rows with missing locus (after liftover) mt = mt.filter_rows(hl.is_defined(mt.locus)) # Write plink format hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom)) return 0
def run_pca_project( ref_dirname: str = 'gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/', ref_basename: str = 'unrelated', ref_info: str = 'gs://hgdp-1kg/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv', data_dirname: str = None, data_basename: str = None, out_dir: str = None, input_type: str = None, reference: str = 'GRCh38', npcs: int = 20, maf: float = 0.05, hwe: float = 1e-3, call_rate: float = 0.98, ld_cor: float = 0.2, ld_window: int = 250000, relatedness_method: str = 'pc_relate', relatedness_thresh: float = 0.98, prob_threshold: float = 0.8): """ Project samples into predefined PCA space :param ref_dirname: directory name where reference data is :param ref_basename: base filename for reference data :param ref_info: reference sample information :param data_dirname: matrix table of data to project :param data_basename: matrix table of data to project :param out_dir: directory and filename prefix for where to put PCA projection output :param input_type: input file(s) type: hail, plink, or vcf :param reference: reference build :param npcs: number of principal components to be used in PCA :param maf: minor allele frequency threshold :param hwe: hardy-weinberg fiter threshold :param call_rate: variant call rate filter threshold :param ld_cor: reference build :param ld_window: window size :param relatedness_method: method to use for relatedness filtering :param relatedness_thresh: threshold to use for filtering out related individuals :param prob_threshold: a list of probability thresholds to use for classifying samples :return: a pandas Dataframe with data PCA scores projected on the same PCA space using the Human Genome Diversity """ print('\nReading data mt') if reference.lower() == 'grch37': lifted_over = f'{data_dirname}{data_basename}.liftover.grch38.mt' if not hl.hadoop_exists(lifted_over): from gwaspy.utils.reference_liftover import liftover_to_grch38 mt = liftover_to_grch38(dirname=data_dirname, basename=data_basename, input_type=input_type) else: print(f'\nFound lifted-over over file: {lifted_over}') mt = hl.read_matrix_table(lifted_over) else: from gwaspy.utils.read_file import read_infile mt = read_infile(input_type=input_type, dirname=data_dirname, basename=data_basename) print('\nFiltering data mt') mt = pca_filter_mt(in_mt=mt, maf=maf, hwe=hwe, call_rate=call_rate, ld_cor=ld_cor, ld_window=ld_window) mt = relatedness_check(in_mt=mt, method=relatedness_method, outdir=out_dir, kin_estimate=relatedness_thresh) # Intersect data with reference intersect_ref(ref_dirname=ref_dirname, ref_basename=ref_basename, data_mt=mt, data_basename=data_basename, out_dir=out_dir) ref_in_data = hl.read_matrix_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp_intersect_{data_basename}.mt') print('\nComputing reference PCs') run_ref_pca(mt=ref_in_data, npcs=npcs, out_dir=out_dir, data_basename=data_basename) # project data pca_loadings = hl.read_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp_loadings.ht') project_mt = hl.read_matrix_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}_intersect_1kg_hgdp.mt') ht_projections = pc_project(mt=project_mt, loadings_ht=pca_loadings) ht_projections = ht_projections.transmute(**{f'PC{i}': ht_projections.scores[i - 1] for i in range(1, npcs+1)}) ht_projections.export(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.project.pca.scores.tsv') ref_scores = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp.project.pca.scores.txt.bgz' data_scores = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.project.pca.scores.tsv' data_ref = merge_data_with_ref(ref_scores=ref_scores, ref_info=ref_info, data_scores=data_scores) from gwaspy.pca.assign_pop_labels import assign_population_pcs pcs_df, clf = assign_population_pcs(pop_pc_pd=data_ref, num_pcs=npcs, min_prob=prob_threshold) data_pops = pcs_df.loc[pcs_df['SuperPop'].isnull()] data_pops['pop'].value_counts() cols = ['s', 'pop'] + [f'prob_{i}' for i in ["AFR", "AMR", "CSA", "EAS", "EUR", "MID", "OCE"]] + [f'PC{i}' for i in range(1, npcs+1)] data_pops_df = data_pops[cols] data_pops_df.to_csv(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/pca_sup_pops_{prob_threshold}_probs.project.pca.txt', sep='\t', index=False) print("\nGenerating PCA plots") data_scores_prob = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/pca_sup_pops_{prob_threshold}_probs.project.pca.txt' figs_dict = {} # plotting more than 10 PCA plots in HTML generates wobbly, large files for i in range(1, 10, 2): xpc = f'PC{i}' ypc = f'PC{i + 1}' figs_dict["fig{}{}".format(xpc, ypc)] = plot_pca_ref(data_scores=data_scores_prob, ref_scores=ref_scores, ref_info=ref_info, x_pc=xpc, y_pc=ypc) with open('/tmp/pca.project.plots.html', 'a') as f: for figname, figure in figs_dict.items(): f.write(figure.to_html(include_plotlyjs='cdn')) hl.hadoop_copy('file:///tmp/pca.project.plots.html', f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.pca.project.plots.html')
def main(args): ######################################################################## ### initialize print('Getting started: ' + datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # 1. Read in summary stats data # 2. Annotate matrix table with effect sizes for each phenotype # 3. Compute PRS for each start = time.time() pheno_gwas = hl.import_table(f'gs://apcdr/pheno_code_ukb_code.txt') pheno_ss = dict([(x.pheno_code, x.ukb_code) for x in pheno_gwas.collect()]) #pheno_ss = dict([(x.ss_code, x.pheno_code) for x in pheno_gwas.collect()]) # mt = hl.read_matrix_table('gs://apcdr/prs_sumstats_clumps/ukb_holdout/ukb31063.gwas_holdout_sumstats_pheno37_subset.mt') mt = hl.read_matrix_table('gs://apcdr/dosage_bgen/apcdr.mt') ss_keys = dict( zip(['CHR', 'POS', 'REF', 'ALT', 'P', 'BETA'], args.chr_pos_ref_alt_p_beta.split(','))) for pheno in list(pheno_ss.keys()): #for pheno in ['WHR']: print('Pheno: ' + pheno + ', Time: ' + datetime.now().strftime("%Y-%m-%d %H:%M:%S")) suffix_replace = args.ss_suffix.split('.') suffix_replace[-2] = 'clumped' suffix_replace = '.'.join(suffix_replace) if hl.hadoop_exists(args.ss_clump_prefix + pheno + suffix_replace): ss_path = args.ss_clump_prefix + pheno + args.ss_suffix clump_path = args.ss_clump_prefix + pheno + suffix_replace elif hl.hadoop_exists(args.ss_clump_prefix + pheno_ss[pheno] + suffix_replace): ss_path = args.ss_clump_prefix + pheno_ss[pheno] + args.ss_suffix clump_path = args.ss_clump_prefix + pheno_ss[pheno] + suffix_replace else: continue ss = hl.import_table(ss_path, impute=True, delimiter='\s+', min_partitions=1000) ss = ss.annotate(locus=hl.locus(hl.str(ss[ss_keys['CHR']]), ss[ss_keys['POS']]), alleles=[ss[ss_keys['REF']], ss[ss_keys['ALT']]]) ss = ss.key_by(ss.locus, ss.alleles) ## Read in summary statistics and true phenotypes mt_annot = mt.annotate_rows(ss=ss[mt.locus, mt.alleles]) # come back to this # ht_samples = hl.import_table('gs://apcdr/ukb_holdout/ukb31063.gwas_samples.gwas_vs_holdout.txt', # types={'s': hl.tstr}, key='s') # ht_samples = hl.import_table('gs://apcdr/ukb_holdout/ukb31063.gwas_samples.holdout_and_target.txt', # types={'s': hl.tstr}, key='s') # # mt_annot = mt_annot.filter_cols(hl.or_else(ht_samples[mt_annot.s].in_gwas != 'TRUE', True)) # mt_annot = mt_annot.filter_cols(hl.is_defined(ht_samples[mt_annot.s])) # # print(mt.count()) # 13364303, 136265) print('Starting ' + pheno + ': ' + datetime.now().strftime("%Y-%m-%d %H:%M:%S")) p_max = { 's1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2, 's6': .05, 's7': .1, 's8': .2, 's9': .5, 's10': 1 } pheno_clump = specific_clumps(clump_path) mt_annot = mt_annot.filter_rows(pheno_clump.get(mt_annot.locus, False)) # print(mt.count()) annot_expr = { k: hl.agg.sum( hl.float(mt_annot.ss[ss_keys['BETA']]) * mt_annot.dosage * hl.int(mt_annot.ss[ss_keys['P']] < v)) for k, v in p_max.items() } mt_annot = mt_annot.annotate_cols(**annot_expr) ht_out = mt_annot.cols() #ht_out.describe() #covs = hl.read_table('gs://apcdr/ukb_holdout/uk_round2_allSamples_phenos_phesant.ht').select('age', 'sex') # added # need to add in PCs #ht_out = ht_out.annotate(**covs[ht_out.key]) ht_comb = ht_out.select(*p_max.keys(), age=ht_out.phenotypes.age, sex=ht_out.phenotypes.sex, pheno=ht_out.phenotypes[pheno]) output_location = args.ss_clump_prefix + pheno + '_apcdr_PRS' #ht_comb.describe() #ht_comb.write(output_location + '.ht', overwrite=args.overwrite) #ht_comb = hl.read_table(output_location + '.ht') ht_comb.export(output_location + '.txt.bgz') end = time.time() print("Success! Job was completed in %s" % time.strftime("%H:%M:%S", time.gmtime(end - start)))
fn = "gs://qingbowang/ems_v1_test/ems_p_causal_interpolated_{0}.tsv".format( tissue_name) with hl.hadoop_open(fn, 'r') as f: pcausal = pd.read_csv(f, sep="\t", index_col=0) pcausal["rf_score_bin"] = pcausal.index del pcausal["rf_score_bin.1"] #duplicated columns pcausal = hl.Table.from_pandas(pcausal) pcausal = pcausal.transmute( rf_score_bin=hl.format('%.3f', pcausal["rf_score_bin"])) #score all chunks: #get the max for i in range(10000): #just take the upperbound if not hl.hadoop_exists( "gs://qingbowang/ems_v1_test/ems_rawscore_gtexvg_all{0}_chunk{1}.tsv.gz" .format(tissue_name, i)): imax = i break dfall = [] for i in range(imax): print("starting chunk {0} of {1}, {2}".format(i, imax - 1, tm.ctime())) df = hl.import_table( "gs://qingbowang/ems_v1_test/ems_rawscore_gtexvg_all{0}_chunk{1}.tsv.gz" .format(tissue_name, i), force=True, impute=True) df = df.repartition(80) #80 partition for 1000mann lines df = df.annotate(rf_score_bin=hl.format('%.3f', df["0"])) pcausal = pcausal.key_by("rf_score_bin") df = df.key_by("rf_score_bin")