def test_hadoop_ls(self): path1 = resource('ls_test/f_50') ls1 = hl.hadoop_ls(path1) self.assertEqual(len(ls1), 1) self.assertEqual(ls1[0]['size_bytes'], 50) self.assertEqual(ls1[0]['is_dir'], False) self.assertTrue('path' in ls1[0]) self.assertTrue('owner' in ls1[0]) self.assertTrue('modification_time' in ls1[0]) path2 = resource('ls_test') ls2 = hl.hadoop_ls(path2) self.assertEqual(len(ls2), 3) ls2_dict = {x['path'].split("/")[-1]: x for x in ls2} self.assertEqual(ls2_dict['f_50']['size_bytes'], 50) self.assertEqual(ls2_dict['f_100']['size_bytes'], 100) self.assertEqual(ls2_dict['f_100']['is_dir'], False) self.assertEqual(ls2_dict['subdir']['is_dir'], True) self.assertTrue('owner' in ls2_dict['f_50']) self.assertTrue('modification_time' in ls2_dict['f_50']) path3 = resource('ls_test/f*') ls3 = hl.hadoop_ls(path3) assert len(ls3) == 2, ls3 with self.assertRaisesRegex(Exception, "FileNotFound"): hl.hadoop_ls('a_file_that_does_not_exist')
def join_clump_hts(pop, not_pop, max_pops, high_quality=False, overwrite=False): r''' Wrapper for mwzj_hts_by_tree() ''' assert not (not_pop and max_pops), '`not_pop` and `max_pops` cannot both be True' mt_path = get_clumping_results_path(pop=pop, not_pop=not_pop, max_pops=max_pops, high_quality=high_quality) if hl.hadoop_is_file(f'{mt_path}/_SUCCESS') and ~overwrite: print(f'\nMT already written to {mt_path}! To overwrite, use overwrite=True') return else: print(f'Writing MT to {mt_path}') pop = pop.upper() if pop is not None else None clump_results_dir = (f'{ldprune_dir}/results{"_high_quality" if high_quality else ""}/'+ ('max_pops' if max_pops else '{"not_" if not_pop else ""}{pop}')) ls = hl.hadoop_ls(f'{clump_results_dir}/*') all_hts = [x['path'] for x in ls if 'clump_results.ht' in x['path']] temp_dir = ('gs://ukbb-diverse-temp-30day/nb-temp/'+ 'max_pops' if max_pops else f'{"not_" if not_pop else ""}{pop}'+ f'{"-hq" if high_quality else ""}') globals_for_col_key = ukb_common.PHENO_KEY_FIELDS mt = mwzj_hts_by_tree(all_hts=all_hts, temp_dir=temp_dir, globals_for_col_key=globals_for_col_key) # mt = resume_mwzj(temp_dir=temp_dir, # NOTE: only use if all the temp hts have been created # globals_for_col_key=globals_for_col_key) mt.write(mt_path, overwrite=overwrite)
def get_rows_data(rows_files): # noqa: D103 file_sizes = [] partition_bounds = [] parts_file = [x["path"] for x in rows_files if x["path"].endswith("parts")] if parts_file: parts = hl.hadoop_ls(parts_file[0]) for i, x in enumerate(parts): index = x["path"].split(f"{parts_file[0]}/part-")[1].split("-")[0] if i < len(parts) - 1: test_index = (parts[i + 1]["path"].split( f"{parts_file[0]}/part-")[1].split("-")[0]) if test_index == index: continue file_sizes.append(x["size_bytes"]) metadata_file = [ x["path"] for x in rows_files if x["path"].endswith("metadata.json.gz") ] if metadata_file: with hl.hadoop_open(metadata_file[0], "rb") as f: rows_meta = json.loads(f.read()) try: partition_bounds = [( x["start"]["locus"]["contig"], x["start"]["locus"]["position"], x["end"]["locus"]["contig"], x["end"]["locus"]["position"], ) for x in rows_meta["jRangeBounds"]] except KeyError: pass return partition_bounds, file_sizes
def get_rows_data(rows_files): file_sizes = [] partition_bounds = [] parts_file = [x['path'] for x in rows_files if x['path'].endswith('parts')] if parts_file: parts = hl.hadoop_ls(parts_file[0]) for i, x in enumerate(parts): index = x['path'].split(f'{parts_file[0]}/part-')[1].split('-')[0] if i < len(parts) - 1: test_index = parts[i + 1]['path'].split( f'{parts_file[0]}/part-')[1].split('-')[0] if test_index == index: continue file_sizes.append(x['size_bytes']) metadata_file = [ x['path'] for x in rows_files if x['path'].endswith('metadata.json.gz') ] if metadata_file: with hl.hadoop_open(metadata_file[0], 'rb') as f: rows_meta = json.loads(f.read()) try: partition_bounds = [(x['start']['locus']['contig'], x['start']['locus']['position'], x['end']['locus']['contig'], x['end']['locus']['position']) for x in rows_meta['jRangeBounds']] except KeyError: pass return partition_bounds, file_sizes
def create_full_results_file(prune, overwrite=False): r''' Concatenates PRS-phentype regression results into a single table. ''' reg_path_regex = prs_dir + f'prs_phen_reg.*.*.n_remove_{int(n_remove_per_sex)}.seed_*.{"" if prune else "not_"}pruned*.tsv' ls = hl.hadoop_ls(reg_path_regex) reg_paths = sorted([f['path'] for f in ls]) df_list = [] for reg_path in reg_paths: with hl.hadoop_open(reg_path) as f: df_list.append(pd.read_csv(f, sep='\t')) df = pd.concat(df_list, sort=False) df.insert(1, 'phen_desc', df.phen.astype(str).apply(lambda x: phen_dict[x][0]) ) # add phenotype description to dataframe all_reg_results_path = prs_dir + f'prs_phen_reg.all_phens.n_remove_{int(n_remove_per_sex)}.{"" if prune else "not_"}pruned.tsv' if hl.hadoop_is_file(all_reg_results_path) and not overwrite: print('\n... Full PRS-phen regression results already written! ...') print(all_reg_results_path) else: print('\n... Writing PRS-phen regression results ...') print(all_reg_results_path) with hl.hadoop_open(all_reg_results_path, 'w') as f: df.to_csv(f, sep='\t', index=False)
def export_loo(batch_size=256, update=False): r''' For exporting p-values of meta-analysis of leave-one-out population sets ''' meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path()) meta_mt0 = meta_mt0.select_rows() meta_mt0 = meta_mt0.annotate_cols(pheno_id = get_pheno_id(tb=meta_mt0)) meta_mt0 = meta_mt0.filter_cols(hl.len(meta_mt0.pheno_data.pop)==6) if update: current_dir = f'{ldprune_dir}/loo/sumstats/batch1' # directory of current results to update ss_list = hl.hadoop_ls(current_dir) pheno_id_list = [x['path'].replace('.tsv.bgz','').replace(f'{current_dir}/','') for x in ss_list if 'bgz' in x['path']] meta_mt0 = meta_mt0.filter_cols(~hl.literal(pheno_id_list).contains(meta_mt0.pheno_id)) meta_mt0 = meta_mt0.annotate_rows(chr = meta_mt0.locus.contig, pos = meta_mt0.locus.position, SNP = (meta_mt0.locus.contig+':'+ hl.str(meta_mt0.locus.position)+':'+ meta_mt0.alleles[0]+':'+ meta_mt0.alleles[1])) all_pops = sorted(['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID']) annotate_dict = {} ''' pop_idx corresponds to the alphabetic ordering of the pops entry with idx=0 is 6-pop meta-analysis, entry with idx=1 is 5-pop not-AFR meta-analysis, idx=2 is 5-pop not-AMR, etc) ''' for pop_idx, pop in enumerate(all_pops,1): annotate_dict.update({f'pval_not_{pop}': meta_mt0.meta_analysis.Pvalue[pop_idx]}) meta_mt1 = meta_mt0.annotate_entries(**annotate_dict) meta_mt1 = meta_mt1.key_cols_by('pheno_id') meta_mt1 = meta_mt1.key_rows_by().drop('locus','alleles','meta_analysis') meta_mt1.describe() batch_idx = 1 get_export_path = lambda batch_idx: f'{ldprune_dir}/loo/sumstats/batch{batch_idx}' while hl.hadoop_is_dir(get_export_path(batch_idx)): batch_idx += 1 print(f'\nExporting to: {get_export_path(batch_idx)}\n') print(meta_mt1.count_cols()) hl.experimental.export_entries_by_col(mt = meta_mt1, path = get_export_path(batch_idx), bgzip = True, batch_size = batch_size, use_string_key_as_file_name = True, header_json_in_file = False)
def annotate_sites(ht, bed_dir = "gs://gnomad-qingbowang/finucane_et_al_hg38_ht/"): #currently only supports hg38. Returns annotated hail table #per functional annotation, annotate binary ls = hl.hadoop_ls(bed_dir) for i in range(len(ls)): func_interval = hl.read_table(ls[i]["path"]) func_name = ls[i]["path"].split("/")[-1].split(".")[0] ht = ht.annotate(func_name_tmp = hl.is_defined(func_interval[ht.locus])) #temp for the column name ht = ht.rename({"func_name_tmp":func_name}) #and change it to the functional anntation name itself print ("done {0}".format(func_name)) return (ht)
def test_hadoop_ls(self): path1 = resource('ls_test/f_50') ls1 = hl.hadoop_ls(path1) self.assertEqual(len(ls1), 1) self.assertEqual(ls1[0]['size_bytes'], 50) self.assertEqual(ls1[0]['is_dir'], False) self.assertTrue('path' in ls1[0]) self.assertTrue('owner' in ls1[0]) self.assertTrue('modification_time' in ls1[0]) path2 = resource('ls_test') ls2 = hl.hadoop_ls(path2) self.assertEqual(len(ls2), 3) ls2_dict = {x['path'].split("/")[-1]: x for x in ls2} self.assertEqual(ls2_dict['f_50']['size_bytes'], 50) self.assertEqual(ls2_dict['f_100']['size_bytes'], 100) self.assertEqual(ls2_dict['f_100']['is_dir'], False) self.assertEqual(ls2_dict['subdir']['is_dir'], True) self.assertTrue('owner' in ls2_dict['f_50']) self.assertTrue('modification_time' in ls2_dict['f_50'])
def test_hadoop_ls(self): path1 = resource('ls_test/f_50') ls1 = hl.hadoop_ls(path1) self.assertEqual(len(ls1), 1) self.assertEqual(ls1[0]['size_bytes'], 50) self.assertEqual(ls1[0]['is_dir'], False) self.assertTrue('path' in ls1[0]) self.assertTrue('owner' in ls1[0]) self.assertTrue('modification_time' in ls1[0]) path2 = resource('ls_test') ls2 = hl.hadoop_ls(path2) self.assertEqual(len(ls2), 3) ls2_dict = {x['path'].split("/")[-1]: x for x in ls2} self.assertEqual(ls2_dict['f_50']['size_bytes'], 50) self.assertEqual(ls2_dict['f_100']['size_bytes'], 100) self.assertEqual(ls2_dict['f_100']['is_dir'], False) self.assertEqual(ls2_dict['subdir']['is_dir'], True) self.assertTrue('owner' in ls2_dict['f_50']) self.assertTrue('modification_time' in ls2_dict['f_50'])
def get_paths(): pheno_manifest = hl.import_table(f'{ldprune_dir}/phenotype_manifest.tsv.bgz', impute=True) pheno_manifest = pheno_manifest.filter(pheno_manifest.num_pops==6) filenames = pheno_manifest.filename.collect() all_files = hl.hadoop_ls(loo_sumstats_dir) print(len(all_files)) all_paths = list(map(lambda f: f['path'], all_files)) print(len(all_paths)) filename_path_dict = dict(zip([path.split('/')[-1] for path in all_paths],all_paths)) paths = [filename_path_dict[f] for f in filenames if f in filename_path_dict and filename_path_dict[f] in all_paths] print(len(paths)) return paths
def annotate_sites_specific_histonemark(ht, bed_dir = "gs://gnomad-qingbowang/finucane_et_al_hg38_ht/", mark="H3K4me1"): #per histone mark, since otherwise this will blow up #per functional annotation, annotate binary ls = hl.hadoop_ls(bed_dir) for i in range(len(ls)): func_interval = hl.read_table(ls[i]["path"]) func_name = ls[i]["path"].split("/")[-1].replace("_narrowpeak.ht", "") his_name = func_name.split("-")[-1] if his_name ==mark: ht = ht.annotate(func_name_tmp = hl.is_defined(func_interval[ht.locus])) #temp for the column name ht = ht.rename({"func_name_tmp":func_name}) #and change it to the functional anntation name itself print ("done {0}".format(func_name)) return (ht)
def resume_mwzj(temp_dir, globals_for_col_key): r''' For resuming multiway zip join if intermediate tables have already been written ''' ls = hl.hadoop_ls(temp_dir) paths = [x['path'] for x in ls if 'temp_output' in x['path'] ] chunk_size = len(paths) outer_hts = [] for i in range(chunk_size): outer_hts.append(hl.read_table(f'{temp_dir}/temp_output_{i}.ht')) ht = hl.Table.multi_way_zip_join(outer_hts, 'row_field_name_outer', 'global_field_name_outer') ht = ht.transmute(inner_row=hl.flatmap(lambda i: hl.cond(hl.is_missing(ht.row_field_name_outer[i].row_field_name), hl.range(0, hl.len(ht.global_field_name_outer[i].global_field_name)) .map(lambda _: hl.null(ht.row_field_name_outer[i].row_field_name.dtype.element_type)), ht.row_field_name_outer[i].row_field_name), hl.range(hl.len(ht.global_field_name_outer)))) ht = ht.transmute_globals(inner_global=hl.flatmap(lambda x: x.global_field_name, ht.global_field_name_outer)) mt = ht._unlocalize_entries('inner_row', 'inner_global', globals_for_col_key) return mt
def prs_phen_reg(test_mt, phen, sex, n_remove, prune, percentiles, seed, use_sex_spec_irnt=False, overwrite=False): if use_sex_spec_irnt and 'irnt' not in phen: print( f'NOTE: Setting use_sex_spec_irnt=False because phen {phen} is not IRNT' ) test_ht = test_mt.cols() reg_path = prs_dir + f'prs_phen_reg.{phen}.{sex}.n_remove_{int(n_remove_per_sex)}.seed_{seed}.{"" if prune else "not_"}pruned{".sexspecirnt" if use_sex_spec_irnt else ""}.tsv' if hl.hadoop_is_file(reg_path) and not overwrite: print( f'... Phen ~ PRS + covariates regression already complete for all gwas versions & percentiles of {phen} {sex} {"sex_spec_irnt" if use_sex_spec_irnt else ""}! ...' ) else: row_struct_ls = [] gwas_versions = [ 'unadjusted', f'mtag_{"def" if sex!="both_sexes" else "rg1"}' ] for gwas_version in gwas_versions: for percentile in percentiles: prs_path_without_threshold = prs_dir + f'prs.{phen}.{sex}.n_remove_{int(n_remove_per_sex)}.seed_{seed}.{gwas_version}.{"" if prune else "not_"}pruned*.perc_{percentile}.tsv' print(prs_path_without_threshold) ls = hl.hadoop_ls(prs_path_without_threshold) print( 'WARNING: More than one file matches {prs_path_without_threshold}' if len(ls) > 1 else '') print('\n'.join([x['path'] for x in ls])) prs_path = ls[0][ 'path'] # default to using the first path if more than one path exists for a given p-value percentile pval_thresh = prs_path.split('pval_thresh_')[1].split('.perc')[ 0] #previously used for the both_sexes prs print( f'... {phen} {sex} {gwas_version} percentile={percentile} ...' ) print(f'... using {prs_path} ...') print(f'... pval threshold: {pval_thresh} ...') prs_ht = hl.import_table(prs_path, impute=True, key='s', types={'s': hl.tstr}) test_ht = test_ht.annotate(prs=prs_ht[test_ht.s].prs) cov_list = ['prs', 'age', 'age_squared' ] + ['PC{:}'.format(i) for i in range(1, 21)] for isFemale in [0, 1]: # test in males, then females test_ht_sex = test_ht.filter(test_ht.isFemale == isFemale) reg = test_ht_sex.aggregate( hl.agg.linreg( y=test_ht_sex.phen, x=[1] + list( map( lambda x: test_ht_sex[x] if type(x) is str else x, cov_list)))) print( f'\n\n... {phen} {sex} {gwas_version} percentile={percentile} ' + f'applied to {"fe" if isFemale else ""}males {"using sex-spec irnt" if use_sex_spec_irnt else ""} ...\n' + f'\n... multiple R^2: {reg.multiple_r_squared} ...' + f'\n... pval for multiple R^2: {reg.multiple_p_value} ...' + f'\n... adjusted R^2: {reg.adjusted_r_squared} ...') row_struct_ls.append({ 'phen': phen, 'gwas_sex': sex, 'gwas_version': gwas_version, 'sex_spec_irnt': str(use_sex_spec_irnt), 'percentile': str(percentile), 'pval_threshold': pval_thresh, 'sex_tested_on': f'{"fe" if isFemale else ""}males', 'multiple_r2': str(reg.multiple_r_squared), 'multiple_r2_pval': str(reg.multiple_p_value), 'adjusted_r2': str(reg.adjusted_r_squared) }) ht = hl.Table.parallelize( hl.literal( row_struct_ls, 'array<struct{phen: str, gwas_sex: str, gwas_version: str, sex_spec_irnt: str, percentile: str, pval_threshold: str, sex_tested_on: str, multiple_r2: str, multiple_r2_pval: str, adjusted_r2: str}>' )) ht = ht.annotate(percentile=hl.float(ht.percentile), pval_threshold=hl.float(ht.pval_threshold), multiple_r2=hl.float(ht.multiple_r2), multiple_r2_pval=hl.float(ht.multiple_r2_pval), adjusted_r2=hl.float(ht.adjusted_r2)) ht.show(12) print( f'\n\n... Writing regression results to {reg_path} (overwrite={overwrite})...' ) ht.export(reg_path)
os.chdir("/Users/weisburd/code/methods/gcnv_viewer") print(os.getcwd()) #%% #google_storage_dir = "gs://fc-secure-e2c5f2a5-2e76-4c01-a264-419262b2c7c8/dcr_tabs" #google_storage_dir = "gs://seqr-datasets-gcnv/GRCh38/RDG_WES_Broad_Internal/v1/beds" google_storage_dir = "gs://seqr-datasets-gcnv/GRCh38/RDG_WES_Broad_Internal/v3/beds" assert hl.hadoop_is_dir(google_storage_dir) #%% batch_name_to_path_and_samples = {} for result in hl.hadoop_ls(google_storage_dir): if not result['path'].endswith('.bed.gz') and not result['path'].endswith('.bed'): continue if result['size_bytes'] < 1000: print(f"ERROR: file size of {result['path']} is too small: {result['size_bytes']}") with hl.hadoop_open(result['path'], 'r') as f: line = f.readline() fields = line.rstrip("\n").split("\t") sample_ids = fields[3:] batch_name = os.path.basename(result['path']).replace(".dcr", "").replace(".bed", "").replace(".gz", "") batch_name_to_path_and_samples[batch_name] = (result['path'], sample_ids)
ht = hl.read_table("gs://gnomad-bw2/gnomad_v3_1_readviz_crams__that_failed_AB_filter_exploded_keyed_by_sample.ht") current_samples_v31 = ht.distinct().S.collect() # 4,445 samples len(set(current_samples_v31) - set(v31_release_samples)) # 919 samples ht = hl.read_table("gs://gnomad-bw2/gnomad_v3_readviz_crams__that_failed_AB_filter_exploded_keyed_by_sample.ht") current_samples_v3 = ht.distinct().S.collect() # 68,639 samples len(set(current_samples_v3) - set(v31_release_samples)) # 1,655 """ #%% tsv_paths = hl.hadoop_ls("gs://gnomad-bw2/gnomad_v3_1_readviz_tsvs") #%% path_tuples = [(os.path.basename(t['path']).replace(".tsv.bgz", ""), t['path']) for t in tsv_paths] df = pd.DataFrame(path_tuples, columns=['entity:participant_id', 'variants_tsv_bgz']) df = df.set_index('entity:participant_id') #%% df2 = pd.read_table("./metadata/v3_1_new_releasable_cram_paths_with_sex.txt").rename(columns={'CRAM': 'cram_path', 'CRAI': 'crai_path'}) df2 = df2[['sample_id', 'cram_path', 'crai_path']] df2 = df2.set_index('sample_id') #%%
def plot_hail_file_metadata( t_path: str) -> Optional[Union[Grid, Tabs, bokeh.plotting.Figure]]: """ Takes path to hail Table or MatrixTable (gs://bucket/path/hail.mt), outputs Grid or Tabs, respectively. Or if an unordered Table is provided, a Figure with file sizes is output. If metadata file or rows directory is missing, returns None. """ panel_size = 600 subpanel_size = 150 files = hl.hadoop_ls(t_path) rows_file = [x['path'] for x in files if x['path'].endswith('rows')] entries_file = [x['path'] for x in files if x['path'].endswith('entries')] # cols_file = [x['path'] for x in files if x['path'].endswith('cols')] success_file = [ x['modification_time'] for x in files if x['path'].endswith('SUCCESS') ] data_type = 'Table' metadata_file = [ x['path'] for x in files if x['path'].endswith('metadata.json.gz') ] if not metadata_file: warnings.warn('No metadata file found. Exiting...') return None with hl.hadoop_open(metadata_file[0], 'rb') as f: overall_meta = json.loads(f.read()) rows_per_partition = overall_meta['components']['partition_counts'][ 'counts'] if not rows_file: warnings.warn('No rows directory found. Exiting...') return None rows_files = hl.hadoop_ls(rows_file[0]) if entries_file: data_type = 'MatrixTable' rows_file = [ x['path'] for x in rows_files if x['path'].endswith('rows') ] rows_files = hl.hadoop_ls(rows_file[0]) row_partition_bounds, row_file_sizes = get_rows_data(rows_files) total_file_size, row_file_sizes, row_scale = scale_file_sizes( row_file_sizes) if not row_partition_bounds: warnings.warn('Table is not partitioned. Only plotting file sizes') row_file_sizes_hist, row_file_sizes_edges = np.histogram( row_file_sizes, bins=50) p_file_size = figure(plot_width=panel_size, plot_height=panel_size) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") p_file_size.yaxis.axis_label = f'File size ({row_scale}B)' return p_file_size all_data = { 'partition_widths': [-1 if x[0] != x[2] else x[3] - x[1] for x in row_partition_bounds], 'partition_bounds': [f'{x[0]}:{x[1]}-{x[2]}:{x[3]}' for x in row_partition_bounds], 'spans_chromosome': [ 'Spans chromosomes' if x[0] != x[2] else 'Within chromosome' for x in row_partition_bounds ], 'row_file_sizes': row_file_sizes, 'row_file_sizes_human': [f'{x:.1f} {row_scale}B' for x in row_file_sizes], 'rows_per_partition': rows_per_partition, 'index': list(range(len(rows_per_partition))) } if entries_file: entries_rows_files = hl.hadoop_ls(entries_file[0]) entries_rows_file = [ x['path'] for x in entries_rows_files if x['path'].endswith('rows') ] if entries_rows_file: entries_files = hl.hadoop_ls(entries_rows_file[0]) entry_partition_bounds, entry_file_sizes = get_rows_data( entries_files) total_entry_file_size, entry_file_sizes, entry_scale = scale_file_sizes( entry_file_sizes) all_data['entry_file_sizes'] = entry_file_sizes all_data['entry_file_sizes_human'] = [ f'{x:.1f} {entry_scale}B' for x in row_file_sizes ] title = f'{data_type}: {t_path}' msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_file_size}<br/>" if success_file[0]: msg += success_file[0] source = ColumnDataSource(pd.DataFrame(all_data)) p = figure(tools=TOOLS, plot_width=panel_size, plot_height=panel_size) p.title.text = title p.xaxis.axis_label = 'Number of rows' p.yaxis.axis_label = f'File size ({row_scale}B)' color_map = factor_cmap('spans_chromosome', palette=Spectral8, factors=list(set(all_data['spans_chromosome']))) p.scatter('rows_per_partition', 'row_file_sizes', color=color_map, legend='spans_chromosome', source=source) p.legend.location = 'bottom_right' p.select_one(HoverTool).tooltips = [ (x, f'@{x}') for x in ('rows_per_partition', 'row_file_sizes_human', 'partition_bounds', 'index') ] p_stats = Div(text=msg) p_rows_per_partition = figure(x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size) p_file_size = figure(y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size) rows_per_partition_hist, rows_per_partition_edges = np.histogram( all_data['rows_per_partition'], bins=50) p_rows_per_partition.quad(top=rows_per_partition_hist, bottom=0, left=rows_per_partition_edges[:-1], right=rows_per_partition_edges[1:], fill_color="#036564", line_color="#033649") row_file_sizes_hist, row_file_sizes_edges = np.histogram( all_data['row_file_sizes'], bins=50) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") rows_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) if 'entry_file_sizes' in all_data: title = f'Statistics for {data_type}: {t_path}' msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_entry_file_size}<br/>" if success_file[0]: msg += success_file[0] source = ColumnDataSource(pd.DataFrame(all_data)) panel_size = 600 subpanel_size = 150 p = figure(tools=TOOLS, plot_width=panel_size, plot_height=panel_size) p.title.text = title p.xaxis.axis_label = 'Number of rows' p.yaxis.axis_label = f'File size ({entry_scale}B)' color_map = factor_cmap('spans_chromosome', palette=Spectral8, factors=list(set( all_data['spans_chromosome']))) p.scatter('rows_per_partition', 'entry_file_sizes', color=color_map, legend='spans_chromosome', source=source) p.legend.location = 'bottom_right' p.select_one(HoverTool).tooltips = [ (x, f'@{x}') for x in ('rows_per_partition', 'entry_file_sizes_human', 'partition_bounds', 'index') ] p_stats = Div(text=msg) p_rows_per_partition = figure(x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size) p_rows_per_partition.quad(top=rows_per_partition_hist, bottom=0, left=rows_per_partition_edges[:-1], right=rows_per_partition_edges[1:], fill_color="#036564", line_color="#033649") p_file_size = figure(y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size) row_file_sizes_hist, row_file_sizes_edges = np.histogram( all_data['entry_file_sizes'], bins=50) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") entries_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) return Tabs(tabs=[ Panel(child=entries_grid, title='Entries'), Panel(child=rows_grid, title='Rows') ]) else: return rows_grid
def plot_hail_file_metadata( t_path: str, ) -> Optional[Union[Grid, Tabs, bokeh.plotting.Figure]]: """ Take path to hail Table or MatrixTable (gs://bucket/path/hail.mt), output Grid or Tabs, respectively. Or if an unordered Table is provided, a Figure with file sizes is output. If metadata file or rows directory is missing, returns None. """ panel_size = 600 subpanel_size = 150 files = hl.hadoop_ls(t_path) rows_file = [x["path"] for x in files if x["path"].endswith("rows")] entries_file = [x["path"] for x in files if x["path"].endswith("entries")] # cols_file = [x['path'] for x in files if x['path'].endswith('cols')] success_file = [ x["modification_time"] for x in files if x["path"].endswith("SUCCESS") ] data_type = "Table" metadata_file = [ x["path"] for x in files if x["path"].endswith("metadata.json.gz") ] if not metadata_file: logger.warning("No metadata file found. Exiting...") return None with hl.hadoop_open(metadata_file[0], "rb") as f: overall_meta = json.loads(f.read()) rows_per_partition = overall_meta["components"]["partition_counts"][ "counts"] if not rows_file: logger.warning("No rows directory found. Exiting...") return None rows_files = hl.hadoop_ls(rows_file[0]) if entries_file: data_type = "MatrixTable" rows_file = [ x["path"] for x in rows_files if x["path"].endswith("rows") ] rows_files = hl.hadoop_ls(rows_file[0]) row_partition_bounds, row_file_sizes = get_rows_data(rows_files) total_file_size, row_file_sizes, row_scale = scale_file_sizes( row_file_sizes) if not row_partition_bounds: logger.warning("Table is not partitioned. Only plotting file sizes") row_file_sizes_hist, row_file_sizes_edges = np.histogram( row_file_sizes, bins=50) p_file_size = figure(plot_width=panel_size, plot_height=panel_size) p_file_size.quad( right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649", ) p_file_size.yaxis.axis_label = f"File size ({row_scale}B)" return p_file_size all_data = { "partition_widths": [-1 if x[0] != x[2] else x[3] - x[1] for x in row_partition_bounds], "partition_bounds": [f"{x[0]}:{x[1]}-{x[2]}:{x[3]}" for x in row_partition_bounds], "spans_chromosome": [ "Spans chromosomes" if x[0] != x[2] else "Within chromosome" for x in row_partition_bounds ], "row_file_sizes": row_file_sizes, "row_file_sizes_human": [f"{x:.1f} {row_scale}B" for x in row_file_sizes], "rows_per_partition": rows_per_partition, "index": list(range(len(rows_per_partition))), } if entries_file: entries_rows_files = hl.hadoop_ls(entries_file[0]) entries_rows_file = [ x["path"] for x in entries_rows_files if x["path"].endswith("rows") ] if entries_rows_file: entries_files = hl.hadoop_ls(entries_rows_file[0]) entry_partition_bounds, entry_file_sizes = get_rows_data( entries_files) total_entry_file_size, entry_file_sizes, entry_scale = scale_file_sizes( entry_file_sizes) all_data["entry_file_sizes"] = entry_file_sizes all_data["entry_file_sizes_human"] = [ f"{x:.1f} {entry_scale}B" for x in row_file_sizes ] title = f"{data_type}: {t_path}" msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_file_size}<br/>" if success_file[0]: msg += success_file[0] source = ColumnDataSource(pd.DataFrame(all_data)) p = figure(tools=TOOLS, plot_width=panel_size, plot_height=panel_size) p.title.text = title p.xaxis.axis_label = "Number of rows" p.yaxis.axis_label = f"File size ({row_scale}B)" color_map = factor_cmap( "spans_chromosome", palette=Spectral8, factors=list(set(all_data["spans_chromosome"])), ) p.scatter( "rows_per_partition", "row_file_sizes", color=color_map, legend="spans_chromosome", source=source, ) p.legend.location = "bottom_right" p.select_one(HoverTool).tooltips = [(x, f"@{x}") for x in ( "rows_per_partition", "row_file_sizes_human", "partition_bounds", "index", )] p_stats = Div(text=msg) p_rows_per_partition = figure(x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size) p_file_size = figure(y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size) rows_per_partition_hist, rows_per_partition_edges = np.histogram( all_data["rows_per_partition"], bins=50) p_rows_per_partition.quad( top=rows_per_partition_hist, bottom=0, left=rows_per_partition_edges[:-1], right=rows_per_partition_edges[1:], fill_color="#036564", line_color="#033649", ) row_file_sizes_hist, row_file_sizes_edges = np.histogram( all_data["row_file_sizes"], bins=50) p_file_size.quad( right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649", ) rows_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) if "entry_file_sizes" in all_data: title = f"Statistics for {data_type}: {t_path}" msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_entry_file_size}<br/>" if success_file[0]: msg += success_file[0] source = ColumnDataSource(pd.DataFrame(all_data)) panel_size = 600 subpanel_size = 150 p = figure(tools=TOOLS, plot_width=panel_size, plot_height=panel_size) p.title.text = title p.xaxis.axis_label = "Number of rows" p.yaxis.axis_label = f"File size ({entry_scale}B)" color_map = factor_cmap( "spans_chromosome", palette=Spectral8, factors=list(set(all_data["spans_chromosome"])), ) p.scatter( "rows_per_partition", "entry_file_sizes", color=color_map, legend="spans_chromosome", source=source, ) p.legend.location = "bottom_right" p.select_one(HoverTool).tooltips = [(x, f"@{x}") for x in ( "rows_per_partition", "entry_file_sizes_human", "partition_bounds", "index", )] p_stats = Div(text=msg) p_rows_per_partition = figure(x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size) p_rows_per_partition.quad( top=rows_per_partition_hist, bottom=0, left=rows_per_partition_edges[:-1], right=rows_per_partition_edges[1:], fill_color="#036564", line_color="#033649", ) p_file_size = figure(y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size) row_file_sizes_hist, row_file_sizes_edges = np.histogram( all_data["entry_file_sizes"], bins=50) p_file_size.quad( right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649", ) entries_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) return Tabs(tabs=[ Panel(child=entries_grid, title="Entries"), Panel(child=rows_grid, title="Rows"), ]) else: return rows_grid
def get_ss_path_list(sumstats_dir): ss_list = hl.hadoop_ls(sumstats_dir) ss_path_list = [x['path'] for x in ss_list if 'bgz' in x['path']] print(f'\nNumber of sumstats files: {len(ss_path_list)}\n') return ss_path_list
def prs(mt, phen, sex, n_remove, prune, percentiles, seed, count=True): r''' Calculate PRS using betas from both sexes and sex-stratified GWAS, as well as MTAG meta-analyzed betas. PRS are always calculated on both sexes, regardless of the sex the GWAS was run on. P-value thresholds are determined by percentile. Set `count`=True if running this for the first time, to be sure that numbers make sense. To speed up, set count_rows=False ''' assert sex in [ 'both_sexes', 'female', 'male' ], f'WARNING: sex={sex} not allowed. sex must be one of the following: both_sexes, female, male' # "def" uses the MTAG results created by using the default settings # "rg1" uses the MTAG results created by using the --perfect-gencov flag gwas_versions = [ 'unadjusted', f'mtag_{"rg1" if sex=="both_sexes" else "def"}' ] mt = mt for gwas_version in gwas_versions: print( f'\n... Calculating PRS for "{phen_dict[phen][0]}" {sex} {gwas_version} ...\n' ) gwas_version_suffix = "" if gwas_version == 'unadjusted' else '.' + gwas_version gwas_path = ( prs_dir + f'{phen}.gwas.{sex}.n_remove_{n_remove_per_sex}.seed_{seed}{gwas_version_suffix}.tsv.{"b" if gwas_version=="unadjusted" else ""}gz' ) ss = hl.import_table( gwas_path, impute=True, key='snpid' if gwas_version == 'unadjusted' else 'SNP', force=True) if prune: print('\n... Pruning SNPs ...\n') # define the set of SNPs pruned_snps_path = 'gs://nbaya/risk_gradients/ukb_imp_v3_pruned.bim' #from Robert Maier (pruning threshold r2=0.2, random 10k UKB sample), download here: https://github.com/nikbaya/split/blob/master/ukb_imp_v3_pruned.bim.gz variants = hl.import_table(pruned_snps_path, delimiter='\t', no_header=True, impute=True) print(f'\n... Pruning to variants in {pruned_snps_path} ...\n') variants = variants.rename({ 'f0': 'chr', 'f1': 'rsid', 'f3': 'pos' }).key_by('rsid') # mt = mt.key_rows_by('rsid') # filter to variants defined in variants table ss = ss.filter( hl.is_defined(variants[ss['snpid' if gwas_version == 'unadjusted' else 'SNP']])) if count: ct_rows = ss.count() print( f'\n\n... SNP count after pruning filter: {ct_rows} ...\n') else: print(f'\n... Not pruning because prune={prune} ...\n') for percentile in percentiles: ## use path without threshold to check if PRS was already run because it doesn't require calculating the pval threshold prs_path_without_threshold = ( prs_dir + f'prs.{phen}.{sex}.n_remove_{int(n_remove_per_sex)}.seed_{seed}.{gwas_version}.{"" if prune else "not_"}pruned.pval_thresh_*.perc_{percentile}.tsv' ) # prs_path_without_threshold = (prs_dir+f'prs.{phen}.{sex}.n_remove_{int(n_remove_per_sex)}.seed_{seed}.{gwas_version}.{"" if prune else "not_"}pruned.pval_thresh_*.perc_{percentile}.opposite_sex.tsv') if len(hl.hadoop_ls(prs_path_without_threshold)) > 0: print( f'\n\n... Calculation of PRS for "{phen_dict[phen][0]}" {sex} {gwas_version} for percentile {percentile} already completed! ...\n' ) else: start = dt.now() if percentile != 1: threshold = ss.aggregate( hl.agg.approx_quantiles( ss[('' if gwas_version == 'unadjusted' else 'mtag_' ) + 'pval'], percentile)) ss = ss.filter( ss[('' if gwas_version == 'unadjusted' else 'mtag_') + 'pval'] <= threshold) else: threshold = 1 threshold_str = '{:.4e}'.format(threshold) prs_path = prs_dir + f'prs.{phen}.{sex}.n_remove_{int(n_remove_per_sex)}.seed_{seed}.{gwas_version}.{"" if prune else "not_"}pruned.pval_thresh_{threshold_str}.perc_{percentile}.tsv' print( f'\n\n... Using p-value threshold of {threshold} for percentile {percentile} ...\n' ) mt = mt.annotate_rows( beta=ss[mt.rsid]['beta' if gwas_version == 'unadjusted' else 'mtag_beta']) if count: if percentile != 1: threshold_ct = mt.filter_rows(hl.is_defined( mt.beta)).count_rows() else: threshold_ct = ct_rows print( f'\n\n... Variants remaining after thresholding filter: {threshold_ct} ...\n' ) mt = mt.annotate_cols(prs=hl.agg.sum(mt.dosage * mt.beta)) if count: mt_cols_ct = mt.filter_cols(hl.is_defined( mt.prs)).count_cols() print(f'\n\n... Samples with PRS: {mt_cols_ct} ...\n') mt.cols().describe() mt.cols().select('prs').export(prs_path) elapsed = dt.now() - start print( f'\n\n... Completed calculation of PRS for "{phen_dict[phen][0]}" {sex} {gwas_version} ...' ) print( f'\n... Elapsed time: {round(elapsed.seconds/60, 2)} min ...\n' )