def test_row_count(foods_csv: str) -> None: df = pl.read_csv(foods_csv, row_count_name="row_count") assert df["row_count"].to_list() == list(range(27)) df = (pl.scan_csv(foods_csv, row_count_name="row_count").filter( pl.col("category") == pl.lit("vegetables")).collect()) assert df["row_count"].to_list() == [0, 6, 11, 13, 14, 20, 25] df = (pl.scan_csv(foods_csv, row_count_name="row_count").with_row_count( "foo", 10).filter(pl.col("category") == pl.lit("vegetables")).collect()) assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]
def load_dir(pheno, region, dir_): with open(f'{dir_}/converged.txt') as converged: assert converged.read().strip() == 'TRUE' alphas = pl.scan_csv(f'{dir_}/alpha.tab', sep='\t', has_header=False).collect().to_numpy().T susie_pips = 1 - np.prod(1 - alphas, axis=1) df = pl.scan_csv(f'{dir_}/colnames.txt', has_header=False, with_column_names=lambda _: ['var_name']).with_column( pl.lit(1).alias('row_number')).with_columns([ pl.col('row_number').cumsum(), pl.lit(None, int).alias('cs_num'), pl.lit(region).alias('region'), pl.lit(pheno).alias('phenotype'), pl.Series(susie_pips).alias('susie_pip'), pl.lit(None, float).alias('susie_cs_pip') ]) for cs_num in range(50): cs_num += 1 cs_fname = f'{dir_}/cs{cs_num}.txt' if not os.path.exists(cs_fname): continue with open(cs_fname) as cs: var_nums = [int(var_num) for var_num in next(cs).strip().split()] next(cs) min_ld = float(next(cs).split()[0]) if min_ld < min_ld_thresh: continue df = df.with_columns([ pl.when(pl.col('row_number').is_in(var_nums)).then( pl.when(~pl.col('cs_num').is_null()).then(-1).otherwise( cs_num)).otherwise(pl.col('cs_num')).alias('cs_num'), pl.when(pl.col('row_number').is_in(var_nums)).then( pl.Series(alphas[:, cs_num - 1])).otherwise( pl.col('susie_cs_pip')).alias('susie_cs_pip') ]) df = df.with_column( pl.when(pl.col('cs_num') != -1).then( pl.col('susie_cs_pip')).otherwise(-1).alias('susie_cs_pip')) df = df.filter( pl.col('var_name').str.contains('^STR') & ~pl.col('cs_num').is_null() & (pl.col('susie_pip') > 0.05)).drop('row_number') return df
def test_scan_csv_schema_overwrite_and_dtypes_overwrite( foods_csv: str) -> None: assert (pl.scan_csv( foods_csv, dtypes={ "calories_foo": pl.Utf8, "fats_g_foo": pl.Float32 }, with_column_names=lambda names: [f"{a}_foo" for a in names], ).collect().dtypes) == [pl.Utf8, pl.Utf8, pl.Float32, pl.Int64]
def test_invalid_utf8() -> None: np.random.seed(1) bts = bytes(np.random.randint(0, 255, 200)) file = path.join(path.dirname(__file__), "nonutf8.csv") with open(file, "wb") as f: f.write(bts) a = pl.read_csv(file, has_headers=False, encoding="utf8-lossy") b = pl.scan_csv(file, has_headers=False, encoding="utf8-lossy").collect() assert a.frame_equal(b, null_equal=True)
def test_csv_schema_offset(foods_csv: str) -> None: csv = """metadata line foo,bar 1,2 3,4 5,6 """.encode() df = pl.read_csv(csv, skip_rows=2) assert df.columns == ["foo", "bar"] assert df.shape == (3, 2) df = pl.read_csv(csv, skip_rows=2, skip_rows_after_header=2) assert df.columns == ["foo", "bar"] assert df.shape == (1, 2) df = pl.scan_csv(foods_csv, skip_rows=4).collect() assert df.columns == ["fruit", "60", "0", "11"] assert df.shape == (23, 4) df = pl.scan_csv(foods_csv, skip_rows_after_header=10).collect() assert df.columns == ["category", "calories", "fats_g", "sugars_g"] assert df.shape == (17, 4)
def get_str_loci(phenotype, my_str_fname, thresh): p_col = f'p_{phenotype}' csv = pl.scan_csv( my_str_fname, sep='\t', dtypes={'alleles': str, 'locus_filtered': str} ).filter( pl.col(p_col) <= thresh ).with_column( pl.when(pl.col(p_col) <= 1e-300) .then(0) .otherwise(pl.col(p_col)) .alias(p_col) ).collect().to_dict(as_series=False) return sortedcontainers.SortedSet( iterable = zip(csv[p_col], csv['chrom'], csv['pos'], itertools.repeat('STR')) )
def get_snp_loci(plink_imputed_snp_fname, thresh): csv = pl.scan_csv( plink_imputed_snp_fname, sep='\t', null_values='NA' ).filter( pl.col('P') <= thresh ).with_column( pl.when(pl.col('P') <= 1e-300) .then(0) .otherwise(pl.col('P')) .alias('P') ).filter( pl.col('ERRCODE') != 'CONST_OMITTED_ALLELE' ).collect() assert np.all((csv['ERRCODE'] == '.').to_numpy()) dict_csv = csv.to_dict(as_series = False) return sortedcontainers.SortedSet( iterable = zip(dict_csv['P'], dict_csv['#CHROM'], dict_csv['POS'], itertools.repeat('SNP'), dict_csv['REF'], dict_csv['ALT']) )
import polars as pl from ..paths import DATA_DIR q = pl.scan_csv(f"{DATA_DIR}/reddit.csv").filter( (pl.col("comment_karma") > 0) & (pl.col("link_karma") > 0) & (pl.col("name").str_contains(r"^a"))) df = q.fetch(int(1e7))
import bokeh.plotting import numpy as np import polars as pl import scipy.stats parser = argparse.ArgumentParser() parser.add_argument('outdir') parser.add_argument('chrom_files', nargs = '+', help='4 cols: pos, chance of length confusionn, avg abs length confusion, normalized avg abs lenght confusion') args = parser.parse_args() outdir = args.outdir chrom_fnames = args.chrom_files loci = pl.concat([ pl.scan_csv( chrom_fname, sep='\t' ) for chrom_fname in chrom_fnames ]).drop('pos').collect() for col in loci.columns: print(f'Plotting column {col} ...', flush=True) max_val = loci.select(pl.col(col).max()).to_numpy() min_val = loci.select(pl.col(col).min()).to_numpy() n_steps = 1000 step_size = (max_val - min_val)/n_steps xs = np.arange(min_val, max_val + step_size, step_size) ys = scipy.stats.gaussian_kde(loci[col].to_numpy())(xs) if col.startswith('chance'): unit = '%' elif col.startswith('avg'):
import polars as pl from polars.lazy import * import time reddit = pl.scan_csv("data/reddit.csv") runestar = pl.scan_csv("data/runescape.csv", has_headers=False).with_column( col("column_1").alias("name") ) reddit = ( reddit.filter(col("comment_karma") > 0) .filter(col("link_karma") > 0) .filter(col("name").str_contains(r"^a")) # filter name that start with an "a" ) joined = reddit.join(runestar, on="name", how="inner").select( ["name", "comment_karma", "link_karma"] ) t0 = time.time() joined.show_graph(True) df = joined.fetch(int(1e7)) print(time.time() - t0) print(df)
import polars as pl from polars.lazy import * import time reddit = pl.scan_csv("data/reddit.csv") # doesn't really matter due to predicate optimizations optimal = True # reddit = reddit.filter( # (col("comment_karma") > 0) & # (col("link_karma") > 0) & # (col("name").str_contains(r"^a")) # ) reddit = ( reddit.filter(col("comment_karma") > 0) .filter(col("link_karma") > 0) .filter(col("name").str_contains(r"^a")) # filter name that start with an "a" ) # if optimal: # this is exactly the same result as below as the query optimizer will combine predicates. # reddit = reddit.filter( # (col("comment_karma") > 0) & # (col("link_karma") > 0) & # (col("name").str_contains(r"^a")) # ) # else: # reddit = ( # reddit # .filter(col("comment_karma") > 0)
pheno_datas_d = json.loads(args.pheno_datas_json) assert set(assoc_results_d.keys()) == set(pheno_datas_d.keys()) == set(ethnicities) figs = [] for ethnicity in ethnicities: if not args.binary: stat_name = 'mean' else: stat_name = 'fraction' result = pl.scan_csv( assoc_results_d[ethnicity], sep='\t', dtypes={'locus_filtered': str} ).filter( (pl.col('chrom') == args.chrom) & (pl.col('pos') == args.pos) ).collect().select([ # have to collect first due to some sort of bug 'motif', '0.05_significance_CI', '5e-8_significance_CI', f'{stat_name}_{args.phenotype}_per_single_dosage', ]) assert result.shape[0] == 1 pheno_data = np.load(pheno_datas_d[ethnicity]) bgen_samples = [] with open(f'{ukb}/microarray/ukb46122_hap_chr1_v2_s487314.sample') as samplefile: for num, line in enumerate(samplefile): if num <= 1: # skip first two lines continue
def main(): parser = argparse.ArgumentParser() parser.add_argument('outtable') parser.add_argument('outreadme') parser.add_argument('pos_to_snpstr_pos') parser.add_argument('intable') parser.add_argument('inreadme') parser.add_argument('spot_test_fname_json_dict_fname') args = parser.parse_args() with open(args.spot_test_fname_json_dict_fname) as json_file: spot_test_fname_json_dict = next(json_file) with open(args.outreadme, 'w+') as readme: with open(args.inreadme) as inreadme: readme.write(inreadme.read()) readme.write( 'other_ethnic_association_ps - association p-values for the other ' 'ethnicities in the order ' + ','.join(other_ethnicities) + '\n' ) readme.write( 'other_ethnic_effect_directions - direction of association (+/-) ' 'for the other ethnicities in the order ' + ','.join(other_ethnicities) + " (NaN if that ethnicity's p > 0.05)\n" ) for ethnicity in other_ethnicities: readme.write( f'{ethnicity}_population_allele_frequencies - frequencies of each allele ' "(by dosage) among the ethnicity's tested population\n" ) hits = pl.scan_csv( args.intable, sep='\t', # hack added arguments here that will be ignored when reading putatively_causal but not when reading exonic_finemapped dtype={'alleles': str} ) cols = hits.columns # hack to only clean in one of the two cases this function is running if 'white_brit_allele_frequencies' in cols: hits = hits.with_column( pl.col('white_brit_allele_frequencies').str.replace_all('"', "'") ) hits = hits.join( pl.scan_csv(args.pos_to_snpstr_pos, sep='\t'), how='left', left_on=['chrom', 'start_pos'], right_on=['chrom', 'pos'] ) spot_tests_fnames = { tuple(key.split('__')): fname for key, fname in json.loads(spot_test_fname_json_dict).items() } spot_tests = {} for outer_ethnicity in other_ethnicities: spot_tests[outer_ethnicity] = pl.concat([ (pl.scan_csv( spot_test_fname, sep='\t', dtype={'alleles': str}, null_values=['nan'], with_column_names=lambda cols: list(fix_cols(cols, phenotype)) ).select([ pl.lit(phenotype).alias('phenotype'), 'chrom', 'pos', pl.col('p_phenotype').cast(float).alias(f'{ethnicity}_p'), pl.when(pl.col('p_phenotype') >= 0.05).then(np.nan).when(pl.col('coeff_phenotype') > 0).then(pl.lit('+')).otherwise(pl.lit('-')).alias(f'{ethnicity}_effect_direction'), pl.col('subset_total_per_allele_dosages').apply(reformat_dosage_dict_str).alias(f'{ethnicity}_population_allele_frequencies') ])) for (phenotype, _, _, ethnicity), spot_test_fname in spot_tests_fnames.items() if ethnicity == outer_ethnicity ]) for ethnicity in other_ethnicities: hits = hits.join( spot_tests[ethnicity], how='left', left_on=['phenotype', 'chrom', 'snpstr_pos'], right_on=['phenotype', 'chrom', 'pos'] ) hits = hits.with_columns([ pl.sum([pl.col(f'{ethnicity}_p').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities]) .str.replace(', $', '').alias('other_ethnic_association_ps'), pl.sum([pl.col(f'{ethnicity}_effect_direction').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities]) .str.replace(', $', '').alias('other_ethnic_effect_directions') ]) hits = hits.select([ *cols, 'other_ethnic_association_ps', 'other_ethnic_effect_directions', *[f'{ethnicity}_population_allele_frequencies' for ethnicity in other_ethnicities] ]).collect() assert hits.shape[0] == pl.read_csv( args.intable, sep='\t', # same hack as above dtype = {'alleles': str} ).shape[0] hits.to_csv(args.outtable, sep='\t',)
def test_scan_csv() -> None: df = pl.scan_csv(Path(__file__).parent.parent / "files" / "small.csv") assert df.collect().shape == (4, 3)
info_arr[i][info_arr[i] == None] = 'Missing' if not first: findings += ':' else: first = False findings += info_arr[i][argsort][:count] findingss.append(', '.join(findings)) pss.append(', '.join(str(x) for x in sort[:count])) return (pss, findingss, n_tests) spliceSTR = pl.scan_csv(f'{workdir}/yang_spliceSTRs.tab', sep='\t').distinct().groupby('hg19_START').agg([ pl.col('p_values').list(), pl.col('Tissue').list(), pl.col('gene_name').list(), pl.col('str-exon').str.split_exact( '-', 1).struct.field('field_1').list().alias('exon') ]).collect() pss, findingss, n_tests = fdr_cols( spliceSTR['p_values'], [spliceSTR['Tissue'], spliceSTR['gene_name'], spliceSTR['exon']]) new_splice = pl.DataFrame({ 'hg19_START': spliceSTR['hg19_START'], 'splice_p_vals': pd.Series(pss), 'splice_associations (tissue:gene:exonID)':
import polars as pl from ..paths import DATA_DIR reddit = (pl.scan_csv(f"{DATA_DIR}/reddit.csv").filter( pl.col("comment_karma") > 0).filter(pl.col("link_karma") > 0).filter( pl.col("name").str.contains(r"^a"))) runescape = pl.scan_csv("data/runescape.csv", has_headers=False).select( pl.col("column_1").alias("name")) dataset = reddit.join(runescape, on="name", how="inner").select( ["name", "comment_karma", "link_karma"]) df1 = dataset.fetch(int(1e7)) df2 = dataset.fetch(int(1e7), predicate_pushdown=True, projection_pushdown=True)
def load_plink_results(phenotype, binary, unconditional_results_fname, conditional_results_fname=None): # TODO remove conditional snps # Load plink SNP results print(f"Loading plink SNP results for {phenotype} ... ", end='', flush=True) start_time = time.time() if binary: binary_colnames = { 'A1_CASE_CT': 'alt_case_count', 'A1_CTRL_CT': 'alt_control_count', 'FIRTH?': 'firth?' } else: binary_colnames = {} start_time = time.time() unconditional_results = pl.scan_csv( unconditional_results_fname, sep='\t', null_values='NA').filter(pl.col('P') < 5e-5).rename({ '#CHROM': 'chr', 'POS': 'pos', 'ID': 'id', 'REF': 'ref', 'ALT': 'alt', 'P': 'p_val', 'ERRCODE': 'error', # these last three only occur in logistic regression **binary_colnames }).select([ pl.col(col) for col in [ 'chr', 'pos', 'id', 'ref', 'alt', 'p_val', 'error', *binary_colnames.values() ] ]).collect().to_pandas() if not conditional_results_fname: results = unconditional_results else: results = pl.scan_csv( conditional_results_fname, sep='\t', null_values='NA').rename({ '#CHROM': 'chr', 'POS': 'pos', 'ID': 'id', 'REF': 'ref', 'ALT': 'alt', 'P': 'p_val', 'ERRCODE': 'error', # these last three only occur in logistic regression **binary_colnames }).select([ pl.col(col) for col in [ 'chr', 'pos', 'id', 'ref', 'alt', 'p_val', 'error', *binary_colnames.values() ] ]).collect().to_pandas() unconditional_results['p_val'] = np.maximum( unconditional_results['p_val'], 1 / 10**max_p_val) unconditional_results['p_val'] = -np.log10( unconditional_results['p_val']) unconditional_results.rename(columns={'p_val': 'unconditional_p'}, inplace=True) unconditional_results = unconditional_results[[ 'chr', 'pos', 'unconditional_p' ]] results = results.merge( unconditional_results, on=['chr', 'pos'], how='inner' ) # subsets to only those which passed the p-val threshold in the unconditional run if binary == 'logistic': results.rename(columns={'firth?': 'firth'}, inplace=True) results = utils.df_to_recarray(results) results = results[results['error'] != 'CONST_OMITTED_ALLELE'] if binary == 'logistic': # in theory could keep unfinished error codes and just note them, # but easier to ignore results = results[(results['error'] != 'FIRTH_CONVERGE_FAIL') & (results['error'] != 'UNFINISHED')] results['p_val'] = np.maximum(results['p_val'], 1 / 10**max_p_val) results['p_val'] = -np.log10(results['p_val']) # we've already filtered all the spots that had errors in the unconditional run # having a VIF_TOO_HIGH or CORR_TOO_HIGH only in the conditional run just means that # SNP is extremely correlated with the conditioning variants, which means # its p-value should be very small, so this isn't an issue. if not conditional_results_fname: if not np.all(results['error'] == '.'): print(np.unique(results['error'])) assert False else: assert np.all((results['error'] == '.') | (results['error'] == 'VIF_TOO_HIGH') | (results['error'] == 'CORR_TOO_HIGH')) # rename for readability results['error'][results['error'] == '.'] = 'none' results['p_val'][results['error'] == 'VIF_TOO_HIGH'] = 0 print(f"done ({time.time() - start_time:.2e}s)", flush=True) return results
def load_my_str_results(phenotype, binary, unconditional_results_fname, conditional_results_fname=None): print(f"Loading my STR results for {phenotype} ... ", end='', flush=True) start_time = time.time() with open(unconditional_results_fname) as tsv: header = tsv.readline().strip() unconditional_results = pl.scan_csv( unconditional_results_fname, sep='\t', skip_rows=1, has_header=False, with_column_names=lambda _: fix_cols(header), dtypes={ 'alleles': str, 'locus_filtered': str }).filter(pl.col(f'p_{phenotype}') < 5e-5).collect().to_pandas() if not conditional_results_fname: results = unconditional_results else: results = pd.read_csv(conditional_results_fname, header=0, delimiter='\t', encoding='UTF-8', dtype=utils.get_dtypes(conditional_results_fname, {'locus_filtered': str})) unconditional_results[f'p_{phenotype}'] = np.maximum( unconditional_results[f'p_{phenotype}'], 1 / 10**max_p_val) unconditional_results[f'p_{phenotype}'] = -np.log10( unconditional_results[f'p_{phenotype}']) unconditional_results.rename( columns={f'p_{phenotype}': 'unconditional_p'}, inplace=True) unconditional_results = unconditional_results[[ 'chrom', 'pos', 'unconditional_p' ]] results = results.merge( unconditional_results, on=['chrom', 'pos'], how='inner' ) # subsets to only those which passed the p-val threshold in the unconditional run if binary == 'logistic': results.rename(columns={'firth?': 'firth'}, inplace=True) rename_dict = {} for idx, name in my_results_rename.items(): rename_dict[results.columns[idx]] = name rename_dict.update(my_str_results_rename) for colname in ('total_per_allele_dosages', 'total_hardcall_alleles', 'subset_total_per_allele_dosages', 'subset_total_hardcall_alleles', 'subset_allele_dosage_r2'): # convert allele lens from strings to floats, in addition round allele lens and values, but not NaN values new_col = np.array( list( map( lambda dict_str: { round(float(allele_len), 2): (round(val, 2) if val != 'NaN' else val) for allele_len, val in ast.literal_eval(dict_str). items() }, results[colname]))) # convert allele_lens to ints if they are close enough new_col = np.array( list( map( lambda d: str({(int(key) if key == int(key) else key): val for key, val in d.items()}), new_col))) results[colname] = new_col results.rename(columns=rename_dict, inplace=True) results = utils.df_to_recarray(results) results['p_val'] = np.maximum(results['p_val'], 1 / 10**max_p_val) results['p_val'] = -np.log10(results['p_val']) if conditional_results_fname: for STR in get_conditioned_strs(conditional_results_fname): results = results[results['pos'] != STR] print(f"done ({time.time() - start_time:.2e}s)", flush=True) return results
import polars as pl from polars.lazy import * reddit = pl.scan_csv("data/reddit.csv").select( [pl.sum("comment_karma"), pl.min("link_karma")]) if __name__ == "__main__": df = reddit.fetch() with open("book/src/outputs/how_can_i_aggregate.txt", "w") as f: f.write(str(df))
ukb = os.environ['UKB'] parser = argparse.ArgumentParser() parser.add_argument('--load', action='store_true', default=False) parser.add_argument('--calc', action='store_true', default=False) args = parser.parse_args() if args.load: # pos (start), snpstr_pos (hipstr) all_STRs = pl.read_csv(f'{ukb}/snpstr/flank_trimmed_vcf/vars.tab', sep='\t') # pos (hisptr) snpstr_strs = pl.scan_csv( f'{ukb}/snpstr/str_loci.txt', sep='\t', has_header=False, with_column_names=lambda _: ['chrom', 'pos'], ) all_STRs = all_STRs.lazy().join( snpstr_strs, left_on=['chrom', 'snpstr_pos'], right_on=['chrom', 'pos'], how='inner', suffix='_other').select([ 'chrom', 'pos', 'end_pos', 'snpstr_pos' ]).with_column(pl.col('snpstr_pos').alias('SNPSTR_start_pos')).drop( 'snpstr_pos').distinct(subset=['chrom', 'pos']).collect() assert ~np.any(np.isnan(all_STRs['chrom'].to_numpy())) assert ~np.any(np.isnan(all_STRs['pos'].to_numpy())) assert ~np.any(np.isnan(all_STRs['end_pos'].to_numpy()))
def main(): parser = argparse.ArgumentParser() parser.add_argument('phenotypes', nargs='+') phenotypes = parser.parse_args().phenotypes all_dfs = [] susie_cs_min_abs_corrs = [] finemap_cs_coverages = [] unconverged_regions = [] #underexplored_regions = [] unfinished_regions = [] for phenotype in phenotypes: pheno_dfs = [] str_assocs = pl.scan_csv( f'{ukb}/association/results/{phenotype}/my_str/results.tab', sep='\t', ).select([ pl.lit(phenotype).alias('phenotype'), 'chrom', 'pos', pl.col(f'p_{phenotype}').alias('p_val'), pl.lit(True).alias('is_STR'), pl.lit(None).cast(int).alias('reflen'), pl.lit(None).cast(int).alias('altlen') ]) snp_assocs = pl.scan_csv( f'{ukb}/association/results/{phenotype}/plink_snp/results.tab', sep='\t', null_values='NA', ).select([ pl.col('#CHROM').alias('chrom'), pl.col('POS').alias('pos'), pl.col('REF').str.lengths().cast(int).alias('reflen'), pl.col('ALT').str.lengths().cast(int).alias('altlen'), pl.col('P').alias('p_val'), ]).groupby(['chrom', 'pos', 'reflen', 'altlen']).agg([ pl.col('p_val').min().alias('p_val'), ]).with_columns([ pl.lit(phenotype).alias('phenotype'), pl.lit(False).alias('is_STR') ]).select([ 'phenotype', 'chrom', 'pos', 'p_val', 'is_STR', 'reflen', 'altlen' ]) assocs = pl.concat([str_assocs, snp_assocs ]).filter(pl.col('p_val') <= p_val_thresh) regions_df = pl.read_csv(f'{ukb}/signals/regions/{phenotype}.tab', sep='\t') for chrom, start, end, any_strs in zip(regions_df['chrom'], regions_df['start'], regions_df['end'], regions_df['any_strs']): if not any_strs: continue converged_fname = f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/converged.txt' if not os.path.exists(converged_fname): unfinished_regions.append((phenotype, chrom, start, end)) continue with open(converged_fname) as converged_file: if not next(converged_file).strip() == 'TRUE': unconverged_regions.append((phenotype, chrom, start, end)) continue print(f'Loading {phenotype} region {chrom}:{start}-{end}', flush=True) with open( f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/colnames.txt' ) as var_file: susie_vars = [line.strip() for line in var_file] alphas = pl.scan_csv( f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/alpha.tab', sep='\t', has_header=False).collect().to_numpy().T n_alphas = alphas.shape[1] susie_pips = 1 - np.prod(1 - alphas, axis=1) assert susie_pips.shape[0] == len(susie_vars) susie_idx = np.arange(len(susie_vars)) + 1 susie_df = pl.DataFrame({ 'varname': susie_vars, 'susie_pip': susie_pips, 'susie_alpha': np.zeros(len(susie_vars)), 'susie_cs': [-1] * len(susie_vars), 'susie_idx': susie_idx, **{f'alpha_{i}': alphas[:, i] for i in range(n_alphas)} }).lazy() finemap_df = pl.scan_csv( f'{ukb}/finemapping/finemap_results/{phenotype}/{chrom}_{start}_{end}/finemap_output.snp', sep=' ').select([ pl.col('rsid').alias('varname'), pl.col('prob').alias('finemap_pip') ]) df = susie_df.join(finemap_df, how='inner', on=[ 'varname' ]).with_columns([ pl.col('varname').str.extract('^[^_]*_([^_]*)', 1).cast(int).alias('pos'), pl.col('varname').str.extract( '^[^_]*_[^_]*_([^_]*)_.*', 1).str.lengths().cast(int).alias('reflen'), pl.col('varname').str.extract( '^[^_]*_[^_]*_[^_]*_([^_]*)', 1).str.lengths().cast(int).alias('altlen'), pl.col('varname').str.contains('^STR').alias('is_STR'), pl.lit(f'{phenotype}_{chrom}_{start}_{end}').alias('region'), pl.lit(chrom).alias('chrom').cast(int), pl.lit(phenotype).alias('phenotype') ]).sort('susie_idx') real_cs_count = 0 for cs_fname in glob.glob( f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/cs*.txt' ): cs_id = int(cs_fname.split('cs')[-1].split('.')[0]) with open(cs_fname) as cs_file: # susie uses 1 based indexing, python uses 0 # make sure cs idxs are in increasing order cs_susie_idx = np.array( [int(idx) for idx in next(cs_file).strip().split()]) assert np.all(cs_susie_idx[1:] - cs_susie_idx[:-1] > 0) cs_susie_idx = pl.Series('cs_susie_idx', cs_susie_idx) next(cs_file) # skip cs credibility min_abs_corr, _, _ = [ float(idx) for idx in next(cs_file).strip().split() ] susie_cs_min_abs_corrs.append(min_abs_corr) finemap_cs_coverages.append( df.filter(pl.col('susie_idx').is_in(cs_susie_idx)).select( pl.col('finemap_pip').sum()).collect()) df = df.with_column( pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then( pl.when( pl.col(f'alpha_{cs_id-1}') > pl.col('susie_alpha') ).then(pl.col(f'alpha_{cs_id-1}')).otherwise( pl.col('susie_alpha'))).otherwise( pl.col('susie_alpha')).alias('susie_alpha')) if min_abs_corr < corr_cutoff: continue real_cs_count += 1 # could worry about variants being in multiple CSes df = df.with_column( pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then( cs_id).otherwise(pl.col('susie_cs')).alias('susie_cs')) pheno_dfs.append(df) ''' if real_cs_count >= 10: underexplored_regions.append((phenotype, chrom, start, end)) ''' pheno_dfs = [ df.select(pl.col('*').exclude('^alpha.*$')) for df in pheno_dfs ] pheno_df = pl.concat(pheno_dfs).join( assocs, how='left', on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen', 'altlen']).collect() all_dfs.append(pheno_df) del df, susie_df, finemap_df, assocs, pheno_dfs, pheno_df susie_cs_min_abs_corrs = np.array(susie_cs_min_abs_corrs) finemap_cs_coverages = np.array(finemap_cs_coverages) total_df = pl.concat(all_dfs) #total_assocs = pl.concat(all_assocs).filter(pl.col('p_val') <= p_val_thresh) '''' start_time = time.time() print('Gathering data ... ', flush=True) total_df = total_df.join( total_assocs, how='left', on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen', 'altlen'] ).collect() print(f'Done. Time: {time.time() - start_time:.2}') ''' total_df.filter( ~pl.col('p_val').is_null() & (pl.col('p_val') <= p_val_thresh)).to_csv( f'{ukb}/post_finemapping/intermediate_results/gathered_data.tab', sep='\t') print( 'Any vars with null Ps?', total_df.select(pl.col('p_val').is_null().alias('null?')).select( pl.any('null?').alias('any_nulls'))['any_nulls'][0]) print( 'n regions', total_df.select( pl.col('region').unique().count().alias('region_count')) ['region_count'][0]) cses_per_region = total_df.filter( pl.col('susie_cs') >= 0).filter(~pl.col('p_val').is_null()).groupby([ 'susie_cs', 'region' ]).agg( pl.col('p_val').min().alias('min_p'), ).filter(pl.col('min_p') <= p_val_thresh).groupby('region').agg( pl.col('region').count().alias('n_cses')).to_dict(False)['n_cses'] print( f'avg cses (total PIP >= .9, min_p_val of CS members <= {p_val_thresh}) per region {np.mean(cses_per_region)}, ({np.std(cses_per_region)})' ) for filter_, text in ((pl.lit(True), ''), (pl.col('is_STR'), ' STR'), (~pl.col('is_STR'), ' SNP')): susie_hits_per_region = total_df.filter(filter_).with_column( ((pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= pip_threshold) & (pl.col('p_val') <= p_val_thresh) ).alias('susie_hit')).groupby('region').agg( pl.col('susie_hit').sum().alias('n_susie_hits')).to_dict( False)['n_susie_hits'] print( f'avg susie{text} hits (var is in a CS, PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(susie_hits_per_region)}, ({np.std(susie_hits_per_region)})' ) finemap_hits_per_region = total_df.filter(filter_).with_column( ((pl.col('finemap_pip') >= pip_threshold) & (pl.col('p_val') <= p_val_thresh) ).alias('finemap_hit')).groupby('region').agg( pl.col('finemap_hit').sum().alias('n_finemap_hits')).select( 'n_finemap_hits').to_numpy() print( f'avg finemap{text} hits (PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(finemap_hits_per_region)}, ({np.std(finemap_hits_per_region)})' ) print('Exporting FINEMAP vs SuSiE PIP plots', flush=True) comparison_thresh = 0.3 title = f'{text} with p-val <= {p_val_thresh} where at least one of SuSiE or FINEMAP PIP >= {comparison_thresh}' if text == '': title = 'Vars ' + title fig = bokeh.plotting.figure( width=1200, height=1200, title=title, x_axis_label='FINEMAP PIPs', y_axis_label='SuSiE PIPs', ) fig.title.text_font_size = '30px' fig.axis.axis_label_text_font_size = '26px' fig.axis.major_label_text_font_size = '20px' fig.background_fill_color = None fig.border_fill_color = None fig.ygrid.grid_line_color = None fig.xgrid.grid_line_color = None fig.toolbar.logo = None fig.toolbar_location = None print(total_df.filter(filter_)) print(total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh))) pips = total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh) & ((pl.col('finemap_pip') >= comparison_thresh) | ((pl.col('susie_pip') >= comparison_thresh) & (pl.col('susie_cs') >= 0)))).select( ['susie_pip', 'finemap_pip']) print(pips) bin_size = .05 bins = bokeh.util.hex.hexbin( pips['finemap_pip'].to_numpy().reshape(-1), pips['susie_pip'].to_numpy().reshape(-1), size=bin_size) palette = [ linear_int_interpolate((134, 204, 195), (9, 41, 46), i / 254) for i in range(-1, 255) ] cmap = bokeh.transform.log_cmap('counts', palette=palette, low=1, high=max(bins.counts), low_color=(255, 255, 255)) color_mapper = bokeh.models.LogColorMapper(palette=palette, low=1, high=max(bins.counts)) fig.hex_tile(q='q', r='r', size=bin_size, line_color=None, source=bins, fill_color=cmap) color_bar = bokeh.models.ColorBar(color_mapper=color_mapper, width=70, major_label_text_font_size='20px') fig.add_layout(color_bar, 'right') ext = text.replace(' ', '_') bokeh.io.export_png( fig, filename= f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.png') bokeh.io.export_svg( fig, filename= f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.svg') print(f'unconverged regions: {unconverged_regions}') print(f'unfinished regions: {unfinished_regions}') #print(f'underexplored regions: {underexplored_regions}') fig = bokeh.plotting.figure( width=1200, height=1200, title='SuSiE credible set min absolute correlations', x_axis_label='min absolute correlation', y_axis_label='# credible sets', ) fig.axis.axis_label_text_font_size = '30px' fig.background_fill_color = None fig.border_fill_color = None fig.grid.grid_line_color = None fig.toolbar_location = None step = 0.01 left_edges = np.arange(0, 1 + step, step) ys = [ np.sum((left_edge <= susie_cs_min_abs_corrs) & (susie_cs_min_abs_corrs < left_edge + step)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step) print('Exporting cs plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.png') bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.svg') fig = bokeh.plotting.figure( width=1200, height=1200, title= f'Number of SuSie CSes min absolute corr >= {corr_cutoff} per region', x_axis_label='# cses in the region', y_axis_label='# regions', ) fig.axis.axis_label_text_font_size = '30px' fig.background_fill_color = None fig.border_fill_color = None fig.grid.grid_line_color = None fig.toolbar_location = None left_edges = np.arange(0, max(cses_per_region) + 1) ys = [ np.sum((left_edge <= cses_per_region) & (cses_per_region < left_edge + 1)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1) print('Exporting cs per region plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/cses_per_region.png') bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/cses_per_region.svg') fig = bokeh.plotting.figure( width=1200, height=1200, title=f'Number of FINEMAP vars with PIP >= {pip_threshold} per region', x_axis_label='# hits in the region', y_axis_label='# regions', ) fig.axis.axis_label_text_font_size = '30px' fig.background_fill_color = None fig.border_fill_color = None fig.grid.grid_line_color = None fig.toolbar_location = None left_edges = np.arange(0, max(finemap_hits_per_region) + 1) ys = [ np.sum((left_edge <= finemap_hits_per_region) & (finemap_hits_per_region < left_edge + 1)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1) print('Exporting finemap hits per region plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.png') bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.svg') fig = bokeh.plotting.figure( width=1200, height=1200, title= f'FINEMAP total PIPs for SuSiE CSes with min_abs_corr >= {corr_cutoff}', x_axis_label='FINEMAP PIPs', y_axis_label='# credible sets', ) fig.background_fill_color = None fig.border_fill_color = None fig.ygrid.grid_line_color = None fig.xgrid.grid_line_color = None fig.toolbar.logo = None fig.toolbar_location = None include = susie_cs_min_abs_corrs >= corr_cutoff max_total_pip = max(1, np.max(finemap_cs_coverages[include])) step = 0.01 left_edges = np.arange(0, max_total_pip + step, step) ys = [ np.sum((left_edge <= finemap_cs_coverages[include]) & (finemap_cs_coverages[include] < left_edge + step)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step) print('Exporting FINEMAP CS PIP plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.png' ) bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.svg' ) total_cses = np.sum(include) total_cses_large_finemap_pip = np.sum( finemap_cs_coverages[include] >= pip_threshold) print( f'SuSiE CSes with min_abs_corr >= {corr_cutoff} with FINEMAP total PIP >= {pip_threshold}: {total_cses_large_finemap_pip} ({total_cses_large_finemap_pip/total_cses:%})' ) susie_pip_threshold_for_finemap = .3 n_replicates_from_finemap = total_df.filter( (pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= susie_pip_threshold_for_finemap) & (pl.col('finemap_pip') >= pip_threshold)).shape[0] n_finemap_total = total_df.filter( pl.col('finemap_pip') >= pip_threshold).shape[0] print( f'FINEMAP hits with PIP >= {pip_threshold} in a SuSiE CS with abs corr >= {corr_cutoff} and SuSiE PIP >= {susie_pip_threshold_for_finemap}: {n_replicates_from_finemap} ({n_replicates_from_finemap/n_finemap_total:%})' ) for (curr_df, text) in [(total_df, 'all hits no filter'), (total_df.filter(pl.col('p_val') <= 1e-10), 'all hits p<=1e-10')]: print(text) var_thresh1 = .8 var_thresh2 = .3 for susie_thresh in (var_thresh1, var_thresh2): for finemap_thresh in (var_thresh1, var_thresh2): count = curr_df.filter( (pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= susie_thresh) & (pl.col('finemap_pip') >= finemap_thresh)).shape[0] print( f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} and with FINEMAP PIP >= {finemap_thresh}: {count}' ) for susie_thresh in (var_thresh1, var_thresh2): count = curr_df.filter( (pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= susie_thresh) & (pl.col('finemap_pip') < var_thresh2)).shape[0] print( f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} with FINEMAP PIP < {var_thresh2}: {count}' ) for finemap_thresh in (var_thresh1, var_thresh2): count = curr_df.filter( (pl.col('finemap_pip') >= finemap_thresh) & ((pl.col('susie_cs') < 0) | (pl.col('susie_pip') < var_thresh2))).shape[0] print( f'Vars with FINEMAP PIP >= {finemap_thresh} either not in a SuSiE CS or having SuSiE PIP <= {var_thresh2}: {count}' ) # Not going to report susie alphas v pips - just know that they're similar if we look # at vars in good credible sets and not otherwise '''
def different_vars(susie_vars_to_compare_fname, phenotype, chrom, start_pos, end_pos): filter_set_fname = f'{ukb}/finemapping/str_imp_snp_overlaps/chr{chrom}_to_filter.tab' p_cutoff = 5e-4 # first choose STRs and SNPs only with p <= p_cutoff to lessen memory burden #print('Loading strs and snps var list ... ', flush=True) strs_to_include = set() snps_to_include = set() strs_to_include = pl.scan_csv( f'{ukb}/association/results/{phenotype}/my_str/results.tab', sep='\t' ).filter( (pl.col('chrom') == chrom) & (pl.col('pos') >= start_pos) & (pl.col('pos') <= end_pos) & (pl.col(f'p_{phenotype}') <= p_cutoff) ).select(pl.col('pos')).collect().to_numpy().flatten() assert len(strs_to_include) != 0 snps_to_filter = set() snps_to_filter = pl.scan_csv( filter_set_fname, sep='\t' ).select([ pl.col('snp_pos'), pl.col('snp_ref'), pl.col('snp_alt'), pl.lit('1').alias('join_marker') ]) snps_to_include = pl.scan_csv( f'{ukb}/association/results/{phenotype}/plink_snp/results.tab', sep='\t', null_values='NA' ).filter( (pl.col('#CHROM') == chrom) & (pl.col('POS') >= start_pos) & (pl.col('POS') <= end_pos) & (pl.col('P') <= p_cutoff) ).join( snps_to_filter, how = 'left', left_on = ['POS', 'REF', 'ALT'], right_on = ['snp_pos', 'snp_ref', 'snp_alt'] ).filter( pl.col('join_marker').is_null() ).select([ pl.col('POS'), pl.col('REF'), pl.col('ALT') ]).collect().pipe( lambda df: list(zip(*df.to_dict().values())) ) # returns a list of tuples snp_sort_tuples = set((pos, 'SNP', ref, alt) for (pos, ref, alt) in snps_to_include) str_sort_tuples = set((pos, 'STR') for pos in strs_to_include) vars_ = snp_sort_tuples.union(str_sort_tuples) var_names = { f'STR_{tuple[0]}' if tuple[1] == 'STR' else f'SNP_{tuple[0]}_{tuple[2]}_{tuple[3]}' for tuple in vars_ } with open(susie_vars_to_compare_fname) as susie_vars_to_compare_file: susie_vars = {line.strip() for line in susie_vars_to_compare_file.readlines() if line.strip()} if susie_vars == var_names: return None else: assert all(x in var_names for x in susie_vars) return list(x for x in var_names if x not in susie_vars)
}, **{ f'{ethnicity}_se': float for ethnicity in other_ethnicities } })).filter('is_STR') fname = f'{ukb}/association/results/{phenotype}/my_str/results.tab' with open(fname) as tsv: header = tsv.readline().strip() assoc_df = pl.scan_csv( fname, sep='\t', skip_rows=1, has_header=False, with_column_names=lambda _: header. replace('0.05_significance_CI', 'foo', 1).replace( '5e-8_significance_CI', 'bar', 1).split( '\t') # these duplicate column names won't be used anyway ).select([ 'chrom', 'pos', pl.col('subset_total_per_allele_dosages').alias( 'white_brit_allele_dosages') ]) df = df.lazy().join(assoc_df, how='left', on=['chrom', 'pos']) for ethnicity in other_ethnicities: fname = f'{ukb}/association/results_finemapped_only/{ethnicity}/{phenotype}/my_str/results.tab' with open(fname) as tsv: header = tsv.readline().strip() assoc_df = pl.scan_csv( fname, sep='\t', skip_rows=1,
def choose_vars(readme_fname, outcols_fname, phenotype, chrom, start_pos, end_pos, p_cutoff, mac, use_PACSIN2): if use_PACSIN2: assert int(chrom) == 22 filter_set_fname = f'{ukb}/finemapping/str_imp_snp_overlaps/chr{chrom}_to_filter.tab' if mac: mac_threshold = int(mac[0]) snp_mac_fname = mac[1] str_mac_fname = mac[2] snps_exclude_mac = pl.scan_csv( snp_mac_fname, sep='\t').filter(pl.col('ALT_CTS') < mac_threshold).select([ '#POS', 'REF', 'ALT' ]).collect().pipe(lambda df: list(zip(*df.to_dict().values()))) # need to make that look like a list of strings to polars b/c buggy, so add a single nonsense to it snps_exclude_mac.append('asdf') strs_exclude_mac = pl.scan_csv( str_mac_fname, sep='\t').filter(pl.col('mac') < mac_threshold).select( 'pos').collect()['pos'].to_list() today = datetime.datetime.now().strftime("%Y_%M_%D") with open(readme_fname, 'w') as readme: readme.write( f'Run date: {today}\n' f'Choosing variants for which association tests were not skipped and with p <= {p_cutoff}. ' 'SNPs in the filter set are also skipped. ' f'(Filter set at {filter_set_fname})\n') # first choose STRs and SNPs only with p <= p_cutoff to lessen memory burden print('Choosing which strs and snps to include ... ', flush=True) strs_to_include = set() snps_to_include = set() strs_to_include = pl.scan_csv( f'{ukb}/association/results/{phenotype}/my_str/results.tab', sep='\t').filter((pl.col('chrom') == chrom) & (pl.col('pos') >= start_pos) & (pl.col('pos') <= end_pos) & (pl.col(f'p_{phenotype}') <= p_cutoff)).select( pl.col('pos')).collect().to_numpy().flatten() if mac: strs_to_include = strs_to_include[ ~np.isin(strs_to_include, strs_exclude_mac)] snps_to_filter = set() snps_to_filter = pl.scan_csv(filter_set_fname, sep='\t').select([ pl.col('snp_pos'), pl.col('snp_ref'), pl.col('snp_alt'), pl.lit('1').alias('join_marker') ]) snps_to_include = pl.scan_csv( f'{ukb}/association/results/{phenotype}/plink_snp/results.tab', sep='\t', null_values='NA').filter( (pl.col('#CHROM') == chrom) & (pl.col('POS') >= start_pos) & (pl.col('POS') <= end_pos) & (pl.col('P') <= p_cutoff)).join( snps_to_filter, how='left', left_on=['POS', 'REF', 'ALT'], right_on=[ 'snp_pos', 'snp_ref', 'snp_alt' ]).filter(pl.col('join_marker').is_null()).select([ 'POS', 'REF', 'ALT' ]).collect().pipe(lambda df: list(zip(*df.to_dict().values()))) # returns a list of tuples if mac: snps_to_include = [ snps_to_include[idx] for idx in np.where(~np.isin(snps_to_include, snps_exclude_mac))[0] ] snp_sort_tuples = set( (pos, 'SNP', ref, alt) for (pos, ref, alt) in snps_to_include) str_sort_tuples = set((pos, 'STR') for pos in strs_to_include) vars_ = snp_sort_tuples.union(str_sort_tuples) if use_PACSIN2: vars_.remove((43385872, 'STR')) vars_.add((43385866, 'PACSIN2_STR')) vars_.add((43385875, 'PACSIN2_STR')) vars_.add((43385893, 'PACSIN2_STR')) sorted_vars = sorted(vars_) sorted_var_names = [ f'STR_{tuple[0]}' if tuple[1] == 'STR' else f'SNP_{tuple[0]}_{tuple[2]}_{tuple[3]}' if tuple[1] == 'SNP' else f'PACSIN2_STR_{tuple[0]}' if tuple[1] == 'PACSIN2_STR' else None # break the sort for tuple in sorted_vars ] assert len(set(sorted_var_names)) == len( sorted_var_names) # make sure is unique print(f'# STRs: {len(strs_to_include)} # SNPs: {len(snps_to_include)}', flush=True) with open(outcols_fname, 'w') as colfile: for var_name in sorted_var_names: colfile.write(var_name + '\n')
def test_scan_empty_csv() -> None: with pytest.raises(Exception) as excinfo: pl.scan_csv(Path(__file__).parent.parent / "files" / "empty.csv").collect() assert str(excinfo.value) == "empty csv"
#!/usr/bin/env python3 import os import polars as pl import phenotypes ukb = os.environ['UKB'] dfs = [] for phenotype in phenotypes.phenotypes_in_use: dfs.append( pl.scan_csv(f'{ukb}/signals/regions/{phenotype}.tab', sep='\t').with_column( pl.lit(phenotype).alias('phenotype'))) pl.concat(dfs).collect().with_column( (((pl.col('phenotype') == 'total_bilirubin') & (pl.col('chrom') == 12) & (pl.col('start') == 19976272) & (pl.col('end') == 22524428)) | ((pl.col('phenotype') == 'urate') & (pl.col('chrom') == 4) & (pl.col('start') == 8165642) & (pl.col('end') == 11717761)) | ((pl.col('phenotype') == 'alkaline_phosphatase') & (pl.col('chrom') == 1) & (pl.col('start') == 19430673) & (pl.col('end') == 24309348)) ).alias('filtered_due_to_computation_burden')).select([ 'phenotype', 'chrom', 'start', 'end', 'filtered_due_to_computation_burden' ]).to_csv( f'{ukb}/export_scripts/results/supp_table_2_finemapping_regions.tab', sep='\t')
def generate_figure(assoc_results_fname, pheno_data_fname, chrom, pos, phenotype, dosage_fraction_threshold, unit, binary, publication): assert bool(unit) or binary assert 0 <= dosage_fraction_threshold <= 1 if not binary: y_axis_label = 'Mean ' + phenotype.replace('_', ' ') + f' ({unit})' else: y_axis_label = 'Fraction ' + phenotype.replace('_', ' ') + ' cases' figure = bokeh.plotting.figure( width=600, height=600, y_axis_label=y_axis_label, x_axis_label='Sum of allele lengths (repeat copies)') figure.grid.grid_line_color = None figure.background_fill_color = None figure.border_fill_color = None figure.toolbar_location = None figure.title.text_font_size = '18px' figure.axis.axis_label_text_font_size = '18px' figure.axis.major_label_text_font_size = '14px' if not binary: stat_name = 'mean' else: stat_name = 'fraction' def fix_header(header): def fix_header_helper(_): part1 = header.rpartition('0.05_significance_CI') fix1 = part1[0] + 'foo' + part1[2] part2 = fix1.rpartition('5e-8_significance_CI') fix2 = part2[0] + 'bar' + part2[2] return fix2.split('\t') return fix_header_helper with open(assoc_results_fname) as tsv: header = tsv.readline().strip() result = pl.scan_csv( assoc_results_fname, sep='\t', dtypes={ 'locus_filtered': str }, skip_rows=1, has_header=False, with_column_names=fix_header(header)).filter( (pl.col('chrom') == chrom) & (pl.col('pos') == pos)).collect().select( [ # have to collect first due to some sort of bug 'motif', '0.05_significance_CI', '5e-8_significance_CI', f'{stat_name}_{phenotype}_per_single_dosage', 'total_subset_dosage_per_summed_gt' ]) assert result.shape[0] == 1 pheno_data = np.load(pheno_data_fname) bgen_samples = sample_utils.get_all_samples() assert len(bgen_samples) == 487409 samples_array = np.array(bgen_samples, dtype=float).reshape(-1, 1) merged_arr = utils.merge_arrays(samples_array, pheno_data) unfiltered_subset = ~np.isnan(merged_arr[:, 1]) n_samples = np.sum(unfiltered_subset) subset_summed_dosage_fractions = { float(allele): val for allele, val in ast.literal_eval( result['total_subset_dosage_per_summed_gt'].to_numpy()[0]).items() } total_dosage = np.sum(list(subset_summed_dosage_fractions.values())) subset_summed_dosage_fractions = { key: val / total_dosage for key, val in subset_summed_dosage_fractions.items() } alleles = list(subset_summed_dosage_fractions.keys()) alleles_copy = alleles.copy() for allele in alleles_copy: if subset_summed_dosage_fractions[allele] < dosage_fraction_threshold: alleles.remove(allele) alleles = sorted(alleles) mean_per_dosage = { float(allele): val for allele, val in ast.literal_eval( result[f'{stat_name}_{phenotype}_per_single_dosage'].to_numpy() [0]).items() } ci5e_2 = { float(allele): val for allele, val in ast.literal_eval( result['0.05_significance_CI'].to_numpy()[0]).items() } ci5e_8 = { float(allele): val for allele, val in ast.literal_eval( result['5e-8_significance_CI'].to_numpy()[0]).items() } y_min = min(ci5e_8[allele][0] for allele in alleles) y_max = max(ci5e_8[allele][1] for allele in alleles) figure.varea(alleles, [ci5e_2[allele][1] for allele in alleles], [ci5e_8[allele][1] for allele in alleles], color="red", alpha=0.2, legend_label='1 - 5e-8 Confidence Interval') figure.varea(alleles, [ci5e_2[allele][0] for allele in alleles], [ci5e_2[allele][1] for allele in alleles], color="red", alpha=0.4, legend_label='0.95 Confidence Interval') figure.varea(alleles, [ci5e_8[allele][0] for allele in alleles], [ci5e_2[allele][0] for allele in alleles], color="red", alpha=0.2) figure.line(alleles, [mean_per_dosage[allele] for allele in alleles], line_width=2, color="black") figure.circle(alleles, [mean_per_dosage[allele] for allele in alleles], color="black", size=6, legend_label='mean') figure.legend.label_text_font_size = '10px' figure.y_range = bokeh.models.Range1d(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)) figure.add_layout( bokeh.models.Title(text=f'STR {chrom}:{pos}', align="center", text_font_size='18px'), "above") figure.add_layout( bokeh.models.Title(text=phenotype.replace('_', ' ').capitalize() + " vs genotype", align="center", text_font_size='18px'), "above") if not publication: figure.add_layout( bokeh.models.Title( text="Phenotype values are unadjusted for covariates", align="center"), "below") figure.add_layout( bokeh.models.Title( text= "People contribute to each genotype based on their prob. of having that genotype", align="center"), "below") figure.add_layout( bokeh.models.Title(text="Only considers tested individuals", align="center"), "below") figure.add_layout( bokeh.models.Title( text= f"Genotypes with dosages less than {100*dosage_fraction_threshold}% of the population are omitted", align="center"), "below") return figure
import polars as pl from ..paths import DATA_DIR q = ( pl.scan_csv(f"{DATA_DIR}/reddit.csv").filter( pl.col("comment_karma") > 0).filter(pl.col("link_karma") > 0).filter( pl.col("name").str.contains( r"^a")) # filter name that start with an "a" ) df1 = q.fetch(int(1e7)) df2 = q.fetch(int(1e7), predicate_pushdown=True)
import polars as pl q = (pl.scan_csv("data/reddit.csv").groupby("comment_karma").agg( [pl.col("name").n_unique().alias("unique_names"), pl.max("link_karma")]).sort(by_columns="unique_names", reverse=True)) df = q.fetch()
def main(): df = pl.scan_csv('post_finemapping/intermediate_results/gathered_data.tab', sep='\t').filter((pl.col('susie_pip') >= 0.3) | (pl.col('finemap_pip') >= 0.3)) df = df.with_column( (pl.col('susie_pip') - pl.col('finemap_pip')).alias('susie_f_pip_diff')).with_column( pl.col('susie_f_pip_diff').abs().alias('abs_pip_diff')) locus_summary_df = pl.concat([ pl.scan_csv( f'export_scripts/intermediate_results/chr{chrom}_loci_summary.tab', sep='\t') for chrom in range(1, 23) ]).select(['chr', 'pos', 'multiallelicness', 'allele_dist']) allele_threshes = (0.0004, 0.002, 0.01, 0.05) #allele_threshes = [0.01] df = df.join( locus_summary_df, how='left', #left_on=['chrom', 'snpstr_pos'], left_on=['chrom', 'pos'], right_on=['chr', 'pos']).collect() snp_df = df.filter(~pl.col('is_STR')) str_df = df.filter(pl.col('is_STR')) assert not str_df.select( pl.col('multiallelicness').is_null().any()).to_numpy()[0] str_df = str_df.with_columns([ pl.apply('allele_dist', count_alleles(thresh), pl.UInt32).alias(f'alleles_{thresh}') for thresh in allele_threshes ]) confusions = pl.concat([ pl.scan_csv(f'side_analyses/length_confusion/chr{i}.tab', sep='\t').with_column(pl.lit(i).alias('chrom').cast(int)) for i in range(1, 23) ]).collect() merged_df = str_df.join(confusions, how='left', on=['chrom', 'pos']) step = 0.05 fig = bokeh.plotting.figure(title='STR PIP histogram', width=size, height=size, x_axis_label='PIP', y_axis_label='density', tools='', toolbar_location=None) xs = np.arange(0, 1 + step, step) fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(str_df['susie_pip'], bins=xs, density=True)[0], color='red', legend_label='SuSiE STRs') fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(str_df['finemap_pip'], bins=xs, density=True)[0], color='blue', legend_label='FINEMAP STRs') fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(snp_df['susie_pip'], bins=xs, density=True)[0], color='green', legend_label='SuSiE SNPs') fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(snp_df['finemap_pip'], bins=xs, density=True)[0], color='purple', legend_label='FINEMAP SNPs') bokeh.io.export_png(fig, filename='post_finemapping/results/pip_histogram.png') fig = bokeh.plotting.figure(title='STR PIP scatterplot', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', tools='', toolbar_location=None) fig.circle(str_df['susie_pip'], str_df['finemap_pip']) bokeh.io.export_png( fig, filename='post_finemapping/results/str_comp_pip_scatter.png') fig = bokeh.plotting.figure(title='STR PIP heatmap', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) heat_map(fig, str_df['finemap_pip'], str_df['susie_pip'], 'post_finemapping/results/str_comp_pip_heatmap.png') fig = bokeh.plotting.figure(title='STR PIPs', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) weighted_heat_map( fig, merged_df['finemap_pip'], merged_df['susie_pip'], merged_df['chance_of_length_confusion'], 'average chance of misgenotyping per sample at any such locus', 'post_finemapping/results/str_comp_pip_chance_map.png') fig = bokeh.plotting.figure(title='STR PIPs', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) weighted_heat_map( fig, merged_df['finemap_pip'], merged_df['susie_pip'], merged_df['normalized_avg_abs_length_confusion'], 'average number of standard deviations of misgenotyping per sample at any such locus', 'post_finemapping/results/str_comp_pip_sd_map.png') fig = bokeh.plotting.figure(title='SNP PIP scatterplot', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', tools='', toolbar_location=None) fig.circle(snp_df['susie_pip'], snp_df['finemap_pip']) bokeh.io.export_png( fig, filename='post_finemapping/results/snp_comp_pip_scatter.png') fig = bokeh.plotting.figure(title='SNP PIP heatmap', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) heat_map(fig, snp_df['finemap_pip'], snp_df['susie_pip'], 'post_finemapping/results/snp_comp_pip_heatmap.png') color_mapper = bokeh.models.LinearColorMapper(palette=palette, low=0, high=1) color_bar = bokeh.models.ColorBar(color_mapper=color_mapper, width=30) cmap = bokeh.transform.linear_cmap('foo', palette=palette, low=0, high=1) fig = bokeh.plotting.figure(title='STR PIP scatterplot', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', tools='', match_aspect=True, toolbar_location=None) cb_title = bokeh.models.Title( text='chance a genotype call at this locus is wrong', align='center') fig.add_layout(color_bar, 'right') fig.add_layout(cb_title, 'right') cds = bokeh.models.ColumnDataSource( dict(x=merged_df['finemap_pip'], y=merged_df['susie_pip'], color=[ linear_int_interpolate((134, 204, 195), (9, 41, 46), val) for val in merged_df['chance_of_length_confusion'] ])) fig.circle(x='x', y='y', color='color', source=cds) bokeh.io.export_png( fig, filename='post_finemapping/results/colored_str_comp_pip_scatter.png') step = 0.05 for thresh in allele_threshes: for pip_thresh in (0.3, 0.8): for xs, x_label, out_loc, title, col in [ ( np.arange(-1, 1 + step, step), 'SuSiE PIP - FINEMAP PIP', f'post_finemapping/results/pip_diff_density_allele_thresh_{thresh}_pip_thresh_{pip_thresh}.png', f'PIP diff, STR allele penetrance threshold = {thresh:.4}', 'susie_f_pip_diff', ), (np.arange(0, 1 + step, step), 'absolute PIP difference', f'post_finemapping/results/pip_abs_diff_density_allele_thresh_{thresh}_pip_thresh_{pip_thresh}.png', f'absolute PIP diff, STR allele penetrance threshold = {thresh:.4}', 'abs_pip_diff') ]: filter_exp = (pl.col('susie_pip') >= pip_thresh) | ( pl.col('finemap_pip') >= pip_thresh) fig = bokeh.plotting.figure(title=title, width=size, height=size, x_axis_label=x_label, y_axis_label='density', tools='', toolbar_location=None) fig.line( x=xs[:-1], y=np.histogram(snp_df.filter(filter_exp)[col].to_numpy(), bins=xs, density=True)[0], #y=scipy.stats.gaussian_kde(snp_df['susie_f_pip_diff'].to_numpy())(xs), color='black', legend_label=f'SNPs (n={snp_df.shape[0]})') for count, color in ((2, 'brown'), (3, 'red'), (4, 'orange')): arr = str_df.filter(filter_exp).filter( pl.col(f'alleles_{thresh}') == count)[col].to_numpy() fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(arr, bins=xs, density=True)[0], color=color, legend_label=f'{count}-allele STRs (n={arr.shape[0]})') arr = str_df.filter(filter_exp).filter( pl.col(f'alleles_{thresh}') >= 5)[col].to_numpy() fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(arr, bins=xs, density=True)[0], color='gold', legend_label= f'STRs with at least 5 alleles (n={arr.shape[0]})') fig.add_layout( bokeh.models.Title( text= f'Variants with PIP at least {pip_thresh} for SuSiE or FINEMAP' ), 'below') bokeh.io.export_png(fig, filename=out_loc) fig = bokeh.plotting.figure(title='STR PIP diff', width=size, height=size, x_axis_label='multiallelicness', y_axis_label='SuSiE PIP - FINEMAP PIP', tools='', toolbar_location=None) heat_map(fig, str_df['multiallelicness'], str_df['susie_f_pip_diff'], 'post_finemapping/results/str_pip_diff_heatmap.png', y_min=-1) fig = bokeh.plotting.figure(title='STR PIP abs diff', width=size, height=size, x_axis_label='multiallelicness', y_axis_label='absolute PIP difference', tools='', toolbar_location=None) heat_map(fig, str_df['multiallelicness'], str_df['abs_pip_diff'], 'post_finemapping/results/str_pip_abs_diff_heatmap.png') fig = bokeh.plotting.figure(title='PIP abs diff', width=size, height=size, x_axis_label='multiallelicness', y_axis_label='absolute PIP difference', tools='', toolbar_location=None)