def test_concat(): df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) assert pl.concat([df, df]).shape == (6, 3) # check if a remains unchanged a = pl.from_rows(((1, 2), (1, 2))) _ = pl.concat([a, a, a]) assert a.shape == (2, 2)
def test_lazy_concat(df: pl.DataFrame) -> None: shape = df.shape shape = (shape[0] * 2, shape[1]) out = pl.concat([df.lazy(), df.lazy()]).collect() assert out.shape == shape assert out.frame_equal(df.vstack(df.clone()), null_equal=True)
def build_gene_annotation_df(pset_dict): """ Build a table mapping each gene in a dataset to its gene annotations. @param pset_dict: [`dict`] A nested dictionary containing all tables in the PSet @return: [`DataFrame`] A table of all gene annotations, mapped to genes """ # Extract the all molecular data types for the pSet df_list = [ pl.from_pandas(pset_dict['molecularProfiles'][mDataType]['rowData']) for mDataType in pset_dict['molecularProfiles'] ] # Get columns of interest, add columns needed later for i in range(len(df_list)): df_list[i] = df_list[i].select(['.features']) empty_column = [None for _ in range(len(df_list[i]['.features']))] df_list[i]['symbol'] = pl.Series('symbol', empty_column, dtype=pl.Utf8) df_list[i]['gene_seq_start'] = pl.Series('gene_seq_start', empty_column, dtype=pl.Int64) df_list[i]['gene_seq_end'] = pl.Series('gene_seq_end', empty_column, dtype=pl.Int64) # Merge to a single DataFrame gene_annotation_df = pl.concat(df_list) \ .rename({'.features': 'gene_id'}) # Remove Ensembl gene version gene_annotation_df['gene_id'] = gene_annotation_df['gene_id'] \ .apply(lambda x: re.sub(r'\..*$', '', x)) gene_annotation_df = gene_annotation_df \ .drop_duplicates() \ .to_pandas() return gene_annotation_df
def concat_and_sort(blocks: List["pyarrow.Table"], key: "SortKeyT", descending: bool) -> "pyarrow.Table": check_polars_installed() col, _ = key[0] blocks = [pl.from_arrow(block) for block in blocks] df = pl.concat(blocks).sort(col, reverse=descending) return df.to_arrow()
def main(): parser = argparse.ArgumentParser() parser.add_argument('json_dirs_list_fname') parser.add_argument('outfname') args = parser.parse_args() with open(args.json_dirs_list_fname) as json_dirs_list_file: dirs_list = json.loads(json_dirs_list_file.read().strip()) dfs = [] for pheno, region, dir_ in dirs_list: if pheno == 'urate' and region == '4_8165642_11717761': continue print(f'Loading region {pheno}: {region}', flush=True) dfs.append(load_dir(pheno, region, dir_)) pl.concat(dfs).with_columns([ pl.col('susie_pip').round(4), pl.col('susie_cs_pip').round(4), ]).collect().to_csv(args.outfname, sep='\t')
def test_unset_sorted_on_append() -> None: df1 = pl.DataFrame([ pl.Series("key", ["a", "b", "a", "b"], dtype=pl.Categorical), pl.Series("val", [1, 2, 3, 4]), ]).sort("key") df2 = pl.DataFrame([ pl.Series("key", ["a", "b", "a", "b"], dtype=pl.Categorical), pl.Series("val", [5, 6, 7, 8]), ]).sort("key") df = pl.concat([df1, df2], rechunk=False) assert df.groupby("key").count()["count"].to_list() == [4, 4]
def test_expression_appends() -> None: df = pl.DataFrame({"a": [1, 1, 2]}) assert df.select(pl.repeat(None, 3).append(pl.col("a"))).n_chunks() == 2 assert df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()).n_chunks() == 1 out = df.select(pl.concat([pl.repeat(None, 3), pl.col("a")])) assert out.n_chunks() == 1 assert out.to_series().to_list() == [None, None, None, 1, 1, 2]
def test_predicate_count_vstack() -> None: l1 = pl.DataFrame({ "k": ["x", "y"], "v": [3, 2], }).lazy() l2 = pl.DataFrame({ "k": ["x", "y"], "v": [5, 7], }).lazy() assert pl.concat([l1, l2]).filter( pl.count().over("k") == 2).collect()["v"].to_list() == [3, 2, 5, 7]
def test_concat_horizontal() -> None: a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]}) b = pl.DataFrame({"c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "e": [1, 2, 1, 2]}) out = pl.concat([a, b], how="horizontal") expected = pl.DataFrame({ "a": ["a", "b", None, None], "b": [1, 2, None, None], "c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "e": [1, 2, 1, 2], }) assert out.frame_equal(expected)
def test_diag_concat() -> None: a = pl.DataFrame({"a": [1, 2]}) b = pl.DataFrame({"b": ["a", "b"], "c": [1, 2]}) c = pl.DataFrame({"a": [5, 7], "c": [1, 2], "d": [1, 2]}) out = pl.concat([a, b, c], how="diagonal") expected = pl.DataFrame({ "a": [1, 2, None, None, 5, 7], "b": [None, None, "a", "b", None, None], "c": [None, None, 1, 2, 1, 2], "d": [None, None, None, None, 1, 2], }) assert out.frame_equal(expected, null_equal=True)
def test_categorical_lexical_ordering_after_concat() -> None: with pl.StringCache(): ldf1 = (pl.DataFrame([ pl.Series("key1", [8, 5]), pl.Series("key2", ["fox", "baz"]) ]).lazy().with_column( pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical"))) ldf2 = (pl.DataFrame([ pl.Series("key1", [6, 8, 6]), pl.Series("key2", ["fox", "foo", "bar"]) ]).lazy().with_column( pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical"))) df = (pl.concat([ldf1, ldf2]).with_column( pl.col("key2").cat.set_ordering("lexical")).collect()) df.sort(["key1", "key2"])
def test_dtype_concat_3735() -> None: for dt in [ pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64, ]: d1 = pl.DataFrame([ pl.Series("val", [1, 2], dtype=dt), ]) d2 = pl.DataFrame([ pl.Series("val", [3, 4], dtype=dt), ]) df = pl.concat([d1, d2]) assert df.shape == (4, 1)
loci = pl.read_csv(args.str_loci, sep='\t', has_header=False).distinct().rename({ 'column_1': 'chrom', 'column_2': 'pos' }) for chrom in range(1, 23): dfs = [ loci.filter((pl.col('pos') >= start) & (pl.col('pos') <= end) & (pl.col('chrom') == chrom)) for (region_chrom, start, end) in zip( *pl.read_csv(args.finemapping_regions, sep='\t').select( ['chrom', 'start', 'end']).to_dict(False).values()) if region_chrom == chrom ] if len(dfs) > 0: pl.concat(dfs).sort('pos').to_csv( f'{args.outdir}/{args.phenotype}_chr{chrom}.tab', sep='\t') else: pl.DataFrame({ 'chrom': [], 'pos': [] }).to_csv(f'{args.outdir}/{args.phenotype}_chr{chrom}.tab', sep='\t') ''' df.filter( (pl.col('phenotype') == phenotype) & (pl.col('chrom') == chrom) ).select(['chrom', 'pos']).sort('pos').to_csv(, sep='\t') '''
import bokeh.plotting import numpy as np import polars as pl import scipy.stats parser = argparse.ArgumentParser() parser.add_argument('outdir') parser.add_argument('chrom_files', nargs = '+', help='4 cols: pos, chance of length confusionn, avg abs length confusion, normalized avg abs lenght confusion') args = parser.parse_args() outdir = args.outdir chrom_fnames = args.chrom_files loci = pl.concat([ pl.scan_csv( chrom_fname, sep='\t' ) for chrom_fname in chrom_fnames ]).drop('pos').collect() for col in loci.columns: print(f'Plotting column {col} ...', flush=True) max_val = loci.select(pl.col(col).max()).to_numpy() min_val = loci.select(pl.col(col).min()).to_numpy() n_steps = 1000 step_size = (max_val - min_val)/n_steps xs = np.arange(min_val, max_val + step_size, step_size) ys = scipy.stats.gaussian_kde(loci[col].to_numpy())(xs) if col.startswith('chance'): unit = '%' elif col.startswith('avg'):
def test_concat() -> None: s = pl.Series("a", [2, 1, 3]) assert pl.concat([s, s]).len() == 6 # check if s remains unchanged assert s.len() == 3
def main(): parser = argparse.ArgumentParser() parser.add_argument('outtable') parser.add_argument('outreadme') parser.add_argument('pos_to_snpstr_pos') parser.add_argument('intable') parser.add_argument('inreadme') parser.add_argument('spot_test_fname_json_dict_fname') args = parser.parse_args() with open(args.spot_test_fname_json_dict_fname) as json_file: spot_test_fname_json_dict = next(json_file) with open(args.outreadme, 'w+') as readme: with open(args.inreadme) as inreadme: readme.write(inreadme.read()) readme.write( 'other_ethnic_association_ps - association p-values for the other ' 'ethnicities in the order ' + ','.join(other_ethnicities) + '\n' ) readme.write( 'other_ethnic_effect_directions - direction of association (+/-) ' 'for the other ethnicities in the order ' + ','.join(other_ethnicities) + " (NaN if that ethnicity's p > 0.05)\n" ) for ethnicity in other_ethnicities: readme.write( f'{ethnicity}_population_allele_frequencies - frequencies of each allele ' "(by dosage) among the ethnicity's tested population\n" ) hits = pl.scan_csv( args.intable, sep='\t', # hack added arguments here that will be ignored when reading putatively_causal but not when reading exonic_finemapped dtype={'alleles': str} ) cols = hits.columns # hack to only clean in one of the two cases this function is running if 'white_brit_allele_frequencies' in cols: hits = hits.with_column( pl.col('white_brit_allele_frequencies').str.replace_all('"', "'") ) hits = hits.join( pl.scan_csv(args.pos_to_snpstr_pos, sep='\t'), how='left', left_on=['chrom', 'start_pos'], right_on=['chrom', 'pos'] ) spot_tests_fnames = { tuple(key.split('__')): fname for key, fname in json.loads(spot_test_fname_json_dict).items() } spot_tests = {} for outer_ethnicity in other_ethnicities: spot_tests[outer_ethnicity] = pl.concat([ (pl.scan_csv( spot_test_fname, sep='\t', dtype={'alleles': str}, null_values=['nan'], with_column_names=lambda cols: list(fix_cols(cols, phenotype)) ).select([ pl.lit(phenotype).alias('phenotype'), 'chrom', 'pos', pl.col('p_phenotype').cast(float).alias(f'{ethnicity}_p'), pl.when(pl.col('p_phenotype') >= 0.05).then(np.nan).when(pl.col('coeff_phenotype') > 0).then(pl.lit('+')).otherwise(pl.lit('-')).alias(f'{ethnicity}_effect_direction'), pl.col('subset_total_per_allele_dosages').apply(reformat_dosage_dict_str).alias(f'{ethnicity}_population_allele_frequencies') ])) for (phenotype, _, _, ethnicity), spot_test_fname in spot_tests_fnames.items() if ethnicity == outer_ethnicity ]) for ethnicity in other_ethnicities: hits = hits.join( spot_tests[ethnicity], how='left', left_on=['phenotype', 'chrom', 'snpstr_pos'], right_on=['phenotype', 'chrom', 'pos'] ) hits = hits.with_columns([ pl.sum([pl.col(f'{ethnicity}_p').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities]) .str.replace(', $', '').alias('other_ethnic_association_ps'), pl.sum([pl.col(f'{ethnicity}_effect_direction').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities]) .str.replace(', $', '').alias('other_ethnic_effect_directions') ]) hits = hits.select([ *cols, 'other_ethnic_association_ps', 'other_ethnic_effect_directions', *[f'{ethnicity}_population_allele_frequencies' for ethnicity in other_ethnicities] ]).collect() assert hits.shape[0] == pl.read_csv( args.intable, sep='\t', # same hack as above dtype = {'alleles': str} ).shape[0] hits.to_csv(args.outtable, sep='\t',)
help='cols: chrom, pos, FINEMAP_pcausal, SuSiE_CS_pcausal') parser.add_argument('pos_to_snpstr_pos', help='cols: chrom, pos, snpstr_pos') parser.add_argument( 'chrom_tables', nargs='+', help= 'In chromosome order. 4 cols: pos, chance of length confusionn, avg abs length confusion, normalized avg abs length confusion' ) args = parser.parse_args() outdir = args.outdir results_fname = args.results_table chrom_fnames = args.chrom_tables loci = pl.concat([ pl.scan_csv(chrom_fname, sep='\t').with_column( pl.lit(chrom_num + 1).cast(int).alias('chrom')) for chrom_num, chrom_fname in enumerate(chrom_fnames) ]).collect() pos_to_snpstr_pos = pl.scan_csv(args.pos_to_snpstr_pos, sep='\t').collect() cols = ['normalized_avg_abs_length_confusion', 'chance_of_length_confusion'] #loci_cols = [col for col in loci.columns if col != 'pos' and col != 'chrom'] results = pl.scan_csv(results_fname, sep='\t', null_values='NA').with_column( pl.when(pl.col('SuSiE_CS_pcausal').is_null()).then(0).otherwise( pl.col('SuSiE_CS_pcausal')).alias('SuSiE_pcausal')).with_column( (pl.col('FINEMAP_pcausal') - pl.col('SuSiE_pcausal')).alias('discrepancy')).with_column( pl.col('discrepancy').abs().alias('abs_discrepancy')).filter( (pl.col('FINEMAP_pcausal') >= .8)
dna_structures, how='left', left_on=['canonical_unit'], right_on=['repeat_unit'], suffixes=['', '_other'] ) ''' loci_summary_dfs = [] for chrom in range(1, 23): distribution_stats = pl.read_csv( f'{ukb}/export_scripts/intermediate_results/chr{chrom}_loci_summary.tab', sep='\t', ) loci_summary_dfs.append(distribution_stats) loci_summaries = pl.concat(loci_summary_dfs) n_before = all_STRs.shape[0] all_STRs = all_STRs.join(loci_summaries, how='left', left_on=['chrom', 'SNPSTR_start_pos'], right_on=['chr', 'pos'], suffix='_other') assert n_before == all_STRs.shape[0] print('Calculating mean lens ... ', flush=True, end='') all_STRs = all_STRs.with_column( pl.Series([ sum(key * val for (key, val) in ast.literal_eval(allele_dist).items()) for allele_dist in all_STRs['allele_dist'] ]).alias('mean_len')) print('done', flush=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument('phenotypes', nargs='+') phenotypes = parser.parse_args().phenotypes all_dfs = [] susie_cs_min_abs_corrs = [] finemap_cs_coverages = [] unconverged_regions = [] #underexplored_regions = [] unfinished_regions = [] for phenotype in phenotypes: pheno_dfs = [] str_assocs = pl.scan_csv( f'{ukb}/association/results/{phenotype}/my_str/results.tab', sep='\t', ).select([ pl.lit(phenotype).alias('phenotype'), 'chrom', 'pos', pl.col(f'p_{phenotype}').alias('p_val'), pl.lit(True).alias('is_STR'), pl.lit(None).cast(int).alias('reflen'), pl.lit(None).cast(int).alias('altlen') ]) snp_assocs = pl.scan_csv( f'{ukb}/association/results/{phenotype}/plink_snp/results.tab', sep='\t', null_values='NA', ).select([ pl.col('#CHROM').alias('chrom'), pl.col('POS').alias('pos'), pl.col('REF').str.lengths().cast(int).alias('reflen'), pl.col('ALT').str.lengths().cast(int).alias('altlen'), pl.col('P').alias('p_val'), ]).groupby(['chrom', 'pos', 'reflen', 'altlen']).agg([ pl.col('p_val').min().alias('p_val'), ]).with_columns([ pl.lit(phenotype).alias('phenotype'), pl.lit(False).alias('is_STR') ]).select([ 'phenotype', 'chrom', 'pos', 'p_val', 'is_STR', 'reflen', 'altlen' ]) assocs = pl.concat([str_assocs, snp_assocs ]).filter(pl.col('p_val') <= p_val_thresh) regions_df = pl.read_csv(f'{ukb}/signals/regions/{phenotype}.tab', sep='\t') for chrom, start, end, any_strs in zip(regions_df['chrom'], regions_df['start'], regions_df['end'], regions_df['any_strs']): if not any_strs: continue converged_fname = f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/converged.txt' if not os.path.exists(converged_fname): unfinished_regions.append((phenotype, chrom, start, end)) continue with open(converged_fname) as converged_file: if not next(converged_file).strip() == 'TRUE': unconverged_regions.append((phenotype, chrom, start, end)) continue print(f'Loading {phenotype} region {chrom}:{start}-{end}', flush=True) with open( f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/colnames.txt' ) as var_file: susie_vars = [line.strip() for line in var_file] alphas = pl.scan_csv( f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/alpha.tab', sep='\t', has_header=False).collect().to_numpy().T n_alphas = alphas.shape[1] susie_pips = 1 - np.prod(1 - alphas, axis=1) assert susie_pips.shape[0] == len(susie_vars) susie_idx = np.arange(len(susie_vars)) + 1 susie_df = pl.DataFrame({ 'varname': susie_vars, 'susie_pip': susie_pips, 'susie_alpha': np.zeros(len(susie_vars)), 'susie_cs': [-1] * len(susie_vars), 'susie_idx': susie_idx, **{f'alpha_{i}': alphas[:, i] for i in range(n_alphas)} }).lazy() finemap_df = pl.scan_csv( f'{ukb}/finemapping/finemap_results/{phenotype}/{chrom}_{start}_{end}/finemap_output.snp', sep=' ').select([ pl.col('rsid').alias('varname'), pl.col('prob').alias('finemap_pip') ]) df = susie_df.join(finemap_df, how='inner', on=[ 'varname' ]).with_columns([ pl.col('varname').str.extract('^[^_]*_([^_]*)', 1).cast(int).alias('pos'), pl.col('varname').str.extract( '^[^_]*_[^_]*_([^_]*)_.*', 1).str.lengths().cast(int).alias('reflen'), pl.col('varname').str.extract( '^[^_]*_[^_]*_[^_]*_([^_]*)', 1).str.lengths().cast(int).alias('altlen'), pl.col('varname').str.contains('^STR').alias('is_STR'), pl.lit(f'{phenotype}_{chrom}_{start}_{end}').alias('region'), pl.lit(chrom).alias('chrom').cast(int), pl.lit(phenotype).alias('phenotype') ]).sort('susie_idx') real_cs_count = 0 for cs_fname in glob.glob( f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/cs*.txt' ): cs_id = int(cs_fname.split('cs')[-1].split('.')[0]) with open(cs_fname) as cs_file: # susie uses 1 based indexing, python uses 0 # make sure cs idxs are in increasing order cs_susie_idx = np.array( [int(idx) for idx in next(cs_file).strip().split()]) assert np.all(cs_susie_idx[1:] - cs_susie_idx[:-1] > 0) cs_susie_idx = pl.Series('cs_susie_idx', cs_susie_idx) next(cs_file) # skip cs credibility min_abs_corr, _, _ = [ float(idx) for idx in next(cs_file).strip().split() ] susie_cs_min_abs_corrs.append(min_abs_corr) finemap_cs_coverages.append( df.filter(pl.col('susie_idx').is_in(cs_susie_idx)).select( pl.col('finemap_pip').sum()).collect()) df = df.with_column( pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then( pl.when( pl.col(f'alpha_{cs_id-1}') > pl.col('susie_alpha') ).then(pl.col(f'alpha_{cs_id-1}')).otherwise( pl.col('susie_alpha'))).otherwise( pl.col('susie_alpha')).alias('susie_alpha')) if min_abs_corr < corr_cutoff: continue real_cs_count += 1 # could worry about variants being in multiple CSes df = df.with_column( pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then( cs_id).otherwise(pl.col('susie_cs')).alias('susie_cs')) pheno_dfs.append(df) ''' if real_cs_count >= 10: underexplored_regions.append((phenotype, chrom, start, end)) ''' pheno_dfs = [ df.select(pl.col('*').exclude('^alpha.*$')) for df in pheno_dfs ] pheno_df = pl.concat(pheno_dfs).join( assocs, how='left', on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen', 'altlen']).collect() all_dfs.append(pheno_df) del df, susie_df, finemap_df, assocs, pheno_dfs, pheno_df susie_cs_min_abs_corrs = np.array(susie_cs_min_abs_corrs) finemap_cs_coverages = np.array(finemap_cs_coverages) total_df = pl.concat(all_dfs) #total_assocs = pl.concat(all_assocs).filter(pl.col('p_val') <= p_val_thresh) '''' start_time = time.time() print('Gathering data ... ', flush=True) total_df = total_df.join( total_assocs, how='left', on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen', 'altlen'] ).collect() print(f'Done. Time: {time.time() - start_time:.2}') ''' total_df.filter( ~pl.col('p_val').is_null() & (pl.col('p_val') <= p_val_thresh)).to_csv( f'{ukb}/post_finemapping/intermediate_results/gathered_data.tab', sep='\t') print( 'Any vars with null Ps?', total_df.select(pl.col('p_val').is_null().alias('null?')).select( pl.any('null?').alias('any_nulls'))['any_nulls'][0]) print( 'n regions', total_df.select( pl.col('region').unique().count().alias('region_count')) ['region_count'][0]) cses_per_region = total_df.filter( pl.col('susie_cs') >= 0).filter(~pl.col('p_val').is_null()).groupby([ 'susie_cs', 'region' ]).agg( pl.col('p_val').min().alias('min_p'), ).filter(pl.col('min_p') <= p_val_thresh).groupby('region').agg( pl.col('region').count().alias('n_cses')).to_dict(False)['n_cses'] print( f'avg cses (total PIP >= .9, min_p_val of CS members <= {p_val_thresh}) per region {np.mean(cses_per_region)}, ({np.std(cses_per_region)})' ) for filter_, text in ((pl.lit(True), ''), (pl.col('is_STR'), ' STR'), (~pl.col('is_STR'), ' SNP')): susie_hits_per_region = total_df.filter(filter_).with_column( ((pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= pip_threshold) & (pl.col('p_val') <= p_val_thresh) ).alias('susie_hit')).groupby('region').agg( pl.col('susie_hit').sum().alias('n_susie_hits')).to_dict( False)['n_susie_hits'] print( f'avg susie{text} hits (var is in a CS, PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(susie_hits_per_region)}, ({np.std(susie_hits_per_region)})' ) finemap_hits_per_region = total_df.filter(filter_).with_column( ((pl.col('finemap_pip') >= pip_threshold) & (pl.col('p_val') <= p_val_thresh) ).alias('finemap_hit')).groupby('region').agg( pl.col('finemap_hit').sum().alias('n_finemap_hits')).select( 'n_finemap_hits').to_numpy() print( f'avg finemap{text} hits (PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(finemap_hits_per_region)}, ({np.std(finemap_hits_per_region)})' ) print('Exporting FINEMAP vs SuSiE PIP plots', flush=True) comparison_thresh = 0.3 title = f'{text} with p-val <= {p_val_thresh} where at least one of SuSiE or FINEMAP PIP >= {comparison_thresh}' if text == '': title = 'Vars ' + title fig = bokeh.plotting.figure( width=1200, height=1200, title=title, x_axis_label='FINEMAP PIPs', y_axis_label='SuSiE PIPs', ) fig.title.text_font_size = '30px' fig.axis.axis_label_text_font_size = '26px' fig.axis.major_label_text_font_size = '20px' fig.background_fill_color = None fig.border_fill_color = None fig.ygrid.grid_line_color = None fig.xgrid.grid_line_color = None fig.toolbar.logo = None fig.toolbar_location = None print(total_df.filter(filter_)) print(total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh))) pips = total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh) & ((pl.col('finemap_pip') >= comparison_thresh) | ((pl.col('susie_pip') >= comparison_thresh) & (pl.col('susie_cs') >= 0)))).select( ['susie_pip', 'finemap_pip']) print(pips) bin_size = .05 bins = bokeh.util.hex.hexbin( pips['finemap_pip'].to_numpy().reshape(-1), pips['susie_pip'].to_numpy().reshape(-1), size=bin_size) palette = [ linear_int_interpolate((134, 204, 195), (9, 41, 46), i / 254) for i in range(-1, 255) ] cmap = bokeh.transform.log_cmap('counts', palette=palette, low=1, high=max(bins.counts), low_color=(255, 255, 255)) color_mapper = bokeh.models.LogColorMapper(palette=palette, low=1, high=max(bins.counts)) fig.hex_tile(q='q', r='r', size=bin_size, line_color=None, source=bins, fill_color=cmap) color_bar = bokeh.models.ColorBar(color_mapper=color_mapper, width=70, major_label_text_font_size='20px') fig.add_layout(color_bar, 'right') ext = text.replace(' ', '_') bokeh.io.export_png( fig, filename= f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.png') bokeh.io.export_svg( fig, filename= f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.svg') print(f'unconverged regions: {unconverged_regions}') print(f'unfinished regions: {unfinished_regions}') #print(f'underexplored regions: {underexplored_regions}') fig = bokeh.plotting.figure( width=1200, height=1200, title='SuSiE credible set min absolute correlations', x_axis_label='min absolute correlation', y_axis_label='# credible sets', ) fig.axis.axis_label_text_font_size = '30px' fig.background_fill_color = None fig.border_fill_color = None fig.grid.grid_line_color = None fig.toolbar_location = None step = 0.01 left_edges = np.arange(0, 1 + step, step) ys = [ np.sum((left_edge <= susie_cs_min_abs_corrs) & (susie_cs_min_abs_corrs < left_edge + step)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step) print('Exporting cs plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.png') bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.svg') fig = bokeh.plotting.figure( width=1200, height=1200, title= f'Number of SuSie CSes min absolute corr >= {corr_cutoff} per region', x_axis_label='# cses in the region', y_axis_label='# regions', ) fig.axis.axis_label_text_font_size = '30px' fig.background_fill_color = None fig.border_fill_color = None fig.grid.grid_line_color = None fig.toolbar_location = None left_edges = np.arange(0, max(cses_per_region) + 1) ys = [ np.sum((left_edge <= cses_per_region) & (cses_per_region < left_edge + 1)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1) print('Exporting cs per region plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/cses_per_region.png') bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/cses_per_region.svg') fig = bokeh.plotting.figure( width=1200, height=1200, title=f'Number of FINEMAP vars with PIP >= {pip_threshold} per region', x_axis_label='# hits in the region', y_axis_label='# regions', ) fig.axis.axis_label_text_font_size = '30px' fig.background_fill_color = None fig.border_fill_color = None fig.grid.grid_line_color = None fig.toolbar_location = None left_edges = np.arange(0, max(finemap_hits_per_region) + 1) ys = [ np.sum((left_edge <= finemap_hits_per_region) & (finemap_hits_per_region < left_edge + 1)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1) print('Exporting finemap hits per region plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.png') bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.svg') fig = bokeh.plotting.figure( width=1200, height=1200, title= f'FINEMAP total PIPs for SuSiE CSes with min_abs_corr >= {corr_cutoff}', x_axis_label='FINEMAP PIPs', y_axis_label='# credible sets', ) fig.background_fill_color = None fig.border_fill_color = None fig.ygrid.grid_line_color = None fig.xgrid.grid_line_color = None fig.toolbar.logo = None fig.toolbar_location = None include = susie_cs_min_abs_corrs >= corr_cutoff max_total_pip = max(1, np.max(finemap_cs_coverages[include])) step = 0.01 left_edges = np.arange(0, max_total_pip + step, step) ys = [ np.sum((left_edge <= finemap_cs_coverages[include]) & (finemap_cs_coverages[include] < left_edge + step)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step) print('Exporting FINEMAP CS PIP plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.png' ) bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.svg' ) total_cses = np.sum(include) total_cses_large_finemap_pip = np.sum( finemap_cs_coverages[include] >= pip_threshold) print( f'SuSiE CSes with min_abs_corr >= {corr_cutoff} with FINEMAP total PIP >= {pip_threshold}: {total_cses_large_finemap_pip} ({total_cses_large_finemap_pip/total_cses:%})' ) susie_pip_threshold_for_finemap = .3 n_replicates_from_finemap = total_df.filter( (pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= susie_pip_threshold_for_finemap) & (pl.col('finemap_pip') >= pip_threshold)).shape[0] n_finemap_total = total_df.filter( pl.col('finemap_pip') >= pip_threshold).shape[0] print( f'FINEMAP hits with PIP >= {pip_threshold} in a SuSiE CS with abs corr >= {corr_cutoff} and SuSiE PIP >= {susie_pip_threshold_for_finemap}: {n_replicates_from_finemap} ({n_replicates_from_finemap/n_finemap_total:%})' ) for (curr_df, text) in [(total_df, 'all hits no filter'), (total_df.filter(pl.col('p_val') <= 1e-10), 'all hits p<=1e-10')]: print(text) var_thresh1 = .8 var_thresh2 = .3 for susie_thresh in (var_thresh1, var_thresh2): for finemap_thresh in (var_thresh1, var_thresh2): count = curr_df.filter( (pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= susie_thresh) & (pl.col('finemap_pip') >= finemap_thresh)).shape[0] print( f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} and with FINEMAP PIP >= {finemap_thresh}: {count}' ) for susie_thresh in (var_thresh1, var_thresh2): count = curr_df.filter( (pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= susie_thresh) & (pl.col('finemap_pip') < var_thresh2)).shape[0] print( f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} with FINEMAP PIP < {var_thresh2}: {count}' ) for finemap_thresh in (var_thresh1, var_thresh2): count = curr_df.filter( (pl.col('finemap_pip') >= finemap_thresh) & ((pl.col('susie_cs') < 0) | (pl.col('susie_pip') < var_thresh2))).shape[0] print( f'Vars with FINEMAP PIP >= {finemap_thresh} either not in a SuSiE CS or having SuSiE PIP <= {var_thresh2}: {count}' ) # Not going to report susie alphas v pips - just know that they're similar if we look # at vars in good credible sets and not otherwise '''
]) associations_df = pl.concat([ pl.scan_csv( f'{ukb}/post_finemapping/intermediate_results/finemapping_all_concordance_{phenotype}.tab', sep='\t', dtypes={ **{f'{ethnicity}_p_val': float for ethnicity in other_ethnicities}, **{f'{ethnicity}_coeff': float for ethnicity in other_ethnicities}, **{f'{ethnicity}_se': float for ethnicity in other_ethnicities} }).filter('is_STR') for phenotype in phenotypes.phenotypes_in_use ]).select([ 'phenotype', 'chrom', 'pos', 'region', 'p_val', 'coeff', 'se', ((pl.col('susie_alpha') >= 0.8) & (pl.col('susie_cs') >= 0) & (pl.col('p_val') <= 5e-8)).alias('finemapped_susie'), ((pl.col('finemap_pip') >= 0.8) & (pl.col('p_val') <= 5e-8)).alias('finemapped_finemap'), *[f'{ethnicity}_p_val' for ethnicity in other_ethnicities], *[f'{ethnicity}_coeff' for ethnicity in other_ethnicities], *[f'{ethnicity}_se' for ethnicity in other_ethnicities], ]).collect() df = associations_df.join(
import os import polars as pl import phenotypes ukb = os.environ['UKB'] dfs = [] for phenotype in phenotypes.phenotypes_in_use: dfs.append( pl.scan_csv(f'{ukb}/signals/regions/{phenotype}.tab', sep='\t').with_column( pl.lit(phenotype).alias('phenotype'))) pl.concat(dfs).collect().with_column( (((pl.col('phenotype') == 'total_bilirubin') & (pl.col('chrom') == 12) & (pl.col('start') == 19976272) & (pl.col('end') == 22524428)) | ((pl.col('phenotype') == 'urate') & (pl.col('chrom') == 4) & (pl.col('start') == 8165642) & (pl.col('end') == 11717761)) | ((pl.col('phenotype') == 'alkaline_phosphatase') & (pl.col('chrom') == 1) & (pl.col('start') == 19430673) & (pl.col('end') == 24309348)) ).alias('filtered_due_to_computation_burden')).select([ 'phenotype', 'chrom', 'start', 'end', 'filtered_due_to_computation_burden' ]).to_csv( f'{ukb}/export_scripts/results/supp_table_2_finemapping_regions.tab', sep='\t')
fname, sep='\t', skip_rows=1, has_header=False, with_column_names=lambda _: header. replace('0.05_significance_CI', 'foo', 1).replace( '5e-8_significance_CI', 'bar', 1).split( '\t') # these duplicate column names won't be used anyway ).select([ 'chrom', 'pos', pl.col('subset_total_per_allele_dosages').alias( f'{ethnicity}_allele_dosages') ]) df = df.join(assoc_df, how='left', on=['chrom', 'pos']) finemapping_dfs.append(df.collect()) finemapping_results = pl.concat(finemapping_dfs).rename({'pos': 'snpstr_pos'}) finemapping_results = finemapping_results.filter((pl.col('p_val') <= 5e-8) & ( ((pl.col('susie_alpha') >= 0.8) & (pl.col('susie_cs') >= 0)) | (pl.col('finemap_pip') >= 0.8)).any().over(['chrom', 'snpstr_pos'])) pos_table = pl.read_csv(f'{ukb}/snpstr/flank_trimmed_vcf/vars.tab', sep='\t') finemapping_results = finemapping_results.join(pos_table, how='left', on=['chrom', 'snpstr_pos']) repeat_units = pl.read_csv(f'{ukb}/snpstr/repeat_units.tab', sep='\t') finemapping_results = finemapping_results.join(repeat_units, how='left',
parser = argparse.ArgumentParser() parser.add_argument('outprefix') parser.add_argument('results_tables', nargs='+') args = parser.parse_args() other_ethnicities = ['black', 'south_asian', 'chinese', 'irish', 'white_other'] df = pl.concat([ pl.scan_csv( table, sep='\t', dtypes={ **{f'{ethnicity}_p_val': float for ethnicity in other_ethnicities}, **{f'{ethnicity}_coeff': float for ethnicity in other_ethnicities}, **{f'{ethnicity}_se': float for ethnicity in other_ethnicities} } ) for table in args.results_tables ]).filter(pl.col('p_val') <= 1e-10).with_columns([ ((pl.col('susie_alpha') >= 0.8) & (pl.col('susie_cs') >= 0)).alias('susie_result'), (pl.col('finemap_pip') >= 0.8).alias('finemap_result') ]).filter( pl.col('susie_result') | pl.col('finemap_result') ).collect() for var, condition in (('STR', pl.col('is_STR')), ('SNP', ~pl.col('is_STR'))): temp_df = df.filter(condition) s_total = temp_df.filter('susie_result').shape[0] f_total = temp_df.filter('finemap_result').shape[0] shared = temp_df.filter(pl.col('finemap_result') & pl.col('susie_result')).shape[0] plt.figure()
def main(): df = pl.scan_csv('post_finemapping/intermediate_results/gathered_data.tab', sep='\t').filter((pl.col('susie_pip') >= 0.3) | (pl.col('finemap_pip') >= 0.3)) df = df.with_column( (pl.col('susie_pip') - pl.col('finemap_pip')).alias('susie_f_pip_diff')).with_column( pl.col('susie_f_pip_diff').abs().alias('abs_pip_diff')) locus_summary_df = pl.concat([ pl.scan_csv( f'export_scripts/intermediate_results/chr{chrom}_loci_summary.tab', sep='\t') for chrom in range(1, 23) ]).select(['chr', 'pos', 'multiallelicness', 'allele_dist']) allele_threshes = (0.0004, 0.002, 0.01, 0.05) #allele_threshes = [0.01] df = df.join( locus_summary_df, how='left', #left_on=['chrom', 'snpstr_pos'], left_on=['chrom', 'pos'], right_on=['chr', 'pos']).collect() snp_df = df.filter(~pl.col('is_STR')) str_df = df.filter(pl.col('is_STR')) assert not str_df.select( pl.col('multiallelicness').is_null().any()).to_numpy()[0] str_df = str_df.with_columns([ pl.apply('allele_dist', count_alleles(thresh), pl.UInt32).alias(f'alleles_{thresh}') for thresh in allele_threshes ]) confusions = pl.concat([ pl.scan_csv(f'side_analyses/length_confusion/chr{i}.tab', sep='\t').with_column(pl.lit(i).alias('chrom').cast(int)) for i in range(1, 23) ]).collect() merged_df = str_df.join(confusions, how='left', on=['chrom', 'pos']) step = 0.05 fig = bokeh.plotting.figure(title='STR PIP histogram', width=size, height=size, x_axis_label='PIP', y_axis_label='density', tools='', toolbar_location=None) xs = np.arange(0, 1 + step, step) fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(str_df['susie_pip'], bins=xs, density=True)[0], color='red', legend_label='SuSiE STRs') fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(str_df['finemap_pip'], bins=xs, density=True)[0], color='blue', legend_label='FINEMAP STRs') fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(snp_df['susie_pip'], bins=xs, density=True)[0], color='green', legend_label='SuSiE SNPs') fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(snp_df['finemap_pip'], bins=xs, density=True)[0], color='purple', legend_label='FINEMAP SNPs') bokeh.io.export_png(fig, filename='post_finemapping/results/pip_histogram.png') fig = bokeh.plotting.figure(title='STR PIP scatterplot', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', tools='', toolbar_location=None) fig.circle(str_df['susie_pip'], str_df['finemap_pip']) bokeh.io.export_png( fig, filename='post_finemapping/results/str_comp_pip_scatter.png') fig = bokeh.plotting.figure(title='STR PIP heatmap', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) heat_map(fig, str_df['finemap_pip'], str_df['susie_pip'], 'post_finemapping/results/str_comp_pip_heatmap.png') fig = bokeh.plotting.figure(title='STR PIPs', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) weighted_heat_map( fig, merged_df['finemap_pip'], merged_df['susie_pip'], merged_df['chance_of_length_confusion'], 'average chance of misgenotyping per sample at any such locus', 'post_finemapping/results/str_comp_pip_chance_map.png') fig = bokeh.plotting.figure(title='STR PIPs', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) weighted_heat_map( fig, merged_df['finemap_pip'], merged_df['susie_pip'], merged_df['normalized_avg_abs_length_confusion'], 'average number of standard deviations of misgenotyping per sample at any such locus', 'post_finemapping/results/str_comp_pip_sd_map.png') fig = bokeh.plotting.figure(title='SNP PIP scatterplot', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', tools='', toolbar_location=None) fig.circle(snp_df['susie_pip'], snp_df['finemap_pip']) bokeh.io.export_png( fig, filename='post_finemapping/results/snp_comp_pip_scatter.png') fig = bokeh.plotting.figure(title='SNP PIP heatmap', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) heat_map(fig, snp_df['finemap_pip'], snp_df['susie_pip'], 'post_finemapping/results/snp_comp_pip_heatmap.png') color_mapper = bokeh.models.LinearColorMapper(palette=palette, low=0, high=1) color_bar = bokeh.models.ColorBar(color_mapper=color_mapper, width=30) cmap = bokeh.transform.linear_cmap('foo', palette=palette, low=0, high=1) fig = bokeh.plotting.figure(title='STR PIP scatterplot', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', tools='', match_aspect=True, toolbar_location=None) cb_title = bokeh.models.Title( text='chance a genotype call at this locus is wrong', align='center') fig.add_layout(color_bar, 'right') fig.add_layout(cb_title, 'right') cds = bokeh.models.ColumnDataSource( dict(x=merged_df['finemap_pip'], y=merged_df['susie_pip'], color=[ linear_int_interpolate((134, 204, 195), (9, 41, 46), val) for val in merged_df['chance_of_length_confusion'] ])) fig.circle(x='x', y='y', color='color', source=cds) bokeh.io.export_png( fig, filename='post_finemapping/results/colored_str_comp_pip_scatter.png') step = 0.05 for thresh in allele_threshes: for pip_thresh in (0.3, 0.8): for xs, x_label, out_loc, title, col in [ ( np.arange(-1, 1 + step, step), 'SuSiE PIP - FINEMAP PIP', f'post_finemapping/results/pip_diff_density_allele_thresh_{thresh}_pip_thresh_{pip_thresh}.png', f'PIP diff, STR allele penetrance threshold = {thresh:.4}', 'susie_f_pip_diff', ), (np.arange(0, 1 + step, step), 'absolute PIP difference', f'post_finemapping/results/pip_abs_diff_density_allele_thresh_{thresh}_pip_thresh_{pip_thresh}.png', f'absolute PIP diff, STR allele penetrance threshold = {thresh:.4}', 'abs_pip_diff') ]: filter_exp = (pl.col('susie_pip') >= pip_thresh) | ( pl.col('finemap_pip') >= pip_thresh) fig = bokeh.plotting.figure(title=title, width=size, height=size, x_axis_label=x_label, y_axis_label='density', tools='', toolbar_location=None) fig.line( x=xs[:-1], y=np.histogram(snp_df.filter(filter_exp)[col].to_numpy(), bins=xs, density=True)[0], #y=scipy.stats.gaussian_kde(snp_df['susie_f_pip_diff'].to_numpy())(xs), color='black', legend_label=f'SNPs (n={snp_df.shape[0]})') for count, color in ((2, 'brown'), (3, 'red'), (4, 'orange')): arr = str_df.filter(filter_exp).filter( pl.col(f'alleles_{thresh}') == count)[col].to_numpy() fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(arr, bins=xs, density=True)[0], color=color, legend_label=f'{count}-allele STRs (n={arr.shape[0]})') arr = str_df.filter(filter_exp).filter( pl.col(f'alleles_{thresh}') >= 5)[col].to_numpy() fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(arr, bins=xs, density=True)[0], color='gold', legend_label= f'STRs with at least 5 alleles (n={arr.shape[0]})') fig.add_layout( bokeh.models.Title( text= f'Variants with PIP at least {pip_thresh} for SuSiE or FINEMAP' ), 'below') bokeh.io.export_png(fig, filename=out_loc) fig = bokeh.plotting.figure(title='STR PIP diff', width=size, height=size, x_axis_label='multiallelicness', y_axis_label='SuSiE PIP - FINEMAP PIP', tools='', toolbar_location=None) heat_map(fig, str_df['multiallelicness'], str_df['susie_f_pip_diff'], 'post_finemapping/results/str_pip_diff_heatmap.png', y_min=-1) fig = bokeh.plotting.figure(title='STR PIP abs diff', width=size, height=size, x_axis_label='multiallelicness', y_axis_label='absolute PIP difference', tools='', toolbar_location=None) heat_map(fig, str_df['multiallelicness'], str_df['abs_pip_diff'], 'post_finemapping/results/str_pip_abs_diff_heatmap.png') fig = bokeh.plotting.figure(title='PIP abs diff', width=size, height=size, x_axis_label='multiallelicness', y_axis_label='absolute PIP difference', tools='', toolbar_location=None)
print(coords_df.shape) qtl_strs = [] yang_dir = '/expanse/projects/gymreklab/yal084_storage/share_with_Jonathan' for fname, col_name in ('eSTR', 'str-gene'), ('STR', 'str-exon'), ('eISOFORM', 'str-isoform'): qtl_str = pl.read_csv(f'{yang_dir}/{fname}_GB_650pc_combined_fdr10p.csv', sep='\t').with_column( pl.col(col_name).str.split_exact( '-', 1).struct.field('field_0').alias('hg38')) qtl_str = pl.concat([ qtl_str.join(coords_df, left_on='hg38', right_on=f'chrom_pos_{offset}_38').drop([ f'chrom_pos_{offset2}_38' for offset2 in range(-10, 11) if offset2 != offset ]) for offset in range(-10, 11) ]) qtl_str = qtl_str.distinct().groupby('chrom_pos').agg([ pl.col('phenotype').first(), pl.col('association_p_value').first(), pl.col('p_values').list(), pl.col('Tissue').list(), pl.col('gene_name').list(), pl.col(col_name).str.split_exact( '-', 1).struct.field('field_1').list().alias('target') ]) print(qtl_str.shape)
def write_input_variants(workdir, outdir, gts_dir, readme, phenotype, chrom, start, end, inclusion_threshold, mac, snp_str_ratio, total_prob, use_PACSIN2): ''' write README.txt write finemap_input.z write finemap_innput.master ''' sample_idx = sample_utils.get_samples_idx_phenotype( 'white_brits', phenotype) n_samples = np.sum(sample_idx) if mac: mac_threshold = int(mac[0]) snp_mac_fname = mac[1] str_mac_fname = mac[2] snps_exclude_mac = pl.scan_csv( snp_mac_fname, sep='\t').filter(pl.col('ALT_CTS') < mac_threshold).select( ('SNP_' + pl.col('#POS').cast(str) + '_' + pl.col('REF') + '_' + pl.col('ALT') ).alias('varname')).collect()['varname'].to_list() # need to make that look like a list of strings to polars b/c buggy, so add a single nonsense to it snps_exclude_mac.append('asdf') strs_exclude_mac = pl.scan_csv( str_mac_fname, sep='\t').filter(pl.col('mac') < mac_threshold).select( 'pos').collect()['pos'].to_list() plink_results_fname = f'{ukb}/association/results/{phenotype}/plink_snp/results.tab' str_results_fname = f'{ukb}/association/results/{phenotype}/my_str/results.tab' filter_set_fname = f'{ukb}/finemapping/str_imp_snp_overlaps/chr{chrom}_to_filter.tab' with open(f'{workdir}/finemap_input.master', 'w') as finemap_master: finemap_master.write('z;ld;snp;config;cred;log;n_samples\n' f'{outdir}/finemap_input.z;' f'{gts_dir}/all_variants.ld;' f'{outdir}/finemap_output.snp;' f'{outdir}/finemap_output.config;' f'{outdir}/finemap_output.cred;' f'{outdir}/finemap_output.log;' f'{n_samples}') today = datetime.datetime.now().strftime("%Y_%M_%D") readme.write( f'Run date: {today}\n' 'Manually generating variant-variant LD for each imputed SNP each STR in the region ' 'where an association was successfully ' f'performed and had p < {inclusion_threshold} and the SNP was not in the filter set\n' f'(Filter set at {filter_set_fname})\n' 'Correlation is STR length dosage vs SNP dosage.\n' 'Running FINEMAP with that list of imputed SNPs and STRs.\n') # load STRs strs = pl.scan_csv( str_results_fname, sep='\t', dtypes={ 'locus_filtered': str }).filter((pl.col('chrom') == chrom) & (pl.col('pos') >= start) & (pl.col('pos') <= end) & (pl.col('locus_filtered') == 'False') & (pl.col(f'p_{phenotype}') < inclusion_threshold)).select([ ('STR_' + pl.col('pos').cast(str)).alias('rsid'), ('0' + pl.col('chrom').cast(str) ).str.slice(-2).alias('chromosome'), pl.col('pos').alias('position'), pl.lit('nan').alias('allele1'), pl.lit('nan').alias('allele2'), pl.lit('nan').alias('maf'), pl.col(f'coeff_{phenotype}').alias('beta'), pl.col(f'se_{phenotype}').alias('se'), ]).collect() if mac: strs = strs.filter(~pl.col('position').is_in(strs_exclude_mac)) if use_PACSIN2: strs = strs.filter(pl.col('pos') != 43385872) pacsin2_strs = pl.read_csv( f'{ukb}/association/spot_test/white_brits/{phenotype}/PACSIN2.tab', sep='\t').filter( pl.col('pos').is_in([43385866, 43385875, 43385893])).select([ ('PACSIN2_STR_' + pl.col('pos').cast(str)).alias('rsid'), ('0' + pl.col('chrom').cast(str) ).str.slice(-2).alias('chromosome'), pl.col('pos').alias('position'), pl.lit('nan').alias('allele1'), pl.lit('nan').alias('allele2'), pl.lit('nan').alias('maf'), pl.col(f'coeff_{phenotype}').alias('beta'), pl.col(f'se_{phenotype}').alias('se'), ]) strs = pl.concat([strs, pacsin2_strs]) assert strs.distinct( subset=['chromosome', 'position']).shape[0] == strs.shape[0] n_strs = strs.shape[0] # load SNPs snps_to_filter = set() with open(filter_set_fname) as filter_file: next(filter_file) # skip header for line in filter_file: pos, ref, alt = line.strip().split('\t')[3:6] snps_to_filter.add(f'{pos}_{ref}_{alt}') snps = pl.scan_csv(plink_results_fname, sep='\t', null_values='NA').filter( (pl.col('#CHROM') == chrom) & (pl.col('POS') >= start) & (pl.col('POS') <= end) & (pl.col('ERRCODE') == '.') & (pl.col('P') < inclusion_threshold) & ~(pl.col('POS').cast(str) + '_' + pl.col('REF') + '_' + pl.col('ALT')).is_in(list(snps_to_filter))).select([ ('SNP_' + pl.col('POS').cast(str) + '_' + pl.col('REF') + '_' + pl.col('ALT')).alias('rsid'), ('0' + pl.col('#CHROM').cast(str)).str.slice(-2).alias('chromosome'), pl.col('POS').alias('position'), pl.col('REF').alias('allele1'), pl.col('ALT').alias('allele2'), pl.lit('nan').alias('maf'), pl.col('BETA').alias('beta'), pl.col('SE').alias('se'), ]).collect() if mac: snps = snps.filter(~pl.col('rsid').is_in(snps_exclude_mac)) n_snps = snps.shape[0] if snp_str_ratio is not None: strs = strs.with_column( pl.lit(1 / (n_strs + snp_str_ratio * n_snps)).alias('prob')) snps = snps.with_column( pl.lit(snp_str_ratio / (n_strs + snp_str_ratio * n_snps)).alias('prob')) vars_df = pl.concat([strs, snps]) if total_prob is not None: vars_df = vars_df.with_column( pl.lit(total_prob / (n_snps + n_strs)).alias('prob')) vars_df.to_csv(f'{workdir}/finemap_input.z', sep=' ')