def test_list_ternary_concat() -> None: df = pl.DataFrame( { "list1": [["123", "456"], None], "list2": [["789"], ["zzz"]], } ) assert df.with_column( pl.when(pl.col("list1").is_null()) .then(pl.col("list1").arr.concat(pl.col("list2"))) .otherwise(pl.col("list2")) .alias("result") ).to_dict(False) == { "list1": [["123", "456"], None], "list2": [["789"], ["zzz"]], "result": [["789"], None], } assert df.with_column( pl.when(pl.col("list1").is_null()) .then(pl.col("list2")) .otherwise(pl.col("list1").arr.concat(pl.col("list2"))) .alias("result") ).to_dict(False) == { "list1": [["123", "456"], None], "list2": [["789"], ["zzz"]], "result": [["123", "456", "789"], ["zzz"]], }
def load_dir(pheno, region, dir_): with open(f'{dir_}/converged.txt') as converged: assert converged.read().strip() == 'TRUE' alphas = pl.scan_csv(f'{dir_}/alpha.tab', sep='\t', has_header=False).collect().to_numpy().T susie_pips = 1 - np.prod(1 - alphas, axis=1) df = pl.scan_csv(f'{dir_}/colnames.txt', has_header=False, with_column_names=lambda _: ['var_name']).with_column( pl.lit(1).alias('row_number')).with_columns([ pl.col('row_number').cumsum(), pl.lit(None, int).alias('cs_num'), pl.lit(region).alias('region'), pl.lit(pheno).alias('phenotype'), pl.Series(susie_pips).alias('susie_pip'), pl.lit(None, float).alias('susie_cs_pip') ]) for cs_num in range(50): cs_num += 1 cs_fname = f'{dir_}/cs{cs_num}.txt' if not os.path.exists(cs_fname): continue with open(cs_fname) as cs: var_nums = [int(var_num) for var_num in next(cs).strip().split()] next(cs) min_ld = float(next(cs).split()[0]) if min_ld < min_ld_thresh: continue df = df.with_columns([ pl.when(pl.col('row_number').is_in(var_nums)).then( pl.when(~pl.col('cs_num').is_null()).then(-1).otherwise( cs_num)).otherwise(pl.col('cs_num')).alias('cs_num'), pl.when(pl.col('row_number').is_in(var_nums)).then( pl.Series(alphas[:, cs_num - 1])).otherwise( pl.col('susie_cs_pip')).alias('susie_cs_pip') ]) df = df.with_column( pl.when(pl.col('cs_num') != -1).then( pl.col('susie_cs_pip')).otherwise(-1).alias('susie_cs_pip')) df = df.filter( pl.col('var_name').str.contains('^STR') & ~pl.col('cs_num').is_null() & (pl.col('susie_pip') > 0.05)).drop('row_number') return df
def test_type_coercion_when_then_otherwise_2806() -> None: out = (pl.DataFrame({ "names": ["foo", "spam", "spam"], "nrs": [1, 2, 3] }).select([ pl.when(pl.col("names") == "spam").then(pl.col("nrs") * 2).otherwise( pl.lit("other")).alias("new_col"), ]).to_series()) expected = pl.Series("new_col", ["other", "4", "6"]) assert out.to_list() == expected.to_list() # test it remains float32 assert (pl.Series( "a", [1.0, 2.0, 3.0], dtype=pl.Float32).to_frame().select( pl.when(pl.col("a") > 2.0).then( pl.col("a")).otherwise(0.0))).to_series().dtype == pl.Float32
def calculate_friction_number(column_names: List[str]) -> "pl.Expr": if "fs" in column_names and "qc" in column_names: return (col("fs") / when(col("qc") == 0.0).then(None).otherwise(col("qc")) * 100.0).alias("friction_number") else: return lit(0.0).alias("friction_number")
def full_genome_polars_df(df): df = df.with_column(pl.col('pos').alias('plot_pos')) for chrom in range(2, 23): df = df.with_column( pl.when(pl.col('chr') >= chrom).then( pl.col('plot_pos') + int(chr_lens[chrom - 2])).otherwise( pl.col('plot_pos')).alias('plot_pos')) return df
def test_set_null() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = (df.lazy().with_column( when(col("a") > 1).then( lit(None)).otherwise(100).alias("foo")).collect()) s = out["foo"] assert s[0] == 100 assert s[1] is None assert s[2] is None
def test_list_fill_list() -> None: assert pl.DataFrame({"a": [[1, 2, 3], []]}).select( [ pl.when(pl.col("a").arr.lengths() == 0) .then([5]) .otherwise(pl.col("a")) .alias("filled") ] ).to_dict(False) == {"filled": [[1, 2, 3], [5]]}
def test_list_fill_null() -> None: df = pl.DataFrame({"C": [["a", "b", "c"], [], [], ["d", "e"]]}) assert df.with_columns( [ pl.when(pl.col("C").arr.lengths() == 0) .then(None) .otherwise(pl.col("C")) .alias("C") ] ).to_series().to_list() == [["a", "b", "c"], None, None, ["d", "e"]]
def test_when_then_flatten() -> None: df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 4, 5]}) assert df[ when(col("foo") > 1) .then(col("bar")) .when(col("bar") < 3) .then(10) .otherwise(30) ]["bar"] == [30, 4, 5]
def test_when_then_edge_cases_3994() -> None: df = pl.DataFrame(data={"id": [1, 1], "type": [2, 2]}) # this tests if lazy correctly assigns the list schema to the column aggregation assert (df.lazy().groupby(["id"]).agg(pl.col("type")).with_column( pl.when(pl.col("type").arr.lengths() == 0).then( pl.lit(None)).otherwise( pl.col("type")).keep_name()).collect()).to_dict(False) == { "id": [1], "type": [[2, 2]] } # this tests ternary with an empty argument assert (df.filter(pl.col("id") == 42).groupby([ "id" ]).agg(pl.col("type")).with_column( pl.when(pl.col("type").arr.lengths == 0).then(pl.lit(None)).otherwise( pl.col("type")).keep_name())).to_dict(False) == { "id": [], "type": [] }
def test_lazy() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) _ = df.lazy().with_column(lit(1).alias("foo")).select( [col("a"), col("foo")]) # test if it executes _ = (df.lazy().with_column( when(col("a").gt(lit(2))).then(lit(10)).otherwise( lit(1)).alias("new")).collect()) # test if pl.list is available, this is `to_list` re-exported as list df.groupby("a").agg(pl.list("b"))
def test_list_arr_empty() -> None: df = pl.DataFrame({"cars": [[1, 2, 3], [2, 3], [4], []]}) out = df.select([ pl.col("cars").arr.first().alias("cars_first"), pl.when(pl.col("cars").arr.first() == 2).then(1).when( pl.col("cars").arr.contains(2)).then(2).otherwise(3).alias( "cars_literal"), ]) expected = pl.DataFrame({ "cars_first": [1, 2, 4, None], "cars_literal": [2, 1, 3, 3] }) assert out.frame_equal(expected)
def test_comp_categorical_lit_dtype() -> None: df = pl.DataFrame( data={ "column": ["a", "b", "e"], "values": [1, 5, 9] }, columns=[("column", pl.Categorical), ("more", pl.Int32)], ) assert df.with_column( pl.when(pl.col("column") == "e").then("d").otherwise( pl.col("column")).alias("column")).dtypes == [ pl.Categorical, pl.Int32 ]
def replace_column_void(lf: pl.LazyFrame, column_void) -> pl.LazyFrame: if column_void is None: return lf # TODO: what to do with multiple columnvoids? if isinstance(column_void, list): column_void = column_void[0] return ( # Get all values matching column_void and change them to null lf.select( pl.when(pl.all() == pl.lit(column_void)).then( pl.lit(None)).otherwise(pl.all()).keep_name()) # Interpolate all null values .select(pl.all().interpolate()) # Remove the rows with null values .drop_nulls())
def get_str_loci(phenotype, my_str_fname, thresh): p_col = f'p_{phenotype}' csv = pl.scan_csv( my_str_fname, sep='\t', dtypes={'alleles': str, 'locus_filtered': str} ).filter( pl.col(p_col) <= thresh ).with_column( pl.when(pl.col(p_col) <= 1e-300) .then(0) .otherwise(pl.col(p_col)) .alias(p_col) ).collect().to_dict(as_series=False) return sortedcontainers.SortedSet( iterable = zip(csv[p_col], csv['chrom'], csv['pos'], itertools.repeat('STR')) )
def correct_depth_with_inclination(columns): """ Return the expression needed to correct depth """ if "corrected_depth" in columns: return col("corrected_depth").abs().alias("depth") elif "inclination" in columns: pt = "penetration_length" # every different in depth needs to be corrected with the angle correction_factor = np.cos( np.radians(col("inclination").cast(pl.Float32).fill_null(0))) corrected_depth = (correction_factor * col(pt).diff()).cumsum() return (pl.when(corrected_depth.is_null()).then( col(pt)).otherwise(corrected_depth).alias("depth")) else: return col("penetration_length").alias("depth")
def test_sum_to_one(self): cols = [ "gravel_component", "sand_component", "clay_component", "loam_component", "peat_component", "silt_component", ] s = self.bore.df.select([ pl.fold( pl.lit(0), lambda a, b: a + b, [ pl.when(pl.col(a) < 0).then(1 / len(cols)).otherwise( pl.col(a)) for a in cols ], ).alias("sum") ]) self.assertTrue(np.all(np.isclose(s, 1)))
def ic_to_gamma(water_level): """ Return the expression needed to compute "gamma_predict" """ below_water = (1.0 - col("depth")) < water_level ti = col("type_index") return ( pl.when(ti > 3.22) .then(11.0) .when((ti <= 3.22) & (ti > 2.76)) .then(16.0) .when((ti <= 2.76) & ~(below_water)) .then(18.0) .when(ti <= 2.40 & below_water) .then(19.0) .when(ti <= 1.80 & below_water) .then(20.0) .otherwise(1.0) .alias("gamma_predict") )
def ic_to_soil_type(): """ Assign the soil type to the corresponding Ic. """ ti = col("type_index") return ( pl.when(ti > 3.22) .then("Peat") .when((ti <= 3.22) & (ti > 2.67)) .then("Clays") .when((ti <= 2.67) & (ti > 2.4)) .then("Clayey silt to silty clay") .when((ti <= 2.4) & (ti > 1.8)) .then("Silty sand to sandy silt") .when((ti <= 1.8) & (ti > 1.25)) .then("Sands: clean sand to silty") .when(ti <= 1.25) .then("Gravelly sands") .otherwise("") .alias("soil_type") )
def get_snp_loci(plink_imputed_snp_fname, thresh): csv = pl.scan_csv( plink_imputed_snp_fname, sep='\t', null_values='NA' ).filter( pl.col('P') <= thresh ).with_column( pl.when(pl.col('P') <= 1e-300) .then(0) .otherwise(pl.col('P')) .alias('P') ).filter( pl.col('ERRCODE') != 'CONST_OMITTED_ALLELE' ).collect() assert np.all((csv['ERRCODE'] == '.').to_numpy()) dict_csv = csv.to_dict(as_series = False) return sortedcontainers.SortedSet( iterable = zip(dict_csv['P'], dict_csv['#CHROM'], dict_csv['POS'], itertools.repeat('SNP'), dict_csv['REF'], dict_csv['ALT']) )
sep='\t', dtypes={ col: (float if 'cs' not in col else int) for col in concordance_cols if 'finemap' in col or 'susie' in col or 'p_val' in col }) for phenotype in phenotypes.phenotypes_in_use if not os.path.exists( f'{ukb}/post_finemapping/intermediate_results/finemapping_putatively_causal_concordance_{phenotype}.tab.empty' ) ]).filter('is_STR').with_column(pl.col('pos').alias('snpstr_pos')) finemapping_results = finemapping_results.join( concordance_results, how='left', on=['phenotype', 'chrom', 'snpstr_pos']).with_columns([ pl.when(pl.col('susie_alpha').is_null()).then(None).when( pl.col('susie_cs') >= 0).then( pl.col('susie_alpha')).otherwise(0).alias('susie_CP'), pl.when(pl.col('susie_alpha_hardcall').is_null()).then(None).when( pl.col('susie_cs_hardcall') >= 0).then( pl.col('susie_alpha_hardcall')).otherwise(0).alias( 'susie_CP_best_guess_genotypes'), pl.when(pl.col('susie_alpha_ratio').is_null()).then(None).when( pl.col('susie_cs_ratio') >= 0).then( pl.col('susie_alpha_ratio')).otherwise(0).alias( 'susie_CP_prior_snps_over_strs'), pl.col('finemap_pip').alias('finemap_CP'), pl.col('finemap_pip_p_thresh').alias('finemap_CP_pval_thresh_5e-4'), pl.col('finemap_pip_mac').alias('finemap_CP_mac_thresh_100'), pl.col('finemap_pip_prior_std_derived').alias( 'finemap_CP_prior_effect_size_0.05%'), pl.col('finemap_pip_total_prob').alias('finemap_CP_prior_4_signals'),
from .dataset import df import polars as pl from polars import col df = df[[pl.when(col("random") > 0.5).then(0).otherwise(col("random")) * pl.sum("nrs")]]
import bokeh.io import bokeh.models import bokeh.plotting import numpy as np import polars as pl ukb = os.environ['UKB'] df = pl.read_csv( f'{ukb}/post_finemapping/results/singly_finemapped_strs_for_paper.tab', sep='\t' ).with_column( pl.when( pl.col('association_p_value') < 1e-300 ).then( pl.lit(300) ).otherwise( -pl.col('association_p_value').log10() ).alias('association_p_value') ) # Min abs corr across all CSes fig = bokeh.plotting.figure( width=1200, height=1200, title='Fine-mapped loci by association p-value', x_axis_label='-log10(p-value)', y_axis_label='Number of fine-mapped loci', ) fig.axis.axis_label_text_font_size = '30px' fig.title.text_font_size = '30px'
def map_expr(name: str) -> pl.Expr: return (pl.when(ignore_nulls or pl.col(name).null_count() == 0).then( pl.struct([ pl.sum(name).alias("sum"), (pl.count() - pl.col(name).null_count()).alias("count"), ]), ).otherwise(None)).alias("out")
def main(): parser = argparse.ArgumentParser() parser.add_argument('outfile') parser.add_argument('hits_table') parser.add_argument('qtl_STRs_table') args = parser.parse_args() df = pl.read_csv( args.hits_table, sep='\t', ).filter(pl.col('association_p_value') <= 1e-10) closest_gene_merge = annotation_utils.get_merged_annotations( df.with_column(pl.col('start_pos').alias('pos')).to_pandas(), f'{ukb}/side_analyses/str_annotations/closest_gene', distance=True ) closest_gene = [None]*df.shape[0] nrows = df.shape[0] for idx in range(nrows): chrom = df['chrom'][idx] start_pos = df['start_pos'][idx] end_pos = df['end_pos'][idx] for line in closest_gene_merge[ (closest_gene_merge['chrom'] == chrom) & (closest_gene_merge['STR_pos'] == start_pos) ].itertuples(): if closest_gene[idx] is not None: closest_gene[idx] += ',' else: closest_gene[idx] = '' closest_gene[idx] += str(line.annotation_distance) + ":" closest_gene[idx] += annotation_utils.get_relation( start_pos, end_pos, line.annotation_pos, line.annotation_end_pos, line.annotation_strand ) + ":" closest_gene[idx] += annotation_utils.get_gff_kvp(line.annotation_info, 'gene_name') + ":" closest_gene[idx] += annotation_utils.get_gff_kvp(line.annotation_info, 'gene_type') df = df.with_column(pl.Series(closest_gene).alias('closest_gene')) pheno_blocks = { 'red blood cell' : [ 'haematocrit', 'haemoglobin_concentration', 'red_blood_cell_count', 'mean_corpuscular_volume', 'mean_corpuscular_haemoglobin', 'mean_sphered_cell_volume', 'red_blood_cell_distribution_width', 'mean_corpuscular_haemoglobin_concentration', ], 'platelet' : [ 'platelet_count', 'platelet_crit', 'mean_platelet_volume', 'platelet_distribution_width', ], 'white blood cell' : [ 'eosinophil_count', 'eosinophil_percent', 'neutrophil_count', 'neutrophil_percent', 'lymphocyte_count', 'lymphocyte_percent', 'white_blood_cell_count', ], 'renal' : [ 'cystatin_c', 'creatinine', 'urate', 'urea', ], 'liver' : [ 'gamma_glutamyltransferase', 'alkaline_phosphatase', 'total_bilirubin', 'alanine_aminotransferase', 'total_protein', 'albumin', 'aspartate_aminotransferase', ], 'endocrine' : [ 'glycated_haemoglobin', 'igf_1', 'shbg', 'calcium', 'glucose', 'phosphate', ], 'lipid' : [ 'apolipoprotein_a', 'hdl_cholesterol', 'apolipoprotein_b', 'ldl_cholesterol_direct', 'cholesterol', 'triglycerides', ], '' : ['c_reactive_protein'] } pheno_order = sum(pheno_blocks.values(), []) shaded_phenos = sum([pheno_blocks[key] for key in list(pheno_blocks.keys())[1::2]], []) pheno_indices = np.where(df['phenotype'].to_numpy()[:, None] == np.array(pheno_order)[None, :])[1] for phenotype in pheno_order: if not phenotype in phenotypes.phenotypes_in_use: print(phenotype) assert False df = df.with_column( pl.Series(pheno_indices).alias('pheno_indices') ).with_row_count().with_column( (pl.col('pheno_indices') + pl.col('row_nr')/1e5 + pl.when(pl.col('finemapping') != 'confidently').then(100000).otherwise(0)).min().over(['chrom', 'start_pos']).alias('min_assoc_pheno_index') ).with_columns([ pl.when( pl.col('association_p_value') == 0 ).then(300).otherwise( -pl.col('association_p_value').log10() ).alias('p_val'), (pl.col('finemapping') == 'confidently').sum().over(['chrom', 'start_pos']).alias('n_confident_assocs'), pl.col('white_brit_allele_frequencies').apply( lambda dosage_dict_str: sum(float(part.split(' ')[-1]) >= 1 for part in dosage_dict_str.split('%')[:-1]) ).alias('n_common_alleles') ]) mapi = df['min_assoc_pheno_index'].to_numpy().copy() old_mapi = mapi.copy() for i in range(len(pheno_order)): if i not in np.floor(old_mapi): mapi[old_mapi > i] -= 1 x_coords = np.floor(mapi).astype(int) for i in np.unique(mapi): num_dups = np.unique(mapi[(mapi < i) & (np.floor(i) == np.floor(mapi))]).shape[0] if num_dups > 0: x_coords[np.floor(mapi) > np.floor(i)] += 1 x_coords[mapi == i] += num_dups for i in range(max(x_coords) + 1): if i not in x_coords: print(i) exit() for pair in [(21, 23), (23, 24), (25, 28), (26, 30), (27, 31), (25, 27), (35, 39), (36, 39), (37, 38), (40, 42), (40, 41), (43, 45), (56, 57), (59, 60), (48, 53), (49, 55), (50, 54), (49, 51), (68, 71), (84, 85),(85, 87), (87, 88)]: x_coords[x_coords == pair[0]] = 1000000 x_coords[x_coords == pair[1]] = pair[0] x_coords[x_coords == 1000000] = pair[1] strs = df[ pl.col('chrom').cast(str) + ':' + pl.col('start_pos').cast(str) + '_' + pl.when( pl.col('relation_to_gene').str.contains('protein_coding.*protein_coding') ).then( pl.col('relation_to_gene').str.extract("([^:]*):protein_coding.*:([^:]*):protein_coding", 1) + ' & ' + pl.col('relation_to_gene').str.extract("([^:]*):protein_coding.*:([^:]*):protein_coding", 2) ).when( pl.col('relation_to_gene').str.contains('protein_coding') ).then( pl.col('relation_to_gene').str.extract("([^:]*):protein_coding", 1) ).when( pl.col('relation_to_gene').str.contains('intergenic') ).then( pl.col('chrom').cast(str)+':'+pl.col('start_pos').cast(str)+' (' + pl.col('closest_gene').str.split_exact(',', 1).struct.field('field_0').str.split_exact(':', 2).struct.field('field_2') + ')' ).when( pl.col('relation_to_gene').str.contains('multigene') ).then( pl.col('relation_to_gene').str.extract(r"multigene;[^:]*:([^:]*):", 1) + '/' + pl.col('relation_to_gene').str.extract(r"multigene;[^;]*;[^:]*:([^:]*):", 1) ).otherwise( pl.col('relation_to_gene').str.extract(r"\w[^{:]*:([^:]*):", 1) ) ].to_numpy()[np.argsort(x_coords)] _, idxs = np.unique(strs, return_index=True) unique_stable_strs = strs[np.sort(idxs)].flatten() unique_stable_strs = list(np.char.partition(np.array(unique_stable_strs, dtype=str), '_')[:, 2]) # two genes each containing two unique hits unique_stable_strs[unique_stable_strs.index('CCDC26')] += ' #1' unique_stable_strs[unique_stable_strs.index('CCDC26')] += ' #2' unique_stable_strs[unique_stable_strs.index('TFDP2')] += ' #1' unique_stable_strs[unique_stable_strs.index('TFDP2')] += ' #2' plot_width = 2500 results_plot = bokeh.plotting.figure( width=plot_width, height=1400, x_axis_label='hits (containing gene, or position and nearest gene if intergenic)', y_axis_label='phenotypes', #y_range=bokeh.models.FactorRange(*[(group, pheno) for group in pheno_blocks for pheno in pheno_blocks[group]][::-1]), y_range=[pheno.replace('_', ' ') for pheno in pheno_order[::-1]], x_range=unique_stable_strs,#(0,94), toolbar_location=None, outline_line_color='black' ) results_plot.axis.axis_label_text_font_size = '26px' results_plot.xaxis.major_label_orientation = 1.3 cds = bokeh.models.ColumnDataSource(dict( x=[len(x_coords)/2]*len(shaded_phenos), y=[pheno.replace('_', ' ') for pheno in shaded_phenos], width=[len(x_coords)]*len(shaded_phenos), height=[1]*len(shaded_phenos), color=['grey']*len(shaded_phenos), alpha=['0.15']*len(shaded_phenos), line_color=[None]*len(shaded_phenos) )) results_plot.rect( x='x', y='y', width='width', height='height', color='color', alpha='alpha', line_color='line_color', source=cds ) half_xs=np.arange(1, np.max(x_coords) + 1, 2) cds = bokeh.models.ColumnDataSource(dict( x=half_xs+0.5, y=[len(pheno_order)/2]*half_xs.shape[0], width=[1]*half_xs.shape[0], height=[len(pheno_order)+len(pheno_blocks)+20]*half_xs.shape[0], color=['grey']*half_xs.shape[0], alpha=['0.15']*half_xs.shape[0], line_color=[None]*half_xs.shape[0] )) results_plot.rect( x='x', y='y', width='width', height='height', color='color', alpha='alpha', line_color='line_color', source=cds ) results_plot.xgrid.ticker = [] ''' with open(args.e_splice_STRs_table) as table: e_splice_lines = table.readlines() for loc, gene_rels in zip( df[pl.col('chrom').cast(str)+'_'+pl.col('start_pos').cast(str)].to_numpy().flatten(), df['relation_to_gene'].to_numpy().flatten() ): gene_rel_list = gene_rels.split(';') color = 'blue' for gene_rel in gene_rel_list: if gene_rel == 'intergenic': break if gene_rel == 'multigene': continue for line in e_splice_lines: if re.search(loc + '.*' + gene_rel.split(':')[1], line): color = 'red' str_colors.append(color) str_colors = np.array(str_colors) ''' def pheno_to_ycoords(phenos): return [pheno.replace('_', ' ') for pheno in phenos] #return [(group, pheno) for pheno in phenos for group in pheno_blocks if pheno in pheno_blocks[group]] fill_colors = np.array([' ']*len(x_coords)) fill_colors[:] = 'grey' fill_colors[df['finemapping'].to_numpy() == 'confidently'] = 'black' fill_colors[df['finemapping'].to_numpy() == 'not'] = 'white' up_triangle = df['direction_of_association'].to_numpy() == '+' results_plot.triangle( (x_coords + 0.5)[up_triangle], pheno_to_ycoords(df['phenotype'].to_numpy()[up_triangle]), size=np.sqrt(df['p_val'].to_numpy())[up_triangle]*2, #fill_alpha=df.select(pl.when(pl.col('finemapping') == 'confidently').then(1).otherwise(0)).to_numpy().flatten()[undotted_triangle], color=fill_colors[up_triangle], line_color='black' ) results_plot.inverted_triangle( (x_coords + 0.5)[~up_triangle], pheno_to_ycoords(df['phenotype'].to_numpy()[~up_triangle]), size=np.sqrt(df['p_val'].to_numpy())[~up_triangle]*2, #fill_alpha=df.select(pl.when(pl.col('finemapping') == 'confidently').then(1).otherwise(0)).to_numpy().flatten()[undotted_triangle], fill_color=fill_colors[~up_triangle], line_color='black' ) ''' undotted_triangle = (df['direction_of_association'].to_numpy() == '+') & undotted_loci results_plot.triangle( (x_coords + 0.5)[undotted_triangle], pheno_to_ycoords(df['phenotype'].to_numpy()[undotted_triangle]), #[(group, pheno) for pheno in df['phenotype'].to_numpy()[undotted_triangle] for group in pheno_blocks if pheno in pheno_blocks[group]], #df['phenotype'].to_numpy()[undotted_triangle], size=np.sqrt(df['p_val'].to_numpy())[undotted_triangle]*2, fill_alpha=df.select(pl.when(pl.col('finemapping') == 'confidently').then(1).otherwise(0)).to_numpy().flatten()[undotted_triangle], color='black'#str_colors[undotted_triangle], ) undotted_inverted_triangle = (df['direction_of_association'].to_numpy() == '-') & undotted_loci results_plot.inverted_triangle( (x_coords + 0.5)[undotted_inverted_triangle], pheno_to_ycoords(df['phenotype'].to_numpy()[undotted_inverted_triangle]), #[(group, pheno) for pheno in df['phenotype'].to_numpy()[undotted_inverted_triangle] for group in pheno_blocks if pheno in pheno_blocks[group]], size=np.sqrt(df['p_val'].to_numpy())[undotted_inverted_triangle]*2, fill_alpha=df.select(pl.when(pl.col('finemapping') == 'confidently').then(1).otherwise(0)).to_numpy().flatten()[undotted_inverted_triangle], color='black'#str_colors[undotted_inverted_triangle], ) dotted_triangle = (df['direction_of_association'].to_numpy() == '+') & ~undotted_loci results_plot.triangle_dot( #results_plot.triangle( (x_coords + 0.5)[dotted_triangle], pheno_to_ycoords(df['phenotype'].to_numpy()[dotted_triangle]), #[(group, pheno) for pheno in df['phenotype'].to_numpy()[dotted_triangle] for group in pheno_blocks if pheno in pheno_blocks[group]], #df['phenotype'].to_numpy()[dotted_triangle], size=np.sqrt(df['p_val'].to_numpy())[dotted_triangle]*2, fill_alpha=[0]*np.sum(dotted_triangle), color='black',#str_colors[dotted_triangle], ) dotted_inverted_triangle = (df['direction_of_association'].to_numpy() == '-') & ~undotted_loci results_plot.triangle_dot( #results_plot.triangle( (x_coords + 0.5)[dotted_inverted_triangle], pheno_to_ycoords(df['phenotype'].to_numpy()[dotted_inverted_triangle]), #[(group, pheno) for pheno in df['phenotype'].to_numpy()[dotted_inverted_triangle] for group in pheno_blocks if pheno in pheno_blocks[group]], size=np.sqrt(df['p_val'].to_numpy())[dotted_inverted_triangle]*2, fill_alpha=[0]*np.sum(dotted_inverted_triangle), color='black',#str_colors[dotted_inverted_triangle], angle=np.pi ) ''' other_ethnicities = ['Black', 'South Asian', 'Chinese', 'Irish', 'White Other'] def get_topper(height, factors, color): topper = bokeh.plotting.figure( width=plot_width, height=height, y_range=bokeh.models.FactorRange(*factors), x_range=results_plot.x_range, toolbar_location=None, outline_line_color='black' ) topper.xgrid.ticker = [] topper.xaxis.ticker = [] if color: cds = bokeh.models.ColumnDataSource(dict( x=half_xs+0.5, y=[7.5/2]*half_xs.shape[0], width=[1]*half_xs.shape[0], height=[7.5]*half_xs.shape[0], color=['grey']*half_xs.shape[0], alpha=['0.15']*half_xs.shape[0], line_color=[None]*half_xs.shape[0] )) topper.rect( x='x', y='y', width='width', height='height', color='color', alpha='alpha', line_color='line_color', source=cds ) return topper qtl_topper = get_topper( 60, ['expression QTL', 'splice or isoform QTL'], True ) qtl_STRs = pl.read_csv(args.qtl_STRs_table, sep='\t') eqtl_STR_locs = qtl_STRs.filter(~pl.col('p_vals_expression').is_null())['chrom_pos'] eqtl_STRs = df[ ('chr' + pl.col('chrom').cast(str) + '_' + pl.col('start_pos').cast(str)).is_in(eqtl_STR_locs) ].to_numpy().flatten() qtl_topper.circle( (x_coords + 0.5)[eqtl_STRs], ['expression QTL']*np.sum(eqtl_STRs), color='black', ) splice_iso_STR_locs = qtl_STRs.filter(~pl.col('p_vals_splice').is_null() | ~pl.col('p_vals_isoform').is_null())['chrom_pos'] splice_iso_STRs = df[ ('chr' + pl.col('chrom').cast(str) + '_' + pl.col('start_pos').cast(str)).is_in(splice_iso_STR_locs) ].to_numpy().flatten() qtl_topper.circle( (x_coords + 0.5)[splice_iso_STRs], ['splice or isoform QTL']*np.sum(splice_iso_STRs), color='black', ) replication_topper = get_topper( 150, [f'{ethnicity} replication'.replace('Other', 'other') for ethnicity in other_ethnicities], True ) for ethnicity_num, ethnicity in enumerate(other_ethnicities): replicates = df.select(( (pl.col('finemapping') == 'confidently') & (pl.col('other_ethnicity_effect_directions').str.split_exact(",", ethnicity_num+1).struct.field(f'field_{ethnicity_num}').str.strip() == pl.col('direction_of_association')) & (pl.col('other_ethnicity_association_p_values').str.split_exact(",", ethnicity_num+1).struct.field(f'field_{ethnicity_num}').str.strip().cast(float)*pl.col('n_confident_assocs') <= 0.05) ).alias('out'))['out'].to_numpy() replication_topper.circle( (x_coords + 0.5)[replicates], [f'{ethnicity} replication'.replace('Other', 'other')]*np.sum(replicates), color='black', ) repeat_unit_topper = get_topper(90, ['polyA', 'polyAC', 'polyCCG'], True) for repeat_unit in 'A', 'AC', 'CCG': selection = df['repeat_unit'].to_numpy() == repeat_unit repeat_unit_topper.circle( (x_coords + 0.5)[selection], [f'poly{repeat_unit}'] * np.sum(selection), color='black', ) indices = [] for index, coord in enumerate(x_coords): if index == list(x_coords).index(coord): indices.append(index) assert len(indices) == max(x_coords) + 1 unique_stable_strs = list(np.char.partition(np.array(unique_stable_strs, dtype=str), '_')[:, 2]) multiallelic_topper = get_topper(30, ['number of common alleles'], False) max_common_alleles = np.max(df['n_common_alleles'].to_numpy()) cds = bokeh.models.ColumnDataSource(dict( x=x_coords[indices]+0.5, y=['number of common alleles']*(int(np.max(x_coords)) + 1), width=[1]*(int(np.max(x_coords)) + 1), height=[1]*(int(np.max(x_coords)) + 1), color=['black']*(int(np.max(x_coords)) + 1), alpha=df['n_common_alleles'].to_numpy()[indices]/max_common_alleles, line_color=[None]*(int(np.max(x_coords)) + 1) )) multiallelic_topper.rect( x='x', y='y', width='width', height='height', color='color', alpha='alpha', line_color='line_color', source=cds ) multiallelic_scale = bokeh.plotting.figure( width=120, height=40*max_common_alleles, y_range=[0.5, max_common_alleles+0.5], y_axis_label='# common alleles', toolbar_location=None, outline_line_color='black' ) multiallelic_scale.axis.axis_label_text_font_size = '26px' multiallelic_scale.xaxis.ticker = [] multiallelic_scale.ygrid.ticker = [] multiallelic_scale.yaxis.ticker = np.arange(1, max_common_alleles + 1) cds = bokeh.models.ColumnDataSource(dict( x=[0]*max_common_alleles, y=np.arange(1, max_common_alleles+1), width=[1]*max_common_alleles, height=[1]*max_common_alleles, color=['black']*max_common_alleles, alpha=np.arange(1, max_common_alleles+1)/max_common_alleles, line_color=[None]*max_common_alleles )) multiallelic_scale.rect( x='x', y='y', width='width', height='height', color='color', alpha='alpha', line_color='line_color', source=cds ) scale_ps = [10, 20, 40, 80, 160, 300] str_scale_ps = [str(p) for p in scale_ps] p_val_scale = bokeh.plotting.figure( width=120, height=30*len(scale_ps) + 90, y_range=bokeh.models.FactorRange(*str_scale_ps), y_axis_label='-log10 p-value', toolbar_location=None, outline_line_color='black' ) p_val_scale.axis.axis_label_text_font_size = '26px' p_val_scale.xaxis.ticker = [] p_val_scale.grid.ticker = [] p_val_scale.triangle( [0]*len(scale_ps), str_scale_ps, size=np.sqrt(scale_ps)*2, color='black' ) bokeh.io.export_png( bokeh.layouts.row( bokeh.layouts.column(multiallelic_topper, repeat_unit_topper, replication_topper, qtl_topper, results_plot), bokeh.layouts.column(multiallelic_scale, p_val_scale) ), filename=args.outfile )
def main(): parser = argparse.ArgumentParser() parser.add_argument('phenotypes', nargs='+') phenotypes = parser.parse_args().phenotypes all_dfs = [] susie_cs_min_abs_corrs = [] finemap_cs_coverages = [] unconverged_regions = [] #underexplored_regions = [] unfinished_regions = [] for phenotype in phenotypes: pheno_dfs = [] str_assocs = pl.scan_csv( f'{ukb}/association/results/{phenotype}/my_str/results.tab', sep='\t', ).select([ pl.lit(phenotype).alias('phenotype'), 'chrom', 'pos', pl.col(f'p_{phenotype}').alias('p_val'), pl.lit(True).alias('is_STR'), pl.lit(None).cast(int).alias('reflen'), pl.lit(None).cast(int).alias('altlen') ]) snp_assocs = pl.scan_csv( f'{ukb}/association/results/{phenotype}/plink_snp/results.tab', sep='\t', null_values='NA', ).select([ pl.col('#CHROM').alias('chrom'), pl.col('POS').alias('pos'), pl.col('REF').str.lengths().cast(int).alias('reflen'), pl.col('ALT').str.lengths().cast(int).alias('altlen'), pl.col('P').alias('p_val'), ]).groupby(['chrom', 'pos', 'reflen', 'altlen']).agg([ pl.col('p_val').min().alias('p_val'), ]).with_columns([ pl.lit(phenotype).alias('phenotype'), pl.lit(False).alias('is_STR') ]).select([ 'phenotype', 'chrom', 'pos', 'p_val', 'is_STR', 'reflen', 'altlen' ]) assocs = pl.concat([str_assocs, snp_assocs ]).filter(pl.col('p_val') <= p_val_thresh) regions_df = pl.read_csv(f'{ukb}/signals/regions/{phenotype}.tab', sep='\t') for chrom, start, end, any_strs in zip(regions_df['chrom'], regions_df['start'], regions_df['end'], regions_df['any_strs']): if not any_strs: continue converged_fname = f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/converged.txt' if not os.path.exists(converged_fname): unfinished_regions.append((phenotype, chrom, start, end)) continue with open(converged_fname) as converged_file: if not next(converged_file).strip() == 'TRUE': unconverged_regions.append((phenotype, chrom, start, end)) continue print(f'Loading {phenotype} region {chrom}:{start}-{end}', flush=True) with open( f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/colnames.txt' ) as var_file: susie_vars = [line.strip() for line in var_file] alphas = pl.scan_csv( f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/alpha.tab', sep='\t', has_header=False).collect().to_numpy().T n_alphas = alphas.shape[1] susie_pips = 1 - np.prod(1 - alphas, axis=1) assert susie_pips.shape[0] == len(susie_vars) susie_idx = np.arange(len(susie_vars)) + 1 susie_df = pl.DataFrame({ 'varname': susie_vars, 'susie_pip': susie_pips, 'susie_alpha': np.zeros(len(susie_vars)), 'susie_cs': [-1] * len(susie_vars), 'susie_idx': susie_idx, **{f'alpha_{i}': alphas[:, i] for i in range(n_alphas)} }).lazy() finemap_df = pl.scan_csv( f'{ukb}/finemapping/finemap_results/{phenotype}/{chrom}_{start}_{end}/finemap_output.snp', sep=' ').select([ pl.col('rsid').alias('varname'), pl.col('prob').alias('finemap_pip') ]) df = susie_df.join(finemap_df, how='inner', on=[ 'varname' ]).with_columns([ pl.col('varname').str.extract('^[^_]*_([^_]*)', 1).cast(int).alias('pos'), pl.col('varname').str.extract( '^[^_]*_[^_]*_([^_]*)_.*', 1).str.lengths().cast(int).alias('reflen'), pl.col('varname').str.extract( '^[^_]*_[^_]*_[^_]*_([^_]*)', 1).str.lengths().cast(int).alias('altlen'), pl.col('varname').str.contains('^STR').alias('is_STR'), pl.lit(f'{phenotype}_{chrom}_{start}_{end}').alias('region'), pl.lit(chrom).alias('chrom').cast(int), pl.lit(phenotype).alias('phenotype') ]).sort('susie_idx') real_cs_count = 0 for cs_fname in glob.glob( f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/cs*.txt' ): cs_id = int(cs_fname.split('cs')[-1].split('.')[0]) with open(cs_fname) as cs_file: # susie uses 1 based indexing, python uses 0 # make sure cs idxs are in increasing order cs_susie_idx = np.array( [int(idx) for idx in next(cs_file).strip().split()]) assert np.all(cs_susie_idx[1:] - cs_susie_idx[:-1] > 0) cs_susie_idx = pl.Series('cs_susie_idx', cs_susie_idx) next(cs_file) # skip cs credibility min_abs_corr, _, _ = [ float(idx) for idx in next(cs_file).strip().split() ] susie_cs_min_abs_corrs.append(min_abs_corr) finemap_cs_coverages.append( df.filter(pl.col('susie_idx').is_in(cs_susie_idx)).select( pl.col('finemap_pip').sum()).collect()) df = df.with_column( pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then( pl.when( pl.col(f'alpha_{cs_id-1}') > pl.col('susie_alpha') ).then(pl.col(f'alpha_{cs_id-1}')).otherwise( pl.col('susie_alpha'))).otherwise( pl.col('susie_alpha')).alias('susie_alpha')) if min_abs_corr < corr_cutoff: continue real_cs_count += 1 # could worry about variants being in multiple CSes df = df.with_column( pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then( cs_id).otherwise(pl.col('susie_cs')).alias('susie_cs')) pheno_dfs.append(df) ''' if real_cs_count >= 10: underexplored_regions.append((phenotype, chrom, start, end)) ''' pheno_dfs = [ df.select(pl.col('*').exclude('^alpha.*$')) for df in pheno_dfs ] pheno_df = pl.concat(pheno_dfs).join( assocs, how='left', on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen', 'altlen']).collect() all_dfs.append(pheno_df) del df, susie_df, finemap_df, assocs, pheno_dfs, pheno_df susie_cs_min_abs_corrs = np.array(susie_cs_min_abs_corrs) finemap_cs_coverages = np.array(finemap_cs_coverages) total_df = pl.concat(all_dfs) #total_assocs = pl.concat(all_assocs).filter(pl.col('p_val') <= p_val_thresh) '''' start_time = time.time() print('Gathering data ... ', flush=True) total_df = total_df.join( total_assocs, how='left', on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen', 'altlen'] ).collect() print(f'Done. Time: {time.time() - start_time:.2}') ''' total_df.filter( ~pl.col('p_val').is_null() & (pl.col('p_val') <= p_val_thresh)).to_csv( f'{ukb}/post_finemapping/intermediate_results/gathered_data.tab', sep='\t') print( 'Any vars with null Ps?', total_df.select(pl.col('p_val').is_null().alias('null?')).select( pl.any('null?').alias('any_nulls'))['any_nulls'][0]) print( 'n regions', total_df.select( pl.col('region').unique().count().alias('region_count')) ['region_count'][0]) cses_per_region = total_df.filter( pl.col('susie_cs') >= 0).filter(~pl.col('p_val').is_null()).groupby([ 'susie_cs', 'region' ]).agg( pl.col('p_val').min().alias('min_p'), ).filter(pl.col('min_p') <= p_val_thresh).groupby('region').agg( pl.col('region').count().alias('n_cses')).to_dict(False)['n_cses'] print( f'avg cses (total PIP >= .9, min_p_val of CS members <= {p_val_thresh}) per region {np.mean(cses_per_region)}, ({np.std(cses_per_region)})' ) for filter_, text in ((pl.lit(True), ''), (pl.col('is_STR'), ' STR'), (~pl.col('is_STR'), ' SNP')): susie_hits_per_region = total_df.filter(filter_).with_column( ((pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= pip_threshold) & (pl.col('p_val') <= p_val_thresh) ).alias('susie_hit')).groupby('region').agg( pl.col('susie_hit').sum().alias('n_susie_hits')).to_dict( False)['n_susie_hits'] print( f'avg susie{text} hits (var is in a CS, PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(susie_hits_per_region)}, ({np.std(susie_hits_per_region)})' ) finemap_hits_per_region = total_df.filter(filter_).with_column( ((pl.col('finemap_pip') >= pip_threshold) & (pl.col('p_val') <= p_val_thresh) ).alias('finemap_hit')).groupby('region').agg( pl.col('finemap_hit').sum().alias('n_finemap_hits')).select( 'n_finemap_hits').to_numpy() print( f'avg finemap{text} hits (PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(finemap_hits_per_region)}, ({np.std(finemap_hits_per_region)})' ) print('Exporting FINEMAP vs SuSiE PIP plots', flush=True) comparison_thresh = 0.3 title = f'{text} with p-val <= {p_val_thresh} where at least one of SuSiE or FINEMAP PIP >= {comparison_thresh}' if text == '': title = 'Vars ' + title fig = bokeh.plotting.figure( width=1200, height=1200, title=title, x_axis_label='FINEMAP PIPs', y_axis_label='SuSiE PIPs', ) fig.title.text_font_size = '30px' fig.axis.axis_label_text_font_size = '26px' fig.axis.major_label_text_font_size = '20px' fig.background_fill_color = None fig.border_fill_color = None fig.ygrid.grid_line_color = None fig.xgrid.grid_line_color = None fig.toolbar.logo = None fig.toolbar_location = None print(total_df.filter(filter_)) print(total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh))) pips = total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh) & ((pl.col('finemap_pip') >= comparison_thresh) | ((pl.col('susie_pip') >= comparison_thresh) & (pl.col('susie_cs') >= 0)))).select( ['susie_pip', 'finemap_pip']) print(pips) bin_size = .05 bins = bokeh.util.hex.hexbin( pips['finemap_pip'].to_numpy().reshape(-1), pips['susie_pip'].to_numpy().reshape(-1), size=bin_size) palette = [ linear_int_interpolate((134, 204, 195), (9, 41, 46), i / 254) for i in range(-1, 255) ] cmap = bokeh.transform.log_cmap('counts', palette=palette, low=1, high=max(bins.counts), low_color=(255, 255, 255)) color_mapper = bokeh.models.LogColorMapper(palette=palette, low=1, high=max(bins.counts)) fig.hex_tile(q='q', r='r', size=bin_size, line_color=None, source=bins, fill_color=cmap) color_bar = bokeh.models.ColorBar(color_mapper=color_mapper, width=70, major_label_text_font_size='20px') fig.add_layout(color_bar, 'right') ext = text.replace(' ', '_') bokeh.io.export_png( fig, filename= f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.png') bokeh.io.export_svg( fig, filename= f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.svg') print(f'unconverged regions: {unconverged_regions}') print(f'unfinished regions: {unfinished_regions}') #print(f'underexplored regions: {underexplored_regions}') fig = bokeh.plotting.figure( width=1200, height=1200, title='SuSiE credible set min absolute correlations', x_axis_label='min absolute correlation', y_axis_label='# credible sets', ) fig.axis.axis_label_text_font_size = '30px' fig.background_fill_color = None fig.border_fill_color = None fig.grid.grid_line_color = None fig.toolbar_location = None step = 0.01 left_edges = np.arange(0, 1 + step, step) ys = [ np.sum((left_edge <= susie_cs_min_abs_corrs) & (susie_cs_min_abs_corrs < left_edge + step)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step) print('Exporting cs plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.png') bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.svg') fig = bokeh.plotting.figure( width=1200, height=1200, title= f'Number of SuSie CSes min absolute corr >= {corr_cutoff} per region', x_axis_label='# cses in the region', y_axis_label='# regions', ) fig.axis.axis_label_text_font_size = '30px' fig.background_fill_color = None fig.border_fill_color = None fig.grid.grid_line_color = None fig.toolbar_location = None left_edges = np.arange(0, max(cses_per_region) + 1) ys = [ np.sum((left_edge <= cses_per_region) & (cses_per_region < left_edge + 1)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1) print('Exporting cs per region plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/cses_per_region.png') bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/cses_per_region.svg') fig = bokeh.plotting.figure( width=1200, height=1200, title=f'Number of FINEMAP vars with PIP >= {pip_threshold} per region', x_axis_label='# hits in the region', y_axis_label='# regions', ) fig.axis.axis_label_text_font_size = '30px' fig.background_fill_color = None fig.border_fill_color = None fig.grid.grid_line_color = None fig.toolbar_location = None left_edges = np.arange(0, max(finemap_hits_per_region) + 1) ys = [ np.sum((left_edge <= finemap_hits_per_region) & (finemap_hits_per_region < left_edge + 1)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1) print('Exporting finemap hits per region plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.png') bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.svg') fig = bokeh.plotting.figure( width=1200, height=1200, title= f'FINEMAP total PIPs for SuSiE CSes with min_abs_corr >= {corr_cutoff}', x_axis_label='FINEMAP PIPs', y_axis_label='# credible sets', ) fig.background_fill_color = None fig.border_fill_color = None fig.ygrid.grid_line_color = None fig.xgrid.grid_line_color = None fig.toolbar.logo = None fig.toolbar_location = None include = susie_cs_min_abs_corrs >= corr_cutoff max_total_pip = max(1, np.max(finemap_cs_coverages[include])) step = 0.01 left_edges = np.arange(0, max_total_pip + step, step) ys = [ np.sum((left_edge <= finemap_cs_coverages[include]) & (finemap_cs_coverages[include] < left_edge + step)) for left_edge in left_edges ] fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step) print('Exporting FINEMAP CS PIP plots', flush=True) bokeh.io.export_png( fig, filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.png' ) bokeh.io.export_svg( fig, filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.svg' ) total_cses = np.sum(include) total_cses_large_finemap_pip = np.sum( finemap_cs_coverages[include] >= pip_threshold) print( f'SuSiE CSes with min_abs_corr >= {corr_cutoff} with FINEMAP total PIP >= {pip_threshold}: {total_cses_large_finemap_pip} ({total_cses_large_finemap_pip/total_cses:%})' ) susie_pip_threshold_for_finemap = .3 n_replicates_from_finemap = total_df.filter( (pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= susie_pip_threshold_for_finemap) & (pl.col('finemap_pip') >= pip_threshold)).shape[0] n_finemap_total = total_df.filter( pl.col('finemap_pip') >= pip_threshold).shape[0] print( f'FINEMAP hits with PIP >= {pip_threshold} in a SuSiE CS with abs corr >= {corr_cutoff} and SuSiE PIP >= {susie_pip_threshold_for_finemap}: {n_replicates_from_finemap} ({n_replicates_from_finemap/n_finemap_total:%})' ) for (curr_df, text) in [(total_df, 'all hits no filter'), (total_df.filter(pl.col('p_val') <= 1e-10), 'all hits p<=1e-10')]: print(text) var_thresh1 = .8 var_thresh2 = .3 for susie_thresh in (var_thresh1, var_thresh2): for finemap_thresh in (var_thresh1, var_thresh2): count = curr_df.filter( (pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= susie_thresh) & (pl.col('finemap_pip') >= finemap_thresh)).shape[0] print( f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} and with FINEMAP PIP >= {finemap_thresh}: {count}' ) for susie_thresh in (var_thresh1, var_thresh2): count = curr_df.filter( (pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= susie_thresh) & (pl.col('finemap_pip') < var_thresh2)).shape[0] print( f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} with FINEMAP PIP < {var_thresh2}: {count}' ) for finemap_thresh in (var_thresh1, var_thresh2): count = curr_df.filter( (pl.col('finemap_pip') >= finemap_thresh) & ((pl.col('susie_cs') < 0) | (pl.col('susie_pip') < var_thresh2))).shape[0] print( f'Vars with FINEMAP PIP >= {finemap_thresh} either not in a SuSiE CS or having SuSiE PIP <= {var_thresh2}: {count}' ) # Not going to report susie alphas v pips - just know that they're similar if we look # at vars in good credible sets and not otherwise '''
import polars as pl from .dataset import df q = df.lazy().with_column( pl.when(pl.col("range") >= 5).then(pl.col("left")).otherwise( pl.col("right")).alias("foo_or_bar")) df = q.collect()
joined = pl.read_csv( f'{ukb}/export_scripts/results/causal_STR_candidates_for_publication.tab', sep='\t').select(( 'chr' + pl.col('chrom').cast(str) + '_' + pl.col('start_pos').cast(str) ).alias('chr_pos')).distinct().join( results, how='inner', left_on='chr_pos', right_on='hg19_START').join(filtered, how='left', on='chr_pos').filter( pl.col('FILTER').is_null()).drop([ 'START', 'FILTER', 'CHROM_START', 'CHROM', 'POS' ]).filter((pl.col('splice_p_vals').str.lengths() > 0) | ( pl.col('expression_p_vals').str.lengths() > 0)).with_columns([ pl.when(pl.col('splice_p_vals').str.lengths() == 0).then( None).otherwise( pl.col('splice_p_vals')).alias('splice_p_vals'), pl.when(pl.col('splice_p_vals').str.lengths() == 0).then( None).otherwise( pl.col('splice_associations (tissue:gene:exonID)') ).alias('splice_associations (tissue:gene:exonID)'), pl.when(pl.col('splice_p_vals').str.lengths() == 0).then( None).otherwise( pl.col('splice_n_tests')).alias('splice_n_tests'), pl.when(pl.col('expression_p_vals').str.lengths() == 0).then(None).otherwise( pl.col('expression_p_vals')).alias( 'expression_p_vals'), pl.when(pl.col('expression_p_vals').str.lengths() == 0). then(None).otherwise( pl.col('expression_associations (tissue:gene)')).alias(
pl.col('FILTER').is_null() ).drop(['START', 'FILTER', 'CHROM_START', 'CHROM', 'POS']).filter( ''' ''' total_qtl_str = total_qtl_str.join( trait_assocs, on='chrom_pos' ) ''' total_qtl_str = total_qtl_str.filter( (pl.col('p_vals_expression').str.lengths() > 0) | (pl.col('p_vals_splice').str.lengths() > 0) | (pl.col('p_vals_isoform').str.lengths() > 0) ).with_columns([ pl.when( pl.col('p_vals_expression').str.lengths() == 0).then(None).otherwise( pl.col('p_vals_expression')).alias('p_vals_expression'), pl.when( pl.col('p_vals_expression').str.lengths() == 0).then(None).otherwise( pl.col('associations (tissue:target)_expression')).alias( 'associations (tissue:target)_expression'), pl.when( pl.col('p_vals_expression').str.lengths() == 0).then(None).otherwise( pl.col('n_tests_expression')).alias('n_tests_expression'), pl.when(pl.col('p_vals_splice').str.lengths() == 0).then(None).otherwise( pl.col('p_vals_splice')).alias('p_vals_splice'), pl.when(pl.col('p_vals_splice').str.lengths() == 0).then(None).otherwise( pl.col('associations (tissue:target)_splice')).alias( 'associations (tissue:target)_splice'), pl.when(pl.col('p_vals_splice').str.lengths() == 0).then(None).otherwise( pl.col('n_tests_splice')).alias('n_tests_splice'),
print('Loading eSTRs ... ') eSTRs = pl.read_csv(f'{ukb}/misc_data/eSTR/eSTRs.csv', sep=',').rename({ 'score': 'eSTR_CAVIAR_score' }).with_column(pl.col('chrom').str.slice(3).cast(int)).groupby( ['chrom', 'str.start']).agg(pl.col('eSTR_CAVIAR_score').max()) all_STRs = all_STRs.join( eSTRs, how='left', left_on=['chrom', 'pos'], right_on=['chrom', 'str.start'], ).with_columns([ (~pl.col('eSTR_CAVIAR_score').is_null()).alias('eSTR'), pl.when(pl.col('eSTR_CAVIAR_score').is_null()).then(False).otherwise( pl.col('eSTR_CAVIAR_score') >= .3).alias('FM_eSTR') ]) print('Getting promoters ... ', flush=True, end='') genes = pl.read_csv( f'{ukb}/misc_data/gencode/gencode.v38lift37.annotation.without_chr.sorted.gene.gff3', sep='\t', has_header=False, columns=[0, 3, 4, 6, 8], dtypes={ 'column_1': str }).select([ pl.col('column_1').alias('chrom'), pl.col('column_4').alias('start_pos'), pl.col('column_5').alias('end_pos'), pl.col('column_7').alias('strand'),