def test_apply_none() -> None: df = pl.DataFrame({ "g": [1, 1, 1, 2, 2, 2, 5], "a": [2, 4, 5, 190, 1, 4, 1], "b": [1, 3, 2, 1, 43, 3, 1], }) out = (df.groupby("g", maintain_order=True).agg( pl.apply( exprs=["a", pl.col("b")**4, pl.col("a") / 4], f=lambda x: x[0] * x[1] + x[2].sum(), ).alias("multiple")))["multiple"] assert out[0].to_list() == [4.75, 326.75, 82.75] assert out[1].to_list() == [238.75, 3418849.75, 372.75] out_df = df.select(pl.map(exprs=["a", "b"], f=lambda s: s[0] * s[1])) assert out_df["a"].to_list() == (df["a"] * df["b"]).to_list() # check if we can return None def func(s: List) -> Optional[int]: if s[0][0] == 190: return None else: return s[0] out = ( df.groupby("g", maintain_order=True).agg( pl.apply(exprs=["a", pl.col("b")**4, pl.col("a") / 4], f=func).alias( # type: ignore "multiple")))["multiple"] assert out[1] is None
def test_agg_objects() -> None: df = pl.DataFrame({ "names": ["foo", "ham", "spam", "cheese", "egg", "foo"], "dates": ["1", "1", "2", "3", "3", "4"], "groups": ["A", "A", "B", "B", "B", "C"], }) out = df.groupby("groups").agg([ pl.apply([pl.col("dates"), pl.col("names")], lambda s: dict(zip(s[0], s[1]))) ]) assert out.dtypes == [pl.Utf8, pl.Object]
def test_apply_numpy_out_3057() -> None: df = pl.DataFrame( dict(id=[0, 0, 0, 1, 1, 1], t=[2.0, 4.3, 5, 10, 11, 14], y=[0.0, 1, 1.3, 2, 3, 4])) assert (df.groupby("id", maintain_order=True).agg( pl.apply(["y", "t"], lambda lst: np.trapz(y=lst[0], x=lst[1])).alias( "result")).frame_equal( pl.DataFrame({ "id": [0, 1], "result": [1.955, 13.0] })))
def main(): df = pl.scan_csv('post_finemapping/intermediate_results/gathered_data.tab', sep='\t').filter((pl.col('susie_pip') >= 0.3) | (pl.col('finemap_pip') >= 0.3)) df = df.with_column( (pl.col('susie_pip') - pl.col('finemap_pip')).alias('susie_f_pip_diff')).with_column( pl.col('susie_f_pip_diff').abs().alias('abs_pip_diff')) locus_summary_df = pl.concat([ pl.scan_csv( f'export_scripts/intermediate_results/chr{chrom}_loci_summary.tab', sep='\t') for chrom in range(1, 23) ]).select(['chr', 'pos', 'multiallelicness', 'allele_dist']) allele_threshes = (0.0004, 0.002, 0.01, 0.05) #allele_threshes = [0.01] df = df.join( locus_summary_df, how='left', #left_on=['chrom', 'snpstr_pos'], left_on=['chrom', 'pos'], right_on=['chr', 'pos']).collect() snp_df = df.filter(~pl.col('is_STR')) str_df = df.filter(pl.col('is_STR')) assert not str_df.select( pl.col('multiallelicness').is_null().any()).to_numpy()[0] str_df = str_df.with_columns([ pl.apply('allele_dist', count_alleles(thresh), pl.UInt32).alias(f'alleles_{thresh}') for thresh in allele_threshes ]) confusions = pl.concat([ pl.scan_csv(f'side_analyses/length_confusion/chr{i}.tab', sep='\t').with_column(pl.lit(i).alias('chrom').cast(int)) for i in range(1, 23) ]).collect() merged_df = str_df.join(confusions, how='left', on=['chrom', 'pos']) step = 0.05 fig = bokeh.plotting.figure(title='STR PIP histogram', width=size, height=size, x_axis_label='PIP', y_axis_label='density', tools='', toolbar_location=None) xs = np.arange(0, 1 + step, step) fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(str_df['susie_pip'], bins=xs, density=True)[0], color='red', legend_label='SuSiE STRs') fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(str_df['finemap_pip'], bins=xs, density=True)[0], color='blue', legend_label='FINEMAP STRs') fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(snp_df['susie_pip'], bins=xs, density=True)[0], color='green', legend_label='SuSiE SNPs') fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(snp_df['finemap_pip'], bins=xs, density=True)[0], color='purple', legend_label='FINEMAP SNPs') bokeh.io.export_png(fig, filename='post_finemapping/results/pip_histogram.png') fig = bokeh.plotting.figure(title='STR PIP scatterplot', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', tools='', toolbar_location=None) fig.circle(str_df['susie_pip'], str_df['finemap_pip']) bokeh.io.export_png( fig, filename='post_finemapping/results/str_comp_pip_scatter.png') fig = bokeh.plotting.figure(title='STR PIP heatmap', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) heat_map(fig, str_df['finemap_pip'], str_df['susie_pip'], 'post_finemapping/results/str_comp_pip_heatmap.png') fig = bokeh.plotting.figure(title='STR PIPs', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) weighted_heat_map( fig, merged_df['finemap_pip'], merged_df['susie_pip'], merged_df['chance_of_length_confusion'], 'average chance of misgenotyping per sample at any such locus', 'post_finemapping/results/str_comp_pip_chance_map.png') fig = bokeh.plotting.figure(title='STR PIPs', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) weighted_heat_map( fig, merged_df['finemap_pip'], merged_df['susie_pip'], merged_df['normalized_avg_abs_length_confusion'], 'average number of standard deviations of misgenotyping per sample at any such locus', 'post_finemapping/results/str_comp_pip_sd_map.png') fig = bokeh.plotting.figure(title='SNP PIP scatterplot', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', tools='', toolbar_location=None) fig.circle(snp_df['susie_pip'], snp_df['finemap_pip']) bokeh.io.export_png( fig, filename='post_finemapping/results/snp_comp_pip_scatter.png') fig = bokeh.plotting.figure(title='SNP PIP heatmap', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', match_aspect=True, tools='', toolbar_location=None) heat_map(fig, snp_df['finemap_pip'], snp_df['susie_pip'], 'post_finemapping/results/snp_comp_pip_heatmap.png') color_mapper = bokeh.models.LinearColorMapper(palette=palette, low=0, high=1) color_bar = bokeh.models.ColorBar(color_mapper=color_mapper, width=30) cmap = bokeh.transform.linear_cmap('foo', palette=palette, low=0, high=1) fig = bokeh.plotting.figure(title='STR PIP scatterplot', width=size, height=size, x_axis_label='FINEMAP PIP', y_axis_label='SuSiE PIP', tools='', match_aspect=True, toolbar_location=None) cb_title = bokeh.models.Title( text='chance a genotype call at this locus is wrong', align='center') fig.add_layout(color_bar, 'right') fig.add_layout(cb_title, 'right') cds = bokeh.models.ColumnDataSource( dict(x=merged_df['finemap_pip'], y=merged_df['susie_pip'], color=[ linear_int_interpolate((134, 204, 195), (9, 41, 46), val) for val in merged_df['chance_of_length_confusion'] ])) fig.circle(x='x', y='y', color='color', source=cds) bokeh.io.export_png( fig, filename='post_finemapping/results/colored_str_comp_pip_scatter.png') step = 0.05 for thresh in allele_threshes: for pip_thresh in (0.3, 0.8): for xs, x_label, out_loc, title, col in [ ( np.arange(-1, 1 + step, step), 'SuSiE PIP - FINEMAP PIP', f'post_finemapping/results/pip_diff_density_allele_thresh_{thresh}_pip_thresh_{pip_thresh}.png', f'PIP diff, STR allele penetrance threshold = {thresh:.4}', 'susie_f_pip_diff', ), (np.arange(0, 1 + step, step), 'absolute PIP difference', f'post_finemapping/results/pip_abs_diff_density_allele_thresh_{thresh}_pip_thresh_{pip_thresh}.png', f'absolute PIP diff, STR allele penetrance threshold = {thresh:.4}', 'abs_pip_diff') ]: filter_exp = (pl.col('susie_pip') >= pip_thresh) | ( pl.col('finemap_pip') >= pip_thresh) fig = bokeh.plotting.figure(title=title, width=size, height=size, x_axis_label=x_label, y_axis_label='density', tools='', toolbar_location=None) fig.line( x=xs[:-1], y=np.histogram(snp_df.filter(filter_exp)[col].to_numpy(), bins=xs, density=True)[0], #y=scipy.stats.gaussian_kde(snp_df['susie_f_pip_diff'].to_numpy())(xs), color='black', legend_label=f'SNPs (n={snp_df.shape[0]})') for count, color in ((2, 'brown'), (3, 'red'), (4, 'orange')): arr = str_df.filter(filter_exp).filter( pl.col(f'alleles_{thresh}') == count)[col].to_numpy() fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(arr, bins=xs, density=True)[0], color=color, legend_label=f'{count}-allele STRs (n={arr.shape[0]})') arr = str_df.filter(filter_exp).filter( pl.col(f'alleles_{thresh}') >= 5)[col].to_numpy() fig.line( x=xs[:-1], #y=scipy.stats.gaussian_kde(arr)(xs), y=np.histogram(arr, bins=xs, density=True)[0], color='gold', legend_label= f'STRs with at least 5 alleles (n={arr.shape[0]})') fig.add_layout( bokeh.models.Title( text= f'Variants with PIP at least {pip_thresh} for SuSiE or FINEMAP' ), 'below') bokeh.io.export_png(fig, filename=out_loc) fig = bokeh.plotting.figure(title='STR PIP diff', width=size, height=size, x_axis_label='multiallelicness', y_axis_label='SuSiE PIP - FINEMAP PIP', tools='', toolbar_location=None) heat_map(fig, str_df['multiallelicness'], str_df['susie_f_pip_diff'], 'post_finemapping/results/str_pip_diff_heatmap.png', y_min=-1) fig = bokeh.plotting.figure(title='STR PIP abs diff', width=size, height=size, x_axis_label='multiallelicness', y_axis_label='absolute PIP difference', tools='', toolbar_location=None) heat_map(fig, str_df['multiallelicness'], str_df['abs_pip_diff'], 'post_finemapping/results/str_pip_abs_diff_heatmap.png') fig = bokeh.plotting.figure(title='PIP abs diff', width=size, height=size, x_axis_label='multiallelicness', y_axis_label='absolute PIP difference', tools='', toolbar_location=None)