def test_lazy_functions(): df = pl.DataFrame({ "a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0] }) out = df[[pl.count("a")]] assert out["a"] == 3 assert pl.count(df["a"]) == 3 out = df[[ pl.var("b"), pl.std("b"), pl.max("b"), pl.min("b"), pl.sum("b"), pl.mean("b"), pl.median("b"), pl.n_unique("b"), pl.first("b"), pl.last("b"), ]] expected = 1.0 assert np.isclose(out.select_at_idx(0), expected) assert np.isclose(pl.var(df["b"]), expected) expected = 1.0 assert np.isclose(out.select_at_idx(1), expected) assert np.isclose(pl.std(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(2), expected) assert np.isclose(pl.max(df["b"]), expected) expected = 1 assert np.isclose(out.select_at_idx(3), expected) assert np.isclose(pl.min(df["b"]), expected) expected = 6 assert np.isclose(out.select_at_idx(4), expected) assert np.isclose(pl.sum(df["b"]), expected) expected = 2 assert np.isclose(out.select_at_idx(5), expected) assert np.isclose(pl.mean(df["b"]), expected) expected = 2 assert np.isclose(out.select_at_idx(6), expected) assert np.isclose(pl.median(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(7), expected) assert np.isclose(pl.n_unique(df["b"]), expected) expected = 1 assert np.isclose(out.select_at_idx(8), expected) assert np.isclose(pl.first(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(9), expected) assert np.isclose(pl.last(df["b"]), expected) expected = 3 assert np.isclose(out.select_at_idx(9), expected) assert np.isclose(pl.last(df["b"]), expected)
def test_lazy_functions(): df = pl.DataFrame({ "a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0] }) out = df[[pl.count("a")]] assert out[0] == 3 assert pl.count(df["a"]) == 3 out = df[[ pl.var("b"), pl.std("b"), pl.max("b"), pl.min("b"), pl.sum("b"), pl.mean("b"), pl.median("b"), pl.n_unique("b"), pl.first("b"), pl.last("b"), ]] expected = 1.0 assert np.isclose(out[0], expected) assert np.isclose(pl.var(df["b"]), expected) expected = 1.0 assert np.isclose(out[1], expected) assert np.isclose(pl.std(df["b"]), expected) expected = 3 assert np.isclose(out[2], expected) assert np.isclose(pl.max(df["b"]), expected) expected = 1 assert np.isclose(out[3], expected) assert np.isclose(pl.min(df["b"]), expected) expected = 6 assert np.isclose(out[4], expected) assert np.isclose(pl.sum(df["b"]), expected) expected = 2 assert np.isclose(out[5], expected) assert np.isclose(pl.mean(df["b"]), expected) expected = 2 assert np.isclose(out[6], expected) assert np.isclose(pl.median(df["b"]), expected) expected = 3 assert np.isclose(out[7], expected) assert np.isclose(pl.n_unique(df["b"]), expected) expected = 1 assert np.isclose(out[8], expected) assert np.isclose(pl.first(df["b"]), expected) expected = 3 assert np.isclose(out[9], expected) assert np.isclose(pl.last(df["b"]), expected) expected = 3 assert np.isclose(out[9], expected) assert np.isclose(pl.last(df["b"]), expected)
def test_rolling() -> None: dates = [ "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43", ] df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_column( pl.col("dt").str.strptime(pl.Datetime) ) out = df.groupby_rolling(index_column="dt", period="2d").agg( [ pl.sum("a").alias("sum_a"), pl.min("a").alias("min_a"), pl.max("a").alias("max_a"), ] ) assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1]
def test_fold(): df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = df.lazy().select(pl.sum(["a", "b"])).collect() assert out["sum"].series_equal(pl.Series("sum", [2, 4, 6])) out = df.select( pl.fold(acc=lit(0), f=lambda acc, x: acc + x, exprs=pl.col("*")).alias("foo")) assert out["foo"] == [2, 4, 6]
def test_fold() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = df.select([ pl.sum(["a", "b"]), pl.max(["a", pl.col("b")**2]), pl.min(["a", pl.col("b")**2]), ]) assert out["sum"].series_equal(pl.Series("sum", [2.0, 4.0, 6.0])) assert out["max"].series_equal(pl.Series("max", [1.0, 4.0, 9.0])) assert out["min"].series_equal(pl.Series("min", [1.0, 2.0, 3.0])) out = df.select( pl.fold(acc=lit(0), f=lambda acc, x: acc + x, exprs=pl.col("*")).alias("foo")) assert out["foo"] == [2, 4, 6]
def test_window_function(): df = pl.DataFrame({ "A": [1, 2, 3, 4, 5], "fruits": ["banana", "banana", "apple", "apple", "banana"], "B": [5, 4, 3, 2, 1], "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], }) q = df.lazy().with_columns([ pl.sum("A").over("fruits").alias("fruit_sum_A"), pl.first("B").over("fruits").alias("fruit_first_B"), pl.max("B").over("cars").alias("cars_max_B"), ]) out = q.collect() assert out["cars_max_B"] == [5, 4, 5, 5, 5] out = df[[pl.first("B").over(["fruits", "cars"]).alias("B_first")]] assert out["B_first"] == [5, 4, 3, 3, 5]
id037,id097,id0000062401,48,8,53992,5,15,83.565443 id073,id081,id0000017280,54,90,28480,5,4,17.078693 id081,id073,id0000073423,51,22,39788,2,12,45.883758 id062,id080,id0000092749,1,75,67857,3,10,80.418674 id045,id031,id0000076210,2,42,80312,4,5,48.668692 id082,id048,id0000080227,56,62,16760,3,11,34.933239 id035,id032,id0000033279,55,13,80560,5,5,61.372678 id053,id013,id0000073898,61,63,12387,4,7,29.949863""" f = io.BytesIO(csv) x = pl.read_csv( f, dtype={ "id4": pl.Int32, "id5": pl.Int32, "id6": pl.Int32, "v1": pl.Int32, "v2": pl.Int32, "v3": pl.Float64, }, ) x["id1"] = x["id1"].cast(pl.Categorical) x["id2"] = x["id2"].cast(pl.Categorical) x["id3"] = x["id3"].cast(pl.Categorical) x = x.lazy() question = "sum v1 by id1" # q1 ans = x.groupby("id1").agg(pl.sum("v1")).collect() print(ans.shape, flush=True)
then('doubly').when((pl.col('susie_CP') >= 0.8) | (pl.col('finemap_CP') >= 0.8)).then( 'singly').otherwise('not').alias('finemapping'), 'susie_CP', 'finemap_CP', 'susie_CP_best_guess_genotypes', 'finemap_CP_pval_thresh_5e-4', 'finemap_CP_mac_thresh_100', 'finemap_CP_prior_effect_size_0.05%', 'finemap_CP_prior_4_signals', 'finemap_CP_stopping_thresh_1e-4', 'susie_CP_prior_snps_over_strs', 'finemap_CP_prior_snps_over_strs', 'finemap_CP_prior_effect_size_0.0025%', pl.sum([ pl.col(f'{ethnicity}_p_val').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities ]).str.replace(', $', '').alias('other_ethnicity_association_p_values'), pl.sum([ pl.when(pl.col(f'{ethnicity}_p_val') > .05).then('NA').when( pl.col('coeff') > 0).then('+').otherwise('-') + pl.lit(', ') for ethnicity in other_ethnicities ]).str.replace(', $', '').alias('other_ethnicity_effect_directions'), *[ pl.col(f'{ethnicity}_allele_dosages').apply( dosages_to_frequencies).alias(f'{ethnicity}_allele_frequencies') for ethnicity in other_ethnicities ], ]) finemapping_results.write_csv( f'{ukb}/post_finemapping/results/singly_finemapped_strs_for_paper.tab',
"v3": pl.Float64, }, ) ON_STRINGS = sys.argv.pop() == "on_strings" if not ON_STRINGS: x["id1"] = x["id1"].cast(pl.Categorical) x["id2"] = x["id2"].cast(pl.Categorical) x["id3"] = x["id3"].cast(pl.Categorical) df = x.clone() x = df.lazy() t00 = time.time() t0 = time.time() print("q1") out = x.groupby("id1").agg(pl.sum("v1")).collect() print(time.time() - t0) print("out.shape", out.shape) print('out["v1_sum"].sum()', out["v1_sum"].sum()) t0easy = time.time() t0 = time.time() print("q2") out = x.groupby(["id1", "id2"]).agg(pl.sum("v1")).collect() print(time.time() - t0) print("out.shape", out.shape) print('out["v1_sum"].sum()', out["v1_sum"].sum()) t0 = time.time() print("q3") out = x.groupby("id3").agg([pl.sum("v1"), pl.mean("v3")]).collect()
from .dataset import df import polars as pl from polars import col df = df[[pl.when(col("random") > 0.5).then(0).otherwise(col("random")) * pl.sum("nrs")]]
from .dataset import df import polars as pl from polars import col df = df.select([pl.sum("nrs"), pl.col("names").sort()])
from .dataset import df import polars as pl from polars import col df = df.lazy().select([pl.sum("nrs"), col("names").sort()]).collect()
from .dataset import df import polars as pl from polars import col df = df[[ pl.sum("random").alias("sum"), pl.min("random").alias("min"), pl.max("random").alias("max"), col("random").max().alias("other_max"), pl.std("random").alias("std dev"), pl.var("random").alias("variance"), ]]
def map_expr(name: str) -> pl.Expr: return (pl.when(ignore_nulls or pl.col(name).null_count() == 0).then( pl.struct([ pl.sum(name).alias("sum"), (pl.count() - pl.col(name).null_count()).alias("count"), ]), ).otherwise(None)).alias("out")
def test_groupby(): df = DataFrame({ "a": ["a", "b", "a", "b", "b", "c"], "b": [1, 2, 3, 4, 5, 6], "c": [6, 5, 4, 3, 2, 1], }) # use __getitem__ to map to select assert (df.groupby("a")["b"].sum().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [4, 11, 6] }))) assert (df.groupby("a").select("b").sum().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [4, 11, 6] }))) assert (df.groupby("a").select("c").sum().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [10, 10, 1] }))) assert (df.groupby("a").select("b").min().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [1, 2, 6] }))) assert (df.groupby("a").select("b").max().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [3, 5, 6] }))) assert (df.groupby("a").select("b").mean().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [2.0, (2 + 4 + 5) / 3, 6.0] }))) assert (df.groupby("a").select("b").last().sort(by="a").frame_equal( DataFrame({ "a": ["a", "b", "c"], "": [3, 5, 6] }))) # check if it runs (df.groupby("a").select("b").n_unique()) (df.groupby("a").select("b").quantile(0.3)) (df.groupby("a").select("b").agg_list()) gb_df = df.groupby("a").agg({"b": ["sum", "min"], "c": "count"}) assert "b_sum" in gb_df.columns assert "b_min" in gb_df.columns # # # TODO: is false because count is u32 # df.groupby(by="a", select="b", agg="count").frame_equal( # DataFrame({"a": ["a", "b", "c"], "": [2, 3, 1]}) # ) assert df.groupby("a").apply(lambda df: df[["c"]].sum()).sort( "c")["c"][0] == 1 assert df.groupby("a").groups().sort("a")["a"].series_equal( Series(["a", "b", "c"])) for subdf in df.groupby("a"): if subdf["a"][0] == "b": assert subdf.shape == (3, 3) assert df.groupby("a").get_group("c").shape == (1, 3) assert df.groupby("a").get_group("b").shape == (3, 3) assert df.groupby("a").get_group("a").shape == (2, 3) # Use lazy API in eager groupby assert df.groupby("a").agg([pl.sum("b")]).shape == (3, 2)
import polars as pl from .dataset import parsed_sorted as dataset # creates a new polars.Series with differences per row def mkdiff(cumcases: pl.Series) -> pl.Series: return cumcases - cumcases.shift(1) q = dataset.with_columns( [ pl.col("cumcases") .apply(mkdiff) .over(pl.col("country")) .take(pl.col("country").arg_unique()) .explode() .alias("diffcases"), pl.sum("cumcases").over("country").alias("cases/country"), pl.sum("cumcases").over("date").alias("sum_cases/day"), pl.min("cumcases").over("date").alias("min_cases/day"), pl.max("cumcases").over("date").alias("max_cases/day"), pl.sum("cumcases").over(pl.col("date").year()).alias("cases/year"), ] ) df = q.collect()
import polars as pl dataset = pl.DataFrame({ "A": [1, 2, 3, 4, 5], "fruits": ["banana", "banana", "apple", "apple", "banana"], "B": [5, 4, 3, 2, 1], "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], }) q = dataset.lazy().with_columns([ pl.sum("A").over("fruits").alias("fruit_sum_A"), pl.first("B").over("fruits").alias("fruit_first_B"), pl.max("B").over("cars").alias("cars_max_B"), ]) df = q.collect()
import polars as pl from polars.lazy import * reddit = pl.scan_csv("data/reddit.csv").select( [pl.sum("comment_karma"), pl.min("link_karma")]) if __name__ == "__main__": df = reddit.fetch() with open("book/src/outputs/how_can_i_aggregate.txt", "w") as f: f.write(str(df))
from .dataset import df import polars as pl from polars import col df = df[[pl.sum("nrs"), col("names").sort()]]
def test_fold(): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = df.lazy().select(pl.sum(["a", "b"])).collect() assert out["sum"].series_equal(Series("sum", [2, 4, 6]))
from .dataset import df import polars as pl from polars import col df = (df.lazy().with_columns( [pl.sum("nrs").alias("nrs_sum"), col("random").count().alias("count")]).collect())
def main(): parser = argparse.ArgumentParser() parser.add_argument('outtable') parser.add_argument('outreadme') parser.add_argument('pos_to_snpstr_pos') parser.add_argument('intable') parser.add_argument('inreadme') parser.add_argument('spot_test_fname_json_dict_fname') args = parser.parse_args() with open(args.spot_test_fname_json_dict_fname) as json_file: spot_test_fname_json_dict = next(json_file) with open(args.outreadme, 'w+') as readme: with open(args.inreadme) as inreadme: readme.write(inreadme.read()) readme.write( 'other_ethnic_association_ps - association p-values for the other ' 'ethnicities in the order ' + ','.join(other_ethnicities) + '\n' ) readme.write( 'other_ethnic_effect_directions - direction of association (+/-) ' 'for the other ethnicities in the order ' + ','.join(other_ethnicities) + " (NaN if that ethnicity's p > 0.05)\n" ) for ethnicity in other_ethnicities: readme.write( f'{ethnicity}_population_allele_frequencies - frequencies of each allele ' "(by dosage) among the ethnicity's tested population\n" ) hits = pl.scan_csv( args.intable, sep='\t', # hack added arguments here that will be ignored when reading putatively_causal but not when reading exonic_finemapped dtype={'alleles': str} ) cols = hits.columns # hack to only clean in one of the two cases this function is running if 'white_brit_allele_frequencies' in cols: hits = hits.with_column( pl.col('white_brit_allele_frequencies').str.replace_all('"', "'") ) hits = hits.join( pl.scan_csv(args.pos_to_snpstr_pos, sep='\t'), how='left', left_on=['chrom', 'start_pos'], right_on=['chrom', 'pos'] ) spot_tests_fnames = { tuple(key.split('__')): fname for key, fname in json.loads(spot_test_fname_json_dict).items() } spot_tests = {} for outer_ethnicity in other_ethnicities: spot_tests[outer_ethnicity] = pl.concat([ (pl.scan_csv( spot_test_fname, sep='\t', dtype={'alleles': str}, null_values=['nan'], with_column_names=lambda cols: list(fix_cols(cols, phenotype)) ).select([ pl.lit(phenotype).alias('phenotype'), 'chrom', 'pos', pl.col('p_phenotype').cast(float).alias(f'{ethnicity}_p'), pl.when(pl.col('p_phenotype') >= 0.05).then(np.nan).when(pl.col('coeff_phenotype') > 0).then(pl.lit('+')).otherwise(pl.lit('-')).alias(f'{ethnicity}_effect_direction'), pl.col('subset_total_per_allele_dosages').apply(reformat_dosage_dict_str).alias(f'{ethnicity}_population_allele_frequencies') ])) for (phenotype, _, _, ethnicity), spot_test_fname in spot_tests_fnames.items() if ethnicity == outer_ethnicity ]) for ethnicity in other_ethnicities: hits = hits.join( spot_tests[ethnicity], how='left', left_on=['phenotype', 'chrom', 'snpstr_pos'], right_on=['phenotype', 'chrom', 'pos'] ) hits = hits.with_columns([ pl.sum([pl.col(f'{ethnicity}_p').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities]) .str.replace(', $', '').alias('other_ethnic_association_ps'), pl.sum([pl.col(f'{ethnicity}_effect_direction').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities]) .str.replace(', $', '').alias('other_ethnic_effect_directions') ]) hits = hits.select([ *cols, 'other_ethnic_association_ps', 'other_ethnic_effect_directions', *[f'{ethnicity}_population_allele_frequencies' for ethnicity in other_ethnicities] ]).collect() assert hits.shape[0] == pl.read_csv( args.intable, sep='\t', # same hack as above dtype = {'alleles': str} ).shape[0] hits.to_csv(args.outtable, sep='\t',)
from .dataset import df import polars as pl from polars import col df = (df.lazy().groupby("groups").agg( [pl.sum("nrs"), col("random").count().alias("count")]).collect())
import polars as pl from polars.lazy import * import time t0 = time.time() left = pl.scan_csv("data/join_left_80000.csv") right = pl.scan_csv("data/join_right_80000.csv") other = pl.scan_csv("data/10000000.csv") q = (left.join(right, on="key", how="inner").filter(col("value") > 0.5).with_column( (col("value") * 10).cast(int)).join( other.groupby("groups").agg(pl.sum("values")), left_on="value", right_on="groups", how="inner", ).select(["key", "values_sum"])) print(q._la) df = q.collect() t = time.time() - t0 # with open("data/macro_bench_polars.txt", "w") as f: # f.write(str(t)) print(df) print(q.describe_optimized_plan())