Beispiel #1
0
def test_lazy_functions():
    df = pl.DataFrame({
        "a": ["foo", "bar", "2"],
        "b": [1, 2, 3],
        "c": [1.0, 2.0, 3.0]
    })
    out = df[[pl.count("a")]]
    assert out["a"] == 3
    assert pl.count(df["a"]) == 3
    out = df[[
        pl.var("b"),
        pl.std("b"),
        pl.max("b"),
        pl.min("b"),
        pl.sum("b"),
        pl.mean("b"),
        pl.median("b"),
        pl.n_unique("b"),
        pl.first("b"),
        pl.last("b"),
    ]]
    expected = 1.0
    assert np.isclose(out.select_at_idx(0), expected)
    assert np.isclose(pl.var(df["b"]), expected)
    expected = 1.0
    assert np.isclose(out.select_at_idx(1), expected)
    assert np.isclose(pl.std(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(2), expected)
    assert np.isclose(pl.max(df["b"]), expected)
    expected = 1
    assert np.isclose(out.select_at_idx(3), expected)
    assert np.isclose(pl.min(df["b"]), expected)
    expected = 6
    assert np.isclose(out.select_at_idx(4), expected)
    assert np.isclose(pl.sum(df["b"]), expected)
    expected = 2
    assert np.isclose(out.select_at_idx(5), expected)
    assert np.isclose(pl.mean(df["b"]), expected)
    expected = 2
    assert np.isclose(out.select_at_idx(6), expected)
    assert np.isclose(pl.median(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(7), expected)
    assert np.isclose(pl.n_unique(df["b"]), expected)
    expected = 1
    assert np.isclose(out.select_at_idx(8), expected)
    assert np.isclose(pl.first(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(9), expected)
    assert np.isclose(pl.last(df["b"]), expected)
    expected = 3
    assert np.isclose(out.select_at_idx(9), expected)
    assert np.isclose(pl.last(df["b"]), expected)
Beispiel #2
0
def test_lazy_functions():
    df = pl.DataFrame({
        "a": ["foo", "bar", "2"],
        "b": [1, 2, 3],
        "c": [1.0, 2.0, 3.0]
    })
    out = df[[pl.count("a")]]
    assert out[0] == 3
    assert pl.count(df["a"]) == 3
    out = df[[
        pl.var("b"),
        pl.std("b"),
        pl.max("b"),
        pl.min("b"),
        pl.sum("b"),
        pl.mean("b"),
        pl.median("b"),
        pl.n_unique("b"),
        pl.first("b"),
        pl.last("b"),
    ]]
    expected = 1.0
    assert np.isclose(out[0], expected)
    assert np.isclose(pl.var(df["b"]), expected)
    expected = 1.0
    assert np.isclose(out[1], expected)
    assert np.isclose(pl.std(df["b"]), expected)
    expected = 3
    assert np.isclose(out[2], expected)
    assert np.isclose(pl.max(df["b"]), expected)
    expected = 1
    assert np.isclose(out[3], expected)
    assert np.isclose(pl.min(df["b"]), expected)
    expected = 6
    assert np.isclose(out[4], expected)
    assert np.isclose(pl.sum(df["b"]), expected)
    expected = 2
    assert np.isclose(out[5], expected)
    assert np.isclose(pl.mean(df["b"]), expected)
    expected = 2
    assert np.isclose(out[6], expected)
    assert np.isclose(pl.median(df["b"]), expected)
    expected = 3
    assert np.isclose(out[7], expected)
    assert np.isclose(pl.n_unique(df["b"]), expected)
    expected = 1
    assert np.isclose(out[8], expected)
    assert np.isclose(pl.first(df["b"]), expected)
    expected = 3
    assert np.isclose(out[9], expected)
    assert np.isclose(pl.last(df["b"]), expected)
    expected = 3
    assert np.isclose(out[9], expected)
    assert np.isclose(pl.last(df["b"]), expected)
Beispiel #3
0
def test_rolling() -> None:
    dates = [
        "2020-01-01 13:45:48",
        "2020-01-01 16:42:13",
        "2020-01-01 16:45:09",
        "2020-01-02 18:12:48",
        "2020-01-03 19:45:32",
        "2020-01-08 23:16:43",
    ]

    df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_column(
        pl.col("dt").str.strptime(pl.Datetime)
    )

    out = df.groupby_rolling(index_column="dt", period="2d").agg(
        [
            pl.sum("a").alias("sum_a"),
            pl.min("a").alias("min_a"),
            pl.max("a").alias("max_a"),
        ]
    )

    assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1]
    assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1]
    assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1]
Beispiel #4
0
def test_fold():
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = df.lazy().select(pl.sum(["a", "b"])).collect()
    assert out["sum"].series_equal(pl.Series("sum", [2, 4, 6]))

    out = df.select(
        pl.fold(acc=lit(0), f=lambda acc, x: acc + x,
                exprs=pl.col("*")).alias("foo"))
    assert out["foo"] == [2, 4, 6]
Beispiel #5
0
def test_fold() -> None:
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = df.select([
        pl.sum(["a", "b"]),
        pl.max(["a", pl.col("b")**2]),
        pl.min(["a", pl.col("b")**2]),
    ])
    assert out["sum"].series_equal(pl.Series("sum", [2.0, 4.0, 6.0]))
    assert out["max"].series_equal(pl.Series("max", [1.0, 4.0, 9.0]))
    assert out["min"].series_equal(pl.Series("min", [1.0, 2.0, 3.0]))

    out = df.select(
        pl.fold(acc=lit(0), f=lambda acc, x: acc + x,
                exprs=pl.col("*")).alias("foo"))
    assert out["foo"] == [2, 4, 6]
Beispiel #6
0
def test_window_function():
    df = pl.DataFrame({
        "A": [1, 2, 3, 4, 5],
        "fruits": ["banana", "banana", "apple", "apple", "banana"],
        "B": [5, 4, 3, 2, 1],
        "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
    })

    q = df.lazy().with_columns([
        pl.sum("A").over("fruits").alias("fruit_sum_A"),
        pl.first("B").over("fruits").alias("fruit_first_B"),
        pl.max("B").over("cars").alias("cars_max_B"),
    ])
    out = q.collect()
    assert out["cars_max_B"] == [5, 4, 5, 5, 5]

    out = df[[pl.first("B").over(["fruits", "cars"]).alias("B_first")]]
    assert out["B_first"] == [5, 4, 3, 3, 5]
Beispiel #7
0
id037,id097,id0000062401,48,8,53992,5,15,83.565443
id073,id081,id0000017280,54,90,28480,5,4,17.078693
id081,id073,id0000073423,51,22,39788,2,12,45.883758
id062,id080,id0000092749,1,75,67857,3,10,80.418674
id045,id031,id0000076210,2,42,80312,4,5,48.668692
id082,id048,id0000080227,56,62,16760,3,11,34.933239
id035,id032,id0000033279,55,13,80560,5,5,61.372678
id053,id013,id0000073898,61,63,12387,4,7,29.949863"""

f = io.BytesIO(csv)

x = pl.read_csv(
    f,
    dtype={
        "id4": pl.Int32,
        "id5": pl.Int32,
        "id6": pl.Int32,
        "v1": pl.Int32,
        "v2": pl.Int32,
        "v3": pl.Float64,
    },
)
x["id1"] = x["id1"].cast(pl.Categorical)
x["id2"] = x["id2"].cast(pl.Categorical)
x["id3"] = x["id3"].cast(pl.Categorical)
x = x.lazy()

question = "sum v1 by id1"  # q1
ans = x.groupby("id1").agg(pl.sum("v1")).collect()
print(ans.shape, flush=True)
    then('doubly').when((pl.col('susie_CP') >= 0.8)
                        | (pl.col('finemap_CP') >= 0.8)).then(
                            'singly').otherwise('not').alias('finemapping'),
    'susie_CP',
    'finemap_CP',
    'susie_CP_best_guess_genotypes',
    'finemap_CP_pval_thresh_5e-4',
    'finemap_CP_mac_thresh_100',
    'finemap_CP_prior_effect_size_0.05%',
    'finemap_CP_prior_4_signals',
    'finemap_CP_stopping_thresh_1e-4',
    'susie_CP_prior_snps_over_strs',
    'finemap_CP_prior_snps_over_strs',
    'finemap_CP_prior_effect_size_0.0025%',
    pl.sum([
        pl.col(f'{ethnicity}_p_val').cast(str) + pl.lit(', ')
        for ethnicity in other_ethnicities
    ]).str.replace(', $', '').alias('other_ethnicity_association_p_values'),
    pl.sum([
        pl.when(pl.col(f'{ethnicity}_p_val') > .05).then('NA').when(
            pl.col('coeff') > 0).then('+').otherwise('-') + pl.lit(', ')
        for ethnicity in other_ethnicities
    ]).str.replace(', $', '').alias('other_ethnicity_effect_directions'),
    *[
        pl.col(f'{ethnicity}_allele_dosages').apply(
            dosages_to_frequencies).alias(f'{ethnicity}_allele_frequencies')
        for ethnicity in other_ethnicities
    ],
])

finemapping_results.write_csv(
    f'{ukb}/post_finemapping/results/singly_finemapped_strs_for_paper.tab',
Beispiel #9
0
        "v3": pl.Float64,
    },
)
ON_STRINGS = sys.argv.pop() == "on_strings"

if not ON_STRINGS:
    x["id1"] = x["id1"].cast(pl.Categorical)
    x["id2"] = x["id2"].cast(pl.Categorical)
    x["id3"] = x["id3"].cast(pl.Categorical)
df = x.clone()
x = df.lazy()

t00 = time.time()
t0 = time.time()
print("q1")
out = x.groupby("id1").agg(pl.sum("v1")).collect()
print(time.time() - t0)
print("out.shape", out.shape)
print('out["v1_sum"].sum()', out["v1_sum"].sum())

t0easy = time.time()
t0 = time.time()
print("q2")
out = x.groupby(["id1", "id2"]).agg(pl.sum("v1")).collect()
print(time.time() - t0)
print("out.shape", out.shape)
print('out["v1_sum"].sum()', out["v1_sum"].sum())

t0 = time.time()
print("q3")
out = x.groupby("id3").agg([pl.sum("v1"), pl.mean("v3")]).collect()
Beispiel #10
0
from .dataset import df
import polars as pl
from polars import col

df = df[[pl.when(col("random") > 0.5).then(0).otherwise(col("random")) * pl.sum("nrs")]]
Beispiel #11
0
from .dataset import df
import polars as pl
from polars import col

df = df.select([pl.sum("nrs"), pl.col("names").sort()])
from .dataset import df
import polars as pl
from polars import col

df = df.lazy().select([pl.sum("nrs"), col("names").sort()]).collect()
from .dataset import df
import polars as pl
from polars import col

df = df[[
    pl.sum("random").alias("sum"),
    pl.min("random").alias("min"),
    pl.max("random").alias("max"),
    col("random").max().alias("other_max"),
    pl.std("random").alias("std dev"),
    pl.var("random").alias("variance"),
]]
Beispiel #14
0
 def map_expr(name: str) -> pl.Expr:
     return (pl.when(ignore_nulls or pl.col(name).null_count() == 0).then(
         pl.struct([
             pl.sum(name).alias("sum"),
             (pl.count() - pl.col(name).null_count()).alias("count"),
         ]), ).otherwise(None)).alias("out")
Beispiel #15
0
def test_groupby():
    df = DataFrame({
        "a": ["a", "b", "a", "b", "b", "c"],
        "b": [1, 2, 3, 4, 5, 6],
        "c": [6, 5, 4, 3, 2, 1],
    })

    # use __getitem__ to map to select
    assert (df.groupby("a")["b"].sum().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [4, 11, 6]
        })))

    assert (df.groupby("a").select("b").sum().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [4, 11, 6]
        })))
    assert (df.groupby("a").select("c").sum().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [10, 10, 1]
        })))
    assert (df.groupby("a").select("b").min().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [1, 2, 6]
        })))
    assert (df.groupby("a").select("b").max().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [3, 5, 6]
        })))
    assert (df.groupby("a").select("b").mean().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [2.0, (2 + 4 + 5) / 3, 6.0]
        })))
    assert (df.groupby("a").select("b").last().sort(by="a").frame_equal(
        DataFrame({
            "a": ["a", "b", "c"],
            "": [3, 5, 6]
        })))
    # check if it runs
    (df.groupby("a").select("b").n_unique())

    (df.groupby("a").select("b").quantile(0.3))
    (df.groupby("a").select("b").agg_list())

    gb_df = df.groupby("a").agg({"b": ["sum", "min"], "c": "count"})
    assert "b_sum" in gb_df.columns
    assert "b_min" in gb_df.columns

    #
    # # TODO: is false because count is u32
    # df.groupby(by="a", select="b", agg="count").frame_equal(
    #     DataFrame({"a": ["a", "b", "c"], "": [2, 3, 1]})
    # )
    assert df.groupby("a").apply(lambda df: df[["c"]].sum()).sort(
        "c")["c"][0] == 1

    assert df.groupby("a").groups().sort("a")["a"].series_equal(
        Series(["a", "b", "c"]))

    for subdf in df.groupby("a"):
        if subdf["a"][0] == "b":
            assert subdf.shape == (3, 3)

    assert df.groupby("a").get_group("c").shape == (1, 3)
    assert df.groupby("a").get_group("b").shape == (3, 3)
    assert df.groupby("a").get_group("a").shape == (2, 3)

    # Use lazy API in eager groupby
    assert df.groupby("a").agg([pl.sum("b")]).shape == (3, 2)
Beispiel #16
0
import polars as pl

from .dataset import parsed_sorted as dataset


# creates a new polars.Series with differences per row
def mkdiff(cumcases: pl.Series) -> pl.Series:
    return cumcases - cumcases.shift(1)


q = dataset.with_columns(
    [
        pl.col("cumcases")
        .apply(mkdiff)
        .over(pl.col("country"))
        .take(pl.col("country").arg_unique())
        .explode()
        .alias("diffcases"),
        pl.sum("cumcases").over("country").alias("cases/country"),
        pl.sum("cumcases").over("date").alias("sum_cases/day"),
        pl.min("cumcases").over("date").alias("min_cases/day"),
        pl.max("cumcases").over("date").alias("max_cases/day"),
        pl.sum("cumcases").over(pl.col("date").year()).alias("cases/year"),
    ]
)

df = q.collect()
Beispiel #17
0
import polars as pl

dataset = pl.DataFrame({
    "A": [1, 2, 3, 4, 5],
    "fruits": ["banana", "banana", "apple", "apple", "banana"],
    "B": [5, 4, 3, 2, 1],
    "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
})

q = dataset.lazy().with_columns([
    pl.sum("A").over("fruits").alias("fruit_sum_A"),
    pl.first("B").over("fruits").alias("fruit_first_B"),
    pl.max("B").over("cars").alias("cars_max_B"),
])

df = q.collect()
Beispiel #18
0
import polars as pl
from polars.lazy import *

reddit = pl.scan_csv("data/reddit.csv").select(
    [pl.sum("comment_karma"), pl.min("link_karma")])

if __name__ == "__main__":
    df = reddit.fetch()
    with open("book/src/outputs/how_can_i_aggregate.txt", "w") as f:
        f.write(str(df))
Beispiel #19
0
from .dataset import df
import polars as pl
from polars import col

df = df[[pl.sum("nrs"), col("names").sort()]]
Beispiel #20
0
def test_fold():
    df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = df.lazy().select(pl.sum(["a", "b"])).collect()
    assert out["sum"].series_equal(Series("sum", [2, 4, 6]))
from .dataset import df
import polars as pl
from polars import col

df = (df.lazy().with_columns(
    [pl.sum("nrs").alias("nrs_sum"),
     col("random").count().alias("count")]).collect())
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('outtable')
    parser.add_argument('outreadme')
    parser.add_argument('pos_to_snpstr_pos')
    parser.add_argument('intable')
    parser.add_argument('inreadme')
    parser.add_argument('spot_test_fname_json_dict_fname')

    args = parser.parse_args()

    with open(args.spot_test_fname_json_dict_fname) as json_file:
        spot_test_fname_json_dict = next(json_file)

    with open(args.outreadme, 'w+') as readme:
        with open(args.inreadme) as inreadme:
            readme.write(inreadme.read())
        readme.write(
            'other_ethnic_association_ps - association p-values for the other '
            'ethnicities in the order ' +
            ','.join(other_ethnicities) + '\n'
        )
        readme.write(
            'other_ethnic_effect_directions - direction of association (+/-) '
            'for the other ethnicities in the order ' +
            ','.join(other_ethnicities) +
            " (NaN if that ethnicity's p > 0.05)\n"
        )
        for ethnicity in other_ethnicities:
            readme.write(
                f'{ethnicity}_population_allele_frequencies - frequencies of each allele '
                "(by dosage) among the ethnicity's tested population\n"
            )

    hits = pl.scan_csv(
        args.intable,
        sep='\t',
        # hack added arguments here that will be ignored when reading putatively_causal but not when reading exonic_finemapped
        dtype={'alleles': str}
    )
    cols = hits.columns

    # hack to only clean in one of the two cases this function is running
    if 'white_brit_allele_frequencies' in cols:
        hits = hits.with_column(
            pl.col('white_brit_allele_frequencies').str.replace_all('"', "'")
        )

    hits = hits.join(
        pl.scan_csv(args.pos_to_snpstr_pos, sep='\t'),
        how='left',
        left_on=['chrom', 'start_pos'],
        right_on=['chrom', 'pos']
    )

    spot_tests_fnames = {
        tuple(key.split('__')): fname
        for key, fname in
        json.loads(spot_test_fname_json_dict).items()
    }

    spot_tests = {}
    for outer_ethnicity in other_ethnicities:
        spot_tests[outer_ethnicity] = pl.concat([
            (pl.scan_csv(
                    spot_test_fname,
                    sep='\t',
                    dtype={'alleles': str},
                    null_values=['nan'],
                    with_column_names=lambda cols: list(fix_cols(cols, phenotype))
                ).select([
                    pl.lit(phenotype).alias('phenotype'),
                    'chrom',
                    'pos',
                    pl.col('p_phenotype').cast(float).alias(f'{ethnicity}_p'),
                    pl.when(pl.col('p_phenotype') >= 0.05).then(np.nan).when(pl.col('coeff_phenotype') > 0).then(pl.lit('+')).otherwise(pl.lit('-')).alias(f'{ethnicity}_effect_direction'),
                    pl.col('subset_total_per_allele_dosages').apply(reformat_dosage_dict_str).alias(f'{ethnicity}_population_allele_frequencies')
                ]))
            for (phenotype, _, _, ethnicity), spot_test_fname
            in spot_tests_fnames.items()
            if ethnicity == outer_ethnicity
        ])

    for ethnicity in other_ethnicities:
        hits = hits.join(
            spot_tests[ethnicity],
            how='left',
            left_on=['phenotype', 'chrom', 'snpstr_pos'],
            right_on=['phenotype', 'chrom', 'pos']
        )

    hits = hits.with_columns([
        pl.sum([pl.col(f'{ethnicity}_p').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities])
             .str.replace(', $', '').alias('other_ethnic_association_ps'),
        pl.sum([pl.col(f'{ethnicity}_effect_direction').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities])
             .str.replace(', $', '').alias('other_ethnic_effect_directions')
    ])

    hits = hits.select([
        *cols,
        'other_ethnic_association_ps',
        'other_ethnic_effect_directions',
        *[f'{ethnicity}_population_allele_frequencies' for ethnicity in other_ethnicities]
    ]).collect()
    assert hits.shape[0] == pl.read_csv(
        args.intable,
        sep='\t',
        # same hack as above
        dtype = {'alleles': str}
    ).shape[0]

    hits.to_csv(args.outtable, sep='\t',)
Beispiel #23
0
from .dataset import df
import polars as pl
from polars import col

df = (df.lazy().groupby("groups").agg(
    [pl.sum("nrs"), col("random").count().alias("count")]).collect())
Beispiel #24
0
import polars as pl
from polars.lazy import *
import time

t0 = time.time()

left = pl.scan_csv("data/join_left_80000.csv")
right = pl.scan_csv("data/join_right_80000.csv")
other = pl.scan_csv("data/10000000.csv")

q = (left.join(right, on="key",
               how="inner").filter(col("value") > 0.5).with_column(
                   (col("value") * 10).cast(int)).join(
                       other.groupby("groups").agg(pl.sum("values")),
                       left_on="value",
                       right_on="groups",
                       how="inner",
                   ).select(["key", "values_sum"]))
print(q._la)
df = q.collect()

t = time.time() - t0
# with open("data/macro_bench_polars.txt", "w") as f:
#     f.write(str(t))
print(df)
print(q.describe_optimized_plan())