Exemple #1
0
def test_pearson_corr() -> None:
    df = pl.DataFrame({
        "era": [1, 1, 1, 2, 2, 2],
        "prediction": [2, 4, 5, 190, 1, 4],
        "target": [1, 3, 2, 1, 43, 3],
    })

    out = (df.groupby("era", maintain_order=True).agg(
        pl.pearson_corr(pl.col("prediction"),
                        pl.col("target")).alias("c"), ))["c"]
    assert out.to_list() == pytest.approx(
        [0.6546536707079772, -5.477514993831792e-1])

    # we can also pass in column names directly
    out = (df.groupby("era", maintain_order=True).agg(
        pl.pearson_corr("prediction", "target").alias("c"), ))["c"]
    assert out.to_list() == pytest.approx(
        [0.6546536707079772, -5.477514993831792e-1])
Exemple #2
0
print("out.shape", out.shape)
print('out["range_v1_v2"].sum()', out["range_v1_v2"].sum())

t0 = time.time()
print("q8")
out = (x.drop_nulls("v3").sort("v3", reverse=True).groupby("id6").agg(
    pl.col("v3").head(2).alias("largest2_v3")).explode(
        "largest2_v3").collect())
print(time.time() - t0)
print("out.shape", out.shape)
print('out["largest2_v3"].sum()', out["largest2_v3"].sum())

t0 = time.time()
print("q9")
out = (x.groupby(["id2", "id4"]).agg(
    (pl.pearson_corr("v1", "v2")**2).alias("r2")).collect())
print(time.time() - t0)
print("out.shape", out.shape)
print('out["r2"].sum()', out["r2"].sum())

t0 = time.time()
print("q10")
out = (x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"]).agg(
    [pl.sum("v3").alias("v3"),
     pl.count("v1").alias("count")]).collect())
print(time.time() - t0)
print("out.shape", out.shape)
print("easy took:", easy_time, "s")
print("advanced took:", time.time() - t0advanced, "s")
print("total took:", time.time() - t00, "s")