Exemple #1
0
def test_lazy() -> None:
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    _ = df.lazy().with_column(lit(1).alias("foo")).select(
        [col("a"), col("foo")])

    # test if it executes
    _ = (df.lazy().with_column(
        when(col("a").gt(lit(2))).then(lit(10)).otherwise(
            lit(1)).alias("new")).collect())

    # test if pl.list is available, this is `to_list` re-exported as list
    df.groupby("a").agg(pl.list("b"))
Exemple #2
0
def test_row_count(foods_csv: str) -> None:
    df = pl.read_csv(foods_csv, row_count_name="row_count")
    assert df["row_count"].to_list() == list(range(27))

    df = (pl.scan_csv(foods_csv, row_count_name="row_count").filter(
        pl.col("category") == pl.lit("vegetables")).collect())

    assert df["row_count"].to_list() == [0, 6, 11, 13, 14, 20, 25]

    df = (pl.scan_csv(foods_csv, row_count_name="row_count").with_row_count(
        "foo",
        10).filter(pl.col("category") == pl.lit("vegetables")).collect())

    assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]
def test_str_split() -> None:
    a = pl.Series("a", ["a, b", "a", "ab,c,de"])
    for out in [a.str.split(","), pl.select(pl.lit(a).str.split(",")).to_series()]:
        assert out[0].to_list() == ["a", " b"]
        assert out[1].to_list() == ["a"]
        assert out[2].to_list() == ["ab", "c", "de"]

    for out in [
        a.str.split(",", inclusive=True),
        pl.select(pl.lit(a).str.split(",", inclusive=True)).to_series(),
    ]:
        assert out[0].to_list() == ["a,", " b"]
        assert out[1].to_list() == ["a"]
        assert out[2].to_list() == ["ab,", "c,", "de"]
Exemple #4
0
def load_dir(pheno, region, dir_):
    with open(f'{dir_}/converged.txt') as converged:
        assert converged.read().strip() == 'TRUE'

    alphas = pl.scan_csv(f'{dir_}/alpha.tab', sep='\t',
                         has_header=False).collect().to_numpy().T
    susie_pips = 1 - np.prod(1 - alphas, axis=1)

    df = pl.scan_csv(f'{dir_}/colnames.txt',
                     has_header=False,
                     with_column_names=lambda _: ['var_name']).with_column(
                         pl.lit(1).alias('row_number')).with_columns([
                             pl.col('row_number').cumsum(),
                             pl.lit(None, int).alias('cs_num'),
                             pl.lit(region).alias('region'),
                             pl.lit(pheno).alias('phenotype'),
                             pl.Series(susie_pips).alias('susie_pip'),
                             pl.lit(None, float).alias('susie_cs_pip')
                         ])

    for cs_num in range(50):
        cs_num += 1
        cs_fname = f'{dir_}/cs{cs_num}.txt'
        if not os.path.exists(cs_fname):
            continue
        with open(cs_fname) as cs:
            var_nums = [int(var_num) for var_num in next(cs).strip().split()]
            next(cs)
            min_ld = float(next(cs).split()[0])
            if min_ld < min_ld_thresh:
                continue
            df = df.with_columns([
                pl.when(pl.col('row_number').is_in(var_nums)).then(
                    pl.when(~pl.col('cs_num').is_null()).then(-1).otherwise(
                        cs_num)).otherwise(pl.col('cs_num')).alias('cs_num'),
                pl.when(pl.col('row_number').is_in(var_nums)).then(
                    pl.Series(alphas[:, cs_num - 1])).otherwise(
                        pl.col('susie_cs_pip')).alias('susie_cs_pip')
            ])

    df = df.with_column(
        pl.when(pl.col('cs_num') != -1).then(
            pl.col('susie_cs_pip')).otherwise(-1).alias('susie_cs_pip'))
    df = df.filter(
        pl.col('var_name').str.contains('^STR') & ~pl.col('cs_num').is_null()
        & (pl.col('susie_pip') > 0.05)).drop('row_number')

    return df
Exemple #5
0
def test_filter_date():
    dataset = pl.DataFrame({
        "date": ["2020-01-02", "2020-01-03", "2020-01-04"],
        "index": [1, 2, 3]
    })
    df = dataset.with_column(
        pl.col("date").str.strptime(pl.Date32, "%Y-%m-%d"))
    assert df.filter(
        col("date") <= pl.lit_date(datetime(2019, 1, 3))).is_empty()
    assert df.filter(
        col("date") < pl.lit_date(datetime(2020, 1, 4))).shape[0] == 2
    assert df.filter(
        col("date") < pl.lit_date(datetime(2020, 1, 5))).shape[0] == 3
    assert df.filter(col("date") <= pl.lit(datetime(2019, 1, 3))).is_empty()
    assert df.filter(col("date") < pl.lit(datetime(2020, 1, 4))).shape[0] == 2
    assert df.filter(col("date") < pl.lit(datetime(2020, 1, 5))).shape[0] == 3
Exemple #6
0
def calculate_friction_number(column_names: List[str]) -> "pl.Expr":
    if "fs" in column_names and "qc" in column_names:
        return (col("fs") /
                when(col("qc") == 0.0).then(None).otherwise(col("qc")) *
                100.0).alias("friction_number")
    else:
        return lit(0.0).alias("friction_number")
Exemple #7
0
def replace_column_void(lf: pl.LazyFrame, column_void) -> pl.LazyFrame:
    if column_void is None:
        return lf

    # TODO: what to do with multiple columnvoids?
    if isinstance(column_void, list):
        column_void = column_void[0]

    return (
        # Get all values matching column_void and change them to null
        lf.select(
            pl.when(pl.all() == pl.lit(column_void)).then(
                pl.lit(None)).otherwise(pl.all()).keep_name())
        # Interpolate all null values
        .select(pl.all().interpolate())
        # Remove the rows with null values
        .drop_nulls())
Exemple #8
0
def test_contains() -> None:
    a = pl.Series("a", [[1, 2, 3], [2, 5], [6, 7, 8, 9]])
    out = a.arr.contains(2)
    expected = pl.Series("a", [True, True, False])
    testing.assert_series_equal(out, expected)

    out = pl.select(pl.lit(a).arr.contains(2)).to_series()
    testing.assert_series_equal(out, expected)
def test_shuffle() -> None:
    a = pl.Series("a", [1, 2, 3])
    out = a.shuffle(2)
    expected = pl.Series("a", [2, 1, 3])
    testing.assert_series_equal(out, expected)

    out = pl.select(pl.lit(a).shuffle(2)).to_series()
    testing.assert_series_equal(out, expected)
Exemple #10
0
def test_set_null() -> None:
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = (df.lazy().with_column(
        when(col("a") > 1).then(
            lit(None)).otherwise(100).alias("foo")).collect())
    s = out["foo"]
    assert s[0] == 100
    assert s[1] is None
    assert s[2] is None
Exemple #11
0
def test_str_concat():
    df = pl.DataFrame({
        "nrs": [1, 2, 3, 4],
        "name": ["ham", "spam", "foo", None],
    })
    out = df.with_column(
        (pl.lit("Dr. ") + pl.col("name")).alias("graduated_name"))
    assert out["graduated_name"][0] == "Dr. ham"
    assert out["graduated_name"][1] == "Dr. spam"
Exemple #12
0
def test_fold_filter():
    df = pl.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})

    out = df.filter(
        pl.fold(
            acc=pl.lit(True),
            f=lambda a, b: a & b,
            exprs=[col(c) > 1 for c in df.columns],
        ))

    assert out.shape == (1, 2)

    out = df.filter(
        pl.fold(
            acc=pl.lit(True),
            f=lambda a, b: a | b,
            exprs=[col(c) > 1 for c in df.columns],
        ))

    assert out.shape == (3, 2)
Exemple #13
0
def test_list_empty_groupby_result_3521() -> None:
    # Create a left relation where the join column contains a null value
    left = pl.DataFrame().with_columns([
        pl.lit(1).alias("groupby_column"),
        pl.lit(None).cast(pl.Int32).alias("join_column"),
    ])

    # Create a right relation where there is a column to count distinct on
    right = pl.DataFrame().with_columns([
        pl.lit(1).alias("join_column"),
        pl.lit(1).alias("n_unique_column"),
    ])

    # Calculate n_unique after dropping nulls
    # This will panic on polars version 0.13.38 and 0.13.39
    assert (left.join(
        right, on="join_column", how="left").groupby("groupby_column").agg(
            pl.col("n_unique_column").drop_nulls())).to_dict(False) == {
                "groupby_column": [1],
                "n_unique_column": [[]]
            }
Exemple #14
0
def test_when_then_edge_cases_3994() -> None:
    df = pl.DataFrame(data={"id": [1, 1], "type": [2, 2]})

    # this tests if lazy correctly assigns the list schema to the column aggregation
    assert (df.lazy().groupby(["id"]).agg(pl.col("type")).with_column(
        pl.when(pl.col("type").arr.lengths() == 0).then(
            pl.lit(None)).otherwise(
                pl.col("type")).keep_name()).collect()).to_dict(False) == {
                    "id": [1],
                    "type": [[2, 2]]
                }

    # this tests ternary with an empty argument
    assert (df.filter(pl.col("id") == 42).groupby([
        "id"
    ]).agg(pl.col("type")).with_column(
        pl.when(pl.col("type").arr.lengths == 0).then(pl.lit(None)).otherwise(
            pl.col("type")).keep_name())).to_dict(False) == {
                "id": [],
                "type": []
            }
Exemple #15
0
def test_list_arr_get() -> None:
    a = pl.Series("a", [[1, 2, 3], [4, 5], [6, 7, 8, 9]])
    out = a.arr.get(0)
    expected = pl.Series("a", [1, 4, 6])
    testing.assert_series_equal(out, expected)
    out = a.arr.first()
    testing.assert_series_equal(out, expected)
    out = pl.select(pl.lit(a).arr.first()).to_series()
    testing.assert_series_equal(out, expected)

    out = a.arr.get(-1)
    expected = pl.Series("a", [3, 5, 9])
    testing.assert_series_equal(out, expected)
    out = a.arr.last()
    testing.assert_series_equal(out, expected)
    out = pl.select(pl.lit(a).arr.last()).to_series()
    testing.assert_series_equal(out, expected)

    a = pl.Series("a", [[1, 2, 3], [4, 5], [6, 7, 8, 9]])
    out = a.arr.get(-3)
    expected = pl.Series("a", [1, None, 7])
    testing.assert_series_equal(out, expected)
Exemple #16
0
def test_fill_null() -> None:
    dt = datetime.strptime("2021-01-01", "%Y-%m-%d")
    s = pl.Series("A", [dt, None])

    for fill_val in (dt, pl.lit(dt)):
        out = s.fill_null(fill_val)  # type: ignore[arg-type]

        assert out.null_count() == 0
        assert out.dt[0] == dt
        assert out.dt[1] == dt

    dt1 = date(2001, 1, 1)
    dt2 = date(2001, 1, 2)
    dt3 = date(2001, 1, 3)
    s = pl.Series("a", [dt1, dt2, dt3, None])
    dt_2 = date(2001, 1, 4)
    for fill_val in (dt_2, pl.lit(dt_2)):
        out = s.fill_null(fill_val)  # type: ignore[arg-type]

        assert out.null_count() == 0
        assert out.dt[0] == dt1
        assert out.dt[1] == dt2
        assert out.dt[-1] == dt_2
Exemple #17
0
def test_preservation_of_subclasses() -> None:
    """Tests for LazyFrame inheritance."""

    # We should be able to inherit from polars.LazyFrame
    class SubClassedLazyFrame(pl.LazyFrame):
        pass

    # The constructor creates an object which is an instance of both the
    # superclass and subclass
    ldf = pl.DataFrame({"column_1": [1, 2, 3]}).lazy()
    ldf.__class__ = SubClassedLazyFrame
    extended_ldf = ldf.with_column(pl.lit(1).alias("column_2"))
    assert isinstance(extended_ldf, pl.LazyFrame)
    assert isinstance(extended_ldf, SubClassedLazyFrame)
Exemple #18
0
def test_abs() -> None:
    # ints
    s = pl.Series([1, -2, 3, -4])
    testing.assert_series_equal(s.abs(), pl.Series([1, 2, 3, 4]))
    testing.assert_series_equal(np.abs(s), pl.Series([1, 2, 3, 4]))  # type: ignore

    # floats
    s = pl.Series([1.0, -2.0, 3, -4.0])
    testing.assert_series_equal(s.abs(), pl.Series([1.0, 2.0, 3.0, 4.0]))
    testing.assert_series_equal(
        np.abs(s), pl.Series([1.0, 2.0, 3.0, 4.0])  # type: ignore
    )
    testing.assert_series_equal(
        pl.select(pl.lit(s).abs()).to_series(), pl.Series([1.0, 2.0, 3.0, 4.0])
    )
Exemple #19
0
def test_list_arr_get() -> None:
    a = pl.Series("a", [[1, 2, 3], [4, 5], [6, 7, 8, 9]])
    out = a.arr.get(0)
    expected = pl.Series("a", [1, 4, 6])
    assert_series_equal(out, expected)
    out = a.arr.first()
    assert_series_equal(out, expected)
    out = pl.select(pl.lit(a).arr.first()).to_series()
    assert_series_equal(out, expected)

    out = a.arr.get(-1)
    expected = pl.Series("a", [3, 5, 9])
    assert_series_equal(out, expected)
    out = a.arr.last()
    assert_series_equal(out, expected)
    out = pl.select(pl.lit(a).arr.last()).to_series()
    assert_series_equal(out, expected)

    a = pl.Series("a", [[1, 2, 3], [4, 5], [6, 7, 8, 9]])
    out = a.arr.get(-3)
    expected = pl.Series("a", [1, None, 7])
    assert_series_equal(out, expected)

    assert pl.DataFrame(
        {"a": [[1], [2], [3], [4, 5, 6], [7, 8, 9], [None, 11]]}
    ).with_columns(
        [pl.col("a").arr.get(i).alias(f"get_{i}") for i in range(4)]
    ).to_dict(
        False
    ) == {
        "a": [[1], [2], [3], [4, 5, 6], [7, 8, 9], [None, 11]],
        "get_0": [1, 2, 3, 4, 7, None],
        "get_1": [None, None, None, 5, 8, 11],
        "get_2": [None, None, None, 6, 9, None],
        "get_3": [None, None, None, None, None, None],
    }
Exemple #20
0
def test_fold() -> None:
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = df.select([
        pl.sum(["a", "b"]),
        pl.max(["a", pl.col("b")**2]),
        pl.min(["a", pl.col("b")**2]),
    ])
    assert out["sum"].series_equal(pl.Series("sum", [2.0, 4.0, 6.0]))
    assert out["max"].series_equal(pl.Series("max", [1.0, 4.0, 9.0]))
    assert out["min"].series_equal(pl.Series("min", [1.0, 2.0, 3.0]))

    out = df.select(
        pl.fold(acc=lit(0), f=lambda acc, x: acc + x,
                exprs=pl.col("*")).alias("foo"))
    assert out["foo"] == [2, 4, 6]
Exemple #21
0
def test_type_coercion_when_then_otherwise_2806() -> None:
    out = (pl.DataFrame({
        "names": ["foo", "spam", "spam"],
        "nrs": [1, 2, 3]
    }).select([
        pl.when(pl.col("names") == "spam").then(pl.col("nrs") * 2).otherwise(
            pl.lit("other")).alias("new_col"),
    ]).to_series())
    expected = pl.Series("new_col", ["other", "4", "6"])
    assert out.to_list() == expected.to_list()

    # test it remains float32
    assert (pl.Series(
        "a", [1.0, 2.0, 3.0], dtype=pl.Float32).to_frame().select(
            pl.when(pl.col("a") > 2.0).then(
                pl.col("a")).otherwise(0.0))).to_series().dtype == pl.Float32
Exemple #22
0
def type_index() -> pl.Expr:
    return (
        (
            (
                pl.lit(3.0)
                - np.log10(
                    col("normalized_cone_resistance")
                    * (1.0 - col("excess_pore_pressure_ratio"))
                    + 1.0
                )
            )
            ** 2
            + (1.5 + 1.3 * np.log10(col("normalized_friction_ratio"))) ** 2
        )
        ** 0.5
    ).alias("type_index")
Exemple #23
0
def test_reshape() -> None:
    s = pl.Series("a", [1, 2, 3, 4])
    out = s.reshape((-1, 2))
    expected = pl.Series("a", [[1, 2], [3, 4]])
    assert out.series_equal(expected)
    out = s.reshape((2, 2))
    assert out.series_equal(expected)
    out = s.reshape((2, -1))
    assert out.series_equal(expected)

    out = s.reshape((-1, 1))
    expected = pl.Series("a", [[1], [2], [3], [4]])
    assert out.series_equal(expected)

    # test lazy_dispatch
    out = pl.select(pl.lit(s).reshape((-1, 1))).to_series()
    assert out.series_equal(expected)
Exemple #24
0
def test_take(fruits_cars: pl.DataFrame) -> None:
    df = fruits_cars

    # out of bounds error
    with pytest.raises(RuntimeError):
        (df.sort("fruits").select(
            [col("B").reverse().take([1, 2]).list().over("fruits"),
             "fruits"]  # type: ignore
        ))

    for index in [[0, 1], pl.Series([0, 1]), np.array([0, 1]), pl.lit(1)]:
        out = df.sort("fruits").select(
            [col("B").reverse().take(index).list().over("fruits"),
             "fruits"]  # type: ignore
        )

        assert out[0, "B"] == [2, 3]
        assert out[4, "B"] == [1, 4]
Exemple #25
0
 def test_sum_to_one(self):
     cols = [
         "gravel_component",
         "sand_component",
         "clay_component",
         "loam_component",
         "peat_component",
         "silt_component",
     ]
     s = self.bore.df.select([
         pl.fold(
             pl.lit(0),
             lambda a, b: a + b,
             [
                 pl.when(pl.col(a) < 0).then(1 / len(cols)).otherwise(
                     pl.col(a)) for a in cols
             ],
         ).alias("sum")
     ])
     self.assertTrue(np.all(np.isclose(s, 1)))
Exemple #26
0
def test_add_eager_column():
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
    out = df.lazy().with_column(pl.lit(pl.Series("c", [1, 2, 3]))).collect()
    assert out["c"].sum() == 6
    then('doubly').when((pl.col('susie_CP') >= 0.8)
                        | (pl.col('finemap_CP') >= 0.8)).then(
                            'singly').otherwise('not').alias('finemapping'),
    'susie_CP',
    'finemap_CP',
    'susie_CP_best_guess_genotypes',
    'finemap_CP_pval_thresh_5e-4',
    'finemap_CP_mac_thresh_100',
    'finemap_CP_prior_effect_size_0.05%',
    'finemap_CP_prior_4_signals',
    'finemap_CP_stopping_thresh_1e-4',
    'susie_CP_prior_snps_over_strs',
    'finemap_CP_prior_snps_over_strs',
    'finemap_CP_prior_effect_size_0.0025%',
    pl.sum([
        pl.col(f'{ethnicity}_p_val').cast(str) + pl.lit(', ')
        for ethnicity in other_ethnicities
    ]).str.replace(', $', '').alias('other_ethnicity_association_p_values'),
    pl.sum([
        pl.when(pl.col(f'{ethnicity}_p_val') > .05).then('NA').when(
            pl.col('coeff') > 0).then('+').otherwise('-') + pl.lit(', ')
        for ethnicity in other_ethnicities
    ]).str.replace(', $', '').alias('other_ethnicity_effect_directions'),
    *[
        pl.col(f'{ethnicity}_allele_dosages').apply(
            dosages_to_frequencies).alias(f'{ethnicity}_allele_frequencies')
        for ethnicity in other_ethnicities
    ],
])

finemapping_results.write_csv(
Exemple #28
0
print("q10")
out = (x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"]).agg(
    [pl.sum("v3").alias("v3"),
     pl.count("v1").alias("count")]).collect())
print(time.time() - t0)
print("easy took:", easy_time, "s")
print("advanced took:", time.time() - t0advanced, "s")
total_time = time.time() - t00
print("total took:", total_time, "s")
assert out.shape == (9999995, 8)

if not ON_STRINGS:
    if total_time > 11:
        print("query took longer than 11s, may be noise")
        exit(1)

# Additional tests
# the code below, does not belong to the db-benchmark
# but it triggers other code paths so the checksums assertion
# are a sort of integration tests
out = (x.filter(pl.col("id1") == pl.lit("id046")).select(
    [pl.sum("id6"), pl.sum("v3")]).collect())
assert out["id6"] == 430957682
assert np.isclose(out["v3"], 4.724150165888001e6)

out = (x.filter(~(pl.col("id1") == pl.lit("id046"))).select(
    [pl.sum("id6"), pl.sum("v3")]).collect())

assert out["id6"] == 2137755425
assert np.isclose(out["v3"], 4.7040828499563754e8)
Exemple #29
0
def test_numpy_to_lit() -> None:
    out = pl.select(pl.lit(np.array([1, 2, 3]))).to_series().to_list()
    assert out == [1, 2, 3]
    out = pl.select(pl.lit(np.float32(0))).to_series().to_list()
    assert out == [0.0]
Exemple #30
0
def test_datetime_consistency() -> None:
    # dt = datetime(2021, 1, 1, 10, 30, 45, 123456)
    dt = datetime(2021, 1, 1, 10, 30, 45, 123000)
    df = pl.DataFrame({"date": [dt]})
    assert df["date"].dt[0] == dt
    assert df.select(pl.lit(dt))["literal"].dt[0] == dt