def test_lazy() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) _ = df.lazy().with_column(lit(1).alias("foo")).select( [col("a"), col("foo")]) # test if it executes _ = (df.lazy().with_column( when(col("a").gt(lit(2))).then(lit(10)).otherwise( lit(1)).alias("new")).collect()) # test if pl.list is available, this is `to_list` re-exported as list df.groupby("a").agg(pl.list("b"))
def test_row_count(foods_csv: str) -> None: df = pl.read_csv(foods_csv, row_count_name="row_count") assert df["row_count"].to_list() == list(range(27)) df = (pl.scan_csv(foods_csv, row_count_name="row_count").filter( pl.col("category") == pl.lit("vegetables")).collect()) assert df["row_count"].to_list() == [0, 6, 11, 13, 14, 20, 25] df = (pl.scan_csv(foods_csv, row_count_name="row_count").with_row_count( "foo", 10).filter(pl.col("category") == pl.lit("vegetables")).collect()) assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]
def test_str_split() -> None: a = pl.Series("a", ["a, b", "a", "ab,c,de"]) for out in [a.str.split(","), pl.select(pl.lit(a).str.split(",")).to_series()]: assert out[0].to_list() == ["a", " b"] assert out[1].to_list() == ["a"] assert out[2].to_list() == ["ab", "c", "de"] for out in [ a.str.split(",", inclusive=True), pl.select(pl.lit(a).str.split(",", inclusive=True)).to_series(), ]: assert out[0].to_list() == ["a,", " b"] assert out[1].to_list() == ["a"] assert out[2].to_list() == ["ab,", "c,", "de"]
def load_dir(pheno, region, dir_): with open(f'{dir_}/converged.txt') as converged: assert converged.read().strip() == 'TRUE' alphas = pl.scan_csv(f'{dir_}/alpha.tab', sep='\t', has_header=False).collect().to_numpy().T susie_pips = 1 - np.prod(1 - alphas, axis=1) df = pl.scan_csv(f'{dir_}/colnames.txt', has_header=False, with_column_names=lambda _: ['var_name']).with_column( pl.lit(1).alias('row_number')).with_columns([ pl.col('row_number').cumsum(), pl.lit(None, int).alias('cs_num'), pl.lit(region).alias('region'), pl.lit(pheno).alias('phenotype'), pl.Series(susie_pips).alias('susie_pip'), pl.lit(None, float).alias('susie_cs_pip') ]) for cs_num in range(50): cs_num += 1 cs_fname = f'{dir_}/cs{cs_num}.txt' if not os.path.exists(cs_fname): continue with open(cs_fname) as cs: var_nums = [int(var_num) for var_num in next(cs).strip().split()] next(cs) min_ld = float(next(cs).split()[0]) if min_ld < min_ld_thresh: continue df = df.with_columns([ pl.when(pl.col('row_number').is_in(var_nums)).then( pl.when(~pl.col('cs_num').is_null()).then(-1).otherwise( cs_num)).otherwise(pl.col('cs_num')).alias('cs_num'), pl.when(pl.col('row_number').is_in(var_nums)).then( pl.Series(alphas[:, cs_num - 1])).otherwise( pl.col('susie_cs_pip')).alias('susie_cs_pip') ]) df = df.with_column( pl.when(pl.col('cs_num') != -1).then( pl.col('susie_cs_pip')).otherwise(-1).alias('susie_cs_pip')) df = df.filter( pl.col('var_name').str.contains('^STR') & ~pl.col('cs_num').is_null() & (pl.col('susie_pip') > 0.05)).drop('row_number') return df
def test_filter_date(): dataset = pl.DataFrame({ "date": ["2020-01-02", "2020-01-03", "2020-01-04"], "index": [1, 2, 3] }) df = dataset.with_column( pl.col("date").str.strptime(pl.Date32, "%Y-%m-%d")) assert df.filter( col("date") <= pl.lit_date(datetime(2019, 1, 3))).is_empty() assert df.filter( col("date") < pl.lit_date(datetime(2020, 1, 4))).shape[0] == 2 assert df.filter( col("date") < pl.lit_date(datetime(2020, 1, 5))).shape[0] == 3 assert df.filter(col("date") <= pl.lit(datetime(2019, 1, 3))).is_empty() assert df.filter(col("date") < pl.lit(datetime(2020, 1, 4))).shape[0] == 2 assert df.filter(col("date") < pl.lit(datetime(2020, 1, 5))).shape[0] == 3
def calculate_friction_number(column_names: List[str]) -> "pl.Expr": if "fs" in column_names and "qc" in column_names: return (col("fs") / when(col("qc") == 0.0).then(None).otherwise(col("qc")) * 100.0).alias("friction_number") else: return lit(0.0).alias("friction_number")
def replace_column_void(lf: pl.LazyFrame, column_void) -> pl.LazyFrame: if column_void is None: return lf # TODO: what to do with multiple columnvoids? if isinstance(column_void, list): column_void = column_void[0] return ( # Get all values matching column_void and change them to null lf.select( pl.when(pl.all() == pl.lit(column_void)).then( pl.lit(None)).otherwise(pl.all()).keep_name()) # Interpolate all null values .select(pl.all().interpolate()) # Remove the rows with null values .drop_nulls())
def test_contains() -> None: a = pl.Series("a", [[1, 2, 3], [2, 5], [6, 7, 8, 9]]) out = a.arr.contains(2) expected = pl.Series("a", [True, True, False]) testing.assert_series_equal(out, expected) out = pl.select(pl.lit(a).arr.contains(2)).to_series() testing.assert_series_equal(out, expected)
def test_shuffle() -> None: a = pl.Series("a", [1, 2, 3]) out = a.shuffle(2) expected = pl.Series("a", [2, 1, 3]) testing.assert_series_equal(out, expected) out = pl.select(pl.lit(a).shuffle(2)).to_series() testing.assert_series_equal(out, expected)
def test_set_null() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = (df.lazy().with_column( when(col("a") > 1).then( lit(None)).otherwise(100).alias("foo")).collect()) s = out["foo"] assert s[0] == 100 assert s[1] is None assert s[2] is None
def test_str_concat(): df = pl.DataFrame({ "nrs": [1, 2, 3, 4], "name": ["ham", "spam", "foo", None], }) out = df.with_column( (pl.lit("Dr. ") + pl.col("name")).alias("graduated_name")) assert out["graduated_name"][0] == "Dr. ham" assert out["graduated_name"][1] == "Dr. spam"
def test_fold_filter(): df = pl.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]}) out = df.filter( pl.fold( acc=pl.lit(True), f=lambda a, b: a & b, exprs=[col(c) > 1 for c in df.columns], )) assert out.shape == (1, 2) out = df.filter( pl.fold( acc=pl.lit(True), f=lambda a, b: a | b, exprs=[col(c) > 1 for c in df.columns], )) assert out.shape == (3, 2)
def test_list_empty_groupby_result_3521() -> None: # Create a left relation where the join column contains a null value left = pl.DataFrame().with_columns([ pl.lit(1).alias("groupby_column"), pl.lit(None).cast(pl.Int32).alias("join_column"), ]) # Create a right relation where there is a column to count distinct on right = pl.DataFrame().with_columns([ pl.lit(1).alias("join_column"), pl.lit(1).alias("n_unique_column"), ]) # Calculate n_unique after dropping nulls # This will panic on polars version 0.13.38 and 0.13.39 assert (left.join( right, on="join_column", how="left").groupby("groupby_column").agg( pl.col("n_unique_column").drop_nulls())).to_dict(False) == { "groupby_column": [1], "n_unique_column": [[]] }
def test_when_then_edge_cases_3994() -> None: df = pl.DataFrame(data={"id": [1, 1], "type": [2, 2]}) # this tests if lazy correctly assigns the list schema to the column aggregation assert (df.lazy().groupby(["id"]).agg(pl.col("type")).with_column( pl.when(pl.col("type").arr.lengths() == 0).then( pl.lit(None)).otherwise( pl.col("type")).keep_name()).collect()).to_dict(False) == { "id": [1], "type": [[2, 2]] } # this tests ternary with an empty argument assert (df.filter(pl.col("id") == 42).groupby([ "id" ]).agg(pl.col("type")).with_column( pl.when(pl.col("type").arr.lengths == 0).then(pl.lit(None)).otherwise( pl.col("type")).keep_name())).to_dict(False) == { "id": [], "type": [] }
def test_list_arr_get() -> None: a = pl.Series("a", [[1, 2, 3], [4, 5], [6, 7, 8, 9]]) out = a.arr.get(0) expected = pl.Series("a", [1, 4, 6]) testing.assert_series_equal(out, expected) out = a.arr.first() testing.assert_series_equal(out, expected) out = pl.select(pl.lit(a).arr.first()).to_series() testing.assert_series_equal(out, expected) out = a.arr.get(-1) expected = pl.Series("a", [3, 5, 9]) testing.assert_series_equal(out, expected) out = a.arr.last() testing.assert_series_equal(out, expected) out = pl.select(pl.lit(a).arr.last()).to_series() testing.assert_series_equal(out, expected) a = pl.Series("a", [[1, 2, 3], [4, 5], [6, 7, 8, 9]]) out = a.arr.get(-3) expected = pl.Series("a", [1, None, 7]) testing.assert_series_equal(out, expected)
def test_fill_null() -> None: dt = datetime.strptime("2021-01-01", "%Y-%m-%d") s = pl.Series("A", [dt, None]) for fill_val in (dt, pl.lit(dt)): out = s.fill_null(fill_val) # type: ignore[arg-type] assert out.null_count() == 0 assert out.dt[0] == dt assert out.dt[1] == dt dt1 = date(2001, 1, 1) dt2 = date(2001, 1, 2) dt3 = date(2001, 1, 3) s = pl.Series("a", [dt1, dt2, dt3, None]) dt_2 = date(2001, 1, 4) for fill_val in (dt_2, pl.lit(dt_2)): out = s.fill_null(fill_val) # type: ignore[arg-type] assert out.null_count() == 0 assert out.dt[0] == dt1 assert out.dt[1] == dt2 assert out.dt[-1] == dt_2
def test_preservation_of_subclasses() -> None: """Tests for LazyFrame inheritance.""" # We should be able to inherit from polars.LazyFrame class SubClassedLazyFrame(pl.LazyFrame): pass # The constructor creates an object which is an instance of both the # superclass and subclass ldf = pl.DataFrame({"column_1": [1, 2, 3]}).lazy() ldf.__class__ = SubClassedLazyFrame extended_ldf = ldf.with_column(pl.lit(1).alias("column_2")) assert isinstance(extended_ldf, pl.LazyFrame) assert isinstance(extended_ldf, SubClassedLazyFrame)
def test_abs() -> None: # ints s = pl.Series([1, -2, 3, -4]) testing.assert_series_equal(s.abs(), pl.Series([1, 2, 3, 4])) testing.assert_series_equal(np.abs(s), pl.Series([1, 2, 3, 4])) # type: ignore # floats s = pl.Series([1.0, -2.0, 3, -4.0]) testing.assert_series_equal(s.abs(), pl.Series([1.0, 2.0, 3.0, 4.0])) testing.assert_series_equal( np.abs(s), pl.Series([1.0, 2.0, 3.0, 4.0]) # type: ignore ) testing.assert_series_equal( pl.select(pl.lit(s).abs()).to_series(), pl.Series([1.0, 2.0, 3.0, 4.0]) )
def test_list_arr_get() -> None: a = pl.Series("a", [[1, 2, 3], [4, 5], [6, 7, 8, 9]]) out = a.arr.get(0) expected = pl.Series("a", [1, 4, 6]) assert_series_equal(out, expected) out = a.arr.first() assert_series_equal(out, expected) out = pl.select(pl.lit(a).arr.first()).to_series() assert_series_equal(out, expected) out = a.arr.get(-1) expected = pl.Series("a", [3, 5, 9]) assert_series_equal(out, expected) out = a.arr.last() assert_series_equal(out, expected) out = pl.select(pl.lit(a).arr.last()).to_series() assert_series_equal(out, expected) a = pl.Series("a", [[1, 2, 3], [4, 5], [6, 7, 8, 9]]) out = a.arr.get(-3) expected = pl.Series("a", [1, None, 7]) assert_series_equal(out, expected) assert pl.DataFrame( {"a": [[1], [2], [3], [4, 5, 6], [7, 8, 9], [None, 11]]} ).with_columns( [pl.col("a").arr.get(i).alias(f"get_{i}") for i in range(4)] ).to_dict( False ) == { "a": [[1], [2], [3], [4, 5, 6], [7, 8, 9], [None, 11]], "get_0": [1, 2, 3, 4, 7, None], "get_1": [None, None, None, 5, 8, 11], "get_2": [None, None, None, 6, 9, None], "get_3": [None, None, None, None, None, None], }
def test_fold() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = df.select([ pl.sum(["a", "b"]), pl.max(["a", pl.col("b")**2]), pl.min(["a", pl.col("b")**2]), ]) assert out["sum"].series_equal(pl.Series("sum", [2.0, 4.0, 6.0])) assert out["max"].series_equal(pl.Series("max", [1.0, 4.0, 9.0])) assert out["min"].series_equal(pl.Series("min", [1.0, 2.0, 3.0])) out = df.select( pl.fold(acc=lit(0), f=lambda acc, x: acc + x, exprs=pl.col("*")).alias("foo")) assert out["foo"] == [2, 4, 6]
def test_type_coercion_when_then_otherwise_2806() -> None: out = (pl.DataFrame({ "names": ["foo", "spam", "spam"], "nrs": [1, 2, 3] }).select([ pl.when(pl.col("names") == "spam").then(pl.col("nrs") * 2).otherwise( pl.lit("other")).alias("new_col"), ]).to_series()) expected = pl.Series("new_col", ["other", "4", "6"]) assert out.to_list() == expected.to_list() # test it remains float32 assert (pl.Series( "a", [1.0, 2.0, 3.0], dtype=pl.Float32).to_frame().select( pl.when(pl.col("a") > 2.0).then( pl.col("a")).otherwise(0.0))).to_series().dtype == pl.Float32
def type_index() -> pl.Expr: return ( ( ( pl.lit(3.0) - np.log10( col("normalized_cone_resistance") * (1.0 - col("excess_pore_pressure_ratio")) + 1.0 ) ) ** 2 + (1.5 + 1.3 * np.log10(col("normalized_friction_ratio"))) ** 2 ) ** 0.5 ).alias("type_index")
def test_reshape() -> None: s = pl.Series("a", [1, 2, 3, 4]) out = s.reshape((-1, 2)) expected = pl.Series("a", [[1, 2], [3, 4]]) assert out.series_equal(expected) out = s.reshape((2, 2)) assert out.series_equal(expected) out = s.reshape((2, -1)) assert out.series_equal(expected) out = s.reshape((-1, 1)) expected = pl.Series("a", [[1], [2], [3], [4]]) assert out.series_equal(expected) # test lazy_dispatch out = pl.select(pl.lit(s).reshape((-1, 1))).to_series() assert out.series_equal(expected)
def test_take(fruits_cars: pl.DataFrame) -> None: df = fruits_cars # out of bounds error with pytest.raises(RuntimeError): (df.sort("fruits").select( [col("B").reverse().take([1, 2]).list().over("fruits"), "fruits"] # type: ignore )) for index in [[0, 1], pl.Series([0, 1]), np.array([0, 1]), pl.lit(1)]: out = df.sort("fruits").select( [col("B").reverse().take(index).list().over("fruits"), "fruits"] # type: ignore ) assert out[0, "B"] == [2, 3] assert out[4, "B"] == [1, 4]
def test_sum_to_one(self): cols = [ "gravel_component", "sand_component", "clay_component", "loam_component", "peat_component", "silt_component", ] s = self.bore.df.select([ pl.fold( pl.lit(0), lambda a, b: a + b, [ pl.when(pl.col(a) < 0).then(1 / len(cols)).otherwise( pl.col(a)) for a in cols ], ).alias("sum") ]) self.assertTrue(np.all(np.isclose(s, 1)))
def test_add_eager_column(): df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}) out = df.lazy().with_column(pl.lit(pl.Series("c", [1, 2, 3]))).collect() assert out["c"].sum() == 6
then('doubly').when((pl.col('susie_CP') >= 0.8) | (pl.col('finemap_CP') >= 0.8)).then( 'singly').otherwise('not').alias('finemapping'), 'susie_CP', 'finemap_CP', 'susie_CP_best_guess_genotypes', 'finemap_CP_pval_thresh_5e-4', 'finemap_CP_mac_thresh_100', 'finemap_CP_prior_effect_size_0.05%', 'finemap_CP_prior_4_signals', 'finemap_CP_stopping_thresh_1e-4', 'susie_CP_prior_snps_over_strs', 'finemap_CP_prior_snps_over_strs', 'finemap_CP_prior_effect_size_0.0025%', pl.sum([ pl.col(f'{ethnicity}_p_val').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities ]).str.replace(', $', '').alias('other_ethnicity_association_p_values'), pl.sum([ pl.when(pl.col(f'{ethnicity}_p_val') > .05).then('NA').when( pl.col('coeff') > 0).then('+').otherwise('-') + pl.lit(', ') for ethnicity in other_ethnicities ]).str.replace(', $', '').alias('other_ethnicity_effect_directions'), *[ pl.col(f'{ethnicity}_allele_dosages').apply( dosages_to_frequencies).alias(f'{ethnicity}_allele_frequencies') for ethnicity in other_ethnicities ], ]) finemapping_results.write_csv(
print("q10") out = (x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"]).agg( [pl.sum("v3").alias("v3"), pl.count("v1").alias("count")]).collect()) print(time.time() - t0) print("easy took:", easy_time, "s") print("advanced took:", time.time() - t0advanced, "s") total_time = time.time() - t00 print("total took:", total_time, "s") assert out.shape == (9999995, 8) if not ON_STRINGS: if total_time > 11: print("query took longer than 11s, may be noise") exit(1) # Additional tests # the code below, does not belong to the db-benchmark # but it triggers other code paths so the checksums assertion # are a sort of integration tests out = (x.filter(pl.col("id1") == pl.lit("id046")).select( [pl.sum("id6"), pl.sum("v3")]).collect()) assert out["id6"] == 430957682 assert np.isclose(out["v3"], 4.724150165888001e6) out = (x.filter(~(pl.col("id1") == pl.lit("id046"))).select( [pl.sum("id6"), pl.sum("v3")]).collect()) assert out["id6"] == 2137755425 assert np.isclose(out["v3"], 4.7040828499563754e8)
def test_numpy_to_lit() -> None: out = pl.select(pl.lit(np.array([1, 2, 3]))).to_series().to_list() assert out == [1, 2, 3] out = pl.select(pl.lit(np.float32(0))).to_series().to_list() assert out == [0.0]
def test_datetime_consistency() -> None: # dt = datetime(2021, 1, 1, 10, 30, 45, 123456) dt = datetime(2021, 1, 1, 10, 30, 45, 123000) df = pl.DataFrame({"date": [dt]}) assert df["date"].dt[0] == dt assert df.select(pl.lit(dt))["literal"].dt[0] == dt