def test_filter_str(): # use a str instead of a column expr df = pl.DataFrame( { "time": ["11:11:00", "11:12:00", "11:13:00", "11:14:00"], "bools": [True, False, True, False], } ) q = df.lazy() # last row based on a filter q.filter(pl.col("bools")).select(pl.last("*"))
def get_complete_df(): return pl.DataFrame( { "bools": [False, True, False], "bools_nulls": [None, True, False], "int": [1, 2, 3], "int_nulls": [1, None, 3], "floats": [1.0, 2.0, 3.0], "floats_nulls": [1.0, None, 3.0], "strings": ["foo", "bar", "ham"], "strings_nulls": ["foo", None, "ham"], } )
import pypolars as pl from pypolars.lazy import * my_map = {1: "foo", 2: "bar", 3: "ham", 4: "spam", 5: "eggs"} df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]}) # create a udf def my_custom_func(s: Series) -> Series: return s.apply(lambda x: my_map[x]) # run query with udf out = df.lazy().with_column(col("foo").map(my_custom_func).alias("mapped")) if __name__ == "__main__": with open("book/src/outputs/how_can_i_use_custom_functions_2.txt", "w") as f: f.write(str(out.collect()))
from pandas.util.testing import rands import pypolars as pl data_dir = "./data" groups = np.arange(10) str_groups = np.array(list("0123456789")) np.random.seed(1) for size in [1e4, 1e5, 1e6, 1e7]: print(f"create groupby data for {int(size)} rows") size = int(size) g = np.random.choice(groups, size) sg = np.random.choice(str_groups, size) v = np.random.randn(size) df = pl.DataFrame({"groups": g, "values": v, "str": sg}) df.to_csv(f"{data_dir}/{size}.csv") print("groupby data created") # Join benchmark data # https://wesmckinney.com/blog/high-performance-database-joins-with-pandas-dataframe-more-benchmarks/ # https://github.com/wesm/pandas/blob/23669822819808bbaeb6ea36a6b2ef98026884db/bench/bench_merge_sqlite.py N = 10000 indices = np.array([rands(10) for _ in range(N)], dtype="O") indices2 = np.array([rands(10) for _ in range(N)], dtype="O") key = np.tile(indices[:8000], 10) key2 = np.tile(indices2[:8000], 10) left = pl.DataFrame({ "key": key,
import numpy as np uid = [item for sublist in [4 * [r] for r in range(3)] for item in sublist] date = [ "2020-12-20", "2020-12-21", "2020-12-22", "2020-12-23", ] cumcases = [20, 40, 67, 80] df = pl.DataFrame({ "uid": uid, "date": np.hstack([date, date, date]), "cumcases": np.hstack([cumcases, [2 * c for c in cumcases], [3 * c for c in cumcases]]), }) def mkdiff(cumcases: pl.Series) -> pl.Series: """ Creates a new Series with differences per row """ return cumcases - cumcases.shift(1) base_df = ( df.lazy()
import pypolars as pl from pypolars.lazy import * import numpy as np df = pl.DataFrame({ "range": np.arange(10), "left": ["foo"] * 10, "right": ["bar"] * 10 }) out = df.lazy().with_column( when(col("range") >= 5).then(col("left")).otherwise( col("right")).alias("foo_or_bar")) if __name__ == "__main__": with open("book/src/outputs/how_can_i_conditionally_apply.txt", "w") as f: f.write(str(out.collect()))
import pypolars as pl from pypolars.lazy import * df = pl.DataFrame( { "A": [1, 2, 3, 4, 5], "fruits": ["banana", "banana", "apple", "apple", "banana"], "B": [5, 4, 3, 2, 1], "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], } ) # two ways to determine the length groups. out = ( df.lazy() .groupby("fruits") .agg( [ col("cars").apply(lambda groups: groups.len()).alias("custom_1"), col("cars").apply(lambda groups: groups.len()).alias("custom_2"), pl.count("cars"), ] ) ) if __name__ == "__main__": with open("book/src/outputs/how_can_i_use_custom_functions_3.txt", "w") as f: f.write(str(out.collect()))
}""" element_props = [ ("tbody tr th:only-of-type", "vertical-align", "middle"), ("tbody tr th", "vertical-align", "top"), ] element_props.append(("thead th", "text-align", "right")) template_mid = "\n\n".join( map(lambda t: template_select % t, element_props)) template = dedent("\n".join( (template_first, template_mid, template_last))) self.write(template) def render(self) -> List[str]: """ Return the lines needed to render a HTML table. """ with Tag(self.elements, "div"): self.write_style() super().render() return self.elements if __name__ == "__main__": import pypolars as pl df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) fmt = NotebookFormatter(df) print("\n".join(fmt.render()))
import pypolars as pl from pypolars.lazy import * import numpy as np np.random.seed(1) df = pl.DataFrame({"foo": np.arange(10), "bar": np.random.rand(10)}) # create a udf def my_custom_func(s: Series) -> Series: return np.exp(s) / np.log(s) # a simple wrapper that take a function and sets output type my_udf = udf(my_custom_func, output_type=pl.Float64) # run query with udf out = df.lazy().filter(col("bar").map(my_udf) > -1) if __name__ == "__main__": with open("book/src/outputs/how_can_i_use_custom_functions_1.txt", "w") as f: f.write(str(out.collect()))
def test_from_arrow_table(): data = {"a": [1, 2], "b": [1, 2]} tbl = pa.table(data) df = pl.from_arrow_table(tbl) df.frame_equal(pl.DataFrame(data))
import pypolars as pl from pypolars.lazy import * df = pl.DataFrame({"shakespeare": "All that glitters is not gold".split(" ")}) str_lengths = df.lazy().with_column( col("shakespeare").str_lengths().alias("letter_count")) df = pl.DataFrame({"a": "The man that ate a whole cake".split(" ")}) filtered = df.lazy().filter(col("a").str_contains(r"(?i)^the$|^a$").is_not()) if __name__ == "__main__": with open("book/src/outputs/how_can_i_process_strings.txt", "w") as f: f.write(str(str_lengths.collect())) with open("book/src/outputs/how_can_i_process_strings_1.txt", "w") as f: f.write(str(filtered.collect()))