Ejemplo n.º 1
0
def test_filter_str():
    # use a str instead of a column expr
    df = pl.DataFrame(
        {
            "time": ["11:11:00", "11:12:00", "11:13:00", "11:14:00"],
            "bools": [True, False, True, False],
        }
    )
    q = df.lazy()
    # last row based on a filter
    q.filter(pl.col("bools")).select(pl.last("*"))
Ejemplo n.º 2
0
def get_complete_df():
    return pl.DataFrame(
        {
            "bools": [False, True, False],
            "bools_nulls": [None, True, False],
            "int": [1, 2, 3],
            "int_nulls": [1, None, 3],
            "floats": [1.0, 2.0, 3.0],
            "floats_nulls": [1.0, None, 3.0],
            "strings": ["foo", "bar", "ham"],
            "strings_nulls": ["foo", None, "ham"],
        }
    )
Ejemplo n.º 3
0
import pypolars as pl
from pypolars.lazy import *

my_map = {1: "foo", 2: "bar", 3: "ham", 4: "spam", 5: "eggs"}

df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]})

# create a udf
def my_custom_func(s: Series) -> Series:
    return s.apply(lambda x: my_map[x])


# run query with udf
out = df.lazy().with_column(col("foo").map(my_custom_func).alias("mapped"))

if __name__ == "__main__":
    with open("book/src/outputs/how_can_i_use_custom_functions_2.txt", "w") as f:
        f.write(str(out.collect()))
Ejemplo n.º 4
0
from pandas.util.testing import rands
import pypolars as pl

data_dir = "./data"

groups = np.arange(10)
str_groups = np.array(list("0123456789"))
np.random.seed(1)

for size in [1e4, 1e5, 1e6, 1e7]:
    print(f"create groupby data for {int(size)} rows")
    size = int(size)
    g = np.random.choice(groups, size)
    sg = np.random.choice(str_groups, size)
    v = np.random.randn(size)
    df = pl.DataFrame({"groups": g, "values": v, "str": sg})
    df.to_csv(f"{data_dir}/{size}.csv")

print("groupby data created")

# Join benchmark data
# https://wesmckinney.com/blog/high-performance-database-joins-with-pandas-dataframe-more-benchmarks/
# https://github.com/wesm/pandas/blob/23669822819808bbaeb6ea36a6b2ef98026884db/bench/bench_merge_sqlite.py
N = 10000
indices = np.array([rands(10) for _ in range(N)], dtype="O")
indices2 = np.array([rands(10) for _ in range(N)], dtype="O")
key = np.tile(indices[:8000], 10)
key2 = np.tile(indices2[:8000], 10)

left = pl.DataFrame({
    "key": key,
Ejemplo n.º 5
0
import numpy as np

uid = [item for sublist in [4 * [r] for r in range(3)] for item in sublist]
date = [
    "2020-12-20",
    "2020-12-21",
    "2020-12-22",
    "2020-12-23",
]
cumcases = [20, 40, 67, 80]

df = pl.DataFrame({
    "uid":
    uid,
    "date":
    np.hstack([date, date, date]),
    "cumcases":
    np.hstack([cumcases, [2 * c for c in cumcases],
               [3 * c for c in cumcases]]),
})


def mkdiff(cumcases: pl.Series) -> pl.Series:
    """
    Creates a new Series with differences per row
    """
    return cumcases - cumcases.shift(1)


base_df = (
    df.lazy()
Ejemplo n.º 6
0
import pypolars as pl
from pypolars.lazy import *
import numpy as np

df = pl.DataFrame({
    "range": np.arange(10),
    "left": ["foo"] * 10,
    "right": ["bar"] * 10
})

out = df.lazy().with_column(
    when(col("range") >= 5).then(col("left")).otherwise(
        col("right")).alias("foo_or_bar"))

if __name__ == "__main__":
    with open("book/src/outputs/how_can_i_conditionally_apply.txt", "w") as f:
        f.write(str(out.collect()))
Ejemplo n.º 7
0
import pypolars as pl
from pypolars.lazy import *


df = pl.DataFrame(
    {
        "A": [1, 2, 3, 4, 5],
        "fruits": ["banana", "banana", "apple", "apple", "banana"],
        "B": [5, 4, 3, 2, 1],
        "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
    }
)

# two ways to determine the length groups.
out = (
    df.lazy()
    .groupby("fruits")
    .agg(
        [
            col("cars").apply(lambda groups: groups.len()).alias("custom_1"),
            col("cars").apply(lambda groups: groups.len()).alias("custom_2"),
            pl.count("cars"),
        ]
    )
)


if __name__ == "__main__":
    with open("book/src/outputs/how_can_i_use_custom_functions_3.txt", "w") as f:
        f.write(str(out.collect()))
Ejemplo n.º 8
0
                }"""
        element_props = [
            ("tbody tr th:only-of-type", "vertical-align", "middle"),
            ("tbody tr th", "vertical-align", "top"),
        ]
        element_props.append(("thead th", "text-align", "right"))
        template_mid = "\n\n".join(
            map(lambda t: template_select % t, element_props))
        template = dedent("\n".join(
            (template_first, template_mid, template_last)))
        self.write(template)

    def render(self) -> List[str]:
        """
        Return the lines needed to render a HTML table.
        """
        with Tag(self.elements, "div"):
            self.write_style()
            super().render()
        return self.elements


if __name__ == "__main__":
    import pypolars as pl

    df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})

    fmt = NotebookFormatter(df)

    print("\n".join(fmt.render()))
Ejemplo n.º 9
0
import pypolars as pl
from pypolars.lazy import *
import numpy as np

np.random.seed(1)

df = pl.DataFrame({"foo": np.arange(10), "bar": np.random.rand(10)})


# create a udf
def my_custom_func(s: Series) -> Series:
    return np.exp(s) / np.log(s)


# a simple wrapper that take a function and sets output type
my_udf = udf(my_custom_func, output_type=pl.Float64)

# run query with udf
out = df.lazy().filter(col("bar").map(my_udf) > -1)

if __name__ == "__main__":
    with open("book/src/outputs/how_can_i_use_custom_functions_1.txt",
              "w") as f:
        f.write(str(out.collect()))
Ejemplo n.º 10
0
def test_from_arrow_table():
    data = {"a": [1, 2], "b": [1, 2]}
    tbl = pa.table(data)

    df = pl.from_arrow_table(tbl)
    df.frame_equal(pl.DataFrame(data))
Ejemplo n.º 11
0
import pypolars as pl
from pypolars.lazy import *

df = pl.DataFrame({"shakespeare": "All that glitters is not gold".split(" ")})

str_lengths = df.lazy().with_column(
    col("shakespeare").str_lengths().alias("letter_count"))

df = pl.DataFrame({"a": "The man that ate a whole cake".split(" ")})

filtered = df.lazy().filter(col("a").str_contains(r"(?i)^the$|^a$").is_not())

if __name__ == "__main__":
    with open("book/src/outputs/how_can_i_process_strings.txt", "w") as f:
        f.write(str(str_lengths.collect()))

    with open("book/src/outputs/how_can_i_process_strings_1.txt", "w") as f:
        f.write(str(filtered.collect()))