def fetch( self, n_rows: int = 500, type_coercion: bool = True, predicate_pushdown: bool = True, projection_pushdown: bool = True, simplify_expression: bool = True, string_cache: bool = True, no_optimization: bool = False, slice_pushdown: bool = True, ) -> pli.DataFrame: """ Fetch is like a collect operation, but it overwrites the number of rows read by every scan operation. This is a utility that helps debug a query on a smaller number of rows. Note that the fetch does not guarantee the final number of rows in the DataFrame. Filter, join operations and a lower number of rows available in the scanned file influence the final number of rows. Parameters ---------- n_rows Collect n_rows from the data sources. type_coercion Run type coercion optimization. predicate_pushdown Run predicate pushdown optimization. projection_pushdown Run projection pushdown optimization. simplify_expression Run simplify expressions optimization. string_cache Use a global string cache in this query. This is needed if you want to join on categorical columns. no_optimization Turn off optimizations. slice_pushdown Slice pushdown opitmizaiton Returns ------- DataFrame """ if no_optimization: predicate_pushdown = False projection_pushdown = False slice_pushdown = False ldf = self._ldf.optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, string_cache, slice_pushdown, ) return pli.wrap_df(ldf.fetch(n_rows))
def arrow_to_pydf(data: "pa.Table", columns: Optional[Sequence[str]] = None, rechunk: bool = True) -> "PyDataFrame": """ Construct a PyDataFrame from an Arrow Table. """ if not _PYARROW_AVAILABLE: # pragma: no cover raise ImportError( "'pyarrow' is required when constructing a PyDataFrame from an Arrow Table." ) if columns is not None: try: data = data.rename_columns(columns) except pa.lib.ArrowInvalid as e: raise ValueError( "Dimensions of columns arg must match data dimensions.") from e data_dict = {} # dictionaries cannot be build in different batches (categorical does not allow that) # so we rechunk them and create them separate. dictionary_cols = {} names = [] for i, column in enumerate(data): # extract the name before casting if column._name is None: name = f"column_{i}" else: name = column._name names.append(name) column = coerce_arrow(column) if pa.types.is_dictionary(column.type): ps = arrow_to_pyseries(name, column, rechunk) dictionary_cols[i] = pli.wrap_s(ps) else: data_dict[name] = column if len(data_dict) > 0: tbl = pa.table(data_dict) # path for table without rows that keeps datatype if tbl.shape[0] == 0: pydf = pli.DataFrame._from_pandas(tbl.to_pandas())._df else: pydf = PyDataFrame.from_arrow_record_batches(tbl.to_batches()) else: pydf = pli.DataFrame([])._df if rechunk: pydf = pydf.rechunk() if len(dictionary_cols) > 0: df = pli.wrap_df(pydf) for i, s in dictionary_cols.items(): df[s.name] = s df = df[names] pydf = df._df return pydf
def collect( self, type_coercion: bool = True, predicate_pushdown: bool = True, projection_pushdown: bool = True, simplify_expression: bool = True, string_cache: bool = False, no_optimization: bool = False, slice_pushdown: bool = True, ) -> pli.DataFrame: """ Collect into a DataFrame. Note: use `fetch` if you want to run this query on the first `n` rows only. This can be a huge time saver in debugging queries. Parameters ---------- type_coercion Do type coercion optimization. predicate_pushdown Do predicate pushdown optimization. projection_pushdown Do projection pushdown optimization. simplify_expression Run simplify expressions optimization. string_cache Use a global string cache in this query. This is needed if you want to join on categorical columns. Caution! If you already have set a global string cache, set this to `False` as this will reset the global cache when the query is finished. no_optimization Turn off optimizations. slice_pushdown Slice pushdown optimization. Returns ------- DataFrame """ if no_optimization: predicate_pushdown = False projection_pushdown = False slice_pushdown = False ldf = self._ldf.optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, string_cache, slice_pushdown, ) return pli.wrap_df(ldf.collect())
def concat( items: (Sequence[pli.DataFrame] | Sequence[pli.Series] | Sequence[pli.LazyFrame] | Sequence[pli.Expr]), rechunk: bool = True, how: str = "vertical", ) -> pli.DataFrame | pli.Series | pli.LazyFrame | pli.Expr: """ Aggregate all the Dataframes/Series in a List of DataFrames/Series to a single DataFrame/Series. Parameters ---------- items DataFrames/Series/LazyFrames to concatenate. rechunk rechunk the final DataFrame/Series. how Only used if the items are DataFrames. One of {"vertical", "diagonal", "horizontal"}. - Vertical: Applies multiple `vstack` operations. - Diagonal: Finds a union between the column schemas and fills missing column values with null. - Horizontal: Stacks Series horizontally and fills with nulls if the lengths don't match. Examples -------- >>> df1 = pl.DataFrame({"a": [1], "b": [3]}) >>> df2 = pl.DataFrame({"a": [2], "b": [4]}) >>> pl.concat([df1, df2]) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 3 │ ├╌╌╌╌╌┼╌╌╌╌╌┤ │ 2 ┆ 4 │ └─────┴─────┘ """ if not len(items) > 0: raise ValueError("cannot concat empty list") out: pli.Series | pli.DataFrame | pli.LazyFrame | pli.Expr first = items[0] if isinstance(first, pli.DataFrame): if how == "vertical": out = pli.wrap_df(_concat_df(items)) elif how == "diagonal": out = pli.wrap_df(_diag_concat_df(items)) elif how == "horizontal": out = pli.wrap_df(_hor_concat_df(items)) else: raise ValueError( f"how should be one of {'vertical', 'diagonal'}, got {how}") elif isinstance(first, pli.LazyFrame): return pli.wrap_ldf(_concat_lf(items, rechunk)) elif isinstance(first, pli.Series): out = pli.wrap_s(_concat_series(items)) elif isinstance(first, pli.Expr): out = first for e in items[1:]: out = out.append(e) # type: ignore[arg-type] else: raise ValueError(f"did not expect type: {type(first)} in 'pl.concat'.") if rechunk: return out.rechunk() return out
def collect_all( lazy_frames: list[pli.LazyFrame], type_coercion: bool = True, predicate_pushdown: bool = True, projection_pushdown: bool = True, simplify_expression: bool = True, string_cache: bool = False, no_optimization: bool = False, slice_pushdown: bool = False, ) -> list[pli.DataFrame]: """ Collect multiple LazyFrames at the same time. This runs all the computation graphs in parallel on Polars threadpool. Parameters ---------- type_coercion Do type coercion optimization. predicate_pushdown Do predicate pushdown optimization. projection_pushdown Do projection pushdown optimization. simplify_expression Run simplify expressions optimization. string_cache Use a global string cache in this query. This is needed if you want to join on categorical columns. Caution! If you already have set a global string cache, set this to `False` as this will reset the global cache when the query is finished. no_optimization Turn off optimizations. slice_pushdown Slice pushdown optimization. Returns ------- List[DataFrame] """ if no_optimization: predicate_pushdown = False projection_pushdown = False slice_pushdown = False prepared = [] for lf in lazy_frames: ldf = lf._ldf.optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, string_cache, slice_pushdown, ) prepared.append(ldf) out = _collect_all(prepared) # wrap the pydataframes into dataframe result = [pli.wrap_df(pydf) for pydf in out] return result