def arrow_to_pydf(data: "pa.Table", columns: Optional[Sequence[str]] = None, rechunk: bool = True) -> "PyDataFrame": """ Construct a PyDataFrame from an Arrow Table. """ if not _PYARROW_AVAILABLE: # pragma: no cover raise ImportError( "'pyarrow' is required when constructing a PyDataFrame from an Arrow Table." ) if columns is not None: try: data = data.rename_columns(columns) except pa.lib.ArrowInvalid as e: raise ValueError( "Dimensions of columns arg must match data dimensions.") from e data_dict = {} # dictionaries cannot be build in different batches (categorical does not allow that) # so we rechunk them and create them separate. dictionary_cols = {} names = [] for i, column in enumerate(data): # extract the name before casting if column._name is None: name = f"column_{i}" else: name = column._name names.append(name) column = coerce_arrow(column) if pa.types.is_dictionary(column.type): ps = arrow_to_pyseries(name, column, rechunk) dictionary_cols[i] = pli.wrap_s(ps) else: data_dict[name] = column if len(data_dict) > 0: tbl = pa.table(data_dict) # path for table without rows that keeps datatype if tbl.shape[0] == 0: pydf = pli.DataFrame._from_pandas(tbl.to_pandas())._df else: pydf = PyDataFrame.from_arrow_record_batches(tbl.to_batches()) else: pydf = pli.DataFrame([])._df if rechunk: pydf = pydf.rechunk() if len(dictionary_cols) > 0: df = pli.wrap_df(pydf) for i, s in dictionary_cols.items(): df[s.name] = s df = df[names] pydf = df._df return pydf
def concat( items: (Sequence[pli.DataFrame] | Sequence[pli.Series] | Sequence[pli.LazyFrame] | Sequence[pli.Expr]), rechunk: bool = True, how: str = "vertical", ) -> pli.DataFrame | pli.Series | pli.LazyFrame | pli.Expr: """ Aggregate all the Dataframes/Series in a List of DataFrames/Series to a single DataFrame/Series. Parameters ---------- items DataFrames/Series/LazyFrames to concatenate. rechunk rechunk the final DataFrame/Series. how Only used if the items are DataFrames. One of {"vertical", "diagonal", "horizontal"}. - Vertical: Applies multiple `vstack` operations. - Diagonal: Finds a union between the column schemas and fills missing column values with null. - Horizontal: Stacks Series horizontally and fills with nulls if the lengths don't match. Examples -------- >>> df1 = pl.DataFrame({"a": [1], "b": [3]}) >>> df2 = pl.DataFrame({"a": [2], "b": [4]}) >>> pl.concat([df1, df2]) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 3 │ ├╌╌╌╌╌┼╌╌╌╌╌┤ │ 2 ┆ 4 │ └─────┴─────┘ """ if not len(items) > 0: raise ValueError("cannot concat empty list") out: pli.Series | pli.DataFrame | pli.LazyFrame | pli.Expr first = items[0] if isinstance(first, pli.DataFrame): if how == "vertical": out = pli.wrap_df(_concat_df(items)) elif how == "diagonal": out = pli.wrap_df(_diag_concat_df(items)) elif how == "horizontal": out = pli.wrap_df(_hor_concat_df(items)) else: raise ValueError( f"how should be one of {'vertical', 'diagonal'}, got {how}") elif isinstance(first, pli.LazyFrame): return pli.wrap_ldf(_concat_lf(items, rechunk)) elif isinstance(first, pli.Series): out = pli.wrap_s(_concat_series(items)) elif isinstance(first, pli.Expr): out = first for e in items[1:]: out = out.append(e) # type: ignore[arg-type] else: raise ValueError(f"did not expect type: {type(first)} in 'pl.concat'.") if rechunk: return out.rechunk() return out
def date_range( low: date | datetime, high: date | datetime, interval: str | timedelta, closed: str | None = "both", name: str | None = None, time_unit: str | None = None, ) -> pli.Series: """ Create a range of type `Datetime` (or `Date`). Parameters ---------- low Lower bound of the date range. high Upper bound of the date range. interval Interval periods. It can be a python timedelta object, like ``timedelta(days=10)``, or a polars duration string, such as ``3d12h4m25s`` representing 3 days, 12 hours, 4 minutes, and 25 seconds. closed : {None, 'left', 'right', 'both', 'none'} Make the interval closed to the 'left', 'right', 'none' or 'both' sides. name Name of the output Series. time_unit : {'ns', 'us', 'ms'} Set the time unit. Notes ----- If both ``low`` and ``high`` are passed as date types (not datetime), and the interval granularity is no finer than 1d, the returned range is also of type date. All other permutations return a datetime Series. Returns ------- A Series of type `Datetime` or `Date`. Examples -------- Using polars duration string to specify the interval: >>> from datetime import date >>> pl.date_range(date(2022, 1, 1), date(2022, 3, 1), "1mo", name="drange") shape: (3,) Series: 'drange' [date] [ 2022-01-01 2022-02-01 2022-03-01 ] Using `timedelta` object to specify the interval: >>> from datetime import datetime, timedelta >>> pl.date_range( ... datetime(1985, 1, 1), ... datetime(1985, 1, 10), ... timedelta(days=1, hours=12), ... time_unit="ms", ... ) shape: (7,) Series: '' [datetime[ms]] [ 1985-01-01 00:00:00 1985-01-02 12:00:00 1985-01-04 00:00:00 1985-01-05 12:00:00 1985-01-07 00:00:00 1985-01-08 12:00:00 1985-01-10 00:00:00 ] """ if isinstance(interval, timedelta): interval = _timedelta_to_pl_duration(interval) low, low_is_date = _ensure_datetime(low) high, high_is_date = _ensure_datetime(high) if in_nanoseconds_window(low) and in_nanoseconds_window( high) and time_unit is None: tu = "ns" elif time_unit is not None: tu = time_unit else: tu = "ms" start = _datetime_to_pl_timestamp(low, tu) stop = _datetime_to_pl_timestamp(high, tu) if name is None: name = "" dt_range = pli.wrap_s( _py_date_range(start, stop, interval, closed, name, tu)) if (low_is_date and high_is_date and not _interval_granularity(interval).endswith(("h", "m", "s"))): dt_range = dt_range.cast(Date) return dt_range
def date_range( low: datetime, high: datetime, interval: Union[str, timedelta], closed: Optional[str] = "both", name: Optional[str] = None, time_unit: Optional[str] = None, ) -> "pli.Series": """ Create a date range of type `Datetime`. Parameters ---------- low Lower bound of the date range high Upper bound of the date range interval Interval periods A python timedelta object or a polars duration `str` e.g.: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds closed {None, 'left', 'right', 'both', 'none'} Make the interval closed to the 'left', 'right', 'none' or 'both' sides. name Name of the output Series time_unit Set the time unit; one of {'ns', 'ms'} Returns ------- A Series of type `Datetime` Examples -------- >>> from datetime import datetime >>> pl.date_range(datetime(1985, 1, 1), datetime(2015, 7, 1), "1d12h") shape: (7426,) Series: '' [datetime[ns]] [ 1985-01-01 00:00:00 1985-01-02 12:00:00 1985-01-04 00:00:00 1985-01-05 12:00:00 1985-01-07 00:00:00 1985-01-08 12:00:00 1985-01-10 00:00:00 1985-01-11 12:00:00 1985-01-13 00:00:00 1985-01-14 12:00:00 1985-01-16 00:00:00 1985-01-17 12:00:00 ... 2015-06-14 00:00:00 2015-06-15 12:00:00 2015-06-17 00:00:00 2015-06-18 12:00:00 2015-06-20 00:00:00 2015-06-21 12:00:00 2015-06-23 00:00:00 2015-06-24 12:00:00 2015-06-26 00:00:00 2015-06-27 12:00:00 2015-06-29 00:00:00 2015-06-30 12:00:00 ] """ if isinstance(interval, timedelta): interval = _timedelta_to_pl_duration(interval) if in_nanoseconds_window(low) and in_nanoseconds_window(high) and time_unit is None: tu = "ns" elif time_unit is not None: tu = time_unit else: tu = "ms" start = _datetime_to_pl_timestamp(low, tu) stop = _datetime_to_pl_timestamp(high, tu) if name is None: name = "" return pli.wrap_s(_py_date_range(start, stop, interval, closed, name, tu))