Exemple #1
0
    def _arrow_table_to_pandas(
        cls, arrow_table: pa.Table, categories, **kwargs
    ) -> pd.DataFrame:
        _kwargs = kwargs.get("arrow_to_pandas", {})
        _kwargs.update({"use_threads": False, "ignore_metadata": False})

        return arrow_table.to_pandas(categories=categories, **_kwargs)
Exemple #2
0
    def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame:

        return table.to_pandas(
            ignore_metadata=True,  # noqa
            date_as_object=False,  # noqa
            timestamp_as_object=False,  # noqa
            types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get)
def _arrowtable2df(
    table: pa.Table,
    categories: Optional[List[str]],
    safe: bool,
    use_threads: bool,
    dataset: bool,
    path: str,
    path_root: Optional[str],
) -> pd.DataFrame:
    df: pd.DataFrame = _apply_partitions(
        df=table.to_pandas(
            use_threads=use_threads,
            split_blocks=True,
            self_destruct=True,
            integer_object_nulls=False,
            date_as_object=True,
            ignore_metadata=True,
            categories=categories,
            safe=safe,
            types_mapper=_data_types.pyarrow2pandas_extension,
        ),
        dataset=dataset,
        path=path,
        path_root=path_root,
    )
    return _utils.ensure_df_is_mutable(df=df)
Exemple #4
0
def _arrowtable2df(
    table: pa.Table,
    categories: Optional[List[str]],
    safe: bool,
    use_threads: bool,
    dataset: bool,
    path: str,
    path_root: Optional[str],
) -> pd.DataFrame:
    metadata: Dict[str, Any] = {}
    if table.schema.metadata is not None and b"pandas" in table.schema.metadata:
        metadata = json.loads(table.schema.metadata[b"pandas"])
    df: pd.DataFrame = _apply_partitions(
        df=table.to_pandas(
            use_threads=use_threads,
            split_blocks=True,
            self_destruct=True,
            integer_object_nulls=False,
            date_as_object=True,
            ignore_metadata=True,
            strings_to_categorical=False,
            safe=safe,
            categories=categories,
            types_mapper=_data_types.pyarrow2pandas_extension,
        ),
        dataset=dataset,
        path=path,
        path_root=path_root,
    )
    df = _utils.ensure_df_is_mutable(df=df)
    if metadata:
        _logger.debug("metadata: %s", metadata)
        df = _apply_index(df=df, metadata=metadata)
        df = _apply_timezone(df=df, metadata=metadata)
    return df
Exemple #5
0
def test_write_pandas(tmp_path: pathlib.Path, sample_data: pa.Table):
    # When timestamp is converted to Pandas, it gets casted to ns resolution,
    # but Delta Lake schemas only support us resolution.
    sample_pandas = sample_data.to_pandas().drop(["timestamp"], axis=1)
    write_deltalake(str(tmp_path), sample_pandas)

    delta_table = DeltaTable(str(tmp_path))
    df = delta_table.to_pandas()
    assert_frame_equal(df, sample_pandas)
Exemple #6
0
def _table2df(table: pa.Table,
              categories: List[str] = None,
              use_threads: bool = True) -> pd.DataFrame:
    return table.to_pandas(
        use_threads=use_threads,
        split_blocks=True,
        self_destruct=True,
        integer_object_nulls=False,
        date_as_object=True,
        ignore_metadata=True,
        categories=categories,
        types_mapper=_data_types.pyarrow2pandas_extension,
    )
Exemple #7
0
def _write_partitioned_table_from_source(
    column_names: List[str],
    table: pa.Table,
    feature_table_date_partition_column: str,
    feature_table_timestamp_column: str,
) -> str:
    """
    Partitions dataset by date based on timestamp_column.
    Assumes date_partition_column is in date format if provided.

    Args:
        column_names: Column names in provided ingestion source
        table: PyArrow table of Dataset
        feature_table_date_partition_column: Date-partition column of FeatureTable
        feature_table_timestamp_column: Timestamp column of FeatureTable
    Returns:
        str:
            Root directory which contains date partitioned files.
    """
    dir_path = tempfile.mkdtemp()

    # Case: date_partition_column is provided and dataset does not contain it
    if feature_table_date_partition_column not in column_names:
        df = table.to_pandas()
        df[feature_table_date_partition_column] = df[
            feature_table_timestamp_column
        ].dt.date
        table = pa.Table.from_pandas(df)

    pq.write_to_dataset(
        table=table,
        root_path=dir_path,
        partition_cols=[feature_table_date_partition_column],
    )

    # Remove table from memory
    del table

    return dir_path
    def _handle_table_dataframe(
        table: pa.Table,
        mappings: Optional[Dict],
        raise_on_empty: bool = True,
        sort_columns: Optional[List] = None,
        as_type: Optional[Dict] = None,
    ):
        df = table.to_pandas().drop_duplicates()
        for col in mappings:
            df.loc[:, col] = df[col].map(mappings[col])

        if df.empty and raise_on_empty:
            local_vars = dict(locals())
            kw = [
                f"{k}={local_vars[k]}"
                for k in ("filter_expr", "instrument_ids", "start", "end")
            ]
            raise ValueError(f"Data empty for {kw}")
        if sort_columns:
            df = df.sort_values(sort_columns)
        if as_type:
            df = df.astype(as_type)
        return df
Exemple #9
0
 def extract_column(self, pa_table: pa.Table) -> np.ndarray:
     series = pa_table.to_pandas(
         types_mapper=pandas_types_mapper)[pa_table.column_names[0]]
     return self._series_to_numpy(series)
Exemple #10
0
 def convert_table_to_df(table: pa.Table) -> pd.DataFrame:
     return table.to_pandas(integer_object_nulls=True)
def print_expected(expected: Table):
    print('==================== EXPECTED ========================')
    print(expected.to_pandas())
def print_actual(actual: Table):
    print('==================== ACTUAL ==========================')
    print(actual.to_pandas())
Exemple #13
0
 def extract_batch(self, pa_table: pa.Table) -> pd.DataFrame:
     return pa_table.to_pandas(types_mapper=pandas_types_mapper)
Exemple #14
0
 def extract_column(self, pa_table: pa.Table) -> pd.Series:
     return pa_table.to_pandas(
         types_mapper=pandas_types_mapper)[pa_table.column_names[0]]
Exemple #15
0
 def extract_batch(self, pa_table: pa.Table) -> dict:
     df = pa_table.to_pandas(types_mapper=pandas_types_mapper)
     return {k: self._series_to_numpy(v) for k, v in df.items()}
 def m_o(engine: NativeExecutionEngine, df: pa.Table) -> None:
     assert 1 == df.to_pandas().shape[0]
Exemple #17
0
 def convert_table_to_df(table: pa.Table) -> pd.DataFrame:
     try:
         return table.to_pandas(integer_object_nulls=True)
     except pa.lib.ArrowInvalid:
         return table.to_pandas(integer_object_nulls=True,
                                timestamp_as_object=True)
Exemple #18
0
def __arrow_to_pandas(table: pyarrow.Table) -> pd.DataFrame:
    return table.to_pandas(
        date_as_object=False, deduplicate_objects=True,
        ignore_metadata=True)  # TODO ensure dictionaries stay dictionaries