Ejemplo n.º 1
0
class PandasBackend(DataBackend):
    _data: DataFrame
    _index: PandasIndex
    _loc: _LocIndexer
    _iloc: _ILocIndexer

    def __init__(
        self,
        data: Optional[Union(Series, DataFrame, dict[str, list])] = None,
        index: Optional[PandasIndex] = None,
    ) -> None:
        if data is None:
            self._data = DataFrame(dtype="object")
        elif type(data) is Series:
            self._data = cast(Series, data).to_frame().transpose()
        elif type(data) is DataFrame:
            self._data = DataFrame(data)
        elif type(data) is dict:
            sample_value = next(iter(data.values()))
            if not isinstance(sample_value, Iterable) or isinstance(
                    sample_value, str):
                self._data = Series(data).to_frame().transpose()
            else:
                self._data = DataFrame(data)
        else:
            raise ValueError(
                f"Received unexpected value type {type(data)}: {data}")

        if index is None:
            self._data.index.name = "index"
            self._index = PandasIndex(self._data.index, [])
        else:
            if not isinstance(index, PandasIndex):
                index = PandasIndex(index)
            self._data.index = index._data
            self._index = index
        self._loc = _LocIndexer(self)
        self._iloc = _ILocIndexer(self)

    def is_link(self) -> bool:
        return False

    def link_token(self) -> Optional[DataToken]:
        return None

    def to_pandas(self) -> DataFrame:
        return self._data

    @property
    def columns(self) -> list[str]:
        return self._data.columns.tolist()

    @property
    def values(self) -> np.ndarray:
        data_values = self._data.values
        shape = data_values.shape
        if shape[1] == 1:
            return np.squeeze(data_values, axis=1)
        elif shape[0] == 1:
            return np.squeeze(data_values, axis=0)
        else:
            return data_values

    @property
    def dtypes(self) -> dict[str, DataType]:
        return {
            col: DataType(dtype)
            for col, dtype in self._data.dtypes.items()
        }

    def cast_columns(self, column_dtypes: dict[str, type]) -> PandasBackend:
        return PandasBackend(self._data.astype(column_dtypes, errors="ignore"))

    def to_dict(self) -> dict[str, any]:
        return self._data.to_dict("list")

    @property
    def index(self) -> Index:
        return self._index

    @property
    def index_name(self) -> Union[str, list[str]]:
        return self._data.index.name

    @property
    def loc(self: PandasBackend) -> LocIndexer[PandasBackend]:
        return self._loc

    @property
    def iloc(self: PandasBackend) -> ILocIndexer[PandasBackend]:
        return self._iloc

    def equals(self, other: PandasBackend) -> bool:
        if type(other) is not PandasBackend:
            return False
        return np.array_equal(self._data.values,
                              other._data.values) and self._index.equals(
                                  other._index)

    def __eq__(self, other) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data == other

    def __ne__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data != other

    def __gt__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data > other

    def __ge__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data >= other

    def __lt__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data < other

    def __le__(self, other: Any) -> DataFrame:
        if issubclass(type(other), PandasBackend):
            other = other._data
        return self._data <= other

    def __len__(self) -> int:
        return len(self._data)

    def __iter__(self) -> Generator[str, None, None]:
        return iter(self._data)

    def iterrows(self) -> Generator[tuple[int, PandasBackend], None, None]:
        for i, row in self._data.iterrows():
            yield (i, PandasBackend(row.to_frame().transpose()))

    def itertuples(self, ignore_index: bool = False):
        for values in self._data.itertuples(index=not ignore_index):
            yield values

    def __getitem__(self, item: str) -> Any:
        return PandasBackend(self._data[item].to_frame())

    def getitems(self, items: list[str]) -> PandasBackend:
        return PandasBackend(self._data[items])

    def getmask(self, mask: list[bool]) -> PandasBackend:
        return PandasBackend(self._data[mask])

    def query(self, query: "Query") -> PandasBackend:
        from tanuki.database.adapter.query.pandas_query_compiler import PandasQueryCompiler

        query_compiler = PandasQueryCompiler(self._data)
        query = query_compiler.compile(query)
        return PandasBackend(self._data[query])

    def __setitem__(self, items: str, value: Any) -> None:
        if isinstance(value, PandasBackend):
            value = value._data
        self._data[items] = value

    def get_index(self, index_alias: IndexAlias) -> Index:
        cols = [str(col) for col in index_alias.columns]
        new_data = self._data.set_index(cols)
        new_data.index.name = index_alias.name
        return PandasIndex(new_data.index, cols)

    def set_index(self, index: Union[Index, IndexAlias]) -> PandasBackend:
        cols = [str(col) for col in index.columns]
        new_data = self._data.set_index(cols)
        new_data.index.name = index.name
        new_index = PandasIndex(new_data.index, cols)
        return PandasBackend(new_data, new_index)

    def reset_index(self: PandasBackend) -> PandasBackend:
        new_data = self._data.reset_index(drop=True)
        new_data.index.name = "index"
        new_index = PandasIndex(new_data.index, [])
        return PandasBackend(new_data, new_index)

    def append(
        self: PandasBackend,
        new_backend: PandasBackend,
        ignore_index: bool = False,
    ) -> PandasBackend:
        return PandasBackend(
            self._data.append(new_backend._data, ignore_index=ignore_index))

    def drop_indices(self: PandasBackend, indices: list[int]) -> PandasBackend:
        return PandasBackend(self._data.drop(indices))

    @classmethod
    def concat(
        cls: type[PandasBackend],
        all_backends: list[PandasBackend],
        ignore_index: bool = False,
    ) -> PandasBackend:
        all_data = [backend._data for backend in all_backends]
        return PandasBackend(pd.concat(all_data, ignore_index=ignore_index))

    def nunique(self) -> int:
        return self._data.nunique()

    def __str__(self) -> str:
        return str(self._data)

    def __repr__(self) -> str:
        return str(self)
Ejemplo n.º 2
0
class TestPandasIndex:

    def setup_method(self):
        self.index = PandasIndex(PIndex(np.arange(0, 3), name="index"), ["a", "b"])
    
    def test_name(self):
        assert_that(self.index.name, equal_to("index"))

    def test_columns(self):
        assert_that(self.index.columns, equal_to(["a", "b"]))

    def test_to_pandas(self) -> None:
        assert_that(self.index.to_pandas().equals(self.index), equal_to(True))

    def test_getitem(self):
        expected = PandasIndex(PIndex([1], name="index"), ["a", "b"])
        assert_that(self.index[1], equal_to(1))
        assert_that(self.index[[1]].equals(expected), equal_to(True))

    def test_values(self):
        assert_that(np.array_equal(self.index.values, np.array([0, 1, 2])), equal_to(True))

    def test_tolist(self):
        assert_that(self.index.tolist(), equal_to([0, 1, 2]))

    def test_equals(self):
        test = PandasIndex(PIndex([0, 1, 2], name="index"), ["a", "b"])
        assert_that(self.index.equals(test), equal_to(True))

        test = PandasIndex(PIndex([0, 1], name="index"), ["a", "b"])
        assert_that(self.index.equals(test), equal_to(False))

        test = PandasIndex(PIndex([0, 1, 2], name="index2"), ["a", "b"])
        assert_that(self.index.equals(test), equal_to(False))

        test = PandasIndex(PIndex([0, 1, 2], name="index"), ["a"])
        assert_that(self.index.equals(test), equal_to(False))

    def test_eq(self):
        expected = np.array([False, True, False])
        actual = self.index == 1
        assert_that(np.array_equal(actual, expected), equal_to(True))

    def test_ne(self):
        expected = np.array([True, False, True])
        actual = self.index != 1
        assert_that(np.array_equal(actual, expected), equal_to(True))

    def test_gt(self):
        expected = np.array([False, False, True])
        actual = self.index > 1
        assert_that(np.array_equal(actual, expected), equal_to(True))

    def test_ge(self):
        expected = np.array([False, True, True])
        actual = self.index >= 1
        assert_that(np.array_equal(actual, expected), equal_to(True))

    def test_lt(self):
        expected = np.array([True, False, False])
        actual = self.index < 1
        assert_that(np.array_equal(actual, expected), equal_to(True))

    def test_le(self):
        expected = np.array([True, True, False])
        actual = self.index <= 1
        assert_that(np.array_equal(actual, expected), equal_to(True))

    def test_len(self) -> int:
        assert_that(len(self.index), equal_to(3))

    def test_str(self) -> str:
        assert_that(str(self.index), equal_to("Int64Index([0, 1, 2], dtype='int64', name='index')"))

    def test_repr(self) -> str:
        assert_that(repr(self.index), equal_to("Int64Index([0, 1, 2], dtype='int64', name='index')"))