def __init__( self: Column[T], name: str, data: Optional[list[T]] = None, index: Optional[list] = None, dtype: Optional[T] = None, ) -> None: self.name = name if data is not None: if issubclass(type(data), DataBackend): self._data_backend = data else: self._data_backend = PandasBackend({name: data}, index=index) self.dtype = ( self.infer_dtype(self.name, self._data_backend) if dtype is None else DataType(dtype) ) else: self._data_backend = PandasBackend(index=index) self.dtype = Object if data is not None: self._validate_column() self.loc = Column._LocIndexer[T](self) self.iloc = Column._ILocIndexer[T](self)
def test_equals(self) -> None: test = PandasBackend( {"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that(self.data_backend.equals(test), equal_to(True)) test = PandasBackend( {"a": ["a", "b", "d"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that(self.data_backend.equals(test), equal_to(False))
def test_lt(self) -> None: expected = PandasBackend({"a": ["a"], "b": [1], "c": [True]}) query = self.data_backend["b"] < 2 queried = self.data_backend.query(query) assert_that(queried.equals(expected), equal_to(True)) test = PandasBackend({"b": [2]}) query = self.data_backend < test queried = self.data_backend.query(query) assert_that(queried.equals(expected), equal_to(True))
def test_concat(self) -> None: postfix = PandasBackend({"a": ["d"], "b": [4], "c": [False]}) new_frame = PandasBackend.concat([self.data_backend, postfix], ignore_index=True) expected = PandasBackend( { "a": ["a", "b", "c", "d"], "b": [1, 2, 3, 4], "c": [True, False, True, False], } ) assert_that(new_frame.equals(expected), equal_to(True))
def test_ge(self) -> None: expected = PandasBackend({ "a": ["b", "c"], "b": [2, 3], "c": [False, True] }) query = self.data_backend["b"] >= 2 queried = self.data_backend.query(query) assert_that(queried.equals(expected), equal_to(True)) test = PandasBackend({"b": [2]}) query = self.data_backend >= test queried = self.data_backend.query(query) assert_that(queried.equals(expected), equal_to(True))
def __init__( self: T, metadata: Optional[M] = None, index: Optional[Iterable] = None, **column_data: dict[str, list], ) -> None: self.metadata = metadata if len(column_data) > 0: column_data = {str(name): col for name, col in column_data.items()} self._data_backend = PandasBackend(column_data, index=index) self._validate_data_frame() else: self._data_backend = PandasBackend(index=index) self._compile()
def test_eq(self) -> None: expected = PandasBackend({"a": ["a"], "b": [1], "c": [True]}) query = self.data_backend["a"] == "a" queried = self.data_backend.query(query) assert_that(queried.equals(expected), equal_to(True)) test = PandasBackend({"a": ["a", "c"], "b": [1, 3]}) expected = PandasBackend({ "a": ["a", "c"], "b": [1, 3], "c": [True, True] }) query = self.data_backend == test queried = self.data_backend.query(query) assert_that(queried.equals(expected), equal_to(True))
def test_drop_indices(self) -> None: new_frame = self.data_backend.drop_indices([1]) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) assert_that(new_frame.equals(expected), equal_to(True))
def test_query(self) -> None: query = (ExampleStore.a == "a") | (ExampleStore.b == 3) test = self.data_backend.query(query) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) assert_that(test.equals(expected), equal_to(True))
def test_getmask(self) -> None: test = self.data_backend.getmask([True, False, True]) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) repr(expected) assert_that(test.equals(expected), equal_to(True))
def test_append(self) -> None: try: postfix = PandasBackend({"a": ["d"], "b": [4], "c": [False]}) self.data_backend.append(postfix, ignore_index=True) fail("Expected exception") except NotImplementedError as e: assert_that( str(e), equal_to( "The current version of Tanuki does not support Store to DB writing" ), )
def test_ne(self) -> None: test = PandasBackend( {"a": ["d", "b", "c"], "b": [1, 4, 3], "c": [True, False, False]} ) expected = DataFrame( { "a": [True, False, False], "b": [False, True, False], "c": [False, False, True], } ) assert_that((self.data_backend != test).equals(expected), equal_to(True))
def from_rows( cls: Type[T], data_rows: list[tuple], columns: Optional[list[str]] = None, metadata: Optional[M] = None, ) -> T: if columns is None: columns = list(cls._parse_columns().keys()) else: columns = [str(col) for col in columns] data = DataFrame.from_records(data_rows, columns=columns) return cls.from_backend(PandasBackend(data), metadata)
def setup_method(self) -> None: self.test_store = ExampleStore(a=["a", "b", "c"], b=[1, 2, 3], c=[True, False, True]) conn_config = self.sql_db.connection_config() self.db = Sqlite3Database(conn_config) self.db.insert(ExampleStore.data_token, self.test_store) self.db_store = ExampleStore.link(self.db, ExampleStore.data_token) self.data_backend = self.db_store._data_backend self.test_series0 = PandasBackend(Series({ "a": "a", "b": 1, "c": True }), index=[0]) self.test_series2 = PandasBackend(Series({ "a": "c", "b": 3, "c": True }), index=[2])
def test_equals(self) -> None: test = DatabaseBackend(ExampleStore, self.db, ExampleStore.data_token) assert_that(self.data_backend.equals(test), equal_to(True)) newBackend = self.data_backend[ExampleStore.a] test = DatabaseBackend(ExampleStore, self.db, ExampleStore.data_token) assert_that(newBackend.equals(test), equal_to(False)) test = DatabaseBackend(ExampleStore, self.db, ExampleStore.data_token, None, ["a"]) assert_that(self.data_backend.equals(test[ExampleStore.a]), equal_to(False)) test = PandasBackend({ "a": ["a", "b", "d"], "b": [1, 2, 3], "c": [True, False, True] }) assert_that(self.data_backend.equals(test), equal_to(False))
class TestPandasBackend: def setup_method(self) -> None: self.data_backend = PandasBackend( DataFrame({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]}) ) self.test_series0 = PandasBackend(Series({"a": "a", "b": 1, "c": True}), index=[0]) self.test_series2 = PandasBackend(Series({"a": "c", "b": 3, "c": True}), index=[2]) def test_iloc(self) -> None: actual_series = self.data_backend.iloc[0] assert_that(actual_series.equals(self.test_series0), is_(True)) def test_loc(self) -> None: test_slice = self.data_backend.iloc[[0, 2]] actual_series = test_slice.loc[2] assert_that(actual_series.equals(self.test_series2), is_(True)) def test_to_dict(self) -> None: frame_expected_dict = { "a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True], } assert_that(self.data_backend.to_dict(), equal_to(frame_expected_dict)) series_expected_dict = {"a": ["a"], "b": [1], "c": [True]} assert_that(self.test_series0.to_dict(), equal_to(series_expected_dict)) def test_single_row(self) -> None: assert_that(self.test_series0["a"].values[0], equal_to("a")) assert_that(self.test_series0["b"].values[0], equal_to(1)) assert_that(self.test_series0["c"].values[0], equal_to(True)) example_row = self.data_backend.iloc[0] assert_that(example_row["a"].values[0], equal_to("a")) assert_that(example_row["b"].values[0], equal_to(1)) assert_that(example_row["c"].values[0], equal_to(True)) def test_set_index(self) -> None: test_slice = self.data_backend.iloc[[0, 2]] assert_that(test_slice.index.tolist(), equal_to([0, 2])) test_slice = test_slice.set_index(ExampleStore.a_index) assert_that(test_slice.index.tolist(), equal_to(["a", "c"])) def test_get_index(self) -> None: assert_that(self.data_backend.index.tolist(), equal_to([0, 1, 2])) test_slice = self.data_backend.iloc[[0, 2]] assert_that(test_slice.index.tolist(), equal_to([0, 2])) def test_reset_index(self) -> None: test_slice = self.data_backend.iloc[[0, 2]] assert_that(test_slice.index.tolist(), equal_to([0, 2])) test_slice = test_slice.reset_index() assert_that(test_slice.index.tolist(), equal_to([0, 1])) def test_contains(self) -> None: assert_that("a", is_in(self.data_backend)) def test_len(self) -> None: assert_that(len(self.data_backend), equal_to(3)) def test_columns(self) -> None: assert_that(self.data_backend.columns, equal_to(["a", "b", "c"])) assert_that(self.test_series0.columns, equal_to(["a", "b", "c"])) def test_iter(self) -> None: columns = ["a", "b", "c"] for actual_col, expected_col in zip(self.data_backend, columns): assert_that(actual_col, equal_to(expected_col)) def test_iterows(self) -> None: for i, row in self.data_backend.iterrows(): iloc_row = self.data_backend.iloc[i] assert_that(row.equals(iloc_row), is_(True)) def test_itertuples(self) -> None: for i, a, b, c in self.data_backend.itertuples(): iloc_row = self.data_backend.iloc[i] assert_that(a, equal_to(iloc_row["a"].values[0])) assert_that(b, equal_to(iloc_row["b"].values[0])) assert_that(c, equal_to(iloc_row["c"].values[0])) def test_str(self) -> None: expected = " a b c\nindex \n0 a 1 True\n1 b 2 False\n2 c 3 True" assert_that(str(self.data_backend), equal_to(expected)) def test_repr(self) -> None: expected = " a b c\nindex \n0 a 1 True\n1 b 2 False\n2 c 3 True" assert_that(repr(self.data_backend), equal_to(expected)) def test_is_link(self) -> None: assert_that(self.data_backend.is_link(), equal_to(False)) def test_link_token(self) -> None: assert_that(self.data_backend.link_token(), equal_to(None)) def test_to_pandas(self) -> None: expected = DataFrame( {"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that(self.data_backend.to_pandas().equals(expected), equal_to(True)) def test_values(self) -> None: expected = DataFrame( {"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that( np.array_equal(self.data_backend.values, expected.values), equal_to(True) ) def test_dtypes(self) -> None: expected = {"a": Object, "b": Int64, "c": Boolean} assert_that(self.data_backend.dtypes, equal_to(expected)) def test_cast_columns(self) -> None: expected = DataFrame({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [1, 0, 1]}) actual = self.data_backend.cast_columns({"c": int}) assert_that(np.array_equal(actual.values, expected.values), equal_to(True)) def test_index(self) -> None: expected = PandasIndex(PIndex([0, 1, 2], name="index"), []) assert_that(self.data_backend.index.equals(expected), equal_to(True)) new_frame = self.data_backend.set_index(ExampleStore.ab_index) pindex = PIndex([("a", 1), ("b", 2), ("c", 3)]) pindex.name = "ab_index" expected = PandasIndex(pindex, ["a", "b"]) assert_that(new_frame.index.equals(expected), equal_to(True)) def test_index_name(self) -> None: assert_that(self.data_backend.index_name, equal_to("index")) new_frame = self.data_backend.set_index(ExampleStore.ab_index) assert_that(new_frame.index_name, equal_to("ab_index")) def test_equals(self) -> None: test = PandasBackend( {"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that(self.data_backend.equals(test), equal_to(True)) test = PandasBackend( {"a": ["a", "b", "d"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that(self.data_backend.equals(test), equal_to(False)) def test_eq(self) -> None: test = PandasBackend( {"a": ["d", "b", "c"], "b": [1, 4, 3], "c": [True, False, False]} ) expected = DataFrame( { "a": [False, True, True], "b": [True, False, True], "c": [True, True, False], } ) assert_that((self.data_backend == test).equals(expected), equal_to(True)) def test_ne(self) -> None: test = PandasBackend( {"a": ["d", "b", "c"], "b": [1, 4, 3], "c": [True, False, False]} ) expected = DataFrame( { "a": [True, False, False], "b": [False, True, False], "c": [False, False, True], } ) assert_that((self.data_backend != test).equals(expected), equal_to(True)) def test_gt(self) -> None: sample = ExampleStore(b=[1, 2, 3]) test = ExampleStore(b=[1, 1, 3]) expected = DataFrame({"b": [False, True, False]}) assert_that((sample > test).equals(expected), equal_to(True)) def test_ge(self) -> None: sample = ExampleStore(b=[0, 2, 3]) test = ExampleStore(b=[1, 1, 3]) expected = DataFrame({"b": [False, True, True]}) assert_that((sample >= test).equals(expected), equal_to(True)) def test_lt(self) -> None: sample = ExampleStore(b=[0, 2, 3]) test = ExampleStore(b=[1, 1, 3]) expected = DataFrame({"b": [True, False, False]}) assert_that((sample < test).equals(expected), equal_to(True)) def test_le(self) -> None: sample = ExampleStore(b=[0, 2, 3]) test = ExampleStore(b=[1, 1, 3]) expected = DataFrame({"b": [True, False, True]}) assert_that((sample <= test).equals(expected), equal_to(True)) def test_getitem(self) -> None: expected = PandasBackend({"b": [1, 2, 3]}) assert_that(self.data_backend["b"].equals(expected), equal_to(True)) def test_getitems(self) -> None: expected = PandasBackend({"a": ["a", "b", "c"], "b": [1, 2, 3]}) assert_that( self.data_backend.getitems(["a", "b"]).equals(expected), equal_to(True) ) def test_getmask(self) -> None: test = self.data_backend.getmask([True, False, True]) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) repr(expected) assert_that(test.equals(expected), equal_to(True)) def test_query(self) -> None: query = (ExampleStore.a == "a") | (ExampleStore.b == 3) test = self.data_backend.query(query) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) assert_that(test.equals(expected), equal_to(True)) def test_setitem(self) -> None: self.data_backend["a"] = ["d", "e", "f"] expected = PandasBackend( {"a": ["d", "e", "f"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that(self.data_backend.equals(expected), equal_to(True)) def test_append(self) -> None: postfix = PandasBackend({"a": ["d"], "b": [4], "c": [False]}) new_frame = self.data_backend.append(postfix, ignore_index=True) expected = PandasBackend( { "a": ["a", "b", "c", "d"], "b": [1, 2, 3, 4], "c": [True, False, True, False], } ) assert_that(new_frame.equals(expected), equal_to(True)) def test_drop_indices(self) -> None: new_frame = self.data_backend.drop_indices([1]) expected = PandasBackend( {"a": ["a", "c"], "b": [1, 3], "c": [True, True]}, index=PandasIndex(PIndex([0, 2], name="index"), []), ) assert_that(new_frame.equals(expected), equal_to(True)) def test_concat(self) -> None: postfix = PandasBackend({"a": ["d"], "b": [4], "c": [False]}) new_frame = PandasBackend.concat([self.data_backend, postfix], ignore_index=True) expected = PandasBackend( { "a": ["a", "b", "c", "d"], "b": [1, 2, 3, 4], "c": [True, False, True, False], } ) assert_that(new_frame.equals(expected), equal_to(True))
def from_pandas(cls: Type[T], data: Union[Series, DataFrame], metadata: Optional[M] = None) -> T: return cls.from_backend(PandasBackend(data), metadata)
class DataStore: # Class vars _registered_stores: ClassVar[dict[str, dict[int, Type[T]]]] = {} @classmethod def store_type(cls, store_class: str, store_version: int) -> Optional[Type[DataStore]]: return cls._registered_stores.get(store_class, {}).get(store_version) type_factory: ClassVar[StorableTypeFactory] version: ClassVar[int] metadata: ClassVar[Type[M]] columns: ClassVar[list[ColumnAlias]] indices: ClassVar[list[IndexAlias]] # Instance vars columns: list[ColumnAlias] _data_backend: B loc: DataStore._LocIndexer[T] iloc: DataStore._ILocIndexer[T] index: Index metadata: Optional[M] def __init_subclass__(cls: Type[T], version: int = 1, register: bool = True) -> None: super(DataStore, cls).__init_subclass__() if register and DataStore.store_type(cls.__name__, version) is not None: raise TypeError( f"Duplicate DataStore version found for {cls.__name__} version {version}" ) cls.type_factory = StorableTypeFactory(list(cls.__mro__), cls.__annotations__) cls.version = version cls.metadata = cls.type_factory.metadata cls.columns = [] cls.indices = [] for name, col in cls._parse_columns().items(): cls.columns.append(col) setattr(cls, name, col) for name, index in cls._parse_indices().items(): cls.indices.append(index) setattr(cls, name, index) if register: if cls.__name__ is not DataStore._registered_stores: DataStore._registered_stores[cls.__name__] = {} DataStore._registered_stores[cls.__name__][version] = cls def __init__( self: T, metadata: Optional[M] = None, index: Optional[Iterable] = None, **column_data: dict[str, list], ) -> None: self.metadata = metadata if len(column_data) > 0: column_data = {str(name): col for name, col in column_data.items()} self._data_backend = PandasBackend(column_data, index=index) self._validate_data_frame() else: self._data_backend = PandasBackend(index=index) self._compile() def _compile(self: T) -> None: self._attach_columns() self._attach_indices() self._all_columns = self._parse_columns() self._active_columns = self._parse_active_columns() self.columns = list(self._active_columns.values()) self.index = self._data_backend.index self.loc = DataStore._LocIndexer[T](self) self.iloc = DataStore._ILocIndexer[T](self) @classmethod def link(cls: Type[T], database: D, data_token: DataToken, read_only: bool = True) -> T: from tanuki.data_backend.database_backend import DatabaseBackend return cls.from_backend( DatabaseBackend[T](cls, database, data_token, read_only=read_only), validate=False, ) @classmethod def from_rows( cls: Type[T], data_rows: list[tuple], columns: Optional[list[str]] = None, metadata: Optional[M] = None, ) -> T: if columns is None: columns = list(cls._parse_columns().keys()) else: columns = [str(col) for col in columns] data = DataFrame.from_records(data_rows, columns=columns) return cls.from_backend(PandasBackend(data), metadata) @classmethod def from_pandas(cls: Type[T], data: Union[Series, DataFrame], metadata: Optional[M] = None) -> T: return cls.from_backend(PandasBackend(data), metadata) def to_pandas(self: T) -> DataFrame: return self._data_backend.to_pandas() def to_dict(self: T) -> DataFrame: return self._data_backend.to_dict() @property def values(self: T) -> np.array: return self._data_backend.values @property def dtypes(self: T) -> dict[str, DataType]: return {col.name: col.dtype for col in self.columns} def is_link(self: T) -> bool: return self._data_backend.is_link() def link_token(self: T) -> Optional[DataToken]: return self._data_backend.link_token() def load(self: T) -> T: return self.from_backend(self._data_backend.load()) @classmethod def from_backend( cls: Type[T], data_backend: B, metadata: Optional[M] = None, validate: bool = True, ) -> T: instance = cls() instance.metadata = metadata instance._data_backend = data_backend if validate: instance._validate_data_frame() instance._compile() return instance @classmethod def _parse_columns(cls) -> dict[str, ColumnAlias]: return cls.type_factory.columns @classmethod def _parse_indices(cls) -> dict[str, IndexAlias]: return cls.type_factory.indices def _parse_active_columns(self: T) -> dict[str, ColumnAlias]: columns = self._parse_columns() backend_columns = [str(col) for col in self._data_backend.columns] unmatched_columns = set(backend_columns) - columns.keys() if len(unmatched_columns) > 0: raise KeyError( f"Data backend contains columns which are not supported by {self.__class__.__name__}:\n{unmatched_columns}" ) active_columns = {} for col in backend_columns: active_columns[col] = columns[col] return active_columns def _validate_data_frame(self: T) -> None: columns = self._parse_active_columns() invalid_types = [] for name, col in columns.items(): if isinstance(col.dtype, TypeAlias): continue col_data = self._data_backend[name] data_dtype = Column.infer_dtype(name, col_data) # TODO: Run in batch if data_dtype is not type(None) and data_dtype != col.dtype: try: cast_column = col(name, col_data) self._data_backend[name] = cast_column._data_backend except: invalid_types.append(name) if len(invalid_types) != 0: raise TypeError(f"Invalid types provided for: {invalid_types}") def _attach_columns(self: T) -> None: columns = self._parse_columns() active_columns = self._parse_active_columns() for name, col in columns.items(): if name in active_columns: data = self._data_backend[name] setattr(self, name, col(name, data)) else: setattr(self, name, None) def _attach_indices(self: T) -> None: indices = self._parse_indices() active_columns = self._parse_active_columns() for name, alias in indices.items(): col_names = [col.name for col in alias.columns] has_columns = True for col in col_names: if col not in active_columns: has_columns = False break if not has_columns: setattr(self, name, None) else: index = self._data_backend.get_index(alias) setattr(self, name, index) def __contains__(self: T, key): return str(key) in self._all_columns def __str__(self: T) -> str: if len(self._data_backend) == 0: return f"Empty {self.__class__.__name__}" else: return f"{self.__class__.__name__}\n{self._data_backend}" def __repr__(self: T) -> str: return str(self) def equals(self: T, other) -> bool: if not issubclass(type(other), DataStore): return False oc = cast(DataStore, other) return (other.__class__.__name__ == self.__class__.__name__ and self._data_backend.equals(oc._data_backend) and self.metadata == oc.metadata) def _get_external_backend(self: T, other: Any) -> None: if issubclass(type(other), DataStore): if not self.__class__.__name__ == other.__class__.__name__: raise UnsupportedOperation( "Cannot compare different DataStore types: " + f"{self.__class__.__name__} vs {other.__class__.__name__}") return cast(DataStore, other)._data_backend else: return other def __eq__(self: T, other: Any) -> Any: other = self._get_external_backend(other) return self._data_backend == other def __ne__(self: T, other: Any) -> Any: other = self._get_external_backend(other) return self._data_backend != other def __gt__(self: T, other: Any) -> Any: other = self._get_external_backend(other) return self._data_backend > other def __ge__(self: T, other: Any) -> Any: other = self._get_external_backend(other) return self._data_backend >= other def __lt__(self: T, other: Any) -> Any: other = self._get_external_backend(other) return self._data_backend < other def __le__(self: T, other: Any) -> Any: other = self._get_external_backend(other) return self._data_backend <= other def __len__(self: T) -> int: return len(self._data_backend) def __iter__(self: T) -> Generator[ColumnAlias, None, None]: for column in self._data_backend: yield self._active_columns[column] def iterrows(self: T) -> Generator[tuple[int, T], None, None]: for i, row in self._data_backend.iterrows(): yield (i, self.from_backend(row, self.metadata)) def itertuples(self: T, ignore_index: bool = False) -> Generator[tuple]: return self._data_backend.itertuples(ignore_index=ignore_index) def _get_column(self: T, item: str) -> T: if item not in self._all_columns: raise ValueError( f"Could not match '{item}' to {self.__class__.__name__} column" ) elif item not in self._active_columns: return None else: return self._all_columns[item](item, self._data_backend[item]) def _get_columns(self: T, columns: list[str]) -> T: unused_columns = set(columns) - self._all_columns.keys() if len(unused_columns) > 0: raise ValueError( f"The following columns do not exist in {self.__class__.__name__}: {unused_columns}" ) return self._data_backend.getitems(columns) def _get_mask(self: T, mask: list[bool]) -> T: return self._data_backend.getmask(mask) def query(self: T, query: Optional[Query] = None) -> T: return self.from_backend(self._data_backend.query(query), self.metadata) def __getitem__( self: T, item: Union[ColumnAlias, list[ColumnAlias], list[bool], Query] ) -> Union[Column, T]: if issubclass(type(item), Query): result = self._data_backend.query(item) elif item == "index": return self._data_backend.index elif type(item) is str or type(item) is ColumnAlias: result = self._get_column(str(item)) elif isinstance(item, Iterable): sample = item while type(sample) is not str and isinstance(sample, Iterable): sample = next(iter(sample)) value_type = DataType(type(sample)) if value_type is String or value_type is ColumnAlias: result = self._get_columns([str(value) for value in item]) elif value_type is Boolean: result = self._get_mask(item) else: raise RuntimeError(f"Unknown get item request: {item}") else: raise RuntimeError(f"Unknown get item request: {item}") if issubclass(type(result), DataBackend): result = self.from_backend(result, self.metadata) return result def __getattr__(self: T, name: str) -> Any: if name[0] != "_": raise AttributeError( f"Could not match '{name}' to {self.__class__.__name__} column" ) def set_index(self: T, index: Union[Index, IndexAlias]) -> T: return self.from_backend(self._data_backend.set_index(index), self.metadata) def reset_index(self: T) -> T: return self.from_backend(self._data_backend.reset_index(), self.metadata) def append(self: T, new_store: T, ignore_index: bool = False) -> T: return self.from_backend( self._data_backend.append(new_store._data_backend, ignore_index=ignore_index), self.metadata, ) def drop(self: T, indices: list[int]) -> T: return self.from_backend(self._data_backend.drop_indices(indices), self.metadata) @classmethod def concat(cls: T, all_data_stores: list[T], ignore_index: bool = False) -> T: all_match = all([isinstance(item, cls) for item in all_data_stores]) if not all_match: raise ValueError("All data stores must be same type for concat") backend_sample: B = all_data_stores[0]._data_backend all_backends = [store._data_backend for store in all_data_stores] return cls.from_backend( backend_sample.concat(all_backends, ignore_index=ignore_index), all_data_stores[0].metadata, ) @classmethod def builder(cls: Type[T]) -> DataStore._Builder[T]: return DataStore._Builder[cls](cls) class _Builder(Generic[T]): _store_class: Type[T] _column_data: dict[str, Column] _row_data: dict[str, list] def __init__(self, store_class: Type[T]) -> None: self._store_class = store_class self._column_data = {} self._row_data = {} def append_column(self, column_name: str, column_data: Column) -> DataStore._Builder[T]: if len(self._row_data) > 0: raise UnsupportedOperation( "Cannot insert column when row data present") self._column_data[str(column_name)] = column_data return self def __setitem__(self, column_name: str, column_data) -> None: self._column_data[str(column_name)] = column_data def append_row(self, **row_data: any) -> DataStore._Builder[T]: if len(self._column_data) > 0: raise UnsupportedOperation( "Cannot insert row data when column data present") for key, value in row_data.items(): if key not in self._row_data: self._row_data[key] = [] self._row_data[key].append(value) return self def build(self, metadata: Optional[M] = None) -> T: if len(self._column_data) > 0: return self._store_class(**self._column_data, metadata=metadata) else: return self._store_class(**self._row_data, metadata=metadata) class _ILocIndexer(Generic[T]): _data_store: T def __init__(self, data_store: T) -> None: self._data_store = data_store def __getitem__(self, item: Union[int, list, slice]) -> T: return self._data_store.from_backend( self._data_store._data_backend.iloc[item], self._data_store.metadata) class _LocIndexer(Generic[T]): _data_store: T def __init__(self, data_store: T) -> None: self._data_store = data_store def __getitem__(self, item: Union[Any, list, slice]) -> T: return self._data_store.from_backend( self._data_store._data_backend.loc[item], self._data_store.metadata)
def setup_method(self) -> None: self.data_backend = PandasBackend( DataFrame({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]}) ) self.test_series0 = PandasBackend(Series({"a": "a", "b": 1, "c": True}), index=[0]) self.test_series2 = PandasBackend(Series({"a": "c", "b": 3, "c": True}), index=[2])
def test_getitem(self) -> None: expected = PandasBackend({"b": [1, 2, 3]}) assert_that(self.data_backend["b"].equals(expected), equal_to(True))
class Column(Generic[T]): name: str _data_backend: B loc: Column._LocIndexer[T] iloc: Column._ILocIndexer[T] dtype: DataType @classmethod def __class_getitem__(_, dtype: T) -> Type[Column]: from .column_alias import ColumnAlias return ColumnAlias(dtype) def __init__( self: Column[T], name: str, data: Optional[list[T]] = None, index: Optional[list] = None, dtype: Optional[T] = None, ) -> None: self.name = name if data is not None: if issubclass(type(data), DataBackend): self._data_backend = data else: self._data_backend = PandasBackend({name: data}, index=index) self.dtype = ( self.infer_dtype(self.name, self._data_backend) if dtype is None else DataType(dtype) ) else: self._data_backend = PandasBackend(index=index) self.dtype = Object if data is not None: self._validate_column() self.loc = Column._LocIndexer[T](self) self.iloc = Column._ILocIndexer[T](self) @staticmethod def determine_nested_dtype(data: Iterable) -> DataType: if len(data) == 0: return Object sample = next(iter(data)) stype = type(sample) if sample == data: return type(data) elif stype is list or stype is set: nested_type = Column.determine_nested_dtype(sample) else: nested_type = stype return GenericAlias(type(data), nested_type) @staticmethod def infer_dtype(column_name: str, data_backend: DataBackend) -> DataType: dtype: type = data_backend.dtypes[column_name] if dtype == Object: if len(data_backend) == 0: dtype = Object else: sample = data_backend.iloc[0].values[0] stype = type(sample) if stype is list or stype is set: dtype = DataType( GenericAlias(stype, Column.determine_nested_dtype(sample)) ) else: dtype = stype return DataType(dtype) def _validate_column(self: Column[T]) -> None: if self._data_backend.is_link() or isinstance(self.dtype, TypeAlias): return data_type = self.infer_dtype(self.name, self._data_backend) if self.dtype != data_type: try: self._data_backend = self._data_backend.cast_columns( {self.name: self.dtype.pdtype()} ) except Exception as e: raise TypeError( f"Failed to cast '{data_type.__name__}' to '{self.dtype.__name__}'", e, ) @property def index(self: Column[T]) -> Index: return self._data_backend.index def tolist(self: Column[T]) -> list: return self._data_backend.values.tolist() @property def values(self: Column[T]) -> np.ndarray: return self._data_backend.values @classmethod def _new_data_copy( cls: type[Column], name: str, data_backend: DataBackend, dtype: type[NT] ) -> "Column[NT]": instance: Column[dtype] = cls(name) instance._data_backend = data_backend instance.dtype = DataType(dtype) return instance def astype(self: Column[T], new_dtype: type[NT]) -> "Column[NT]": return Column( self.name, self._data_backend.cast_columns({self.name: new_dtype}), dtype=new_dtype, ) def first( self: Column[T], n: Optional[int] = 1, offset: Optional[int] = 0 ) -> Column[T]: return Column( self.name, self._data_backend.iloc[self.index[offset : offset + n]], dtype=self.dtype, ) def nunique(self) -> int: return self._data_backend.nunique()[self.name] def equals(self: Column[T], other: Any) -> bool: if type(other) is Column: return self._data_backend.equals(cast(Column, other)._data_backend) else: return self._data_backend.equals(other) def __eq__(self: Column[T], other: Any) -> Column[Boolean]: if isinstance(other, Column): other = other._data_backend return self._data_backend == other def __ne__(self: Column[T], other: Any) -> bool: if isinstance(other, Column): other = other._data_backend return self._data_backend != other def __gt__(self: Column[T], other: Any) -> bool: if isinstance(other, Column): other = other._data_backend return self._data_backend > other def __ge__(self: Column[T], other: Any) -> bool: if isinstance(other, Column): other = other._data_backend return self._data_backend >= other def __lt__(self: Column[T], other: Any) -> bool: if isinstance(other, Column): other = other._data_backend return self._data_backend < other def __le__(self: Column[T], other: Any) -> bool: if isinstance(other, Column): other = other._data_backend return self._data_backend <= other def __len__(self: Column[T]): return len(self._data_backend) def __iter__(self: Column[T]) -> Iterator[T]: return self._data_backend.itertuples() def item(self) -> Any: if len(self) > 1: raise RuntimeError("Cannot call `item` on list of column values") return self.values[0] def __getitem__(self: Column[T], indexable: Indexible) -> Column[T]: return self.loc[indexable] def __str__(self: Column[T]) -> str: result = f"Column {self.name}" if len(self._data_backend) == 0: return f"{result}([], dtype: {self.dtype.__name__})" else: return str(self._data_backend) def __repr__(self: Column[T]) -> str: return str(self) class _ILocIndexer(Generic[T]): _column: Column[T] def __init__(self, column: Column[T]) -> None: self._column = column def __getitem__(self, item: Union[int, list, slice]) -> Column[T]: data = self._column._data_backend.iloc[item] return Column._new_data_copy(self._column.name, data, self._column.dtype) class _LocIndexer(Generic[T]): _column: Column[T] def __init__(self, column: Column[T]) -> None: self._column = column def __getitem__(self, item: Union[Any, list, slice]) -> Column[T]: data = self._column._data_backend.loc[item] return Column._new_data_copy(self._column.name, data, self._column.dtype)
def test_getitems(self) -> None: expected = PandasBackend({"a": ["a", "b", "c"], "b": [1, 2, 3]}) assert_that( self.data_backend.getitems(["a", "b"]).equals(expected), equal_to(True) )
def test_setitem(self) -> None: self.data_backend["a"] = ["d", "e", "f"] expected = PandasBackend( {"a": ["d", "e", "f"], "b": [1, 2, 3], "c": [True, False, True]} ) assert_that(self.data_backend.equals(expected), equal_to(True))