def test_reserve_dtype(self): t = TypedDfBuilder("a").reserve("x", dtype=np.float32).build() df = t.convert(pd.DataFrame([pd.Series(dict(x="0.5"))])) assert df.column_names() == ["x"] assert df.to_numpy().tolist() == [[0.5]] with pytest.raises(ValueError): t.convert(pd.DataFrame([pd.Series(dict(x="kitten"))]))
def test_to_instances(self): t = TypedDfBuilder("T").require("animal", dtype=str).reserve("age", dtype=int).build() df: t = t.of( [ pd.Series(dict(animal="goldfish", age=2)), pd.Series(dict(animal="goldfish", age=1)), pd.Series(dict(animal="gazelle", age=8)), pd.Series(dict(animal="pineapple", age=114)), pd.Series(dict(animal="anteater", age=11)), ] ) dc = t.create_dataclass() instances = df.to_dataclass_instances() assert instances == [ dc("goldfish", 2), dc("goldfish", 1), dc("gazelle", 8), dc("pineapple", 114), dc("anteater", 11), ] assert list(sorted(instances)) == [ dc("anteater", 11), dc("gazelle", 8), dc("goldfish", 1), dc("goldfish", 2), dc("pineapple", 114), ]
def test_no_overwrite(self): t = TypedDfBuilder("a").reserve("x", "y").build() df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="dog"))])) with tmpfile(".csv") as path: df.write_file(path, overwrite=False) with pytest.raises(FileExistsError): df.write_file(path, overwrite=False)
def test_bad_type(self): with pytest.raises(TypeError): # noinspection PyTypeChecker TypedDfBuilder(None).build() with pytest.raises(TypeError): # noinspection PyTypeChecker TypedDfBuilder(5).build()
def test_mkdir(self): t = TypedDfBuilder("a").reserve("x", "y").build() df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="dog"))])) with tmpdir() as path: df.write_file(path / "a.csv", mkdirs=True) with tmpdir() as path: with pytest.raises(FileNotFoundError): df.write_file(path / "b.csv")
def test_bad_reserve(self): for index in [True, False]: with pytest.raises(ClashError): TypedDfBuilder("a").reserve("level_0", index=index) with pytest.raises(ClashError): TypedDfBuilder("a").reserve("abc", "level_0", index=index) with pytest.raises(ClashError): TypedDfBuilder("a").reserve("abc", "index", index=index)
def test_drop(self): t = TypedDfBuilder("a").reserve("column").drop("trash").build() typ: DfTyping = t.get_typing() assert typ.columns_to_drop == {"trash"} df = t.convert(pd.DataFrame([pd.Series(dict(x="x", zz="y"))])) assert df.column_names() == ["x", "zz"] df = t.convert(pd.DataFrame([pd.Series(dict(x="x", trash="y"))])) assert df.column_names() == ["x"]
def test_require_and_reserve_col(self): t = TypedDfBuilder("a").require("column").reserve("reserved").build() typ: DfTyping = t.get_typing() assert typ.required_columns == ["column"] assert typ.reserved_columns == ["reserved"] assert typ.required_index_names == [] assert typ.reserved_index_names == [] assert typ.verifications == []
def test_pass_io_options(self): t = TypedDfBuilder("a").reserve("x", "y").add_write_kwargs(FileFormat.csv, sep="&").build() df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="dog"))])) with tmpfile(".csv") as path: df.write_file(path) lines = path.read_text(encoding="utf8").splitlines() assert lines == ["x&y", "cat&dog"]
def test_read_instances_empty_fields(self): @dataclass(frozen=True) class Dc: pass t = TypedDfBuilder("T").reserve("animal", dtype=str).build() df = t.from_dataclass_instances([Dc()]) assert len(df) == 1 assert "animal" not in df.columns
def test_create_any(self): t = TypedDfBuilder("T").require("greeting").reserve("bonus").build() dc = t.create_dataclass() assert dc is not None assert issubclass(dc, TypedDfDataclass) assert len(dc.get_fields()) == 2 assert dc.get_fields()[0].name == "greeting" assert dc.get_fields()[0].type == Any assert dc.get_fields()[1].name == "bonus" assert dc.get_fields()[1].type == Optional[Any]
def test_read_instances(self): @dataclass(frozen=True) class Dc: animal: str val: Optional[int] t = TypedDfBuilder("T").require("animal", dtype=str).reserve("age", dtype=int).build() df = t.from_dataclass_instances([Dc("cat", 1), Dc("kitten", 2)]) assert len(df) == 2 assert df.to_numpy().tolist() == [["cat", 1], ["kitten", 2]]
def test_attrs_hard(self): meta = None try: t = TypedDfBuilder("a").reserve("x", "y").build() df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="kitten"))])) df.attrs["matrix"] = np.zeros((2, 2)) with tmpfile(".csv") as path: df.write_file(path, attrs=True) meta = Path(str(path) + ".attrs.json") assert meta.exists() df = t.read_file(path, attrs=True) assert df.attrs == {"matrix": [["0.0", "0.0"], ["0.0", "0.0"]]} finally: if meta is not None: meta.unlink(missing_ok=True)
def test_attrs(self): meta = None try: t = TypedDfBuilder("a").reserve("x", "y").build() df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="kitten"))])) df.attrs["fruit"] = "apple" with tmpfile(".csv") as path: df.write_file(path, attrs=True) meta = Path(str(path) + ".attrs.json") assert meta.exists() data = meta.read_text(encoding="utf-8").replace("\n", "").replace( " ", "") assert data == '{"fruit": "apple"}' df = t.read_file(path, attrs=True) assert df.attrs == {"fruit": "apple"} finally: if meta is not None: meta.unlink(missing_ok=True)
def test_condition(self): t = TypedDfBuilder("a").verify(always_ok).build() typ: DfTyping = t.get_typing() assert typ.required_columns == [] assert typ.required_index_names == [] assert typ.verifications == [always_ok] TypedDf(pd.DataFrame()) t = TypedDfBuilder("a").verify(always_fail).build() with pytest.raises(VerificationFailedError): t.convert(pd.DataFrame())
def test_require_and_reserve_index(self): t = (TypedDfBuilder("a").require("column", index=True).reserve( "reserved", index=True)).build() typ: DfTyping = t.get_typing() assert typ.required_columns == [] assert typ.reserved_columns == [] assert typ.required_index_names == ["column"] assert typ.reserved_index_names == ["reserved"] assert typ.known_index_names == ["column", "reserved"] assert typ.known_column_names == [] assert typ.known_names == ["column", "reserved"] assert typ.verifications == []
def test_already_added(self): for cola in [True, False]: for indexa in [True, False]: for colb in [True, False]: for indexb in [True, False]: builder = TypedDfBuilder("a") if cola: builder = builder.require("a", index=indexa) else: cola = builder.reserve("a", index=indexa) with pytest.raises(ClashError): if colb: builder.require("a", index=indexb) else: builder.reserve("a", index=indexb)
def typed(cls, name: str, doc: Optional[str] = None) -> TypedDfBuilder: """ Creates a new type with flexible requirements. The class will enforce constraints and subclass :class:`typeddfs.typed_dfs.TypedDf`. Args: name: The name that will be used for the new class doc: The docstring for the new class Returns: A builder instance (builder pattern) to be used with chained calls Example: ``TypedDfs.typed("MyClass").require("name", index=True).build()`` """ return TypedDfBuilder(name, doc)
def test_dir_hash(self): t = TypedDfBuilder("a").reserve("x", "y").build() df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="kitten"))])) with tmpfile(".csv") as path: hash_dir = Checksums().get_dirsum_of_file(path) hash_dir.unlink(missing_ok=True) df.write_file(path, dir_hash=True) assert hash_dir.exists() got = Checksums().load_dirsum_exact(hash_dir) assert list(got.keys()) == [path] hit = got[path] assert len(hit) == 64 t.read_file(path, dir_hash=True) t.read_file(path, hex_hash=hit)
def test_file_hash(self): t = TypedDfBuilder("a").reserve("x", "y").build() df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="dog"))])) # unfortunately, the file that gets output is os-dependent # \n vs \r\n is an issue, so we can't check the exact hash with tmpfile(".csv") as path: df.write_file(path, file_hash=True) hash_file = Checksums().get_filesum_of_file(path) assert hash_file.exists() got = Checksums().load_filesum_of_file(path) assert got.file_path == path hit = got.hash_value assert len(hit) == 64 t.read_file(path, file_hash=True) t.read_file(path, hex_hash=hit)
def test_dtype_post_process(self): # make sure these happen in the right order: # 1. dtype conversions # 2. post-processing # 3. final conditions def post(dd: BaseDf) -> BaseDf: assert dd["x"].dtype == np.float32 dd2 = dd.copy() dd2["x"] += 9 return dd2 def cond(dd: BaseDf): return None if dd["x"].dtype == np.float32 else "failed" t = (TypedDfBuilder("a").reserve( "x", dtype=np.float32).post(post).verify(cond)).build() df = t.convert(pd.DataFrame([pd.Series(dict(x="0.5"))])) assert df.to_numpy().tolist() == [[9.5]]
def test_create_empty(self): t = TypedDfBuilder("T").build() dc = t.create_dataclass() assert dc is not None assert issubclass(dc, TypedDfDataclass) assert dc.get_fields() == []
def test_strict(self): # strict columns but not index t = TypedDfBuilder("a").strict(index=False, cols=True).build() typ: DfTyping = t.get_typing() assert typ.more_indices_allowed assert not typ.more_columns_allowed t.convert(pd.DataFrame([pd.Series(dict(x="x"))]).set_index("x")) with pytest.raises(UnexpectedColumnError): t.convert(pd.DataFrame([pd.Series(dict(x="x"))])) # strict index but not columns t = TypedDfBuilder("a").strict(True, False).build() typ: DfTyping = t.get_typing() assert typ.more_columns_allowed assert not typ.more_indices_allowed t.convert(pd.DataFrame([pd.Series(dict(x="x"))])) with pytest.raises(UnexpectedIndexNameError): df = PrettyDf( pd.DataFrame([pd.Series(dict(x="x"))]).set_index("x")) assert df.index_names() == ["x"] assert df.column_names() == [] t.convert(df) # neither strict t = TypedDfBuilder("a").strict(False, False).build() t.convert(pd.DataFrame([pd.Series(dict(x="x"))]))
def test_read_write_insecure(self): secure_type = TypedDfBuilder("a").secure().build() bad_type = TypedDfBuilder("a").recommended_only().build() with pytest.raises(UnsupportedOperationError): # noinspection HttpUrlsUsage secure_type.read_url("http://google.com") # nosec secure = secure_type.new_df() bad = bad_type.new_df() for fmt in FileFormat: for suffix in fmt.suffixes: try: with tmpfile(suffix) as path: # should always complain about insecurity FIRST if not fmt.is_secure: with pytest.raises(FormatInsecureError): secure_type.read_file(path) with pytest.raises(FormatInsecureError): secure.write_file(path) path.unlink(missing_ok=True) if not fmt.is_recommended: with pytest.raises(FormatDiscouragedError): bad_type.read_file(path) with pytest.raises(FormatDiscouragedError): bad.write_file(path) except Exception: logger.error(f"Failed on suffix {suffix}") raise
def test_typed_subclass(self): t1 = TypedDfBuilder("t1").build() t2 = TypedDfBuilder("t2").subclass(t1).build() assert issubclass(t2, t1) assert not issubclass(t1, t2)
def test_drop_clash(self): t = TypedDfBuilder("a").reserve("trash").drop("trash") with pytest.raises(ClashError): t.build()
def test_secure(self): TypedDfBuilder("a").secure().hash(alg="sha256").build() TypedDfBuilder("a").hash(alg="sha1").build() with pytest.raises(DfTypeConstructionError): TypedDfBuilder("a").secure().hash(alg="sha1").build()
def test_read_instances_empty(self): t = TypedDfBuilder("T").require("animal", dtype=str).build() df = t.from_dataclass_instances([]) assert len(df) == 0
def _get(name: str, t: Type[AbsDf] = PlainTypedDf) -> LazyDf: url = f"https://raw.githubusercontent.com/mwaskom/seaborn-data/master/{name}.csv" if t is None: p, _, _ = FileFormat.split(url) t = TypedDfBuilder(p.name).build() return LazyDf.from_source(url, t)
def test_to_instances_empty(self): t = TypedDfBuilder("T").reserve("animal", dtype=str).build() df: t = t.of([]) instances = df.to_dataclass_instances() assert instances == []