Ejemplo n.º 1
0
 def test_reserve_dtype(self):
     t = TypedDfBuilder("a").reserve("x", dtype=np.float32).build()
     df = t.convert(pd.DataFrame([pd.Series(dict(x="0.5"))]))
     assert df.column_names() == ["x"]
     assert df.to_numpy().tolist() == [[0.5]]
     with pytest.raises(ValueError):
         t.convert(pd.DataFrame([pd.Series(dict(x="kitten"))]))
Ejemplo n.º 2
0
 def test_to_instances(self):
     t = TypedDfBuilder("T").require("animal", dtype=str).reserve("age", dtype=int).build()
     df: t = t.of(
         [
             pd.Series(dict(animal="goldfish", age=2)),
             pd.Series(dict(animal="goldfish", age=1)),
             pd.Series(dict(animal="gazelle", age=8)),
             pd.Series(dict(animal="pineapple", age=114)),
             pd.Series(dict(animal="anteater", age=11)),
         ]
     )
     dc = t.create_dataclass()
     instances = df.to_dataclass_instances()
     assert instances == [
         dc("goldfish", 2),
         dc("goldfish", 1),
         dc("gazelle", 8),
         dc("pineapple", 114),
         dc("anteater", 11),
     ]
     assert list(sorted(instances)) == [
         dc("anteater", 11),
         dc("gazelle", 8),
         dc("goldfish", 1),
         dc("goldfish", 2),
         dc("pineapple", 114),
     ]
Ejemplo n.º 3
0
 def test_no_overwrite(self):
     t = TypedDfBuilder("a").reserve("x", "y").build()
     df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="dog"))]))
     with tmpfile(".csv") as path:
         df.write_file(path, overwrite=False)
         with pytest.raises(FileExistsError):
             df.write_file(path, overwrite=False)
Ejemplo n.º 4
0
 def test_bad_type(self):
     with pytest.raises(TypeError):
         # noinspection PyTypeChecker
         TypedDfBuilder(None).build()
     with pytest.raises(TypeError):
         # noinspection PyTypeChecker
         TypedDfBuilder(5).build()
Ejemplo n.º 5
0
 def test_mkdir(self):
     t = TypedDfBuilder("a").reserve("x", "y").build()
     df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="dog"))]))
     with tmpdir() as path:
         df.write_file(path / "a.csv", mkdirs=True)
     with tmpdir() as path:
         with pytest.raises(FileNotFoundError):
             df.write_file(path / "b.csv")
Ejemplo n.º 6
0
 def test_bad_reserve(self):
     for index in [True, False]:
         with pytest.raises(ClashError):
             TypedDfBuilder("a").reserve("level_0", index=index)
         with pytest.raises(ClashError):
             TypedDfBuilder("a").reserve("abc", "level_0", index=index)
         with pytest.raises(ClashError):
             TypedDfBuilder("a").reserve("abc", "index", index=index)
Ejemplo n.º 7
0
 def test_drop(self):
     t = TypedDfBuilder("a").reserve("column").drop("trash").build()
     typ: DfTyping = t.get_typing()
     assert typ.columns_to_drop == {"trash"}
     df = t.convert(pd.DataFrame([pd.Series(dict(x="x", zz="y"))]))
     assert df.column_names() == ["x", "zz"]
     df = t.convert(pd.DataFrame([pd.Series(dict(x="x", trash="y"))]))
     assert df.column_names() == ["x"]
Ejemplo n.º 8
0
 def test_require_and_reserve_col(self):
     t = TypedDfBuilder("a").require("column").reserve("reserved").build()
     typ: DfTyping = t.get_typing()
     assert typ.required_columns == ["column"]
     assert typ.reserved_columns == ["reserved"]
     assert typ.required_index_names == []
     assert typ.reserved_index_names == []
     assert typ.verifications == []
Ejemplo n.º 9
0
 def test_pass_io_options(self):
     t = TypedDfBuilder("a").reserve("x",
                                     "y").add_write_kwargs(FileFormat.csv,
                                                           sep="&").build()
     df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="dog"))]))
     with tmpfile(".csv") as path:
         df.write_file(path)
         lines = path.read_text(encoding="utf8").splitlines()
         assert lines == ["x&y", "cat&dog"]
Ejemplo n.º 10
0
    def test_read_instances_empty_fields(self):
        @dataclass(frozen=True)
        class Dc:
            pass

        t = TypedDfBuilder("T").reserve("animal", dtype=str).build()
        df = t.from_dataclass_instances([Dc()])
        assert len(df) == 1
        assert "animal" not in df.columns
Ejemplo n.º 11
0
 def test_create_any(self):
     t = TypedDfBuilder("T").require("greeting").reserve("bonus").build()
     dc = t.create_dataclass()
     assert dc is not None
     assert issubclass(dc, TypedDfDataclass)
     assert len(dc.get_fields()) == 2
     assert dc.get_fields()[0].name == "greeting"
     assert dc.get_fields()[0].type == Any
     assert dc.get_fields()[1].name == "bonus"
     assert dc.get_fields()[1].type == Optional[Any]
Ejemplo n.º 12
0
    def test_read_instances(self):
        @dataclass(frozen=True)
        class Dc:
            animal: str
            val: Optional[int]

        t = TypedDfBuilder("T").require("animal", dtype=str).reserve("age", dtype=int).build()
        df = t.from_dataclass_instances([Dc("cat", 1), Dc("kitten", 2)])
        assert len(df) == 2
        assert df.to_numpy().tolist() == [["cat", 1], ["kitten", 2]]
Ejemplo n.º 13
0
 def test_attrs_hard(self):
     meta = None
     try:
         t = TypedDfBuilder("a").reserve("x", "y").build()
         df = t.convert(pd.DataFrame([pd.Series(dict(x="cat",
                                                     y="kitten"))]))
         df.attrs["matrix"] = np.zeros((2, 2))
         with tmpfile(".csv") as path:
             df.write_file(path, attrs=True)
             meta = Path(str(path) + ".attrs.json")
             assert meta.exists()
             df = t.read_file(path, attrs=True)
             assert df.attrs == {"matrix": [["0.0", "0.0"], ["0.0", "0.0"]]}
     finally:
         if meta is not None:
             meta.unlink(missing_ok=True)
Ejemplo n.º 14
0
 def test_attrs(self):
     meta = None
     try:
         t = TypedDfBuilder("a").reserve("x", "y").build()
         df = t.convert(pd.DataFrame([pd.Series(dict(x="cat",
                                                     y="kitten"))]))
         df.attrs["fruit"] = "apple"
         with tmpfile(".csv") as path:
             df.write_file(path, attrs=True)
             meta = Path(str(path) + ".attrs.json")
             assert meta.exists()
             data = meta.read_text(encoding="utf-8").replace("\n",
                                                             "").replace(
                                                                 "  ", "")
             assert data == '{"fruit": "apple"}'
             df = t.read_file(path, attrs=True)
             assert df.attrs == {"fruit": "apple"}
     finally:
         if meta is not None:
             meta.unlink(missing_ok=True)
Ejemplo n.º 15
0
 def test_condition(self):
     t = TypedDfBuilder("a").verify(always_ok).build()
     typ: DfTyping = t.get_typing()
     assert typ.required_columns == []
     assert typ.required_index_names == []
     assert typ.verifications == [always_ok]
     TypedDf(pd.DataFrame())
     t = TypedDfBuilder("a").verify(always_fail).build()
     with pytest.raises(VerificationFailedError):
         t.convert(pd.DataFrame())
Ejemplo n.º 16
0
 def test_require_and_reserve_index(self):
     t = (TypedDfBuilder("a").require("column", index=True).reserve(
         "reserved", index=True)).build()
     typ: DfTyping = t.get_typing()
     assert typ.required_columns == []
     assert typ.reserved_columns == []
     assert typ.required_index_names == ["column"]
     assert typ.reserved_index_names == ["reserved"]
     assert typ.known_index_names == ["column", "reserved"]
     assert typ.known_column_names == []
     assert typ.known_names == ["column", "reserved"]
     assert typ.verifications == []
Ejemplo n.º 17
0
 def test_already_added(self):
     for cola in [True, False]:
         for indexa in [True, False]:
             for colb in [True, False]:
                 for indexb in [True, False]:
                     builder = TypedDfBuilder("a")
                     if cola:
                         builder = builder.require("a", index=indexa)
                     else:
                         cola = builder.reserve("a", index=indexa)
                     with pytest.raises(ClashError):
                         if colb:
                             builder.require("a", index=indexb)
                         else:
                             builder.reserve("a", index=indexb)
Ejemplo n.º 18
0
    def typed(cls, name: str, doc: Optional[str] = None) -> TypedDfBuilder:
        """
        Creates a new type with flexible requirements.
        The class will enforce constraints and subclass :class:`typeddfs.typed_dfs.TypedDf`.

        Args:
            name: The name that will be used for the new class
            doc: The docstring for the new class

        Returns:
            A builder instance (builder pattern) to be used with chained calls

        Example:
            ``TypedDfs.typed("MyClass").require("name", index=True).build()``
        """
        return TypedDfBuilder(name, doc)
Ejemplo n.º 19
0
 def test_dir_hash(self):
     t = TypedDfBuilder("a").reserve("x", "y").build()
     df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="kitten"))]))
     with tmpfile(".csv") as path:
         hash_dir = Checksums().get_dirsum_of_file(path)
         hash_dir.unlink(missing_ok=True)
         df.write_file(path, dir_hash=True)
         assert hash_dir.exists()
         got = Checksums().load_dirsum_exact(hash_dir)
         assert list(got.keys()) == [path]
         hit = got[path]
         assert len(hit) == 64
         t.read_file(path, dir_hash=True)
         t.read_file(path, hex_hash=hit)
Ejemplo n.º 20
0
 def test_file_hash(self):
     t = TypedDfBuilder("a").reserve("x", "y").build()
     df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="dog"))]))
     # unfortunately, the file that gets output is os-dependent
     # \n vs \r\n is an issue, so we can't check the exact hash
     with tmpfile(".csv") as path:
         df.write_file(path, file_hash=True)
         hash_file = Checksums().get_filesum_of_file(path)
         assert hash_file.exists()
         got = Checksums().load_filesum_of_file(path)
         assert got.file_path == path
         hit = got.hash_value
         assert len(hit) == 64
         t.read_file(path, file_hash=True)
         t.read_file(path, hex_hash=hit)
Ejemplo n.º 21
0
    def test_dtype_post_process(self):
        # make sure these happen in the right order:
        # 1. dtype conversions
        # 2. post-processing
        # 3. final conditions

        def post(dd: BaseDf) -> BaseDf:
            assert dd["x"].dtype == np.float32
            dd2 = dd.copy()
            dd2["x"] += 9
            return dd2

        def cond(dd: BaseDf):
            return None if dd["x"].dtype == np.float32 else "failed"

        t = (TypedDfBuilder("a").reserve(
            "x", dtype=np.float32).post(post).verify(cond)).build()
        df = t.convert(pd.DataFrame([pd.Series(dict(x="0.5"))]))
        assert df.to_numpy().tolist() == [[9.5]]
Ejemplo n.º 22
0
 def test_create_empty(self):
     t = TypedDfBuilder("T").build()
     dc = t.create_dataclass()
     assert dc is not None
     assert issubclass(dc, TypedDfDataclass)
     assert dc.get_fields() == []
Ejemplo n.º 23
0
 def test_strict(self):
     # strict columns but not index
     t = TypedDfBuilder("a").strict(index=False, cols=True).build()
     typ: DfTyping = t.get_typing()
     assert typ.more_indices_allowed
     assert not typ.more_columns_allowed
     t.convert(pd.DataFrame([pd.Series(dict(x="x"))]).set_index("x"))
     with pytest.raises(UnexpectedColumnError):
         t.convert(pd.DataFrame([pd.Series(dict(x="x"))]))
     # strict index but not columns
     t = TypedDfBuilder("a").strict(True, False).build()
     typ: DfTyping = t.get_typing()
     assert typ.more_columns_allowed
     assert not typ.more_indices_allowed
     t.convert(pd.DataFrame([pd.Series(dict(x="x"))]))
     with pytest.raises(UnexpectedIndexNameError):
         df = PrettyDf(
             pd.DataFrame([pd.Series(dict(x="x"))]).set_index("x"))
         assert df.index_names() == ["x"]
         assert df.column_names() == []
         t.convert(df)
     # neither strict
     t = TypedDfBuilder("a").strict(False, False).build()
     t.convert(pd.DataFrame([pd.Series(dict(x="x"))]))
Ejemplo n.º 24
0
 def test_read_write_insecure(self):
     secure_type = TypedDfBuilder("a").secure().build()
     bad_type = TypedDfBuilder("a").recommended_only().build()
     with pytest.raises(UnsupportedOperationError):
         # noinspection HttpUrlsUsage
         secure_type.read_url("http://google.com")  # nosec
     secure = secure_type.new_df()
     bad = bad_type.new_df()
     for fmt in FileFormat:
         for suffix in fmt.suffixes:
             try:
                 with tmpfile(suffix) as path:
                     # should always complain about insecurity FIRST
                     if not fmt.is_secure:
                         with pytest.raises(FormatInsecureError):
                             secure_type.read_file(path)
                         with pytest.raises(FormatInsecureError):
                             secure.write_file(path)
                     path.unlink(missing_ok=True)
                     if not fmt.is_recommended:
                         with pytest.raises(FormatDiscouragedError):
                             bad_type.read_file(path)
                         with pytest.raises(FormatDiscouragedError):
                             bad.write_file(path)
             except Exception:
                 logger.error(f"Failed on suffix {suffix}")
                 raise
Ejemplo n.º 25
0
 def test_typed_subclass(self):
     t1 = TypedDfBuilder("t1").build()
     t2 = TypedDfBuilder("t2").subclass(t1).build()
     assert issubclass(t2, t1)
     assert not issubclass(t1, t2)
Ejemplo n.º 26
0
 def test_drop_clash(self):
     t = TypedDfBuilder("a").reserve("trash").drop("trash")
     with pytest.raises(ClashError):
         t.build()
Ejemplo n.º 27
0
 def test_secure(self):
     TypedDfBuilder("a").secure().hash(alg="sha256").build()
     TypedDfBuilder("a").hash(alg="sha1").build()
     with pytest.raises(DfTypeConstructionError):
         TypedDfBuilder("a").secure().hash(alg="sha1").build()
Ejemplo n.º 28
0
 def test_read_instances_empty(self):
     t = TypedDfBuilder("T").require("animal", dtype=str).build()
     df = t.from_dataclass_instances([])
     assert len(df) == 0
Ejemplo n.º 29
0
def _get(name: str, t: Type[AbsDf] = PlainTypedDf) -> LazyDf:
    url = f"https://raw.githubusercontent.com/mwaskom/seaborn-data/master/{name}.csv"
    if t is None:
        p, _, _ = FileFormat.split(url)
        t = TypedDfBuilder(p.name).build()
    return LazyDf.from_source(url, t)
Ejemplo n.º 30
0
 def test_to_instances_empty(self):
     t = TypedDfBuilder("T").reserve("animal", dtype=str).build()
     df: t = t.of([])
     instances = df.to_dataclass_instances()
     assert instances == []