Exemple #1
0
 def __init__(self, *args: Any, **kwargs: Any):
     p = ParamDict()
     for a in args:
         if a is None:
             continue
         elif isinstance(a, PartitionSpec):
             self._update_dict(p, a.jsondict)
         elif isinstance(a, Dict):
             self._update_dict(p, a)
         elif isinstance(a, str):
             self._update_dict(p, json.loads(a))
         else:
             raise TypeError(f"{a} is not supported")
     self._update_dict(p, kwargs)
     self._num_partitions = p.get("num_partitions", "0")
     self._algo = p.get("algo", "").lower()
     self._partition_by = p.get("partition_by", [])
     aot(
         len(self._partition_by) == len(set(self._partition_by)),
         SyntaxError(f"{self._partition_by} has duplicated keys"),
     )
     self._presort = self._parse_presort_exp(p.get_or_none("presort", object))
     if any(x in self._presort for x in self._partition_by):
         raise SyntaxError(
             "partition by overlap with presort: "
             + f"{self._partition_by}, {self._presort}"
         )
     # TODO: currently, size limit not in use
     self._size_limit = to_size(p.get("size_limit", "0"))
     self._row_limit = p.get("row_limit", 0)
Exemple #2
0
 def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame:
     kw = ParamDict(kwargs)
     infer_schema = kw.get("infer_schema", False)
     if infer_schema:
         kw["inferSchema"] = True
     if "infer_schema" in kw:
         del kw["infer_schema"]
     header = str(kw.get_or_none("header", object)).lower()
     if "header" in kw:
         del kw["header"]
     reader = self._session.read.format("csv")
     reader.options(**kw)
     if header == "true":
         reader.option("header", "true")
         if columns is None:
             return SparkDataFrame(reader.load(p))
         if isinstance(columns, list):  # column names
             return SparkDataFrame(reader.load(p)[columns])
         schema = Schema(columns)
         return SparkDataFrame(reader.load(p)[schema.names], schema)
     if header in ["false", "none"]:
         reader.option("header", "false")
         if columns is None:
             raise InvalidOperationError("columns must be set if without header")
         if isinstance(columns, list):  # column names
             sdf = reader.load(p)
             inferred = to_schema(sdf)
             renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)]
             return SparkDataFrame(sdf.selectExpr(*renames))
         schema = Schema(columns)
         sdf = reader.schema(to_spark_schema(schema)).load(p)
         return SparkDataFrame(sdf, schema)
     else:
         raise NotImplementedError(f"{header} is not supported")
Exemple #3
0
 def __init__(self, *args: Any, **kwargs: Any):  # noqa: C901
     p = ParamDict()
     if (
         len(args) == 1
         and len(kwargs) == 0
         and isinstance(args[0], str)
         and args[0].lower() == "per_row"
     ):
         p["algo"] = "even"
         p["num_partitions"] = "ROWCOUNT"
     else:
         for a in args:
             if a is None:
                 continue
             elif isinstance(a, PartitionSpec):
                 self._update_dict(p, a.jsondict)
             elif isinstance(a, Dict):
                 self._update_dict(p, a)
             elif isinstance(a, str):
                 self._update_dict(p, json.loads(a))
             else:
                 raise TypeError(f"{a} is not supported")
         self._update_dict(p, kwargs)
     self._num_partitions = p.get("num_partitions", "0")
     self._algo = p.get("algo", "").lower()
     if "partition_by" not in p:
         self._partition_by: List[str] = []
     elif isinstance(p["partition_by"], str):
         self._partition_by = [p["partition_by"]]
     elif isinstance(p["partition_by"], (list, tuple)):
         self._partition_by = list(p["partition_by"])
     else:
         raise SyntaxError(p["partition_by"])
     aot(
         len(self._partition_by) == len(set(self._partition_by)),
         SyntaxError(f"{self._partition_by} has duplicated keys"),
     )
     self._presort = parse_presort_exp(p.get_or_none("presort", object))
     if any(x in self._presort for x in self._partition_by):
         raise SyntaxError(
             "partition by overlap with presort: "
             + f"{self._partition_by}, {self._presort}"
         )
     # TODO: currently, size limit not in use
     self._size_limit = to_size(p.get("size_limit", "0"))
     self._row_limit = p.get("row_limit", 0)
Exemple #4
0
def _load_csv(p: FileParser,
              columns: Any = None,
              **kwargs: Any) -> Tuple[pd.DataFrame, Any]:
    kw = ParamDict(kwargs)
    infer_schema = kw.get("infer_schema", False)
    if not infer_schema:
        kw["dtype"] = object
    if "infer_schema" in kw:
        del kw["infer_schema"]
    header: Any = False
    if "header" in kw:
        header = kw["header"]
        del kw["header"]
    if str(header) in ["True", "0"]:
        pdf = _safe_load_csv(p.uri, **{"index_col": False, "header": 0, **kw})
        if columns is None:
            return pdf, None
        if isinstance(columns, list):  # column names
            return pdf[columns], None
        schema = Schema(columns)
        return pdf[schema.names], schema
    if header is None or str(header) == "False":
        if columns is None:
            raise InvalidOperationError(
                "columns must be set if without header")
        if isinstance(columns, list):  # column names
            pdf = _safe_load_csv(
                p.uri, **{
                    "index_col": False,
                    "header": None,
                    "names": columns,
                    **kw
                })
            return pdf, None
        schema = Schema(columns)
        pdf = _safe_load_csv(
            p.uri, **{
                "index_col": False,
                "header": None,
                "names": schema.names,
                **kw
            })
        return pdf, schema
    else:
        raise NotImplementedError(f"{header} is not supported")
Exemple #5
0
def test_param_dict():
    d = ParamDict([("a", 1), ("b", 2)])
    assert 1 == d["a"]
    assert 1 == d[0]
    assert 2 == d["b"]
    assert "2" == d.get_or_throw(1, str)
    # if giving index, it should ignore the throw flag and always throw
    raises(IndexError, lambda: d.get(2, "x"))
    raises(IndexError, lambda: d.get_or_none(2, str))

    d = {"a": "b", "b": {"x": 1, "y": "d"}}
    p = ParamDict(d)
    print({"test": p})
    d["b"]["x"] = 2
    assert 1 == p["b"]["x"]
    p = ParamDict(d, deep=False)
    d["b"]["x"] = 3
    assert 3 == p["b"]["x"]
    pp = ParamDict(p, deep=False)
    p["b"]["x"] = 4
    assert 4 == pp["b"]["x"]
    pp = ParamDict(p, deep=True)
    p["b"]["x"] = 5
    assert 4 == pp["b"]["x"]

    assert 2 == len(p)
    assert "a,b" == ",".join([k for k, _ in p.items()])
    del p["a"]
    assert 1 == len(p)
    p["c"] = 1
    assert 2 == len(p)
    assert "c" in p
    assert "a" not in p

    raises(ValueError, lambda: p.get("c", None))
    assert 1 == p.get("c", 2)
    assert "1" == p.get("c", "2")
    assert 1.0 == p.get("c", 2.0)
    raises(TypeError, lambda: p.get("c", ParamDict()))
    assert 2 == p.get("d", 2)
    p["arr"] = [1, 2]
    assert [1, 2] == p.get("arr", [])
    assert [] == p.get("arr2", [])

    assert p.get_or_none("e", int) is None
    assert 1 == p.get_or_none("c", int)
    assert "1" == p.get_or_none("c", str)
    # exists but can't convert type
    raises(TypeError, lambda: p.get_or_none("c", ParamDict))

    raises(KeyError, lambda: p.get_or_throw("e", int))
    assert 1 == p.get_or_throw("c", int)
    assert "1" == p.get_or_throw("c", str)
    # exists but can't convert type
    raises(TypeError, lambda: p.get_or_throw("c", ParamDict))

    p = ParamDict()
    assert 0 == len(p)
    for x in p:
        pass

    raises(TypeError, lambda: ParamDict("abc"))

    a = ParamDict({"a": 1, "b": 2})
    b = ParamDict({"b": 2, "a": 1})
    c = ParamDict({"b": 2})
    assert a == a
    assert a != b
    assert a != c
    assert a == {"b": 2, "a": 1}
    assert a != {"b": 1, "a": 1}
    assert a != None
    assert not (a == None)

    p = ParamDict({
        "a": "True",
        "b": True,
        "c": "true",
        "d": "False",
        "e": False,
        "f": "false",
        "g": "yes",
        "h": "NO",
        "i": 0,
        "j": "1",
        "k": "",
    })
    assert p.get_or_throw("a", bool)
    assert p.get_or_throw("b", bool)
    assert p.get_or_throw("c", bool)
    assert not p.get_or_throw("d", bool)
    assert not p.get_or_throw("e", bool)
    assert not p.get_or_throw("f", bool)
    assert p.get_or_throw("g", bool)
    assert not p.get_or_throw("h", bool)
    assert not p.get_or_throw("i", bool)
    assert p.get_or_throw("j", bool)
    raises(TypeError, lambda: p.get_or_throw("k", bool))

    s = '{"a":false,"b":10,"c":"cd"}'
    p = ParamDict(json.loads(s))
    assert not p.get_or_throw("a", bool)
    assert "10" == p.get_or_throw("b", str)
    assert "cd" == p.get_or_throw("c", str)
    raises(KeyError, lambda: p.get_or_throw("d", str))

    print(p.to_json())
    print(p.to_json(True))

    # update
    p = ParamDict(dict(a=1, b=2))
    p1 = ParamDict(dict(b=3, c=4))
    p.update(p1)
    assert dict(a=1, b=3, c=4) == p

    p = ParamDict(dict(a=1, b=2))
    p.update(p1, ParamDict.OVERWRITE)
    assert dict(a=1, b=3, c=4) == p

    p = ParamDict(dict(a=1, b=2))
    p.update(p1, ParamDict.IGNORE)
    assert dict(a=1, b=2, c=4) == p

    p = ParamDict(dict(a=1, b=2))
    raises(KeyError, lambda: p.update(p1, ParamDict.THROW))

    raises(ValueError, lambda: p.update(p1, 100))

    p.set_readonly()
    raises(InvalidOperationError, lambda: p.update(p1, 100))