def append(self, obj: Any) -> "Schema": # noqa: C901 """Append schema like object to the current schema. Only new columns are allowed. :raises SchemaError: if a column exists or is invalid or obj is not convertible :return: the Schema object itself """ try: if obj is None: return self elif isinstance(obj, pa.Field): self[obj.name] = obj.type elif isinstance(obj, str): self._append_pa_schema(expression_to_schema(obj)) elif isinstance(obj, Dict): for k, v in obj.items(): self[k] = v elif isinstance(obj, pa.Schema): self._append_pa_schema(obj) elif isinstance(obj, pd.DataFrame): self._append_pa_schema(PD_UTILS.to_schema(obj)) elif isinstance(obj, Tuple): # type: ignore self[obj[0]] = obj[1] elif isinstance(obj, List): for x in obj: self.append(x) else: raise SchemaError(f"Invalid schema to add {obj}") return self except SchemaError: raise except Exception as e: raise SchemaError(str(e))
def test_binary(): b = pickle.dumps("xyz") data = [[b, b"xy"]] s = expression_to_schema("a:bytes,b:bytes") df = DF(data, "a:bytes,b:bytes") a = df.as_array(type_safe=True) assert [[b, b"xy"]] == a
def test_nested(): # data = [[dict(b=[30, "40"])]] # s = expression_to_schema("a:{a:str,b:[int]}") # df = DF(data, "a:{a:str,b:[int]}") # a = df.as_array(type_safe=True) # assert [[dict(a=None, b=[30, 40])]] == a data = [[[json.dumps(dict(b=[30, "40"]))]]] s = expression_to_schema("a:[{a:str,b:[int]}]") df = DF(data, "a:[{a:str,b:[int]}]") a = df.as_array(type_safe=True) assert [[[dict(a=None, b=[30, 40])]]] == a data = [[json.dumps(["1", 2])]] s = expression_to_schema("a:[int]") df = DF(data, "a:[int]") a = df.as_array(type_safe=True) assert [[[1, 2]]] == a
def test_schemas_equal(): a = expression_to_schema("a:int,b:int,c:int") b = expression_to_schema("a:int,b:int,c:int") c = expression_to_schema("a:int,c:int,b:int") assert schemas_equal(a, a) assert schemas_equal(a, b) assert not schemas_equal(a, c) assert schemas_equal(a, c, check_order=False) a = a.with_metadata({"a": "1"}) assert schemas_equal(a, a) assert not schemas_equal(a, b) assert schemas_equal(a, b, check_metadata=False) assert not schemas_equal(a, c) assert not schemas_equal(a, c, check_order=False) assert not schemas_equal(a, c, check_metadata=False) assert schemas_equal(a, c, check_order=False, check_metadata=False) c = c.with_metadata({"a": "1"}) assert not schemas_equal(a, c) assert schemas_equal(a, c, check_order=False)
def test_schemaed_data_partitioner(): p0 = SchemaedDataPartitioner( schema=expression_to_schema("a:int,b:int,c:int"), key_positions=[2, 0], row_limit=0, ) p1 = SchemaedDataPartitioner( schema=expression_to_schema("a:int,b:int,c:int"), key_positions=[2, 0], row_limit=1, ) p2 = SchemaedDataPartitioner( schema=expression_to_schema("a:int,b:int,c:int"), key_positions=[2, 0], row_limit=2, ) data = [[0, 0, 0], [0, 1, 0], [0, 2, 0], [1, 0, 0]] _test_partition(p0, data, "0,0,[0,1,2];1,0,[3]") _test_partition(p1, data, "0,0,[0];0,1,[1];0,2,[2];1,0,[3]") _test_partition(p2, data, "0,0,[0,1];0,1,[2];1,0,[3]") _test_partition(p2, data, "0,0,[0,1];0,1,[2];1,0,[3]") # can reuse the partitioner
def __init__(self, *args: Any, **kwargs: Any): if len(args) > 0 and len(kwargs) > 0: raise SchemaError("Can't set both *args and **kwargs") if len(args) == 1: # duplicate code for better performance if isinstance(args[0], Schema): super().__init__(args[0]) # type: ignore return fields: Optional[List[pa.Field]] = None if isinstance(args[0], str): fields = list(expression_to_schema(args[0])) if isinstance(args[0], pa.Schema): fields = list(args[0]) if isinstance(args[0], pa.Field): fields = [args[0]] if fields is not None: fields = [self._validate_field(f) for f in fields] super().__init__([(x.name, x) for x in fields]) return super().__init__() if len(args) > 0: self.append(list(args)) elif len(kwargs) > 0: self.append(kwargs)
def test_expression_conversion(): _assert_from_expr("a:int,b:ubyte") _assert_from_expr(" a : int32 , b : uint8 ", "a:int,b:ubyte") _assert_from_expr("a:[int32],b:uint8", "a:[int],b:ubyte") _assert_from_expr( "a : { x : int32 , y : [string] } , b : [ uint8 ] ", "a:{x:int,y:[str]},b:[ubyte]", ) _assert_from_expr( "a : [{ x : int32 , y : [string] }] , b : [ uint8 ] ", "a:[{x:int,y:[str]}],b:[ubyte]", ) _assert_from_expr("a:decimal(5,2)") _assert_from_expr("a:bytes,b:bytes") _assert_from_expr("a:bytes,b: binary", "a:bytes,b:bytes") raises(SyntaxError, lambda: expression_to_schema("123:int")) raises(SyntaxError, lambda: expression_to_schema("int")) raises(SyntaxError, lambda: expression_to_schema("a:dummytype")) raises(SyntaxError, lambda: expression_to_schema("a:int,a:str")) raises(SyntaxError, lambda: expression_to_schema("a:int,b:{x:int,x:str}")) raises(SyntaxError, lambda: expression_to_schema("_:int")) raises(SyntaxError, lambda: expression_to_schema("__:int"))
def _assert_from_expr(expr, expected=None): schema = expression_to_schema(expr) out_expr = schema_to_expression(schema) expected = expected or expr assert expected == out_expr
def extract( # noqa: C901 self, obj: Any, ignore_key_mismatch: bool = False, require_type_match: bool = True, ignore_type_mismatch: bool = False, ) -> "Schema": if obj is None: return Schema() if isinstance(obj, str): if ":" in obj: # expression ps = expression_to_schema(obj) pairs: List[Tuple[str, pa.DataType]] = list(zip(ps.names, ps.types)) else: pairs = [(obj, None)] # single key elif isinstance(obj, (pa.Schema, Schema)): pairs = list(zip(obj.names, obj.types)) elif isinstance(obj, List): fields: List[pa.Field] = [] for x in obj: if isinstance(x, str) and ":" not in x: if x not in self: if not ignore_key_mismatch: raise SchemaError(f"Can't extract {x} from {self}") else: fields.append(self[x]) else: fields += self.extract( x, ignore_key_mismatch=ignore_key_mismatch, require_type_match=require_type_match, ignore_type_mismatch=ignore_type_mismatch, ).fields return Schema(pa.schema(fields)) else: return self.extract( Schema(obj), ignore_key_mismatch=ignore_key_mismatch, require_type_match=require_type_match, ignore_type_mismatch=ignore_type_mismatch, ) fields = [] for k, v in pairs: k = k.strip() if k == "": continue if k not in self: if ignore_key_mismatch: continue raise SchemaError(f"Can't extract {k} from {self}") if v is None: fields.append(self[k]) else: tp = self[k].type if not require_type_match or tp == v: fields.append(self[k]) elif not ignore_type_mismatch: raise SchemaError( f"Unable to extract {k}:{v} from {self}, type mismatch" ) return Schema(pa.schema(fields))
def remove( # noqa: C901 self, obj: Any, ignore_key_mismatch: bool = False, require_type_match: bool = True, ignore_type_mismatch: bool = False, ) -> "Schema": if obj is None: return self.copy() target = self if isinstance(obj, str): if ":" in obj: # expression ps = expression_to_schema(obj) pairs: List[Tuple[str, pa.DataType]] = list(zip(ps.names, ps.types)) else: pairs = [(obj, None)] # single key elif isinstance(obj, (pa.Schema, Schema)): pairs = list(zip(obj.names, obj.types)) elif isinstance(obj, (List, Set)): keys: List[str] = [] other: List[Any] = [] for x in obj: if isinstance(x, str) and ":" not in x: keys.append(x) else: other.append(x) pairs = [(x, None) for x in keys] for o in other: target = target.remove( o, ignore_key_mismatch=ignore_key_mismatch, require_type_match=require_type_match, ignore_type_mismatch=ignore_type_mismatch, ) else: return self.remove( Schema(obj), ignore_key_mismatch=ignore_key_mismatch, require_type_match=require_type_match, ignore_type_mismatch=ignore_type_mismatch, ) od = OrderedDict(target) for k, v in pairs: k = k.strip() if k == "": continue if k not in od: if ignore_key_mismatch: continue raise SchemaError(f"Can't remove {k} from {target}") if v is None: del od[k] else: tp = od[k].type if not require_type_match or tp == v: del od[k] elif not ignore_type_mismatch: raise SchemaError( f"Unable to remove {k}:{v} from {self}, type mismatch") return Schema(od)
def __init__(self, data, schema, enforce=False): s = expression_to_schema(schema) df = pd.DataFrame(data, columns=s.names) self.native = PD_UTILS.enforce_type(df, s, enforce) self.schema = s