Example #1
0
 def _get_altered_schema(self, subschema: Any) -> Schema:
     sub = Schema(subschema)
     assert_or_throw(
         sub.names in self.schema,
         lambda: FugueDataFrameOperationError(
             f"{sub.names} are not all in {self.schema}"
         ),
     )
     for k, v in sub.items():
         old_type = self.schema[k].type
         new_type = v.type
         if not old_type.equals(new_type):
             assert_or_throw(
                 not pa.types.is_struct(old_type)
                 and not pa.types.is_list(old_type)
                 and not pa.types.is_binary(old_type),
                 lambda: NotImplementedError(f"can't convert from {old_type}"),
             )
             assert_or_throw(
                 not pa.types.is_struct(new_type)
                 and not pa.types.is_list(new_type)
                 and not pa.types.is_binary(new_type),
                 lambda: NotImplementedError(f"can't convert to {new_type}"),
             )
     return Schema([(k, sub.get(k, v)) for k, v in self.schema.items()])
Example #2
0
def _enforce_type(df: pd.DataFrame, schema: Schema) -> pd.DataFrame:
    # TODO: does this have higher latency?
    for k, v in schema.items():
        s = df[k]
        if pa.types.is_string(v.type):
            ns = s.isnull()
            s = s.astype(str)
            s[ns] = None
        elif pa.types.is_integer(v.type) or pa.types.is_boolean(v.type):
            ns = s.isnull()
            s = s.fillna(0).astype(v.type.to_pandas_dtype())
            s[ns] = None
        elif not pa.types.is_struct(v.type) and not pa.types.is_list(v.type):
            s = s.astype(v.type.to_pandas_dtype())
        df[k] = s
    return df