def is_strict_schema_match(schema1: Schema, schema2: Schema) -> bool: """ Schemas match strictly if all fields match strictly one-to-one """ if set(schema1.field_names()) != set(schema2.field_names()): return False for f1 in schema1.fields: f2 = schema2.get_field(f1.name) if not is_strict_field_match(f1, f2): return False return True
def update_matching_field_definitions(env: Environment, schema: Schema, update_with_schema: Schema) -> Schema: fields = [] modified = False for f in schema.fields: new_f = f try: new_f = update_with_schema.get_field(f.name) modified = True except NameError: pass fields.append(new_f) if not modified: return schema schema_dict = asdict(schema) schema_dict["name"] = f"{schema.name}_with_{update_with_schema.name}" schema_dict["fields"] = fields updated = Schema.from_dict(schema_dict) env.add_new_generated_schema(updated) return updated
def check_casts( from_schema: Schema, to_schema: Schema, cast_level: CastToSchemaLevel = CastToSchemaLevel.SOFT, warn_on_downcast=True, fail_on_downcast=False, ): """ TODO: This is ok, but not really matching the reality of casting. Casting is really a separate ETL step: it's runtime / storage specific, has many different plausible approaches, and is too complex to be managed at one level with no context (like we don't even know what runtime we are on here) Best option for now is to give the user information on what is happening (via warns and logging) but not actually block anything -- let errors arise naturally as they will, otherwise "play on". """ for f in from_schema.fields: try: new_f = to_schema.get_field(f.name) if not is_strict_field_match(f, new_f): # if cast_level == CastToSchemaLevel.HARD: # raise SchemaTypeError( # f"Cannot cast (Cast level == HARD): {f.field_type} to {new_f.field_type} " # ) # if not f.field_type.is_castable_to_type(new_f.field_type): # raise SchemaTypeError( # f"Cannot cast: {f.field_type} to {new_f.field_type}" # ) if warn_on_downcast: warnings.warn( f"Downcasting field '{f.name}': {f.field_type} to {new_f.field_type}" ) if fail_on_downcast: raise SchemaTypeError( f"Cannot cast (FAIL_ON_DOWNCAST=True) '{f.name}': {f.field_type} to {new_f.field_type} " ) except NameError: pass
def create_empty(self, name, storage, schema: Schema): s = ",".join(schema.field_names()) + "\n" storage.get_api().put(name, (ln for ln in [s]))
def has_subset_fields(sub: Schema, supr: Schema) -> bool: return set(sub.field_names()) <= set(supr.field_names())
def create_empty(self, name, storage, schema: Schema): # Not sure you'd really ever want to do this? with storage.get_api().open(name, "w") as f: f.write(",".join(schema.field_names()) + "\n")