def patch_field_auth(schema: DatasetSchema, table_id, field_id, *subfields, auth: list[str]): """Monkeypatch an Amsterdam Schema to set "auth" on a table.""" # This updates the low-level dict data so all high-level objects get it. schema.get_table_by_id(table_id).get_field_by_id(field_id) # check existence raw_table = next(t for t in schema["tables"] if t.default["id"] == table_id) raw_field = next( f for f_id, f in raw_table.default["schema"]["properties"].items() if f_id == field_id ) # Allow to resolve sub fields too for subfield in subfields: # Auto jump over array, object or "array of objects" if raw_field["type"] == "array": raw_field = raw_field["items"] if raw_field["type"] == "object": raw_field = raw_field["properties"] raw_field = raw_field[subfield] raw_field["auth"] = auth # Also patch the active model model = apps.get_model(schema.id, table_id) model_field = model._meta.get_field(to_snake_case(field_id)) for subfield in subfields: model_field = model_field.related_model._meta.get_field(subfield) model_field.field_schema["auth"] = auth
def patch_table_auth(schema: DatasetSchema, table_id, *, auth: list[str]): """Monkeypatch an Amsterdam Schema to set "auth" on a table.""" # This updates the low-level dict data so all high-level objects get it. schema.get_table_by_id(table_id) # checks errors raw_table = next(t for t in schema["tables"] if t.default["id"] == table_id) raw_table.default["auth"] = auth # Also patch the active model, as that's already loaded and has a copy of the table schema model = apps.get_model(schema.id, table_id) model.table_schema()["auth"] = auth
def import_schema(self, name: str, schema: DatasetSchema) -> Optional[Dataset]: """Import a single dataset schema.""" try: dataset = Dataset.objects.get(name=schema.id) except Dataset.DoesNotExist: dataset = Dataset.objects.create(name=schema.id, schema_data=schema.json_data()) self.stdout.write(f" Created {name}") return dataset else: dataset.schema_data = schema.json_data() if dataset.schema_data_changed(): dataset.save() self.stdout.write(f" Updated {name}") return dataset return None
def create_meta_table_data(engine, dataset_schema: DatasetSchema): session = sessionmaker(bind=engine)() ds_content = { camel_case_to_snake(k): v for k, v in dataset_schema.items() if k != "tables" } ds_content["contact_point"] = str(ds_content.get("contact_point", "")) ds_transformer = transformer_factory(models.Dataset) dataset = models.Dataset(**ds_transformer(ds_content)) session.add(dataset) for table_data in dataset_schema["tables"]: table_content = { camel_case_to_snake(k): v for k, v in table_data.items() if k != "schema" } table = models.Table( **{ **table_content, **{ f: table_data["schema"].get(f) for f in ("required", "display") }, }) table.dataset_id = dataset.id session.add(table) for field_name, field_value in table_data["schema"][ "properties"].items(): field_content = { k.replace("$", ""): v for k, v in field_value.items() if k not in {"$comment"} } field_content["name"] = field_name try: field = models.Field(**field_content) except TypeError as e: raise NotImplementedError( f'Import failed: at "{field_name}": {field_value!r}:\n{e}' ) from e field.table_id = table.id field.dataset_id = dataset.id session.add(field) session.commit()
def _load_geojson(postgres_conn_id): """As airflow executes tasks at different hosts, these tasks need to happen in a single call. Otherwise, the (large) file is downloaded by one host, and stored in the XCom table to be shared between tasks. """ tmp_dir = Path(f"/tmp/{dag_id}") tmp_dir.mkdir(parents=True, exist_ok=True) # 1. download files files = {} for route in ROUTES: dest = f"{tmp_dir}/{route.name}.geojson" logger.info("Downloading %s to %s", route.url, dest) download_file(route.url, dest, http_conn_id=None) files[route.name] = dest # 2. generate schema ("schema introspect geojson *.geojson") schema = introspect_geojson_files( "gevaarlijke-routes", files=list(files.values()) ) schema = DatasetSchema.from_dict(schema) # TODO: move to schema-tools? # XXX This is not running as one transaction atm, but autocommitting per chunk # 3. import data db_engine = get_engine() importer = GeoJSONImporter(schema, db_engine, logger=logger) for route in ROUTES: geojson_path = files[route.name] logger.info( "Importing %s into %s", route.name, route.tmp_db_table_name ) importer.generate_db_objects( table_name=route.schema_table_name, db_table_name=route.tmp_db_table_name, truncate=True, # when reexecuting the same task ind_tables=True, ind_extra_index=False, ) importer.load_file( geojson_path, ) if route.post_process: hook = PostgresHook(postgres_conn_id=postgres_conn_id) hook.run(route.post_process)
def execute(self, context=None): """Executes the 'generate_db_object' method from schema-tools. Which leads to the creation of tables and/or an index on the identifier (as specified in the data JSON schema). By default both tables and the identifier and 'many-to-many table' indexes are created. By setting the boolean indicators in the method parameters, tables or an identifier index (per table) can be created. """ data_schema_url = f"{SCHEMA_URL.split('//')[0]}//{self.data_schema_env}{SCHEMA_URL.split('//')[1]}{self.data_schema_name}/{self.data_schema_name}" data = schema_fetch_url_file(data_schema_url) engine = _get_engine(self.db_conn) parent_schema = SchemaType(data) dataset_schema = DatasetSchema(parent_schema) importer = BaseImporter(dataset_schema, engine) for table in data["tables"]: if (self.data_schema_name + "_" + table["id"] == f"{self.data_table_name}"): importer.generate_db_objects( table["id"], ind_tables=self.ind_table, ind_extra_index=self.ind_extra_index, ) else: continue
def vestiging_schema(vestiging_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(vestiging_schema_json)
def parkeervakken_schema(parkeervakken_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(parkeervakken_schema_json)
def schema(self) -> DatasetSchema: """Provide access to the schema data""" if not self.schema_data: raise RuntimeError("Dataset.schema_data is empty") return DatasetSchema.from_dict(self.schema_data)
def woningbouwplannen_schema(woningbouwplannen_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(woningbouwplannen_schema_json)
def gebieden_schema(gebieden_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(gebieden_schema_json)
def download_url_schema(download_url_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(download_url_schema_json)
def explosieven_schema(explosieven_schema_json, ) -> DatasetSchema: return DatasetSchema.from_dict(explosieven_schema_json)
def afval_schema(afval_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(afval_schema_json)
def fietspaaltjes_schema(fietspaaltjes_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(fietspaaltjes_schema_json)
def fietspaaltjes_schema_no_display( fietspaaltjes_schema_json_no_display, ) -> DatasetSchema: return DatasetSchema.from_dict(fietspaaltjes_schema_json_no_display)
def afval_schema_backwards_embedded( afval_schema_backwards_embedded_json, ) -> DatasetSchema: return DatasetSchema.from_dict(afval_schema_backwards_embedded_json)
def indirect_self_ref_schema(indirect_self_ref_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(indirect_self_ref_schema_json)
def afval_schema_backwards_summary( afval_schema_backwards_summary_json, ) -> DatasetSchema: return DatasetSchema.from_dict(afval_schema_backwards_summary_json)
def meldingen_schema(meldingen_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(meldingen_schema_json)
def bommen_v2_schema(bommen_v2_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(bommen_v2_schema_json)
def bag_schema(bag_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(bag_schema_json)
def geometry_zoom_schema(): return DatasetSchema.from_dict( json.loads((HERE / "files" / "geometry_zoom.json").read_text()))
TABLE_SCHEMA = { "id": "mytable", "type": "table", "version": str(V1), "schema": { "$schema": "http://json-schema.org/draft-07/schema#", }, } REMOTE_SCHEMA = DatasetTableSchema( TABLE_SCHEMA, parent_schema=DatasetSchema( { "id": "adhoc", "tables": [ TableVersions( id=TABLE_SCHEMA["id"], default_version_number=V1, active=dict(V1=TABLE_SCHEMA), ) ], } ), ) @pytest.mark.parametrize( "case", [ ("http://remote", "http://remote/foo?bar=baz"), ("http://remote/", "http://remote/foo?bar=baz"), ("http://remote/quux/{table_id}", "http://remote/quux/mytable/foo?bar=baz"), ("http://remote/quux/{table_id}/", "http://remote/quux/mytable/foo?bar=baz"),
def geometry_authdataset_schema( geometry_authdataset_schema_json) -> DatasetSchema: return DatasetSchema.from_dict(geometry_authdataset_schema_json)