Args: - catalog (str, str): tuple with name of table and url """ bq = bigquery.Client(project=GCP.project) job_config = bigquery.LoadJobConfig() job_config.write_disposition = "WRITE_TRUNCATE" df = pd.DataFrame(requests.get(catalog[1]).json()["value"]) job = bq.load_table_from_dataframe( dataframe=df, destination=f"{schema}.{catalog[0]}", project=GCP.project, location=GCP.location, ) return job gcp = Parameter("gcp", required=True) with Flow("CBS catalogs") as flow: odatav3 = odatav3_catalog_to_gbq.map(catalog=list(CATALOGS.items()), GCP=unmapped(gcp)) def main(config): """Executes vektis.agb.flow in DaskExecutor. """ flow.run(parameters={"gcp": config.gcp}) if __name__ == "__main__": config = get_config("dataverbinders") main(config=config)
query = flatten_table(id=table_id, join_type="inner", schema=schema, credentials=None, GCP=GCP) print(query) # dims_query = get_dimensions_from_bq( # id=table_id, # schema='mlz', # credentials=None, # GCP=my_gcp # ) # # place dimensions in dicts according to type ALTERNATIVE OPTION - one iterable with 'Type' marked per item? # dims = {row['Key']: row['Title'] for row in dims_query if row['Type']=="Dimension"} # time_dims = {row['Key']: row['Title'] for row in dims_query if row['Type']=="TimeDimension"} # geo_dims = {row['Key']: row['Title'] for row in dims_query if row['Type']=="GeoDimension"} # geo_details = {row['Key']: row['Title'] for row in dims_query if row['Type']=="GeoDetail"} # print(write_join_dimensions(dims, "INNER", table_id, schema, my_gcp)) # for local testing purposes if __name__ == "__main__": config = get_config("ag") my_gcp = config.gcp table_id = "40060NED" schema = "mlz" main(my_gcp) # data_properties = get_dimensions_from_bq(id=table_id, GCP=my_gcp, schema='mlz') # print(f"The dimensions for table {table_id}:") # for row in data_properties: # print(f"Key = {row['Key']}, Title={row['Title']}, Type={row['Type']}")
from prefect.engine.executors import DaskExecutor from nl_open_data.config import get_config from nl_open_data.flows.cbs import regionaal executor = DaskExecutor(n_workers=8) def main(config): regionaal.main.run(config=config) if __name__ == "__main__": config = get_config('dataverbinders') main(config=config)
import prefect from prefect import task, Parameter, Flow from prefect.tasks.shell import ShellTask from prefect.engine.result_handlers import LocalResultHandler from prefect.tasks.secrets import PrefectSecret from prefect.tasks.gcp.bigquery import BigQueryLoadFile from prefect. utilities.configuration import set_temporary_config from lxml import etree, objectify import xmltodict from nl_open_data.config import get_config from nimbletl.tasks import curl_cmd, unzip, create_dir # TO DO: encapsulate config into initalization task so this can be provided at runtime CONFIG = get_config("dk") BAG_VERSION = "08042020" BAG_URL = ( "http://geodata.nationaalgeoregister.nl/inspireadressen/extract/inspireadressen.zip" ) # NUM data NUM_FILE = CONFIG.path.root / CONFIG.path.bag / (f"9999NUM{BAG_VERSION}" + ".zip") NUM_TMP_DIR = CONFIG.path.root / CONFIG.path.tmp / Path("NUM") NUM_TAG = "Nummeraanduiding" NUM_XPATH = ".//" + NUM_TAG # VBO data VBO_FILE = CONFIG.path.root / CONFIG.path.bag / (f"9999VBO{BAG_VERSION}" + ".zip")