Args:
        - catalog (str, str): tuple with name of table and url
    """
    bq = bigquery.Client(project=GCP.project)
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = "WRITE_TRUNCATE"
    df = pd.DataFrame(requests.get(catalog[1]).json()["value"])
    job = bq.load_table_from_dataframe(
        dataframe=df,
        destination=f"{schema}.{catalog[0]}",
        project=GCP.project,
        location=GCP.location,
    )
    return job


gcp = Parameter("gcp", required=True)
with Flow("CBS catalogs") as flow:
    odatav3 = odatav3_catalog_to_gbq.map(catalog=list(CATALOGS.items()), GCP=unmapped(gcp))


def main(config):
    """Executes vektis.agb.flow in DaskExecutor.
    """
    flow.run(parameters={"gcp": config.gcp})


if __name__ == "__main__":
    config = get_config("dataverbinders")
    main(config=config)
    query = flatten_table(id=table_id, join_type="inner", schema=schema, credentials=None, GCP=GCP)
    print(query)
    
    # dims_query = get_dimensions_from_bq(
    #     id=table_id,
    #     schema='mlz',
    #     credentials=None,
    #     GCP=my_gcp
    #     )

    # # place dimensions in dicts according to type ALTERNATIVE OPTION - one iterable with 'Type' marked per item?
    # dims = {row['Key']: row['Title'] for row in dims_query if row['Type']=="Dimension"}
    # time_dims = {row['Key']: row['Title'] for row in dims_query if row['Type']=="TimeDimension"}
    # geo_dims = {row['Key']: row['Title'] for row in dims_query if row['Type']=="GeoDimension"}
    # geo_details = {row['Key']: row['Title'] for row in dims_query if row['Type']=="GeoDetail"}
    # print(write_join_dimensions(dims, "INNER", table_id, schema, my_gcp))


# for local testing purposes
if __name__ == "__main__":
    config = get_config("ag")
    my_gcp = config.gcp
    table_id = "40060NED"
    schema = "mlz"
    main(my_gcp)

    # data_properties = get_dimensions_from_bq(id=table_id, GCP=my_gcp, schema='mlz')

    # print(f"The dimensions for table {table_id}:")
    # for row in data_properties:
    #     print(f"Key = {row['Key']}, Title={row['Title']}, Type={row['Type']}")
Beispiel #3
0
from prefect.engine.executors import DaskExecutor

from nl_open_data.config import get_config
from nl_open_data.flows.cbs import regionaal

executor = DaskExecutor(n_workers=8)


def main(config):
    regionaal.main.run(config=config)


if __name__ == "__main__":
    config = get_config('dataverbinders')
    main(config=config)
Beispiel #4
0
import prefect
from prefect import task, Parameter, Flow
from prefect.tasks.shell import ShellTask
from prefect.engine.result_handlers import LocalResultHandler
from prefect.tasks.secrets import PrefectSecret
from prefect.tasks.gcp.bigquery import BigQueryLoadFile
from prefect. utilities.configuration import set_temporary_config
from lxml import etree, objectify
import xmltodict

from nl_open_data.config import get_config
from nimbletl.tasks import curl_cmd, unzip, create_dir


# TO DO: encapsulate config into initalization task so this can be provided at runtime
CONFIG = get_config("dk")

BAG_VERSION = "08042020"
BAG_URL = (
    "http://geodata.nationaalgeoregister.nl/inspireadressen/extract/inspireadressen.zip"
)

# NUM data
NUM_FILE = CONFIG.path.root / CONFIG.path.bag / (f"9999NUM{BAG_VERSION}" + ".zip")
NUM_TMP_DIR = CONFIG.path.root / CONFIG.path.tmp / Path("NUM")
NUM_TAG = "Nummeraanduiding"
NUM_XPATH = ".//" + NUM_TAG

# VBO data
VBO_FILE = CONFIG.path.root / CONFIG.path.bag / (f"9999VBO{BAG_VERSION}" + ".zip")