Beispiel #1
0
def create_or_append_table(context, dataset_id, table_id, path):
    tb = Table(table_id=table_id, dataset_id=dataset_id)
    if not tb.table_exists('staging'):
        context.log.info("Table does not exist in STAGING, creating table...")
        tb.create(
            path=path,
            if_table_exists="pass",
            if_storage_data_exists="replace",
            if_table_config_exists="pass",
        )
        context.log.info("Table created in STAGING")
    else:
        context.log.info("Table already exists in STAGING, appending to it...")
        tb.append(filepath=path,
                  if_exists="replace",
                  timeout=600,
                  chunk_size=1024 * 1024 * 10)
        context.log.info("Appended to table on STAGING successfully.")

    if not tb.table_exists("prod"):
        context.log.info("Table does not exist in PROD, publishing...")
        tb.publish(if_exists="pass")
        context.log.info("Published table in PROD successfully.")
    else:
        context.log.info("Table already published in PROD.")
Beispiel #2
0
def upload_logs_to_bq(context, timestamp, error):

    dataset_id = context.resources.basedosdados_config['dataset_id']
    table_id = context.resources.basedosdados_config['table_id'] + "_logs"

    filepath = Path(
        f"{timestamp}/{table_id}/data={pendulum.parse(timestamp).date()}/{table_id}_{timestamp}.csv")
    # create partition directory
    filepath.parent.mkdir(exist_ok=True, parents=True)
    # create dataframe to be uploaded
    df = pd.DataFrame(
        {"timestamp_captura": [pd.to_datetime(timestamp)], "sucesso": [
            error is None], "erro": [error]}
    )
    # save local
    df.to_csv(filepath, index=False)
    # BD Table object
    tb = Table(table_id, dataset_id)
    # create and publish if table does not exist, append to it otherwise
    if not tb.table_exists("staging"):
        tb.create(
            path=f"{timestamp}/{table_id}",
            if_table_exists="replace",
            if_storage_data_exists="replace",
            if_table_config_exists="pass",
        )
    elif not tb.table_exists("prod"):
        tb.publish(if_exists="replace")
    else:
        tb.append(filepath=f"{timestamp}/{table_id}", if_exists='replace')

    # delete local file
    shutil.rmtree(f"{timestamp}")
Beispiel #3
0
def upload_to_bq(context, paths):
    for key in paths.keys():
        context.log.info("#" * 80)
        context.log.info(f"KEY = {key}")
        tb = Table(
            key,
            context.resources.basedosdados_config["dataset_id"],
        )
        tb_dir = paths[key].parent.parent
        context.log.info(f"tb_dir = {tb_dir}")

        if not tb.table_exists("staging"):
            context.log.info(
                "Table does not exist in STAGING, creating table...")
            tb.create(
                path=tb_dir,
                if_table_exists="pass",
                if_storage_data_exists="replace",
                if_table_config_exists="pass",
            )
            context.log.info("Table created in STAGING")
        else:
            context.log.info(
                "Table already exists in STAGING, appending to it...")
            tb.append(filepath=tb_dir,
                      if_exists="replace",
                      timeout=600,
                      chunk_size=1024 * 1024 * 10)
            context.log.info("Appended to table on STAGING successfully.")

        if not tb.table_exists("prod"):
            context.log.info("Table does not exist in PROD, publishing...")
            tb.publish(if_exists="pass")
            context.log.info("Published table in PROD successfully.")
        else:
            context.log.info("Table already published in PROD.")
    context.log.info(f"Returning -> {tb_dir.parent}")

    return tb_dir.parent
Beispiel #4
0
def create_or_append_table(context, csv_path, which_table, _df, date):
    table_obj = Table(
        dataset_id=context.resources.basedosdados_config["dataset_id"],
        table_id=which_table,
    )
    query = f"""SELECT * FROM {table_obj.table_full_name['prod']} as t
            """
    if which_table == "realized_trips":
        query += f"""WHERE EXTRACT(DATE FROM t.departure_datetime) = DATE_SUB(DATE("{date}"), INTERVAL 1 DAY)"""
    if which_table == "unplanned":
        query += f"""WHERE DATE(t.dia) = DATE_SUB(DATE("{date}"), INTERVAL 1 DAY)"""

    try:
        ref = table_obj._get_table_obj("prod")
    except google.api_core.exceptions.NotFound:
        ref = None
    if ref:
        savepath = f"tmp_data/{which_table}_{date}_from_bq.csv"
        bd.download(
            savepath=savepath,
            query=query,
            billing_project_id=context.resources.bd_client.project,
            from_file=True,
            index=False,
        )

        tb = pd.read_csv(savepath)
        df = drop_overlap(tb, _df)
        df.to_csv(csv_path, index=False)

        table_obj.append(csv_path, if_exists="replace")
    else:
        _df.to_csv(csv_path, index=False)
        table_obj.create(csv_path,
                         if_table_config_exists="pass",
                         if_storage_data_exists="replace")
        table_obj.publish(if_exists="replace")