def create_or_append_table(context, dataset_id, table_id, path): tb = Table(table_id=table_id, dataset_id=dataset_id) if not tb.table_exists('staging'): context.log.info("Table does not exist in STAGING, creating table...") tb.create( path=path, if_table_exists="pass", if_storage_data_exists="replace", if_table_config_exists="pass", ) context.log.info("Table created in STAGING") else: context.log.info("Table already exists in STAGING, appending to it...") tb.append(filepath=path, if_exists="replace", timeout=600, chunk_size=1024 * 1024 * 10) context.log.info("Appended to table on STAGING successfully.") if not tb.table_exists("prod"): context.log.info("Table does not exist in PROD, publishing...") tb.publish(if_exists="pass") context.log.info("Published table in PROD successfully.") else: context.log.info("Table already published in PROD.")
def upload_logs_to_bq(context, timestamp, error): dataset_id = context.resources.basedosdados_config['dataset_id'] table_id = context.resources.basedosdados_config['table_id'] + "_logs" filepath = Path( f"{timestamp}/{table_id}/data={pendulum.parse(timestamp).date()}/{table_id}_{timestamp}.csv") # create partition directory filepath.parent.mkdir(exist_ok=True, parents=True) # create dataframe to be uploaded df = pd.DataFrame( {"timestamp_captura": [pd.to_datetime(timestamp)], "sucesso": [ error is None], "erro": [error]} ) # save local df.to_csv(filepath, index=False) # BD Table object tb = Table(table_id, dataset_id) # create and publish if table does not exist, append to it otherwise if not tb.table_exists("staging"): tb.create( path=f"{timestamp}/{table_id}", if_table_exists="replace", if_storage_data_exists="replace", if_table_config_exists="pass", ) elif not tb.table_exists("prod"): tb.publish(if_exists="replace") else: tb.append(filepath=f"{timestamp}/{table_id}", if_exists='replace') # delete local file shutil.rmtree(f"{timestamp}")
def upload_to_bq(context, paths): for key in paths.keys(): context.log.info("#" * 80) context.log.info(f"KEY = {key}") tb = Table( key, context.resources.basedosdados_config["dataset_id"], ) tb_dir = paths[key].parent.parent context.log.info(f"tb_dir = {tb_dir}") if not tb.table_exists("staging"): context.log.info( "Table does not exist in STAGING, creating table...") tb.create( path=tb_dir, if_table_exists="pass", if_storage_data_exists="replace", if_table_config_exists="pass", ) context.log.info("Table created in STAGING") else: context.log.info( "Table already exists in STAGING, appending to it...") tb.append(filepath=tb_dir, if_exists="replace", timeout=600, chunk_size=1024 * 1024 * 10) context.log.info("Appended to table on STAGING successfully.") if not tb.table_exists("prod"): context.log.info("Table does not exist in PROD, publishing...") tb.publish(if_exists="pass") context.log.info("Published table in PROD successfully.") else: context.log.info("Table already published in PROD.") context.log.info(f"Returning -> {tb_dir.parent}") return tb_dir.parent