def create_or_append_table(context, dataset_id, table_id, path): tb = Table(table_id=table_id, dataset_id=dataset_id) if not tb.table_exists('staging'): context.log.info("Table does not exist in STAGING, creating table...") tb.create( path=path, if_table_exists="pass", if_storage_data_exists="replace", if_table_config_exists="pass", ) context.log.info("Table created in STAGING") else: context.log.info("Table already exists in STAGING, appending to it...") tb.append(filepath=path, if_exists="replace", timeout=600, chunk_size=1024 * 1024 * 10) context.log.info("Appended to table on STAGING successfully.") if not tb.table_exists("prod"): context.log.info("Table does not exist in PROD, publishing...") tb.publish(if_exists="pass") context.log.info("Published table in PROD successfully.") else: context.log.info("Table already published in PROD.")
def upload_logs_to_bq(context, timestamp, error): dataset_id = context.resources.basedosdados_config['dataset_id'] table_id = context.resources.basedosdados_config['table_id'] + "_logs" filepath = Path( f"{timestamp}/{table_id}/data={pendulum.parse(timestamp).date()}/{table_id}_{timestamp}.csv") # create partition directory filepath.parent.mkdir(exist_ok=True, parents=True) # create dataframe to be uploaded df = pd.DataFrame( {"timestamp_captura": [pd.to_datetime(timestamp)], "sucesso": [ error is None], "erro": [error]} ) # save local df.to_csv(filepath, index=False) # BD Table object tb = Table(table_id, dataset_id) # create and publish if table does not exist, append to it otherwise if not tb.table_exists("staging"): tb.create( path=f"{timestamp}/{table_id}", if_table_exists="replace", if_storage_data_exists="replace", if_table_config_exists="pass", ) elif not tb.table_exists("prod"): tb.publish(if_exists="replace") else: tb.append(filepath=f"{timestamp}/{table_id}", if_exists='replace') # delete local file shutil.rmtree(f"{timestamp}")
def test_create_auto_partitions(metadatadir): table_part = Table( dataset_id=DATASET_ID, table_id=TABLE_ID + "_autopartitioned", metadata_path=metadatadir, ) table_part.create("tests/sample_data/partitions", partitioned=True, if_exists="replace") table_part.publish()
def test_create_auto_partitions(metadatadir, data_path, sample_data): shutil.rmtree(metadatadir / "partitions", ignore_errors=True) table_part = Table( dataset_id=DATASET_ID, table_id=TABLE_ID + "_partitioned", metadata_path=metadatadir, ) table_part.delete("all") table_part.init( data_sample_path=data_path, if_folder_exists="replace", if_table_config_exists="replace", ) Path(metadatadir / "partitions").mkdir() shutil.copy( sample_data / "table_config_part.yaml", Path(table_part.table_folder / "table_config.yaml"), ) shutil.copy( sample_data / "publish_part.sql", table_part.table_folder / "publish.sql", ) for n in [1, 2]: Path(metadatadir / "partitions" / f"keys={n}").mkdir() shutil.copy( metadatadir / "municipios.csv", metadatadir / "partitions" / f"keys={n}" / "municipios.csv", ) table_part.create( metadatadir / "partitions", partitioned=True, if_table_exists="replace", if_table_config_exists="pass", if_storage_data_exists="replace", ) assert table_exists(table_part, "staging") table_part.publish() assert table_exists(table_part, "prod")
def upload_to_bq(context, paths): for key in paths.keys(): context.log.info("#" * 80) context.log.info(f"KEY = {key}") tb = Table( key, context.resources.basedosdados_config["dataset_id"], ) tb_dir = paths[key].parent.parent context.log.info(f"tb_dir = {tb_dir}") if not tb.table_exists("staging"): context.log.info( "Table does not exist in STAGING, creating table...") tb.create( path=tb_dir, if_table_exists="pass", if_storage_data_exists="replace", if_table_config_exists="pass", ) context.log.info("Table created in STAGING") else: context.log.info( "Table already exists in STAGING, appending to it...") tb.append(filepath=tb_dir, if_exists="replace", timeout=600, chunk_size=1024 * 1024 * 10) context.log.info("Appended to table on STAGING successfully.") if not tb.table_exists("prod"): context.log.info("Table does not exist in PROD, publishing...") tb.publish(if_exists="pass") context.log.info("Published table in PROD successfully.") else: context.log.info("Table already published in PROD.") context.log.info(f"Returning -> {tb_dir.parent}") return tb_dir.parent
def create_or_append_table(context, csv_path, which_table, _df, date): table_obj = Table( dataset_id=context.resources.basedosdados_config["dataset_id"], table_id=which_table, ) query = f"""SELECT * FROM {table_obj.table_full_name['prod']} as t """ if which_table == "realized_trips": query += f"""WHERE EXTRACT(DATE FROM t.departure_datetime) = DATE_SUB(DATE("{date}"), INTERVAL 1 DAY)""" if which_table == "unplanned": query += f"""WHERE DATE(t.dia) = DATE_SUB(DATE("{date}"), INTERVAL 1 DAY)""" try: ref = table_obj._get_table_obj("prod") except google.api_core.exceptions.NotFound: ref = None if ref: savepath = f"tmp_data/{which_table}_{date}_from_bq.csv" bd.download( savepath=savepath, query=query, billing_project_id=context.resources.bd_client.project, from_file=True, index=False, ) tb = pd.read_csv(savepath) df = drop_overlap(tb, _df) df.to_csv(csv_path, index=False) table_obj.append(csv_path, if_exists="replace") else: _df.to_csv(csv_path, index=False) table_obj.create(csv_path, if_table_config_exists="pass", if_storage_data_exists="replace") table_obj.publish(if_exists="replace")