def upload_to_raw(tipo, save_raw_path): if tipo == "estabelecimentos": st = bd.Storage(table_id="microdados_estabelecimentos", dataset_id="br_me_caged") else: st = bd.Storage(table_id="microdados_movimentacoes", dataset_id="br_me_caged") st.upload(path=save_raw_path, mode="raw", if_exists="replace")
def fn_upload_file_to_storage(context, file_path, partitions=None, mode="raw", table_id=None, dataset_id=None): # Upload to storage # If not specific table_id, use resource one if not table_id: table_id = context.resources.basedosdados_config["table_id"] if not dataset_id: dataset_id = context.resources.basedosdados_config["dataset_id"] st = bd.Storage(table_id=table_id, dataset_id=dataset_id) context.log.debug(f"Table ID: {table_id}, Dataset ID: {dataset_id}") context.log.debug( f"Uploading file {file_path} to mode {mode} with partitions {partitions}" ) st.upload(path=file_path, mode=mode, partitions=partitions, if_exists="replace") return True
def append_to_bigquery( context, file_paths, partitions, modes=["raw", "staging"], table_id=None, dataset_id=None, ): if not table_id: table_id = context.resources.basedosdados_config["table_id"] if not dataset_id: dataset_id = context.resources.basedosdados_config["dataset_id"] context.log.info(f"Table ID: {table_id} / Dataset ID: {dataset_id}") st = bd.Storage(dataset_id=dataset_id, table_id=table_id) for idx, mode in enumerate(modes): context.log.info( f"Uploading file {file_paths[idx]} to mode {mode} with partitions {partitions}" ) st.upload(file_paths[idx], partitions=partitions, mode=mode, if_exists="replace") Path(file_paths[idx]).unlink(missing_ok=True)
def download_gtfs_from_storage(context): bucket = (bd.Storage( context.solid_config["dataset_id"], context.solid_config["table_id"]).client["storage_staging"].bucket( "rj-smtr-staging")) prefix = context.solid_config["storage_path"] blobs = [(blob.name, blob) for blob in bucket.list_blobs(prefix=prefix)] gtfs_versions = list( set([ datetime.strptime(blob[0].split("=")[1].split("/")[0], "%Y%m%d").date() for blob in blobs ])) gtfs_partition = build_gtfs_version_name( gtfs_versions, context.resources.schedule_run_date["date"]) blob_obj = [ blob[1] for blob in blobs if (prefix + gtfs_partition) in blob[0] ] Path("tmp_data").mkdir(exist_ok=True) gtfs_path = f"tmp_data/{gtfs_partition}.zip" blob_obj[0].download_to_filename(filename=gtfs_path) return gtfs_path
def save_header_files(dataset_id, table_id): ### save table header in storage query = f""" SELECT * FROM `basedosdados.{dataset_id}.{table_id}` LIMIT 20 """ df = bd.read_sql(query, billing_project_id="basedosdados", from_file=True) df.to_csv("header.csv", index=False, encoding="utf-8") st = bd.Storage(dataset_id=dataset_id, table_id=table_id) st.upload("header.csv", mode="header", if_exists="replace")
def append_to_bigquery_v2(context, file_path, partitions, mode, table_id=None): if not table_id: table_id = context.resources.basedosdados_config["table_id"] dataset_id = context.resources.basedosdados_config["dataset_id"] bd.Storage(dataset_id=dataset_id, table_id=table_id).upload(file_path, partitions=partitions, mode=mode, if_exists="replace") # delete file Path(file_path).unlink(missing_ok=True)