Example #1
0
def upload_to_raw(tipo, save_raw_path):
    if tipo == "estabelecimentos":
        st = bd.Storage(table_id="microdados_estabelecimentos",
                        dataset_id="br_me_caged")
    else:
        st = bd.Storage(table_id="microdados_movimentacoes",
                        dataset_id="br_me_caged")

    st.upload(path=save_raw_path, mode="raw", if_exists="replace")
Example #2
0
def fn_upload_file_to_storage(context,
                              file_path,
                              partitions=None,
                              mode="raw",
                              table_id=None,
                              dataset_id=None):

    # Upload to storage
    # If not specific table_id, use resource one
    if not table_id:
        table_id = context.resources.basedosdados_config["table_id"]
    if not dataset_id:
        dataset_id = context.resources.basedosdados_config["dataset_id"]

    st = bd.Storage(table_id=table_id, dataset_id=dataset_id)

    context.log.debug(f"Table ID: {table_id}, Dataset ID: {dataset_id}")
    context.log.debug(
        f"Uploading file {file_path} to mode {mode} with partitions {partitions}"
    )
    st.upload(path=file_path,
              mode=mode,
              partitions=partitions,
              if_exists="replace")

    return True
Example #3
0
def append_to_bigquery(
    context,
    file_paths,
    partitions,
    modes=["raw", "staging"],
    table_id=None,
    dataset_id=None,
):

    if not table_id:
        table_id = context.resources.basedosdados_config["table_id"]
    if not dataset_id:
        dataset_id = context.resources.basedosdados_config["dataset_id"]

    context.log.info(f"Table ID: {table_id} / Dataset ID: {dataset_id}")

    st = bd.Storage(dataset_id=dataset_id, table_id=table_id)

    for idx, mode in enumerate(modes):
        context.log.info(
            f"Uploading file {file_paths[idx]} to mode {mode} with partitions {partitions}"
        )
        st.upload(file_paths[idx],
                  partitions=partitions,
                  mode=mode,
                  if_exists="replace")
        Path(file_paths[idx]).unlink(missing_ok=True)
Example #4
0
def download_gtfs_from_storage(context):

    bucket = (bd.Storage(
        context.solid_config["dataset_id"],
        context.solid_config["table_id"]).client["storage_staging"].bucket(
            "rj-smtr-staging"))
    prefix = context.solid_config["storage_path"]
    blobs = [(blob.name, blob) for blob in bucket.list_blobs(prefix=prefix)]

    gtfs_versions = list(
        set([
            datetime.strptime(blob[0].split("=")[1].split("/")[0],
                              "%Y%m%d").date() for blob in blobs
        ]))

    gtfs_partition = build_gtfs_version_name(
        gtfs_versions, context.resources.schedule_run_date["date"])

    blob_obj = [
        blob[1] for blob in blobs if (prefix + gtfs_partition) in blob[0]
    ]

    Path("tmp_data").mkdir(exist_ok=True)

    gtfs_path = f"tmp_data/{gtfs_partition}.zip"

    blob_obj[0].download_to_filename(filename=gtfs_path)

    return gtfs_path
Example #5
0
def save_header_files(dataset_id, table_id):
    ### save table header in storage
    query = f"""
    SELECT * FROM `basedosdados.{dataset_id}.{table_id}` LIMIT 20
    """
    df = bd.read_sql(query, billing_project_id="basedosdados", from_file=True)
    df.to_csv("header.csv", index=False, encoding="utf-8")
    st = bd.Storage(dataset_id=dataset_id, table_id=table_id)
    st.upload("header.csv", mode="header", if_exists="replace")
Example #6
0
def append_to_bigquery_v2(context, file_path, partitions, mode, table_id=None):

    if not table_id:
        table_id = context.resources.basedosdados_config["table_id"]
    dataset_id = context.resources.basedosdados_config["dataset_id"]

    bd.Storage(dataset_id=dataset_id,
               table_id=table_id).upload(file_path,
                                         partitions=partitions,
                                         mode=mode,
                                         if_exists="replace")

    # delete file
    Path(file_path).unlink(missing_ok=True)