Esempio n. 1
0
def test_delete_table(storage):

    Storage("br_ibge_pib",
            "municipio").delete_table(bucket_name="basedosdados-dev")

    with pytest.raises(FileNotFoundError):
        Storage("br_ibge_pib", "municipio").delete_table()
Esempio n. 2
0
def bq_upload(context, filepath, raw_filepath=None, partitions=None):
    table_id = context.resources.basedosdados_config['table_id']
    dataset_id = context.resources.basedosdados_config['dataset_id']
    context.log.info(f"""
    Received inputs:
    raw_filepath = {raw_filepath}, type = {type(raw_filepath)}
    treated_filepath = {filepath}, type = {type(filepath)}
    dataset_id = {dataset_id}, type = {type(dataset_id)}
    table_id = {table_id}, type = {type(table_id)}
    partitions = {partitions}, type = {type(partitions)}
    """)
    # Upload raw to staging
    if raw_filepath:
        st = Storage(table_id=table_id, dataset_id=dataset_id)
        context.log.info(
            f"Uploading raw file: {raw_filepath} to bucket {st.bucket_name} at {st.bucket_name}/{dataset_id}/{table_id}"
        )
        st.upload(path=raw_filepath,
                  partitions=partitions,
                  mode='raw',
                  if_exists='replace')

    # creates and publish table if it does not exist, append to it otherwise
    if partitions:
        # If table is partitioned, get parent directory wherein partitions are stored
        tb_dir = filepath.split(partitions)[0]
        create_or_append_table(context, dataset_id, table_id, tb_dir)
    else:
        create_or_append_table(context, dataset_id, table_id, filepath)

    # Delete local Files
    context.log.info(f"Deleting local files: {raw_filepath}, {filepath}")
    cleanup_local(filepath, raw_filepath)
Esempio n. 3
0
def test_copy_table(storage):

    Storage("br_ibge_pib", "municipio").copy_table()

    with pytest.raises(FileNotFoundError):
        Storage("br_ibge_pib2", "municipio2").copy_table()

    Storage("br_ibge_pib", "municipio").copy_table(
        destination_bucket_name="basedosdados-dev", )
Esempio n. 4
0
def upload(context, filename):
    dataset_id = context.resources.basedosdados_config["dataset_id"]
    table_id = context.resources.basedosdados_config["table_id"]

    st = Storage(dataset_id, table_id)

    context.log.info(
        f"Uploading {filename} to GCS at:{st.bucket_name}/staging/{dataset_id}/{table_id}",
    )
    st.upload(path=filename, mode="staging", if_exists="replace")

    return filename
Esempio n. 5
0
def test_create_storage_data_exist_table_config_exist(table, metadatadir,
                                                      data_path, sample_data):

    shutil.rmtree(metadatadir / DATASET_ID / TABLE_ID, ignore_errors=True)

    Dataset(dataset_id=DATASET_ID,
            metadata_path=metadatadir).create(if_exists="pass")

    Storage(dataset_id=DATASET_ID,
            table_id=TABLE_ID,
            metadata_path=metadatadir).upload(data_path,
                                              mode="staging",
                                              if_exists="replace")

    table.init(
        data_sample_path=data_path,
        if_folder_exists="replace",
        if_table_config_exists="replace",
    )

    for file in TABLE_FILES:
        shutil.copy(sample_data / file, table.table_folder / file)

    table.delete(mode="all")

    table.create(
        data_path,
        if_storage_data_exists="pass",
        if_table_config_exists="pass",
    )
    assert table_exists(table, "staging")
Esempio n. 6
0
def test_create(table, metadatadir):

    shutil.rmtree(Path(metadatadir) / DATASET_ID / TABLE_ID,
                  ignore_errors=True)

    Dataset(dataset_id=DATASET_ID,
            metadata_path=metadatadir).create(if_exists="pass")

    Storage(dataset_id=DATASET_ID,
            table_id=TABLE_ID,
            metadata_path=metadatadir).upload(
                "tests/sample_data/municipios.csv",
                mode="staging",
                if_exists="replace")

    table.init(data_sample_path="tests/sample_data/municipios.csv",
               if_exists="replace")

    table.delete(mode="all")

    table.create()

    assert table_exists(table, mode="staging")

    table.create(if_exists="replace")

    assert table_exists(table, mode="staging")

    table.create("tests/sample_data/municipios.csv", if_exists="replace")
Esempio n. 7
0
def test_create_with_upload(table, metadatadir, data_path):

    table.delete("all")

    Storage(DATASET_ID, TABLE_ID).delete_table(not_found_ok=True)

    table.create(data_path, if_table_config_exists="replace")
    assert table_exists(table, mode="staging")
Esempio n. 8
0
def test_create_with_path(table, metadatadir, data_path, sample_data):

    table.delete("all")
    Storage(DATASET_ID, TABLE_ID).delete_table(not_found_ok=True)
    shutil.rmtree(metadatadir / DATASET_ID / TABLE_ID, ignore_errors=True)

    table.create(data_path, )
    assert table_exists(table, mode="staging")
Esempio n. 9
0
def test_create_if_storage_data_raise(table, metadatadir, data_path):

    Storage(dataset_id=DATASET_ID, table_id=TABLE_ID, metadata_path=metadatadir).upload(
        data_path, mode="staging", if_exists="replace"
    )

    with pytest.raises(Exception):
        table.create(
            data_path,
            if_table_exists="replace",
            if_table_config_exists="replace",
            if_storage_data_exists="raise",
        )
Esempio n. 10
0
def test_create_no_path(table, metadatadir, data_path, sample_data):

    Storage(dataset_id=DATASET_ID, table_id=TABLE_ID, metadata_path=metadatadir).upload(
        data_path, mode="staging", if_exists="replace"
    )

    table.init(data_sample_path=data_path, if_folder_exists="replace")

    for file in TABLE_FILES:
        shutil.copy(sample_data / file, table.table_folder / file)

    table.create(if_storage_data_exists="pass", if_table_config_exists="pass")
    assert table_exists(table, "staging")
Esempio n. 11
0
File: main.py Progetto: avila/mais
def sync_bucket(
    source_bucket_name,
    dataset_id,
    table_id,
    destination_bucket_name,
    backup_bucket_name,
    mode="staging",
):
    """Copies proprosed data between storage buckets.
    Creates a backup of old data, then delete it and copies new data into the destination bucket.

    Args:
        source_bucket_name (str):
            The bucket name from which to copy data.
        dataset_id (str):
            Dataset id available in basedosdados. It should always come with table_id.
        table_id (str):
            Table id available in basedosdados.dataset_id.
            It should always come with dataset_id.
        destination_bucket_name (str):
            The bucket name which data will be copied to.
            If None, defaults to the bucket initialized when instantianting Storage object
            (check it with the Storage.bucket proprerty)
        backup_bucket_name (str):
            The bucket name for where backup data will be stored.
        mode (str): Optional.
        Folder of which dataset to update.

    Raises:
        ValueError:
            If there are no files corresponding to the given dataset_id and table_id on the source bucket
    """

    ref = Storage(dataset_id=dataset_id, table_id=table_id)

    prefix = f"{mode}/{dataset_id}/{table_id}/"

    source_ref = (
        ref.client["storage_staging"].bucket(source_bucket_name).list_blobs(
            prefix=prefix))

    destination_ref = ref.bucket.list_blobs(prefix=prefix)

    if len(list(source_ref)) == 0:

        raise ValueError("No objects found on the source bucket")

    # MAKE A BACKUP OF OLD DATA
    if len(list(destination_ref)):
        print(
            f"\n########################################### COPY BACKUP ###########################################\n"
        )
        ref.copy_table(
            source_bucket_name=destination_bucket_name,
            destination_bucket_name=backup_bucket_name,
        )
        print(
            f"\n########################################## DELETE OLD DATA  ##########################################\n"
        )
        # DELETE OLD DATA FROM PROD
        ref.delete_table(not_found_ok=True)
    print(
        f"\n########################################### COPY NEW DATA  ###########################################\n"
    )
    # COPIES DATA TO DESTINATION
    ref.copy_table(source_bucket_name=source_bucket_name)
Esempio n. 12
0
def storage(metadatadir):
    return Storage(dataset_id=DATASET_ID,
                   table_id=TABLE_ID,
                   metadata_path=metadatadir)
Esempio n. 13
0
def sync_bucket(
    source_bucket_name,
    dataset_id,
    table_id,
    destination_bucket_name,
    backup_bucket_name,
    mode="staging",
):
    """Copies proprosed data between storage buckets.
    Creates a backup of old data, then delete it and copies new data into the destination bucket.

    Args:
        source_bucket_name (str):
            The bucket name from which to copy data.
        dataset_id (str):
            Dataset id available in basedosdados. It should always come with table_id.
        table_id (str):
            Table id available in basedosdados.dataset_id.
            It should always come with dataset_id.
        destination_bucket_name (str):
            The bucket name which data will be copied to.
            If None, defaults to the bucket initialized when instantianting Storage object
            (check it with the Storage.bucket proprerty)
        backup_bucket_name (str):
            The bucket name for where backup data will be stored.
        mode (str): Optional
            Folder of which dataset to update.[raw|staging|header|auxiliary_files|architecture]

    Raises:
        ValueError:
            If there are no files corresponding to the given dataset_id and table_id on the source bucket
    """

    ref = Storage(dataset_id=dataset_id, table_id=table_id)

    prefix = f"{mode}/{dataset_id}/{table_id}/"

    source_ref = (
        ref.client["storage_staging"].bucket(source_bucket_name).list_blobs(
            prefix=prefix))

    destination_ref = ref.bucket.list_blobs(prefix=prefix)

    if len(list(source_ref)) == 0:
        raise ValueError(
            f"No objects found on the source bucket {source_bucket_name}.{prefix}"
        )

    if len(list(destination_ref)):
        backup_bucket_blobs = list(ref.client["storage_staging"].bucket(
            backup_bucket_name).list_blobs(prefix=prefix))
        if len(backup_bucket_blobs):
            tprint(f"{mode.upper()}: DELETE BACKUP DATA")
            ref.delete_table(not_found_ok=True,
                             mode=mode,
                             bucket_name=backup_bucket_name)

        tprint(f"{mode.upper()}: BACKUP OLD DATA")
        ref.copy_table(
            source_bucket_name=destination_bucket_name,
            destination_bucket_name=backup_bucket_name,
            mode=mode,
        )

        tprint(f"{mode.upper()}: DELETE OLD DATA")
        ref.delete_table(not_found_ok=True,
                         mode=mode,
                         bucket_name=destination_bucket_name)

    tprint(f"{mode.upper()}: TRANSFER NEW DATA")
    ref.copy_table(
        source_bucket_name=source_bucket_name,
        destination_bucket_name=destination_bucket_name,
        mode=mode,
    )