Example #1
0
def sync_bucket(
    source_bucket_name,
    dataset_id,
    table_id,
    destination_bucket_name,
    backup_bucket_name,
    mode="staging",
):
    """Copies proprosed data between storage buckets.
    Creates a backup of old data, then delete it and copies new data into the destination bucket.

    Args:
        source_bucket_name (str):
            The bucket name from which to copy data.
        dataset_id (str):
            Dataset id available in basedosdados. It should always come with table_id.
        table_id (str):
            Table id available in basedosdados.dataset_id.
            It should always come with dataset_id.
        destination_bucket_name (str):
            The bucket name which data will be copied to.
            If None, defaults to the bucket initialized when instantianting Storage object
            (check it with the Storage.bucket proprerty)
        backup_bucket_name (str):
            The bucket name for where backup data will be stored.
        mode (str): Optional.
        Folder of which dataset to update.

    Raises:
        ValueError:
            If there are no files corresponding to the given dataset_id and table_id on the source bucket
    """

    ref = Storage(dataset_id=dataset_id, table_id=table_id)

    prefix = f"{mode}/{dataset_id}/{table_id}/"

    source_ref = (
        ref.client["storage_staging"].bucket(source_bucket_name).list_blobs(
            prefix=prefix))

    destination_ref = ref.bucket.list_blobs(prefix=prefix)

    if len(list(source_ref)) == 0:

        raise ValueError("No objects found on the source bucket")

    # MAKE A BACKUP OF OLD DATA
    if len(list(destination_ref)):
        ref.copy_table(
            source_bucket_name=destination_bucket_name,
            destination_bucket_name=backup_bucket_name,
        )

        # DELETE OLD DATA FROM PROD
        ref.delete_table(not_found_ok=True)

    # COPIES DATA TO DESTINATION
    ref.copy_table(source_bucket_name=source_bucket_name)
Example #2
0
File: table.py Project: jvfe/mais
    def append(self, filepath, partitions=None, if_exists="raise", **upload_args):
        """Appends new data to existing BigQuery table.

        As long as the data has the same schema. It appends the data in the
        filepath to the existing table.

        Args:
            filepath (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with
            partitions (str, pathlib.PosixPath, dict): Optional.
                Hive structured partition as a string or dict

                * str : `<key>=<value>/<key2>=<value2>`
                * dict: `dict(key=value, key2=value2)`
            if_exists (str): 0ptional.
                What to do if data with same name exists in storage

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
        """

        Storage(self.dataset_id, self.table_id, **self.main_vars).upload(
            filepath,
            mode="staging",
            partitions=None,
            if_exists=if_exists,
            **upload_args,
        )

        self.create(if_exists="replace")
Example #3
0
def storage_copy_table(ctx, dataset_id, table_id, source_bucket_name,
                       dst_bucket_name, mode):
    Storage(dataset_id, table_id).copy_table(
        source_bucket_name=source_bucket_name,
        destination_bucket_name=dst_bucket_name,
        mode=mode,
    )
Example #4
0
def upload_storage(ctx, dataset_id, table_id, filepath, mode, partitions, if_exists):

    ctx.obj.pop("bucket_name")
    blob_name = Storage(dataset_id, table_id, **ctx.obj).upload(
        filepath=filepath, mode=mode, partitions=partitions, if_exists=if_exists
    )

    click.echo(click.style(f"Data was added to `{blob_name}`", fg="green",))
Example #5
0
def init_storage(ctx, bucket_name, replace, very_sure):

    # TODO: Create config file to store bucket_name, etc...
    ctx.obj.pop("bucket_name")
    Storage(bucket_name=bucket_name, **ctx.obj).init(
        replace=replace, very_sure=very_sure
    )

    click.echo(click.style(f"Bucket `{bucket_name}` was created", fg="green",))
Example #6
0
def storage_delete_table(ctx, dataset_id, table_id, mode, not_found_ok, bucket_name):
    Storage(dataset_id, table_id, **ctx.obj).delete_table(
        mode=mode, not_found_ok=not_found_ok, bucket_name=bucket_name
    )
    click.echo(
        click.style(
            f"Data was deleted from bucket `{bucket_name}`",
            fg="green",
        )
    )
Example #7
0
def download_storage(
    ctx, dataset_id, table_id, filename, savepath, partitions, mode, if_not_exists
):
    Storage(dataset_id, table_id, **ctx.obj).download(
        filename, savepath, partitions, mode, if_not_exists
    )

    click.echo(
        click.style(
            f"Data was downloaded to `{savepath}`",
            fg="green",
        )
    )
Example #8
0
File: table.py Project: jvfe/mais
    def create(
        self,
        path=None,
        job_config_params=None,
        partitioned=False,
        if_exists="raise",
        force_dataset=True,
    ):
        """Creates BigQuery table at staging dataset.

        If you add a path, it automatically saves the data in the storage,
        creates a datasets folder and BigQuery location, besides creating the
        table and its configuration files.

        The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery.

        It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*`
        and builds the table.

        It currently supports the types:
            - Comma Delimited CSV

        Data can also be partitioned following the hive partitioning scheme
        `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR`

        Args:
            path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with
            job_config_params (dict): Optional.
                Job configuration params from bigquery
            partitioned (bool): Optional.
                Whether data is partitioned
            if_exists (str): Optional
                What to do if table exists

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists.

        Todo:

            * Implement if_exists=raise
            * Implement if_exists=pass
        """

        # Add data to storage
        if isinstance(
            path,
            (
                str,
                PosixPath,
            ),
        ):

            Storage(self.dataset_id, self.table_id, **self.main_vars).upload(
                path, mode="staging", if_exists="replace"
            )

            # Create Dataset if it doesn't exist
            if force_dataset:

                dataset_obj = Dataset(self.dataset_id, **self.main_vars)

                try:
                    dataset_obj.init()
                except FileExistsError:
                    pass

                dataset_obj.create(if_exists="pass")

            self.init(data_sample_path=path, if_exists="replace")

        external_config = external_config = bigquery.ExternalConfig("CSV")
        external_config.options.skip_leading_rows = 1
        external_config.options.allow_quoted_newlines = True
        external_config.options.allow_jagged_rows = True
        external_config.autodetect = False
        external_config.schema = self._load_schema("staging")

        external_config.source_uris = (
            f"gs://basedosdados/staging/{self.dataset_id}/{self.table_id}/*"
        )

        if partitioned:

            hive_partitioning = bigquery.external_config.HivePartitioningOptions()
            hive_partitioning.mode = "AUTO"
            hive_partitioning.source_uri_prefix = self.uri.format(
                dataset=self.dataset_id, table=self.table_id
            ).replace("*", "")
            external_config.hive_partitioning = hive_partitioning

        table = bigquery.Table(self.table_full_name["staging"])

        table.external_data_configuration = external_config

        if if_exists == "replace":
            self.delete(mode="staging")

        self.client["bigquery_staging"].create_table(table)

        table = bigquery.Table(self.table_full_name["staging"])
Example #9
0
def storage_delete_table(ctx, dataset_id, table_id, mode, not_found_ok,
                         bucket_name):
    Storage(dataset_id, table_id).delete_table(mode=mode,
                                               not_found_ok=not_found_ok,
                                               bucket_name=bucket_name)
Example #10
0
    def create(
        self,
        path=None,
        job_config_params=None,
        force_dataset=True,
        if_table_exists="raise",
        if_storage_data_exists="raise",
        if_table_config_exists="raise",
        source_format="csv",
    ):
        """Creates BigQuery table at staging dataset.

        If you add a path, it automatically saves the data in the storage,
        creates a datasets folder and BigQuery location, besides creating the
        table and its configuration files.

        The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery.

        It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*`
        and builds the table.

        It currently supports the types:
            - Comma Delimited CSV

        Data can also be partitioned following the hive partitioning scheme
        `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR`

        Args:
            path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with
            job_config_params (dict): Optional.
                Job configuration params from bigquery
            partitioned (bool): Optional.
                Whether data is partitioned
            if_table_exists (str): Optional
                What to do if table exists

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists.
            if_table_config_exists (str): Optional.
                What to do if config files already exist

                 * 'raise': Raises FileExistError
                 * 'replace': Replace with blank template
                 * 'pass'; Do nothing
            if_storage_data_exists (str): Optional.
                What to do if data already exists on your bucket:

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            source_format (str): Optional
                Data source format. Only 'csv' is supported. Defaults to 'csv'.
        """

        if path is None:

            # Look if table data already exists at Storage
            data = self.client["storage_staging"].list_blobs(
                self.bucket_name,
                prefix=f"staging/{self.dataset_id}/{self.table_id}")

            # Raise: Cannot create table without external data
            if not data:
                raise BaseDosDadosException(
                    "You must provide a path for uploading data")

        # Add data to storage
        if isinstance(
                path,
            (
                str,
                Path,
            ),
        ):

            Storage(self.dataset_id, self.table_id,
                    **self.main_vars).upload(path,
                                             mode="staging",
                                             if_exists=if_storage_data_exists)

        # Create Dataset if it doesn't exist
        if force_dataset:

            dataset_obj = Dataset(self.dataset_id, **self.main_vars)

            try:
                dataset_obj.init()
            except FileExistsError:
                pass

            dataset_obj.create(if_exists="pass")

        self.init(
            data_sample_path=path,
            if_folder_exists="replace",
            if_table_config_exists=if_table_config_exists,
        )

        table = bigquery.Table(self.table_full_name["staging"])

        table.external_data_configuration = Datatype(
            self, source_format, "staging",
            partitioned=self._is_partitioned()).external_config

        # Lookup if table alreay exists
        table_ref = None
        try:
            table_ref = self.client["bigquery_staging"].get_table(
                self.table_full_name["staging"])

        except google.api_core.exceptions.NotFound:
            pass

        if isinstance(table_ref, google.cloud.bigquery.table.Table):

            if if_table_exists == "pass":

                return None

            elif if_table_exists == "raise":

                raise FileExistsError(
                    "Table already exists, choose replace if you want to overwrite it"
                )

        if if_table_exists == "replace":

            self.delete(mode="staging")

        self.client["bigquery_staging"].create_table(table)