Ejemplo n.º 1
0
def storage_copy_table(ctx, dataset_id, table_id, source_bucket_name,
                       dst_bucket_name, mode):
    Storage(dataset_id, table_id, **ctx.obj).copy_table(
        source_bucket_name=source_bucket_name,
        destination_bucket_name=dst_bucket_name,
        mode=mode,
    )
Ejemplo n.º 2
0
    def append(self,
               filepath,
               partitions=None,
               if_exists="replace",
               **upload_args):
        """Appends new data to existing BigQuery table.

        As long as the data has the same schema. It appends the data in the
        filepath to the existing table.

        Args:
            filepath (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with
            partitions (str, pathlib.PosixPath, dict): Optional.
                Hive structured partition as a string or dict

                * str : `<key>=<value>/<key2>=<value2>`
                * dict: `dict(key=value, key2=value2)`
            if_exists (str): 0ptional.
                What to do if data with same name exists in storage

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
        """
        if not self.table_exists("staging"):
            raise BaseDosDadosException(
                "You cannot append to a table that does not exist")
        else:
            Storage(self.dataset_id, self.table_id, **self.main_vars).upload(
                filepath,
                mode="staging",
                partitions=partitions,
                if_exists=if_exists,
                **upload_args,
            )
Ejemplo n.º 3
0
def storage_delete_table(ctx, dataset_id, table_id, mode, not_found_ok,
                         bucket_name):
    Storage(dataset_id, table_id,
            **ctx.obj).delete_table(mode=mode,
                                    not_found_ok=not_found_ok,
                                    bucket_name=bucket_name)
    click.echo(
        click.style(
            f"Data was deleted from bucket `{bucket_name}`",
            fg="green",
        ))
Ejemplo n.º 4
0
def download_storage(ctx, dataset_id, table_id, filename, savepath, partitions,
                     mode, if_not_exists):
    Storage(dataset_id, table_id,
            **ctx.obj).download(filename, savepath, partitions, mode,
                                if_not_exists)

    click.echo(
        click.style(
            f"Data was downloaded to `{savepath}`",
            fg="green",
        ))
Ejemplo n.º 5
0
def init_storage(ctx, bucket_name, replace, very_sure):

    # TODO: Create config file to store bucket_name, etc...
    ctx.obj.pop("bucket_name")
    Storage(bucket_name=bucket_name, **ctx.obj).init(replace=replace,
                                                     very_sure=very_sure)

    click.echo(click.style(
        f"Bucket `{bucket_name}` was created",
        fg="green",
    ))
Ejemplo n.º 6
0
def upload_storage(ctx, dataset_id, table_id, filepath, mode, partitions,
                   if_exists):

    ctx.obj.pop("bucket_name")
    blob_name = Storage(dataset_id, table_id,
                        **ctx.obj).upload(filepath=filepath,
                                          mode=mode,
                                          partitions=partitions,
                                          if_exists=if_exists)

    click.echo(click.style(
        f"Data was added to `{blob_name}`",
        fg="green",
    ))
Ejemplo n.º 7
0
    def create(
        self,
        path=None,
        job_config_params=None,
        force_dataset=True,
        if_table_exists="raise",
        if_storage_data_exists="raise",
        if_table_config_exists="raise",
        source_format="csv",
        columns_config_url=None,
    ):
        """Creates BigQuery table at staging dataset.

        If you add a path, it automatically saves the data in the storage,
        creates a datasets folder and BigQuery location, besides creating the
        table and its configuration files.

        The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery.

        It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*`
        and builds the table.

        It currently supports the types:

        - Comma Delimited CSV

        Data can also be partitioned following the hive partitioning scheme
        `<key1>=<value1>/<key2>=<value2>` - for instance,
        `year=2012/country=BR`. The partition is automatcally detected
        by searching for `partitions` on the `table_config.yaml`.

        Args:
            path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with
            job_config_params (dict): Optional.
                Job configuration params from bigquery
            if_table_exists (str): Optional
                What to do if table exists

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists.
            if_table_config_exists (str): Optional.
                What to do if config files already exist

                 * 'raise': Raises FileExistError
                 * 'replace': Replace with blank template
                 * 'pass'; Do nothing
            if_storage_data_exists (str): Optional.
                What to do if data already exists on your bucket:

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            source_format (str): Optional
                Data source format. Only 'csv' is supported. Defaults to 'csv'.

            columns_config_url (str): google sheets URL.
                The URL must be in the format https://docs.google.com/spreadsheets/d/<table_key>/edit#gid=<table_gid>.
                The sheet must contain the column name: "coluna" and column description: "descricao"

        """

        if path is None:

            # Look if table data already exists at Storage
            data = self.client["storage_staging"].list_blobs(
                self.bucket_name,
                prefix=f"staging/{self.dataset_id}/{self.table_id}")

            # Raise: Cannot create table without external data
            if not data:
                raise BaseDosDadosException(
                    "You must provide a path for uploading data")

        # Add data to storage
        if isinstance(
                path,
            (
                str,
                Path,
            ),
        ):

            Storage(self.dataset_id, self.table_id,
                    **self.main_vars).upload(path,
                                             mode="staging",
                                             if_exists=if_storage_data_exists)

        # Create Dataset if it doesn't exist
        if force_dataset:

            dataset_obj = Dataset(self.dataset_id, **self.main_vars)

            try:
                dataset_obj.init()
            except FileExistsError:
                pass

            dataset_obj.create(if_exists="pass")

        self.init(
            data_sample_path=path,
            if_folder_exists="replace",
            if_table_config_exists=if_table_config_exists,
            columns_config_url=columns_config_url,
        )

        table = bigquery.Table(self.table_full_name["staging"])

        table.external_data_configuration = Datatype(
            self, source_format, "staging",
            partitioned=self._is_partitioned()).external_config

        # Lookup if table alreay exists
        table_ref = None
        try:
            table_ref = self.client["bigquery_staging"].get_table(
                self.table_full_name["staging"])

        except google.api_core.exceptions.NotFound:
            pass

        if isinstance(table_ref, google.cloud.bigquery.table.Table):

            if if_table_exists == "pass":

                return None

            elif if_table_exists == "raise":

                raise FileExistsError(
                    "Table already exists, choose replace if you want to overwrite it"
                )

        if if_table_exists == "replace":

            self.delete(mode="staging")

        self.client["bigquery_staging"].create_table(table)