Ejemplo n.º 1
0
Archivo: table.py Proyecto: jvfe/mais
    def create(
        self,
        path=None,
        job_config_params=None,
        partitioned=False,
        if_exists="raise",
        force_dataset=True,
    ):
        """Creates BigQuery table at staging dataset.

        If you add a path, it automatically saves the data in the storage,
        creates a datasets folder and BigQuery location, besides creating the
        table and its configuration files.

        The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery.

        It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*`
        and builds the table.

        It currently supports the types:
            - Comma Delimited CSV

        Data can also be partitioned following the hive partitioning scheme
        `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR`

        Args:
            path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with
            job_config_params (dict): Optional.
                Job configuration params from bigquery
            partitioned (bool): Optional.
                Whether data is partitioned
            if_exists (str): Optional
                What to do if table exists

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists.

        Todo:

            * Implement if_exists=raise
            * Implement if_exists=pass
        """

        # Add data to storage
        if isinstance(
            path,
            (
                str,
                PosixPath,
            ),
        ):

            Storage(self.dataset_id, self.table_id, **self.main_vars).upload(
                path, mode="staging", if_exists="replace"
            )

            # Create Dataset if it doesn't exist
            if force_dataset:

                dataset_obj = Dataset(self.dataset_id, **self.main_vars)

                try:
                    dataset_obj.init()
                except FileExistsError:
                    pass

                dataset_obj.create(if_exists="pass")

            self.init(data_sample_path=path, if_exists="replace")

        external_config = external_config = bigquery.ExternalConfig("CSV")
        external_config.options.skip_leading_rows = 1
        external_config.options.allow_quoted_newlines = True
        external_config.options.allow_jagged_rows = True
        external_config.autodetect = False
        external_config.schema = self._load_schema("staging")

        external_config.source_uris = (
            f"gs://basedosdados/staging/{self.dataset_id}/{self.table_id}/*"
        )

        if partitioned:

            hive_partitioning = bigquery.external_config.HivePartitioningOptions()
            hive_partitioning.mode = "AUTO"
            hive_partitioning.source_uri_prefix = self.uri.format(
                dataset=self.dataset_id, table=self.table_id
            ).replace("*", "")
            external_config.hive_partitioning = hive_partitioning

        table = bigquery.Table(self.table_full_name["staging"])

        table.external_data_configuration = external_config

        if if_exists == "replace":
            self.delete(mode="staging")

        self.client["bigquery_staging"].create_table(table)

        table = bigquery.Table(self.table_full_name["staging"])
Ejemplo n.º 2
0
    def create(
        self,
        path=None,
        job_config_params=None,
        force_dataset=True,
        if_table_exists="raise",
        if_storage_data_exists="raise",
        if_table_config_exists="raise",
        source_format="csv",
    ):
        """Creates BigQuery table at staging dataset.

        If you add a path, it automatically saves the data in the storage,
        creates a datasets folder and BigQuery location, besides creating the
        table and its configuration files.

        The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery.

        It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*`
        and builds the table.

        It currently supports the types:
            - Comma Delimited CSV

        Data can also be partitioned following the hive partitioning scheme
        `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR`

        Args:
            path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with
            job_config_params (dict): Optional.
                Job configuration params from bigquery
            partitioned (bool): Optional.
                Whether data is partitioned
            if_table_exists (str): Optional
                What to do if table exists

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists.
            if_table_config_exists (str): Optional.
                What to do if config files already exist

                 * 'raise': Raises FileExistError
                 * 'replace': Replace with blank template
                 * 'pass'; Do nothing
            if_storage_data_exists (str): Optional.
                What to do if data already exists on your bucket:

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            source_format (str): Optional
                Data source format. Only 'csv' is supported. Defaults to 'csv'.
        """

        if path is None:

            # Look if table data already exists at Storage
            data = self.client["storage_staging"].list_blobs(
                self.bucket_name,
                prefix=f"staging/{self.dataset_id}/{self.table_id}")

            # Raise: Cannot create table without external data
            if not data:
                raise BaseDosDadosException(
                    "You must provide a path for uploading data")

        # Add data to storage
        if isinstance(
                path,
            (
                str,
                Path,
            ),
        ):

            Storage(self.dataset_id, self.table_id,
                    **self.main_vars).upload(path,
                                             mode="staging",
                                             if_exists=if_storage_data_exists)

        # Create Dataset if it doesn't exist
        if force_dataset:

            dataset_obj = Dataset(self.dataset_id, **self.main_vars)

            try:
                dataset_obj.init()
            except FileExistsError:
                pass

            dataset_obj.create(if_exists="pass")

        self.init(
            data_sample_path=path,
            if_folder_exists="replace",
            if_table_config_exists=if_table_config_exists,
        )

        table = bigquery.Table(self.table_full_name["staging"])

        table.external_data_configuration = Datatype(
            self, source_format, "staging",
            partitioned=self._is_partitioned()).external_config

        # Lookup if table alreay exists
        table_ref = None
        try:
            table_ref = self.client["bigquery_staging"].get_table(
                self.table_full_name["staging"])

        except google.api_core.exceptions.NotFound:
            pass

        if isinstance(table_ref, google.cloud.bigquery.table.Table):

            if if_table_exists == "pass":

                return None

            elif if_table_exists == "raise":

                raise FileExistsError(
                    "Table already exists, choose replace if you want to overwrite it"
                )

        if if_table_exists == "replace":

            self.delete(mode="staging")

        self.client["bigquery_staging"].create_table(table)