def delete_dataset(ctx, dataset_id, mode): if click.confirm(f"Are you sure you want to delete `{dataset_id}`?"): Dataset(dataset_id=dataset_id, **ctx.obj).delete(mode=mode) click.echo(click.style(mode_text(mode, "deleted", dataset_id), fg="green",))
def update_dataset(ctx, dataset_id, mode): Dataset(dataset_id=dataset_id, **ctx.obj).update(mode=mode) click.echo( click.style( mode_text(mode, "updated", dataset_id), fg="green", ))
def init_dataset(ctx, dataset_id, replace): d = Dataset(dataset_id=dataset_id, **ctx.obj).init(replace=replace) click.echo( click.style( f"Dataset `{dataset_id}` folder and metadata were created at {d.metadata_path}", fg="green", ))
def publicize_dataset(ctx, dataset_id): Dataset(dataset_id=dataset_id, **ctx.obj).publicize() click.echo( click.style( f"Dataset `{dataset_id}` became public!", fg="green", ))
def create_dataset(ctx, dataset_id, mode, if_exists): Dataset(dataset_id=dataset_id, **ctx.obj).create(mode=mode, if_exists=if_exists) click.echo( click.style( mode_text(mode, "created", dataset_id), fg="green", ))
def create( self, path=None, job_config_params=None, partitioned=False, if_exists="raise", force_dataset=True, ): """Creates BigQuery table at staging dataset. If you add a path, it automatically saves the data in the storage, creates a datasets folder and BigQuery location, besides creating the table and its configuration files. The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery. It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*` and builds the table. It currently supports the types: - Comma Delimited CSV Data can also be partitioned following the hive partitioning scheme `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR` Args: path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with job_config_params (dict): Optional. Job configuration params from bigquery partitioned (bool): Optional. Whether data is partitioned if_exists (str): Optional What to do if table exists * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists. Todo: * Implement if_exists=raise * Implement if_exists=pass """ # Add data to storage if isinstance( path, ( str, PosixPath, ), ): Storage(self.dataset_id, self.table_id, **self.main_vars).upload( path, mode="staging", if_exists="replace" ) # Create Dataset if it doesn't exist if force_dataset: dataset_obj = Dataset(self.dataset_id, **self.main_vars) try: dataset_obj.init() except FileExistsError: pass dataset_obj.create(if_exists="pass") self.init(data_sample_path=path, if_exists="replace") external_config = external_config = bigquery.ExternalConfig("CSV") external_config.options.skip_leading_rows = 1 external_config.options.allow_quoted_newlines = True external_config.options.allow_jagged_rows = True external_config.autodetect = False external_config.schema = self._load_schema("staging") external_config.source_uris = ( f"gs://basedosdados/staging/{self.dataset_id}/{self.table_id}/*" ) if partitioned: hive_partitioning = bigquery.external_config.HivePartitioningOptions() hive_partitioning.mode = "AUTO" hive_partitioning.source_uri_prefix = self.uri.format( dataset=self.dataset_id, table=self.table_id ).replace("*", "") external_config.hive_partitioning = hive_partitioning table = bigquery.Table(self.table_full_name["staging"]) table.external_data_configuration = external_config if if_exists == "replace": self.delete(mode="staging") self.client["bigquery_staging"].create_table(table) table = bigquery.Table(self.table_full_name["staging"])
def create( self, path=None, job_config_params=None, force_dataset=True, if_table_exists="raise", if_storage_data_exists="raise", if_table_config_exists="raise", source_format="csv", ): """Creates BigQuery table at staging dataset. If you add a path, it automatically saves the data in the storage, creates a datasets folder and BigQuery location, besides creating the table and its configuration files. The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery. It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*` and builds the table. It currently supports the types: - Comma Delimited CSV Data can also be partitioned following the hive partitioning scheme `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR` Args: path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with job_config_params (dict): Optional. Job configuration params from bigquery partitioned (bool): Optional. Whether data is partitioned if_table_exists (str): Optional What to do if table exists * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists. if_table_config_exists (str): Optional. What to do if config files already exist * 'raise': Raises FileExistError * 'replace': Replace with blank template * 'pass'; Do nothing if_storage_data_exists (str): Optional. What to do if data already exists on your bucket: * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing source_format (str): Optional Data source format. Only 'csv' is supported. Defaults to 'csv'. """ if path is None: # Look if table data already exists at Storage data = self.client["storage_staging"].list_blobs( self.bucket_name, prefix=f"staging/{self.dataset_id}/{self.table_id}") # Raise: Cannot create table without external data if not data: raise BaseDosDadosException( "You must provide a path for uploading data") # Add data to storage if isinstance( path, ( str, Path, ), ): Storage(self.dataset_id, self.table_id, **self.main_vars).upload(path, mode="staging", if_exists=if_storage_data_exists) # Create Dataset if it doesn't exist if force_dataset: dataset_obj = Dataset(self.dataset_id, **self.main_vars) try: dataset_obj.init() except FileExistsError: pass dataset_obj.create(if_exists="pass") self.init( data_sample_path=path, if_folder_exists="replace", if_table_config_exists=if_table_config_exists, ) table = bigquery.Table(self.table_full_name["staging"]) table.external_data_configuration = Datatype( self, source_format, "staging", partitioned=self._is_partitioned()).external_config # Lookup if table alreay exists table_ref = None try: table_ref = self.client["bigquery_staging"].get_table( self.table_full_name["staging"]) except google.api_core.exceptions.NotFound: pass if isinstance(table_ref, google.cloud.bigquery.table.Table): if if_table_exists == "pass": return None elif if_table_exists == "raise": raise FileExistsError( "Table already exists, choose replace if you want to overwrite it" ) if if_table_exists == "replace": self.delete(mode="staging") self.client["bigquery_staging"].create_table(table)