def append(self, filepath, partitions=None, if_exists="raise", **upload_args): """Appends new data to existing BigQuery table. As long as the data has the same schema. It appends the data in the filepath to the existing table. Args: filepath (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with partitions (str, pathlib.PosixPath, dict): Optional. Hive structured partition as a string or dict * str : `<key>=<value>/<key2>=<value2>` * dict: `dict(key=value, key2=value2)` if_exists (str): 0ptional. What to do if data with same name exists in storage * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing """ Storage(self.dataset_id, self.table_id, **self.main_vars).upload( filepath, mode="staging", partitions=None, if_exists=if_exists, **upload_args, ) self.create(if_exists="replace")
def storage_copy_table(ctx, dataset_id, table_id, source_bucket_name, dst_bucket_name, mode): Storage(dataset_id, table_id).copy_table( source_bucket_name=source_bucket_name, destination_bucket_name=dst_bucket_name, mode=mode, )
def sync_bucket( source_bucket_name, dataset_id, table_id, destination_bucket_name, backup_bucket_name, mode="staging", ): """Copies proprosed data between storage buckets. Creates a backup of old data, then delete it and copies new data into the destination bucket. Args: source_bucket_name (str): The bucket name from which to copy data. dataset_id (str): Dataset id available in basedosdados. It should always come with table_id. table_id (str): Table id available in basedosdados.dataset_id. It should always come with dataset_id. destination_bucket_name (str): The bucket name which data will be copied to. If None, defaults to the bucket initialized when instantianting Storage object (check it with the Storage.bucket proprerty) backup_bucket_name (str): The bucket name for where backup data will be stored. mode (str): Optional. Folder of which dataset to update. Raises: ValueError: If there are no files corresponding to the given dataset_id and table_id on the source bucket """ ref = Storage(dataset_id=dataset_id, table_id=table_id) prefix = f"{mode}/{dataset_id}/{table_id}/" source_ref = ( ref.client["storage_staging"].bucket(source_bucket_name).list_blobs( prefix=prefix)) destination_ref = ref.bucket.list_blobs(prefix=prefix) if len(list(source_ref)) == 0: raise ValueError("No objects found on the source bucket") # MAKE A BACKUP OF OLD DATA if len(list(destination_ref)): ref.copy_table( source_bucket_name=destination_bucket_name, destination_bucket_name=backup_bucket_name, ) # DELETE OLD DATA FROM PROD ref.delete_table(not_found_ok=True) # COPIES DATA TO DESTINATION ref.copy_table(source_bucket_name=source_bucket_name)
def upload_storage(ctx, dataset_id, table_id, filepath, mode, partitions, if_exists): ctx.obj.pop("bucket_name") blob_name = Storage(dataset_id, table_id, **ctx.obj).upload( filepath=filepath, mode=mode, partitions=partitions, if_exists=if_exists ) click.echo(click.style(f"Data was added to `{blob_name}`", fg="green",))
def init_storage(ctx, bucket_name, replace, very_sure): # TODO: Create config file to store bucket_name, etc... ctx.obj.pop("bucket_name") Storage(bucket_name=bucket_name, **ctx.obj).init( replace=replace, very_sure=very_sure ) click.echo(click.style(f"Bucket `{bucket_name}` was created", fg="green",))
def storage_delete_table(ctx, dataset_id, table_id, mode, not_found_ok, bucket_name): Storage(dataset_id, table_id, **ctx.obj).delete_table( mode=mode, not_found_ok=not_found_ok, bucket_name=bucket_name ) click.echo( click.style( f"Data was deleted from bucket `{bucket_name}`", fg="green", ) )
def download_storage( ctx, dataset_id, table_id, filename, savepath, partitions, mode, if_not_exists ): Storage(dataset_id, table_id, **ctx.obj).download( filename, savepath, partitions, mode, if_not_exists ) click.echo( click.style( f"Data was downloaded to `{savepath}`", fg="green", ) )
def create( self, path=None, job_config_params=None, partitioned=False, if_exists="raise", force_dataset=True, ): """Creates BigQuery table at staging dataset. If you add a path, it automatically saves the data in the storage, creates a datasets folder and BigQuery location, besides creating the table and its configuration files. The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery. It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*` and builds the table. It currently supports the types: - Comma Delimited CSV Data can also be partitioned following the hive partitioning scheme `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR` Args: path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with job_config_params (dict): Optional. Job configuration params from bigquery partitioned (bool): Optional. Whether data is partitioned if_exists (str): Optional What to do if table exists * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists. Todo: * Implement if_exists=raise * Implement if_exists=pass """ # Add data to storage if isinstance( path, ( str, PosixPath, ), ): Storage(self.dataset_id, self.table_id, **self.main_vars).upload( path, mode="staging", if_exists="replace" ) # Create Dataset if it doesn't exist if force_dataset: dataset_obj = Dataset(self.dataset_id, **self.main_vars) try: dataset_obj.init() except FileExistsError: pass dataset_obj.create(if_exists="pass") self.init(data_sample_path=path, if_exists="replace") external_config = external_config = bigquery.ExternalConfig("CSV") external_config.options.skip_leading_rows = 1 external_config.options.allow_quoted_newlines = True external_config.options.allow_jagged_rows = True external_config.autodetect = False external_config.schema = self._load_schema("staging") external_config.source_uris = ( f"gs://basedosdados/staging/{self.dataset_id}/{self.table_id}/*" ) if partitioned: hive_partitioning = bigquery.external_config.HivePartitioningOptions() hive_partitioning.mode = "AUTO" hive_partitioning.source_uri_prefix = self.uri.format( dataset=self.dataset_id, table=self.table_id ).replace("*", "") external_config.hive_partitioning = hive_partitioning table = bigquery.Table(self.table_full_name["staging"]) table.external_data_configuration = external_config if if_exists == "replace": self.delete(mode="staging") self.client["bigquery_staging"].create_table(table) table = bigquery.Table(self.table_full_name["staging"])
def storage_delete_table(ctx, dataset_id, table_id, mode, not_found_ok, bucket_name): Storage(dataset_id, table_id).delete_table(mode=mode, not_found_ok=not_found_ok, bucket_name=bucket_name)
def create( self, path=None, job_config_params=None, force_dataset=True, if_table_exists="raise", if_storage_data_exists="raise", if_table_config_exists="raise", source_format="csv", ): """Creates BigQuery table at staging dataset. If you add a path, it automatically saves the data in the storage, creates a datasets folder and BigQuery location, besides creating the table and its configuration files. The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery. It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*` and builds the table. It currently supports the types: - Comma Delimited CSV Data can also be partitioned following the hive partitioning scheme `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR` Args: path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with job_config_params (dict): Optional. Job configuration params from bigquery partitioned (bool): Optional. Whether data is partitioned if_table_exists (str): Optional What to do if table exists * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists. if_table_config_exists (str): Optional. What to do if config files already exist * 'raise': Raises FileExistError * 'replace': Replace with blank template * 'pass'; Do nothing if_storage_data_exists (str): Optional. What to do if data already exists on your bucket: * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing source_format (str): Optional Data source format. Only 'csv' is supported. Defaults to 'csv'. """ if path is None: # Look if table data already exists at Storage data = self.client["storage_staging"].list_blobs( self.bucket_name, prefix=f"staging/{self.dataset_id}/{self.table_id}") # Raise: Cannot create table without external data if not data: raise BaseDosDadosException( "You must provide a path for uploading data") # Add data to storage if isinstance( path, ( str, Path, ), ): Storage(self.dataset_id, self.table_id, **self.main_vars).upload(path, mode="staging", if_exists=if_storage_data_exists) # Create Dataset if it doesn't exist if force_dataset: dataset_obj = Dataset(self.dataset_id, **self.main_vars) try: dataset_obj.init() except FileExistsError: pass dataset_obj.create(if_exists="pass") self.init( data_sample_path=path, if_folder_exists="replace", if_table_config_exists=if_table_config_exists, ) table = bigquery.Table(self.table_full_name["staging"]) table.external_data_configuration = Datatype( self, source_format, "staging", partitioned=self._is_partitioned()).external_config # Lookup if table alreay exists table_ref = None try: table_ref = self.client["bigquery_staging"].get_table( self.table_full_name["staging"]) except google.api_core.exceptions.NotFound: pass if isinstance(table_ref, google.cloud.bigquery.table.Table): if if_table_exists == "pass": return None elif if_table_exists == "raise": raise FileExistsError( "Table already exists, choose replace if you want to overwrite it" ) if if_table_exists == "replace": self.delete(mode="staging") self.client["bigquery_staging"].create_table(table)