def execute(self, *args): super().execute() valid = EssentialParameters( self.__class__.__name__, [self._tblname, self._bucket, self._dest_dir] ) valid() os.makedirs(self._dest_dir, exist_ok=True) if isinstance(self._credentials, str): self._logger.warning( ( "DeprecationWarning: " "In the near future, " "the `credentials` will be changed to accept only dictionary types. " ) ) key_filepath = self._credentials else: key_filepath = self._source_path_reader(self._credentials) gbq_client = BigQuery.get_bigquery_client(key_filepath) gbq_ref = gbq_client.dataset(self._dataset).table(self._tblname) gcs_client = Gcs.get_gcs_client(key_filepath) gcs_bucket = gcs_client.bucket(self._bucket) ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f") path = "%s-%s" % ("".join(random.choices(string.ascii_letters, k=8)), ymd_hms) prefix = "%s/%s/%s" % (self._dataset, self._tblname, path) """ gsc dir -> gs://{bucket_name} /{dataset_name}/{table_name} /{XXXXXXXX}-{yyyyMMddHHmmssSSS}/*.csv.gz """ if self._filename: dest_gcs = "gs://%s/%s/%s*.csv.gz" % (self._bucket, prefix, self._filename) else: dest_gcs = "gs://%s/%s/*.csv.gz" % (self._bucket, prefix) # job config settings job_config = bigquery.ExtractJobConfig() job_config.compression = bigquery.Compression.GZIP job_config.destination_format = bigquery.DestinationFormat.CSV # Execute query. job = gbq_client.extract_table( gbq_ref, dest_gcs, job_config=job_config, location=self._location ) job.result() # Download from gcs for blob in gcs_client.list_blobs(gcs_bucket, prefix=prefix): dest = os.path.join(self._dest_dir, os.path.basename(blob.name)) blob.download_to_filename(dest) # Cleanup temporary files for blob in gcs_client.list_blobs(gcs_bucket, prefix=prefix): blob.delete()
def test_get_destination_format_json(self): assert BigQuery.get_destination_format( ".json") == "NEWLINE_DELIMITED_JSON"
def _save_as_file_via_gcs(self): self._logger.info("Save data as a file via GCS") os.makedirs(self._dest_dir, exist_ok=True) ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f") path = "%s-%s" % (StringUtil().random_str(self._RANDOM_STR_LENGTH), ymd_hms,) prefix = "%s/%s/%s" % (self._dataset, self._tblname, path) if isinstance(self._credentials, str): self._logger.warning( ( "DeprecationWarning: " "In the near future, " "the `credentials` will be changed to accept only dictionary types. " "Please see more information " "https://github.com/BrainPad/cliboa/blob/master/docs/modules/bigquery_read.md" ) ) key_filepath = self._credentials else: key_filepath = self._source_path_reader(self._credentials) gbq_client = BigQuery.get_bigquery_client(key_filepath) if self._dataset and self._tblname: table_ref = gbq_client.dataset(self._dataset).table(self._tblname) elif self._dataset and not self._tblname: tmp_tbl = ( "tmp_" + StringUtil().random_str(self._RANDOM_STR_LENGTH) + "_" + ymd_hms ) table_ref = gbq_client.dataset(self._dataset).table(tmp_tbl) gcs_client = Gcs.get_gcs_client(key_filepath) gcs_bucket = gcs_client.bucket(self._bucket) # extract job config settings ext_job_config = BigQuery.get_extract_job_config() ext_job_config.compression = BigQuery.get_compression_type() ext = ".csv" if self._filename: _, ext = os.path.splitext(self._filename) support_ext = [".csv", ".json"] if ext not in support_ext: raise InvalidParameter("%s is not supported as filename." % ext) ext_job_config.destination_format = BigQuery.get_destination_format(ext) comp_format_and_ext = {"GZIP": ".gz"} comp_ext = comp_format_and_ext.get(str(BigQuery.get_compression_type())) if self._filename: dest_gcs = "gs://%s/%s/%s%s" % ( self._bucket, prefix, self._filename, comp_ext, ) else: dest_gcs = "gs://%s/%s/*%s%s" % (self._bucket, prefix, ext, comp_ext) # Execute query. if self._query: query_job_config = BigQuery.get_query_job_config() query_job_config.destination = table_ref query_job_config.write_disposition = BigQuery.get_write_disposition() query_job = gbq_client.query( self._query, location=self._location, job_config=query_job_config ) query_job.result() # Extract to GCS extract_job = gbq_client.extract_table( table_ref, dest_gcs, job_config=ext_job_config, location=self._location ) extract_job.result() # Download from gcs for blob in gcs_bucket.list_blobs(prefix=prefix): dest = os.path.join(self._dest_dir, os.path.basename(blob.name)) blob.download_to_filename(dest) # Cleanup temporary table if self._query: gbq_client.delete_table(table_ref) # Cleanup temporary files for blob in gcs_bucket.list_blobs(prefix=prefix): blob.delete()
def test_get_comporession_type(self): assert BigQuery.get_compression_type() == "GZIP"
def test_get_destination_format_csv(self): assert BigQuery.get_destination_format(".csv") == "CSV"
def test_get_query_job_config(self): self.assertTrue( isinstance(BigQuery.get_query_job_config(), type(bigquery.QueryJobConfig())))
def test_get_extract_job_config_with_no_header(self): self.assertTrue( isinstance( BigQuery.get_extract_job_config(print_header=False), type(bigquery.ExtractJobConfig(print_header=False)), ), )
def test_get_extract_job_config_with_header(self): self.assertTrue( isinstance(BigQuery.get_extract_job_config(), type(bigquery.ExtractJobConfig())))
def test_get_bigquery_client_no_credentials(self): assert BigQuery.get_bigquery_client(None) == bigquery.Client()
def _save_as_file_via_gcs(self): self._logger.info("Save data as a file via GCS") os.makedirs(self._dest_dir, exist_ok=True) ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f") path = "%s-%s" % ( StringUtil().random_str(self._RANDOM_STR_LENGTH), ymd_hms, ) prefix = "%s/%s/%s" % (self._dataset, self._tblname, path) gbq_client = BigQuery.get_bigquery_client(self._credentials) if self._dataset and self._tblname: table_ref = gbq_client.dataset(self._dataset).table(self._tblname) elif self._dataset and not self._tblname: tmp_tbl = ("tmp_" + StringUtil().random_str(self._RANDOM_STR_LENGTH) + "_" + ymd_hms) table_ref = gbq_client.dataset(self._dataset).table(tmp_tbl) gcs_client = Gcs.get_gcs_client(self._credentials) gcs_bucket = gcs_client.get_bucket(self._bucket) # extract job config settings ext_job_config = BigQuery.get_extract_job_config() ext_job_config.compression = BigQuery.get_compression_type() ext = ".csv" if self._filename: _, ext = os.path.splitext(self._filename) support_ext = [".csv", ".json"] if ext not in support_ext: raise InvalidParameter("%s is not supported as filename." % ext) ext_job_config.destination_format = BigQuery.get_destination_format( ext) comp_format_and_ext = {"GZIP": ".gz"} comp_ext = comp_format_and_ext.get(str( BigQuery.get_compression_type())) if self._filename: dest_gcs = "gs://%s/%s/%s%s" % ( self._bucket, prefix, self._filename, comp_ext, ) else: dest_gcs = "gs://%s/%s/*%s%s" % (self._bucket, prefix, ext, comp_ext) # Execute query. if self._query: query_job_config = BigQuery.get_query_job_config() query_job_config.destination = table_ref query_job_config.write_disposition = BigQuery.get_write_disposition( ) query_job = gbq_client.query(self._query, location=self._location, job_config=query_job_config) query_job.result() # Extract to GCS extract_job = gbq_client.extract_table(table_ref, dest_gcs, job_config=ext_job_config, location=self._location) extract_job.result() # Download from gcs for blob in gcs_bucket.list_blobs(prefix=prefix): dest = os.path.join(self._dest_dir, os.path.basename(blob.name)) blob.download_to_filename(dest) # Cleanup temporary table if self._query: gbq_client.delete_table(table_ref) # Cleanup temporary files for blob in gcs_bucket.list_blobs(prefix=prefix): blob.delete()