def download_table_as_df(self, full_table_id, staging_location): """ Download a BigQuery table as Pandas Dataframe Args: full_table_id (src) : fully qualified BigQuery table id staging_location: url to staging_location (currently support a folder in GCS) Returns: pandas.DataFrame: dataframe of the training dataset """ if not is_gs_path(staging_location): raise ValueError("staging_uri must be a directory in GCS") temp_file_name = "temp_{}".format(int(round(time.time() * 1000))) staging_file_path = os.path.join(staging_location, temp_file_name) job_config = ExtractJobConfig() job_config.destination_format = DestinationFormat.CSV job = self.bq.extract_table(Table.from_string(full_table_id), staging_file_path, job_config=job_config) # await completion job.result() return gcs_to_df(staging_file_path)
def export(self, tbl_ref, gcs_base_dir, file_format="csv", compression=None, preview=True): """ :param gcs_base_dir: gcs base bucket :param file_format: (Optional) csv, json or avro. default to csv if can't be determined from export_uri :param compression: default to None. could be gzip. """ tables = self.get_tables(tbl_ref) jobs = list() jc = ExtractJobConfig() jc.compression = compression jc.destination_format = self.__get_bq_format(file_format) for tbl in tables: gcs_uri = "{}/{}/*.{}".format(gcs_base_dir, tbl, file_format) table_ref = TableReference(dataset_ref=tbl_ref.dataset_ref, table_id=tbl) print("-- {}{} => {} ".format("preview: " if preview else "", tbl, gcs_uri)) if preview: continue jobs.append( self.connect(tbl_ref.project).extract_table(table_ref, gcs_uri, job_config=jc)) self.__check_jobs(jobs)
def send_to_gcs(self, query, project_id, output_uri, delimiter=","): job_results = {} client = self.bq_client queryJob = QueryJob(self.__create_job_id(project_id, "queryJob"), query, client) job_results["queryJob"] = self.__get_results(queryJob) output_type = output_uri.split(".")[-1] dest_format = self.__get_file_type(output_type) if dest_format == SourceFormat.CSV: config = ExtractJobConfig(destination_format=dest_format, field_delimiter=delimiter) else: config = ExtractJobConfig(destination_format=dest_format) extractJob = ExtractJob(self.__create_job_id(project_id, "extractJob"), queryJob.destination, output_uri, client, job_config=config) job_results["extractJob"] = self.__get_results(extractJob) return job_results
def download_table_as_file(self, full_table_id, dest, staging_location, file_type): """ Download a bigquery table as file Args: full_table_id (str): fully qualified BigQuery table id dest (str): destination filename staging_location (str): url to staging_location (currently support a folder in GCS) file_type (feast.sdk.resources.feature_set.FileType): (default: FileType.CSV) exported file format Returns: (str) path to the downloaded file """ if not is_gs_path(staging_location): raise ValueError("staging_uri must be a directory in GCS") temp_file_name = "temp_{}".format(int(round(time.time() * 1000))) staging_file_path = os.path.join(staging_location, temp_file_name) job_config = ExtractJobConfig() job_config.destination_format = file_type src_table = Table.from_string(full_table_id) job = self.bq.extract_table(src_table, staging_file_path, job_config=job_config) # await completion job.result() bucket_name, blob_name = split_gs_path(staging_file_path) bucket = self.gcs.get_bucket(bucket_name) blob = bucket.blob(blob_name) blob.download_to_filename(dest) return dest
def export_csv(self, bucket_name: str, bucket_path: str, dataset: str, table: str, sep: str = "\t") -> str: bucket_url = f"gs://{bucket_name}/{self.config.lake_path}/{bucket_path}" logging.info( f"DataWarehouse.export_csv {bucket_url} to {dataset}.{table} ...") client = self._get_client() dataset_ref = DatasetReference(self.config.gcp_project, dataset) to_export = TableReference(dataset_ref, table) config = ExtractJobConfig() config.field_delimiter = sep config.destination_format = bigquery.DestinationFormat.CSV extract_job = client.extract_table(to_export, bucket_url, job_config=config) result = extract_job.result() logging.info( f"DataWarehouse.export_csv {bucket_url} to {dataset}.{table} Complete!" ) return bucket_url
def export_table_to_bucket(self, table_path, # type: str bucket_name, # type: str dir_in_bucket='', # type: Optional[str] output_format='csv', # type: Optional[str]] compression=False, # type: Optional[bool] output_ext='', # type: Optional[str] max_wait_secs=None # type: Optional[int] ): # type: (...) -> None """ Export a BigQuery table to a file in the given bucket. The output file has the same name as the table. Args: table_path: Path of the table bucket_name: Name of the bucket to store the spreadsheet dir_in_bucket: The directory in the bucket to store the output files output_format: Format of output. It must be among 'csv', 'json' and 'avro' compression: Whether to use GZIP compression. Avro cannot be used with GZIP compression output_ext: An optional extension to output file. So that we can tell output files from different exports max_wait_secs: Maximum time to wait. Export table to storage takes significantly longer than query a table. If not set, it will use the class default. """ # A mapping table from supported formats to bigquery required formats. bigquery_required_formats = {'csv': 'CSV', 'json': 'NEWLINE_DELIMITED_JSON', 'avro': 'AVRO'} if output_format not in bigquery_required_formats: raise ValueError('Invalid output format: {}. Must be among {}'.format( output_format, bigquery_required_formats.keys())) if compression and output_format == 'avro': raise ValueError('{} cannot be combined with GZIP compression'.format(output_format)) src_table_ref = self.get_table_reference_from_path(table_path) # Generate the destination of the table content output_filename = src_table_ref.table_id if output_ext: output_filename += '_' + output_ext output_filename += '.' + output_format if compression: output_filename += '.gz' path = os.path.join(dir_in_bucket, output_filename) destination = 'gs://{}/{}'.format(bucket_name, path.lstrip().lstrip('/')) config = ExtractJobConfig() config.destination_format = bigquery_required_formats[output_format] config.compression = 'GZIP' if compression else 'NONE' extract_job = self.gclient.extract_table(src_table_ref, destination, job_config=config, retry=self.default_retry) # Wait for completion extract_job.result(timeout=max_wait_secs or self.max_wait_secs)
def _extract_job_config( self, unload_plan: RecordsUnloadPlan) -> ExtractJobConfig: config = ExtractJobConfig() if isinstance(unload_plan.records_format, AvroRecordsFormat): config.destination_format = 'AVRO' # https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro#logical_types config.use_avro_logical_types = True else: raise NotImplementedError( f'Please add support for {unload_plan.records_format}') return config
def export_table_to_gcs(dataset_ref, source_table, destination_uri): table_ref = dataset_ref.table(source_table) config = ExtractJobConfig() config.print_header = False extract_job = bq.extract_table( table_ref, destination_uri, location="US", job_config=config, ) extract_job.result()
def download_table_as_file(self, full_table_id, dest, file_type, staging_location=None): """ Download a bigquery table as file Args: full_table_id (str): fully qualified BigQuery table id dest (str): destination filename file_type (feast.sdk.resources.feature_set.FileType): (default: FileType.CSV) exported file format staging_location (str, optional): url to staging_location (currently support a folder in GCS) Returns: (str) path to the downloaded file """ if not staging_location: df = self.download_table_as_df(full_table_id) if file_type == FileType.CSV: df.to_csv(dest, index=False) elif file_type == FileType.JSON: df.to_json(dest, index=False) else: raise ValueError( "Only FileType: CSV and JSON are supported for download_table_as_file without staging location" ) return dest if not is_gs_path(staging_location): raise ValueError("staging_uri must be a directory in GCS") temp_file_name = "temp_{}".format(int(round(time.time() * 1000))) staging_file_path = os.path.join(staging_location, temp_file_name) job_config = ExtractJobConfig() job_config.destination_format = file_type src_table = Table.from_string(full_table_id) job = self.bqclient.extract_table(src_table, staging_file_path, job_config=job_config) # await completion job.result() bucket_name, blob_name = split_gs_path(staging_file_path) bucket = self.storageclient.get_bucket(bucket_name) blob = bucket.blob(blob_name) blob.download_to_filename(dest) return dest
def __extract_table_to_shard_folder(self, full_table_id, staging_location, file_type): shard_folder = os.path.join(staging_location, 'temp_%d' % int(round(time.time() * 1000))) staging_file_path = os.path.join(shard_folder, "shard_*") job_config = ExtractJobConfig() job_config.destination_format = file_type job = self.bqclient.extract_table( Table.from_string(full_table_id), staging_file_path, job_config=job_config ) # await completion job.result() return shard_folder
def export_table_to_gcs(dataset_id, table_id, destination, file_format='.json', compression=True): """ Exports data from BigQuery to an object in Google Cloud Storage. For more information, see the README.rst. Example invocation: $ python export_data_to_gcs.py example_dataset example_table \\ gs://example-bucket/example-data.csv The dataset and table should already exist. """ dataset_ref = bigquery_client.dataset(dataset_id) table_ref = dataset_ref.table(table_id) job_config = ExtractJobConfig() if file_format == '.json': job_config.destination_format = DestinationFormat.NEWLINE_DELIMITED_JSON if compression: job_config.compression = Compression.GZIP elif file_format == '.avro': job_config.destination_format = DestinationFormat.AVRO else: job_config.destination_format = DestinationFormat.CSV if compression: job_config.compression = Compression.GZIP job = bigquery_client.extract_table(table_ref, destination, job_config=job_config) job.result(timeout=500) # Waits for job to complete log.info('Exported {}:{} to {}'.format(dataset_id, table_id, destination))
def test_begin_w_alternate_client(self): from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.job import Compression from google.cloud.bigquery.job import DestinationFormat from google.cloud.bigquery.job import ExtractJobConfig PATH = "/projects/%s/jobs" % (self.PROJECT, ) RESOURCE = self._make_resource(ended=True) EXTRACT_CONFIGURATION = { "sourceTable": { "projectId": self.PROJECT, "datasetId": self.DS_ID, "tableId": self.SOURCE_TABLE, }, "destinationUris": [self.DESTINATION_URI], "compression": Compression.GZIP, "destinationFormat": DestinationFormat.NEWLINE_DELIMITED_JSON, "fieldDelimiter": "|", "printHeader": False, } RESOURCE["configuration"]["extract"] = EXTRACT_CONFIGURATION conn1 = _make_connection() client1 = _make_client(project=self.PROJECT, connection=conn1) conn2 = _make_connection(RESOURCE) client2 = _make_client(project=self.PROJECT, connection=conn2) source_dataset = DatasetReference(self.PROJECT, self.DS_ID) source = source_dataset.table(self.SOURCE_TABLE) config = ExtractJobConfig() config.compression = Compression.GZIP config.destination_format = DestinationFormat.NEWLINE_DELIMITED_JSON config.field_delimiter = "|" config.print_header = False job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI], client1, config) with mock.patch( "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" ) as final_attributes: job._begin(client=client2) final_attributes.assert_called_with({"path": PATH}, client2, job) conn1.api_request.assert_not_called() conn2.api_request.assert_called_once_with( method="POST", path=PATH, data={ "jobReference": { "projectId": self.PROJECT, "jobId": self.JOB_ID }, "configuration": { "extract": EXTRACT_CONFIGURATION }, }, timeout=None, ) self._verifyResourceProperties(job, RESOURCE)
def _execute_extract_table(self, destination_uri, bq_dataset_location, export_gzip, export_json, print_header): """Starts an data export job and waits fot it's completion.""" client = self._get_client() job_id = self._get_job_id(with_unique_suffix=True) if export_json: destination_format = 'NEWLINE_DELIMITED_JSON' else: destination_format = 'CSV' job_config = ExtractJobConfig( print_header=print_header, destination_format=destination_format, compression='GZIP' if export_gzip else 'NONE') try: job = client.get_job(job_id) except exceptions.NotFound: job = client.extract_table(self._get_full_table_name(), destination_uri, job_id=job_id, job_config=job_config, location=bq_dataset_location) self._wait(job)
def query( query, project_id, dataset_id=None, table_id=None, output_gcs_path=None, dataset_location='US', job_config=None, output_path=None, output_filename=None, output_destination_format="CSV", job_object_output_path='/tmp/kfp/output/bigquery/query-job.json', output_gcs_path_output_path='/tmp/kfp/output/bigquery/query-output-path.txt', output_dataset_id_output_path='/tmp/kfp/output/bigquery/query-dataset-id.txt', output_table_id_output_path='/tmp/kfp/output/bigquery/query-table-id.txt', ): """Submit a query to Bigquery service and dump outputs to Bigquery table or a GCS blob. Args: query (str): The query used by Bigquery service to fetch the results. project_id (str): The project to execute the query job. dataset_id (str): The ID of the persistent dataset to keep the results of the query. If the dataset does not exist, the operation will create a new one. table_id (str): The ID of the table to keep the results of the query. If absent, the operation will generate a random id for the table. output_gcs_path (str): The GCS blob path to dump the query results to. dataset_location (str): The location to create the dataset. Defaults to `US`. job_config (dict): The full config spec for the query job. output_path (str): The path to where query result will be stored output_filename (str): The name of the file where the results will be stored output_destination_format (str): The name of the output destination format. Default is CSV, and you can also choose NEWLINE_DELIMITED_JSON and AVRO. Returns: The API representation of the completed query job. """ client = bigquery.Client(project=project_id, location=dataset_location) if not job_config: job_config = bigquery.QueryJobConfig() job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE else: job_config = bigquery.QueryJobConfig.from_api_repr(job_config) job_id = None def cancel(): if job_id: client.cancel_job(job_id) with KfpExecutionContext(on_cancel=cancel) as ctx: job_id = 'query_' + ctx.context_id() query_job = _get_job(client, job_id) table_ref = None if not query_job: dataset_ref = _prepare_dataset_ref(client, dataset_id, output_gcs_path, dataset_location) if dataset_ref: if not table_id: table_id = job_id table_ref = dataset_ref.table(table_id) job_config.destination = table_ref gcp_common.dump_file(output_dataset_id_output_path, table_ref.dataset_id) gcp_common.dump_file(output_table_id_output_path, table_ref.table_id) query_job = client.query(query, job_config, job_id=job_id) _display_job_link(project_id, job_id) if output_path != None: #Write to local file result = query_job.result() if not os.path.exists(output_path): os.makedirs(output_path) df = result.to_dataframe() df.to_csv(os.path.join(output_path, output_filename)) else: query_job.result() if output_gcs_path: job_id = 'extract_' + ctx.context_id() extract_job = _get_job(client, job_id) logging.info('Extracting data from table {} to {}.'.format( str(table_ref), output_gcs_path)) if not extract_job: job_config = ExtractJobConfig( destination_format=output_destination_format) extract_job = client.extract_table(table_ref, output_gcs_path, job_config=job_config) extract_job.result() # Wait for export to finish # TODO: Replace '-' with empty string when most users upgrade to Argo version which has the fix: https://github.com/argoproj/argo/pull/1653 gcp_common.dump_file(output_gcs_path_output_path, output_gcs_path or '-') gcp_common.dump_file(job_object_output_path, json.dumps(query_job.to_api_repr())) return query_job.to_api_repr()
def export_table_to_bucket(self, table_path, # type: str bucket_name, # type: str dir_in_bucket='', # type: Optional[str] output_format='csv', # type: Optional[str]] compression=False, # type: Optional[bool] output_ext='', # type: Optional[str] max_wait_secs=None, # type: Optional[int] support_multifile_export=True, # type: bool explicit_filename=None # type: Optional[str] ): # type: (...) -> None """ Export a BigQuery table to a file (or a set of files) in the given bucket. The output files will be in a directory with the same name as the table, within the bucket_name and dir_in_bucket provided. Args: table_path: Path of the table bucket_name: Name of the bucket to store the spreadsheet dir_in_bucket: The directory in the bucket to store the output files output_format: Format of output. It must be among 'csv', 'json' and 'avro' compression: Whether to use GZIP compression. Avro cannot be used with GZIP compression output_ext: An optional extension to output file. So that we can tell output files from different exports max_wait_secs: Maximum time to wait. Export table to storage takes significantly longer than query a table. If not set, it will use the class default. support_multifile_export: If True, and the table is large enough, then the table will be exported as several files suffixed with a shard number. If False, it will be exported as a single file. explicit_filename: File name. Use it as file name if specified, otherwise use table ID, maybe with output_ext, as file name Raises: RuntimeError if there is a problem with the export job. """ bq_output_format = self._convert_to_bq_format(output_format) if compression and output_format == 'avro': raise ValueError('{} cannot be combined with GZIP compression'.format(output_format)) src_table_ref = self.get_table_reference_from_path(table_path) # Generate the destination of the table content. if explicit_filename: output_filename = explicit_filename else: output_filename = src_table_ref.table_id if output_ext: output_filename += '_' + output_ext if support_multifile_export: # End in a * so that multiple shards can be written out if needed. output_filename += '-*' output_filename += '.' + output_format if compression: output_filename += '.gz' path = os.path.join(dir_in_bucket, output_filename) destination = 'gs://{}/{}'.format(bucket_name, path.lstrip().lstrip('/')) config = ExtractJobConfig() config.destination_format = bq_output_format config.compression = 'GZIP' if compression else 'NONE' extract_job = self.gclient.extract_table(src_table_ref, destination, job_config=config, retry=self.default_retry_for_api_calls) # Wait for completion extract_job.result(timeout=max_wait_secs or self.max_wait_secs)
def export_table_to_bucket( self, table_path, # type: str bucket_name, # type: str dir_in_bucket='', # type: Optional[str] output_format='csv', # type: Optional[str]] compression=False, # type: Optional[bool] output_ext='', # type: Optional[str] max_wait_secs=None, # type: Optional[int] support_multifile_export=True, # type: bool explicit_filename=None # type: Optional[str] ): # type: (...) -> List[str] """ Export a BigQuery table to a file (or a set of files) in the given bucket. The output files will be in a directory with the same name as the table, within the bucket_name and dir_in_bucket provided. Args: table_path: Path of the table bucket_name: Name of the bucket to store the spreadsheet dir_in_bucket: The directory in the bucket to store the output files output_format: Format of output. It must be among 'csv', 'json' and 'avro' compression: Whether to use GZIP compression. Avro cannot be used with GZIP compression output_ext: An optional extension to output file. So that we can tell output files from different exports max_wait_secs: Maximum time to wait. Export table to storage takes significantly longer than query a table. If not set, it will use the class default. support_multifile_export: If True, and the table is large enough, then the table will be exported as several files suffixed with a shard number. If False, it will be exported as a single file. explicit_filename: File name. Use it as file name if specified, otherwise use table ID, maybe with output_ext, as file name Returns: A list of the names of the files exported. Raises: RuntimeError if there is a problem with the export job. """ bq_output_format = self._convert_to_bq_format(output_format) if compression and output_format == 'avro': raise ValueError( '{} cannot be combined with GZIP compression'.format( output_format)) src_table_ref = self.get_table_reference_from_path(table_path) # Generate the destination of the table content. if explicit_filename: output_filename = explicit_filename else: output_filename = src_table_ref.table_id if output_ext: output_filename += '_' + output_ext if support_multifile_export: # End in a * so that multiple shards can be written out if needed. output_filename += '-*' output_filename += '.' + output_format if compression: output_filename += '.gz' path = os.path.join(dir_in_bucket, output_filename) destination = 'gs://{}/{}'.format(bucket_name, path.lstrip().lstrip('/')) config = ExtractJobConfig() config.destination_format = bq_output_format config.compression = 'GZIP' if compression else 'NONE' extract_job = self.gclient.extract_table( src_table_ref, destination, job_config=config, retry=self.default_retry_for_api_calls) # Wait for completion extract_job.result(timeout=max_wait_secs or self.max_wait_secs) # Get the names of the files exported. if support_multifile_export: # destination_uri_file_counts is a list of ints representing the number of files # exported to each destination URI. Since we are only exporting to one destination, we # just need the first element. num_files_exported = extract_job.destination_uri_file_counts[0] # Multi-file export replaces the "*" with the file number, padded to 12 digits. See: # https://cloud.google.com/bigquery/docs/exporting-data#exporting_data_into_one_or_more_files # noqa return [ output_filename.replace( '*', str(i).zfill(MULTIFILE_EXPORT_PAD_LENGTH)) for i in range(num_files_exported) ] return [output_filename]