Ejemplo n.º 1
0
    def download_table_as_df(self, full_table_id, staging_location):
        """
        Download a BigQuery table as Pandas Dataframe
        Args:
            full_table_id (src) : fully qualified BigQuery table id
            staging_location: url to staging_location (currently
                support a folder in GCS)

        Returns: pandas.DataFrame: dataframe of the training dataset

        """
        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = DestinationFormat.CSV
        job = self.bq.extract_table(Table.from_string(full_table_id),
                                    staging_file_path,
                                    job_config=job_config)

        # await completion
        job.result()
        return gcs_to_df(staging_file_path)
Ejemplo n.º 2
0
    def export(self,
               tbl_ref,
               gcs_base_dir,
               file_format="csv",
               compression=None,
               preview=True):
        """
        :param gcs_base_dir: gcs base bucket
        :param file_format: (Optional) csv, json or avro. default to csv if can't be determined from export_uri
        :param compression: default to None. could be gzip.
        """
        tables = self.get_tables(tbl_ref)
        jobs = list()
        jc = ExtractJobConfig()
        jc.compression = compression
        jc.destination_format = self.__get_bq_format(file_format)
        for tbl in tables:
            gcs_uri = "{}/{}/*.{}".format(gcs_base_dir, tbl, file_format)
            table_ref = TableReference(dataset_ref=tbl_ref.dataset_ref,
                                       table_id=tbl)
            print("--  {}{} => {} ".format("preview: " if preview else "", tbl,
                                           gcs_uri))
            if preview:
                continue

            jobs.append(
                self.connect(tbl_ref.project).extract_table(table_ref,
                                                            gcs_uri,
                                                            job_config=jc))
        self.__check_jobs(jobs)
Ejemplo n.º 3
0
    def send_to_gcs(self, query, project_id, output_uri, delimiter=","):
        job_results = {}

        client = self.bq_client

        queryJob = QueryJob(self.__create_job_id(project_id, "queryJob"),
                            query, client)
        job_results["queryJob"] = self.__get_results(queryJob)

        output_type = output_uri.split(".")[-1]
        dest_format = self.__get_file_type(output_type)

        if dest_format == SourceFormat.CSV:
            config = ExtractJobConfig(destination_format=dest_format,
                                      field_delimiter=delimiter)
        else:
            config = ExtractJobConfig(destination_format=dest_format)

        extractJob = ExtractJob(self.__create_job_id(project_id, "extractJob"),
                                queryJob.destination,
                                output_uri,
                                client,
                                job_config=config)

        job_results["extractJob"] = self.__get_results(extractJob)

        return job_results
Ejemplo n.º 4
0
    def download_table_as_file(self, full_table_id, dest, staging_location,
                               file_type):
        """
        Download a bigquery table as file
        Args:
            full_table_id (str): fully qualified BigQuery table id
            dest (str): destination filename
            staging_location (str): url to staging_location (currently
                support a folder in GCS)
            file_type (feast.sdk.resources.feature_set.FileType): (default:
                FileType.CSV) exported file format
        Returns: (str) path to the downloaded file

        """
        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = file_type
        src_table = Table.from_string(full_table_id)
        job = self.bq.extract_table(src_table,
                                    staging_file_path,
                                    job_config=job_config)

        # await completion
        job.result()

        bucket_name, blob_name = split_gs_path(staging_file_path)
        bucket = self.gcs.get_bucket(bucket_name)
        blob = bucket.blob(blob_name)
        blob.download_to_filename(dest)
        return dest
Ejemplo n.º 5
0
    def export_csv(self,
                   bucket_name: str,
                   bucket_path: str,
                   dataset: str,
                   table: str,
                   sep: str = "\t") -> str:

        bucket_url = f"gs://{bucket_name}/{self.config.lake_path}/{bucket_path}"

        logging.info(
            f"DataWarehouse.export_csv {bucket_url} to {dataset}.{table} ...")
        client = self._get_client()

        dataset_ref = DatasetReference(self.config.gcp_project, dataset)

        to_export = TableReference(dataset_ref, table)
        config = ExtractJobConfig()
        config.field_delimiter = sep
        config.destination_format = bigquery.DestinationFormat.CSV

        extract_job = client.extract_table(to_export,
                                           bucket_url,
                                           job_config=config)
        result = extract_job.result()

        logging.info(
            f"DataWarehouse.export_csv {bucket_url} to {dataset}.{table} Complete!"
        )

        return bucket_url
Ejemplo n.º 6
0
    def export_table_to_bucket(self,
                               table_path,  # type: str
                               bucket_name,  # type: str
                               dir_in_bucket='',  # type: Optional[str]
                               output_format='csv',  # type: Optional[str]]
                               compression=False,  # type: Optional[bool]
                               output_ext='',  # type: Optional[str]
                               max_wait_secs=None  # type: Optional[int]
                               ):
        # type: (...) -> None
        """
        Export a BigQuery table to a file in the given bucket. The output file has the same name
        as the table.

        Args:
            table_path: Path of the table
            bucket_name: Name of the bucket to store the spreadsheet
            dir_in_bucket: The directory in the bucket to store the output files
            output_format: Format of output. It must be among 'csv', 'json' and 'avro'
            compression: Whether to use GZIP compression. Avro cannot be used with GZIP compression
            output_ext: An optional extension to output file. So that we can tell output files from
                different exports
            max_wait_secs: Maximum time to wait. Export table to storage takes significantly longer
                than query a table. If not set, it will use the class default.
        """
        # A mapping table from supported formats to bigquery required formats.
        bigquery_required_formats = {'csv': 'CSV', 'json': 'NEWLINE_DELIMITED_JSON', 'avro': 'AVRO'}

        if output_format not in bigquery_required_formats:
            raise ValueError('Invalid output format: {}. Must be among {}'.format(
                output_format, bigquery_required_formats.keys()))

        if compression and output_format == 'avro':
            raise ValueError('{} cannot be combined with GZIP compression'.format(output_format))

        src_table_ref = self.get_table_reference_from_path(table_path)

        # Generate the destination of the table content
        output_filename = src_table_ref.table_id
        if output_ext:
            output_filename += '_' + output_ext
        output_filename += '.' + output_format
        if compression:
            output_filename += '.gz'
        path = os.path.join(dir_in_bucket, output_filename)

        destination = 'gs://{}/{}'.format(bucket_name, path.lstrip().lstrip('/'))

        config = ExtractJobConfig()
        config.destination_format = bigquery_required_formats[output_format]
        config.compression = 'GZIP' if compression else 'NONE'

        extract_job = self.gclient.extract_table(src_table_ref, destination, job_config=config,
                                                 retry=self.default_retry)

        # Wait for completion
        extract_job.result(timeout=max_wait_secs or self.max_wait_secs)
Ejemplo n.º 7
0
 def _extract_job_config(
         self, unload_plan: RecordsUnloadPlan) -> ExtractJobConfig:
     config = ExtractJobConfig()
     if isinstance(unload_plan.records_format, AvroRecordsFormat):
         config.destination_format = 'AVRO'
         # https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro#logical_types
         config.use_avro_logical_types = True
     else:
         raise NotImplementedError(
             f'Please add support for {unload_plan.records_format}')
     return config
def export_table_to_gcs(dataset_ref, source_table, destination_uri):
    table_ref = dataset_ref.table(source_table)

    config = ExtractJobConfig()
    config.print_header = False

    extract_job = bq.extract_table(
        table_ref,
        destination_uri,
        location="US",
        job_config=config,
    )
    extract_job.result()
Ejemplo n.º 9
0
    def download_table_as_file(self,
                               full_table_id,
                               dest,
                               file_type,
                               staging_location=None):
        """
        Download a bigquery table as file
        Args:
            full_table_id (str): fully qualified BigQuery table id
            dest (str): destination filename
            file_type (feast.sdk.resources.feature_set.FileType): (default:
                FileType.CSV) exported file format
            staging_location (str, optional): url to staging_location (currently
                support a folder in GCS)
        Returns: (str) path to the downloaded file

        """
        if not staging_location:
            df = self.download_table_as_df(full_table_id)
            if file_type == FileType.CSV:
                df.to_csv(dest, index=False)
            elif file_type == FileType.JSON:
                df.to_json(dest, index=False)
            else:
                raise ValueError(
                    "Only FileType: CSV and JSON are supported for download_table_as_file without staging location"
                )
            return dest

        if not is_gs_path(staging_location):
            raise ValueError("staging_uri must be a directory in GCS")

        temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
        staging_file_path = os.path.join(staging_location, temp_file_name)

        job_config = ExtractJobConfig()
        job_config.destination_format = file_type
        src_table = Table.from_string(full_table_id)
        job = self.bqclient.extract_table(src_table,
                                          staging_file_path,
                                          job_config=job_config)

        # await completion
        job.result()

        bucket_name, blob_name = split_gs_path(staging_file_path)
        bucket = self.storageclient.get_bucket(bucket_name)
        blob = bucket.blob(blob_name)
        blob.download_to_filename(dest)
        return dest
Ejemplo n.º 10
0
    def __extract_table_to_shard_folder(self, full_table_id,
                                        staging_location, file_type):
        shard_folder = os.path.join(staging_location,
                                    'temp_%d' % int(round(time.time() * 1000)))
        staging_file_path = os.path.join(shard_folder, "shard_*")

        job_config = ExtractJobConfig()
        job_config.destination_format = file_type
        job = self.bqclient.extract_table(
            Table.from_string(full_table_id),
            staging_file_path,
            job_config=job_config
        )
        # await completion
        job.result()
        return shard_folder
Ejemplo n.º 11
0
def export_table_to_gcs(dataset_id,
                        table_id,
                        destination,
                        file_format='.json',
                        compression=True):
    """
    Exports data from BigQuery to an object in Google Cloud Storage.
    For more information, see the README.rst.
    Example invocation:
        $ python export_data_to_gcs.py example_dataset example_table \\
            gs://example-bucket/example-data.csv
    The dataset and table should already exist.
    """
    dataset_ref = bigquery_client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)
    job_config = ExtractJobConfig()
    if file_format == '.json':
        job_config.destination_format = DestinationFormat.NEWLINE_DELIMITED_JSON
        if compression:
            job_config.compression = Compression.GZIP
    elif file_format == '.avro':
        job_config.destination_format = DestinationFormat.AVRO
    else:
        job_config.destination_format = DestinationFormat.CSV
        if compression:
            job_config.compression = Compression.GZIP

    job = bigquery_client.extract_table(table_ref,
                                        destination,
                                        job_config=job_config)
    job.result(timeout=500)  # Waits for job to complete
    log.info('Exported {}:{} to {}'.format(dataset_id, table_id, destination))
Ejemplo n.º 12
0
    def test_begin_w_alternate_client(self):
        from google.cloud.bigquery.dataset import DatasetReference
        from google.cloud.bigquery.job import Compression
        from google.cloud.bigquery.job import DestinationFormat
        from google.cloud.bigquery.job import ExtractJobConfig

        PATH = "/projects/%s/jobs" % (self.PROJECT, )
        RESOURCE = self._make_resource(ended=True)
        EXTRACT_CONFIGURATION = {
            "sourceTable": {
                "projectId": self.PROJECT,
                "datasetId": self.DS_ID,
                "tableId": self.SOURCE_TABLE,
            },
            "destinationUris": [self.DESTINATION_URI],
            "compression": Compression.GZIP,
            "destinationFormat": DestinationFormat.NEWLINE_DELIMITED_JSON,
            "fieldDelimiter": "|",
            "printHeader": False,
        }
        RESOURCE["configuration"]["extract"] = EXTRACT_CONFIGURATION
        conn1 = _make_connection()
        client1 = _make_client(project=self.PROJECT, connection=conn1)
        conn2 = _make_connection(RESOURCE)
        client2 = _make_client(project=self.PROJECT, connection=conn2)
        source_dataset = DatasetReference(self.PROJECT, self.DS_ID)
        source = source_dataset.table(self.SOURCE_TABLE)
        config = ExtractJobConfig()
        config.compression = Compression.GZIP
        config.destination_format = DestinationFormat.NEWLINE_DELIMITED_JSON
        config.field_delimiter = "|"
        config.print_header = False
        job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI],
                             client1, config)
        with mock.patch(
                "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes"
        ) as final_attributes:
            job._begin(client=client2)

        final_attributes.assert_called_with({"path": PATH}, client2, job)

        conn1.api_request.assert_not_called()
        conn2.api_request.assert_called_once_with(
            method="POST",
            path=PATH,
            data={
                "jobReference": {
                    "projectId": self.PROJECT,
                    "jobId": self.JOB_ID
                },
                "configuration": {
                    "extract": EXTRACT_CONFIGURATION
                },
            },
            timeout=None,
        )
        self._verifyResourceProperties(job, RESOURCE)
Ejemplo n.º 13
0
 def _execute_extract_table(self, destination_uri, bq_dataset_location,
                            export_gzip, export_json, print_header):
     """Starts an data export job and waits fot it's completion."""
     client = self._get_client()
     job_id = self._get_job_id(with_unique_suffix=True)
     if export_json:
         destination_format = 'NEWLINE_DELIMITED_JSON'
     else:
         destination_format = 'CSV'
     job_config = ExtractJobConfig(
         print_header=print_header,
         destination_format=destination_format,
         compression='GZIP' if export_gzip else 'NONE')
     try:
         job = client.get_job(job_id)
     except exceptions.NotFound:
         job = client.extract_table(self._get_full_table_name(),
                                    destination_uri,
                                    job_id=job_id,
                                    job_config=job_config,
                                    location=bq_dataset_location)
     self._wait(job)
Ejemplo n.º 14
0
def query(
    query,
    project_id,
    dataset_id=None,
    table_id=None,
    output_gcs_path=None,
    dataset_location='US',
    job_config=None,
    output_path=None,
    output_filename=None,
    output_destination_format="CSV",
    job_object_output_path='/tmp/kfp/output/bigquery/query-job.json',
    output_gcs_path_output_path='/tmp/kfp/output/bigquery/query-output-path.txt',
    output_dataset_id_output_path='/tmp/kfp/output/bigquery/query-dataset-id.txt',
    output_table_id_output_path='/tmp/kfp/output/bigquery/query-table-id.txt',
):
    """Submit a query to Bigquery service and dump outputs to Bigquery table or 
    a GCS blob.
    
    Args:
        query (str): The query used by Bigquery service to fetch the results.
        project_id (str): The project to execute the query job.
        dataset_id (str): The ID of the persistent dataset to keep the results
            of the query. If the dataset does not exist, the operation will 
            create a new one.
        table_id (str): The ID of the table to keep the results of the query. If
            absent, the operation will generate a random id for the table.
        output_gcs_path (str): The GCS blob path to dump the query results to.
        dataset_location (str): The location to create the dataset. Defaults to `US`.
        job_config (dict): The full config spec for the query job.
        output_path (str): The path to where query result will be stored
        output_filename (str): The name of the file where the results will be stored
        output_destination_format (str): The name of the output destination format.
            Default is CSV, and you can also choose NEWLINE_DELIMITED_JSON and AVRO.
    Returns:
        The API representation of the completed query job.
    """
    client = bigquery.Client(project=project_id, location=dataset_location)
    if not job_config:
        job_config = bigquery.QueryJobConfig()
        job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
        job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
    else:
        job_config = bigquery.QueryJobConfig.from_api_repr(job_config)
    job_id = None

    def cancel():
        if job_id:
            client.cancel_job(job_id)

    with KfpExecutionContext(on_cancel=cancel) as ctx:
        job_id = 'query_' + ctx.context_id()
        query_job = _get_job(client, job_id)
        table_ref = None
        if not query_job:
            dataset_ref = _prepare_dataset_ref(client, dataset_id,
                                               output_gcs_path,
                                               dataset_location)
            if dataset_ref:
                if not table_id:
                    table_id = job_id
                table_ref = dataset_ref.table(table_id)
                job_config.destination = table_ref
                gcp_common.dump_file(output_dataset_id_output_path,
                                     table_ref.dataset_id)
                gcp_common.dump_file(output_table_id_output_path,
                                     table_ref.table_id)
            query_job = client.query(query, job_config, job_id=job_id)
        _display_job_link(project_id, job_id)
        if output_path != None:  #Write to local file
            result = query_job.result()
            if not os.path.exists(output_path):
                os.makedirs(output_path)
            df = result.to_dataframe()
            df.to_csv(os.path.join(output_path, output_filename))
        else:
            query_job.result()
            if output_gcs_path:
                job_id = 'extract_' + ctx.context_id()
                extract_job = _get_job(client, job_id)
                logging.info('Extracting data from table {} to {}.'.format(
                    str(table_ref), output_gcs_path))
                if not extract_job:
                    job_config = ExtractJobConfig(
                        destination_format=output_destination_format)
                    extract_job = client.extract_table(table_ref,
                                                       output_gcs_path,
                                                       job_config=job_config)
                extract_job.result()  # Wait for export to finish
            # TODO: Replace '-' with empty string when most users upgrade to Argo version which has the fix: https://github.com/argoproj/argo/pull/1653
            gcp_common.dump_file(output_gcs_path_output_path, output_gcs_path
                                 or '-')

        gcp_common.dump_file(job_object_output_path,
                             json.dumps(query_job.to_api_repr()))
        return query_job.to_api_repr()
Ejemplo n.º 15
0
    def export_table_to_bucket(self,
                               table_path,  # type: str
                               bucket_name,  # type: str
                               dir_in_bucket='',  # type: Optional[str]
                               output_format='csv',  # type: Optional[str]]
                               compression=False,  # type: Optional[bool]
                               output_ext='',  # type: Optional[str]
                               max_wait_secs=None,  # type: Optional[int]
                               support_multifile_export=True, # type: bool
                               explicit_filename=None  # type: Optional[str]
                               ):
        # type: (...) -> None
        """
        Export a BigQuery table to a file (or a set of files)
        in the given bucket. The output files will be in a directory with the same name
        as the table, within the bucket_name and dir_in_bucket provided.

        Args:
            table_path: Path of the table
            bucket_name: Name of the bucket to store the spreadsheet
            dir_in_bucket: The directory in the bucket to store the output files
            output_format: Format of output. It must be among 'csv', 'json' and 'avro'
            compression: Whether to use GZIP compression. Avro cannot be used with GZIP compression
            output_ext: An optional extension to output file. So that we can tell output files from
                different exports
            max_wait_secs: Maximum time to wait. Export table to storage takes significantly longer
                than query a table. If not set, it will use the class default.
            support_multifile_export: If True, and the table is large enough, then the table will be
                exported as several files suffixed with a shard number. If False, it will be exported
                as a single file.
            explicit_filename: File name. Use it as file name if specified, otherwise use table ID,
                maybe with output_ext, as file name
        Raises:
            RuntimeError if there is a problem with the export job.
        """
        bq_output_format = self._convert_to_bq_format(output_format)

        if compression and output_format == 'avro':
            raise ValueError('{} cannot be combined with GZIP compression'.format(output_format))

        src_table_ref = self.get_table_reference_from_path(table_path)

        # Generate the destination of the table content.
        if explicit_filename:
            output_filename = explicit_filename
        else:
            output_filename = src_table_ref.table_id
            if output_ext:
                output_filename += '_' + output_ext

        if support_multifile_export:
            # End in a * so that multiple shards can be written out if needed.
            output_filename += '-*'
        output_filename += '.' + output_format
        if compression:
            output_filename += '.gz'

        path = os.path.join(dir_in_bucket, output_filename)

        destination = 'gs://{}/{}'.format(bucket_name, path.lstrip().lstrip('/'))

        config = ExtractJobConfig()
        config.destination_format = bq_output_format
        config.compression = 'GZIP' if compression else 'NONE'

        extract_job = self.gclient.extract_table(src_table_ref, destination, job_config=config,
                                                 retry=self.default_retry_for_api_calls)

        # Wait for completion
        extract_job.result(timeout=max_wait_secs or self.max_wait_secs)
Ejemplo n.º 16
0
    def export_table_to_bucket(
            self,
            table_path,  # type: str
            bucket_name,  # type: str
            dir_in_bucket='',  # type: Optional[str]
            output_format='csv',  # type: Optional[str]]
            compression=False,  # type: Optional[bool]
            output_ext='',  # type: Optional[str]
            max_wait_secs=None,  # type: Optional[int]
            support_multifile_export=True,  # type: bool
            explicit_filename=None  # type: Optional[str]
    ):
        # type: (...) -> List[str]
        """
        Export a BigQuery table to a file (or a set of files)
        in the given bucket. The output files will be in a directory with the same name
        as the table, within the bucket_name and dir_in_bucket provided.

        Args:
            table_path: Path of the table
            bucket_name: Name of the bucket to store the spreadsheet
            dir_in_bucket: The directory in the bucket to store the output files
            output_format: Format of output. It must be among 'csv', 'json' and 'avro'
            compression: Whether to use GZIP compression. Avro cannot be used with GZIP compression
            output_ext: An optional extension to output file. So that we can tell output files from
                different exports
            max_wait_secs: Maximum time to wait. Export table to storage takes significantly longer
                than query a table. If not set, it will use the class default.
            support_multifile_export: If True, and the table is large enough, then the table will be
                exported as several files suffixed with a shard number. If False, it will be exported
                as a single file.
            explicit_filename: File name. Use it as file name if specified, otherwise use table ID,
                maybe with output_ext, as file name

        Returns:
            A list of the names of the files exported.

        Raises:
            RuntimeError if there is a problem with the export job.
        """
        bq_output_format = self._convert_to_bq_format(output_format)

        if compression and output_format == 'avro':
            raise ValueError(
                '{} cannot be combined with GZIP compression'.format(
                    output_format))

        src_table_ref = self.get_table_reference_from_path(table_path)

        # Generate the destination of the table content.
        if explicit_filename:
            output_filename = explicit_filename
        else:
            output_filename = src_table_ref.table_id
            if output_ext:
                output_filename += '_' + output_ext

        if support_multifile_export:
            # End in a * so that multiple shards can be written out if needed.
            output_filename += '-*'
        output_filename += '.' + output_format
        if compression:
            output_filename += '.gz'

        path = os.path.join(dir_in_bucket, output_filename)

        destination = 'gs://{}/{}'.format(bucket_name,
                                          path.lstrip().lstrip('/'))

        config = ExtractJobConfig()
        config.destination_format = bq_output_format
        config.compression = 'GZIP' if compression else 'NONE'

        extract_job = self.gclient.extract_table(
            src_table_ref,
            destination,
            job_config=config,
            retry=self.default_retry_for_api_calls)

        # Wait for completion
        extract_job.result(timeout=max_wait_secs or self.max_wait_secs)

        # Get the names of the files exported.
        if support_multifile_export:
            # destination_uri_file_counts is a list of ints representing the number of files
            # exported to each destination URI. Since we are only exporting to one destination, we
            # just need the first element.
            num_files_exported = extract_job.destination_uri_file_counts[0]
            # Multi-file export replaces the "*" with the file number, padded to 12 digits. See:
            # https://cloud.google.com/bigquery/docs/exporting-data#exporting_data_into_one_or_more_files  # noqa
            return [
                output_filename.replace(
                    '*',
                    str(i).zfill(MULTIFILE_EXPORT_PAD_LENGTH))
                for i in range(num_files_exported)
            ]
        return [output_filename]