Esempio n. 1
0
    def write_to_table(
            self,
            query,
            dataset=None,
            table=None,
            allow_large_results=None,
            use_query_cache=None,
            priority=None,
            create_disposition=None,
            write_disposition=None,
    ):
        """
        Write query result to table. If dataset or table is not provided,
        Bigquery will write the result to temporary table.
        Args:
            query: required BigQuery query string.
            dataset: optional string id of the dataset
            table: optional string id of the table
            allow_large_results: optional boolean
            use_query_cache: optional boolean
            priority: optional string
                    (one of the JOB_PRIORITY_* constants)
            create_disposition: optional string
                    (one of the JOB_CREATE_* constants)
            write_disposition: optional string
                    (one of the JOB_WRITE_* constants)

            Optional arguments with value None are determined by
            BigQuery as described:
            https://developers.google.com/bigquery/docs/reference/v2/jobs

        Returns:
            dict, a BigQuery job resource
        Raises:
            JobInsertException on http/auth failures or error in result
        """

        configuration = {
            "query": query,
        }

        if dataset and table:
            configuration['destinationTable'] = {
                "projectId": self.project_id,
                "tableId": table,
                "datasetId": dataset
            }

        if allow_large_results is not None:
            configuration['allowLargeResults'] = allow_large_results

        if use_query_cache is not None:
            configuration['useQueryCache'] = use_query_cache

        if priority:
            configuration['priority'] = priority

        if create_disposition:
            configuration['createDisposition'] = create_disposition

        if write_disposition:
            configuration['writeDisposition'] = write_disposition

        body = {
            "configuration": {
                'query': configuration
            }
        }

        logger.info("Creating write to table job %s" % body)
        job_resource = self.bigquery.jobs() \
            .insert(projectId=self.project_id, body=body) \
            .execute()
        self._raise_insert_exception_if_error(job_resource)
        return job_resource
Esempio n. 2
0
    def export_data_to_uris(
            self,
            destination_uris,
            dataset,
            table,
            job=None,
            compression=None,
            destination_format=None,
            print_header=None,
            field_delimiter=None,
    ):
        """
        Export data from a BigQuery table to cloud storage.
        Args:
            destination_uris: required string or list of strings representing
                              the uris on cloud storage of the form:
                              gs://bucket/filename
            dataset: required string id of the dataset
            table: required string id of the table
            job: optional string identifying the job (a unique jobid
                    is automatically generated if not provided)
            compression: optional string
                    (one of the JOB_COMPRESSION_* constants)
            destination_format: optional string
                    (one of the JOB_DESTINATION_FORMAT_* constants)
            print_header: optional boolean
            field_delimiter: optional string

            Optional arguments with value None are determined by
            BigQuery as described:
            https://developers.google.com/bigquery/docs/reference/v2/jobs

        Returns:
            dict, a BigQuery job resource
        Raises:
            JobInsertException on http/auth failures or error in result
        """
        destination_uris = destination_uris \
            if isinstance(destination_uris, list) else [destination_uris]

        configuration = {
            "sourceTable": {
                "projectId": self.project_id,
                "tableId": table,
                "datasetId": dataset
            },
            "destinationUris": destination_uris,
        }

        if compression:
            configuration['compression'] = compression

        if destination_format:
            configuration['destinationFormat'] = destination_format

        if print_header is not None:
            configuration['printHeader'] = print_header

        if field_delimiter:
            configuration['fieldDelimiter'] = field_delimiter

        if not job:
            hex = self._generate_hex_for_uris(destination_uris)
            job = "{dataset}-{table}-{digest}".format(
                dataset=dataset,
                table=table,
                digest=hex
            )

        body = {
            "configuration": {
                'extract': configuration
            },
            "jobReference": {
                "projectId": self.project_id,
                "jobId": job
            }
        }

        logger.info("Creating export job %s" % body)
        job_resource = self.bigquery.jobs() \
            .insert(projectId=self.project_id, body=body) \
            .execute()
        self._raise_insert_exception_if_error(job_resource)
        return job_resource
Esempio n. 3
0
    def export_data_to_uris(
        self,
        destination_uris,
        dataset,
        table,
        job=None,
        compression=None,
        destination_format=None,
        print_header=None,
        field_delimiter=None,
    ):
        """
        Export data from a BigQuery table to cloud storage.
        Args:
            destination_uris: required string or list of strings representing
                              the uris on cloud storage of the form:
                              gs://bucket/filename
            dataset: required string id of the dataset
            table: required string id of the table
            job: optional string identifying the job (a unique jobid
                    is automatically generated if not provided)
            compression: optional string
                    (one of the JOB_COMPRESSION_* constants)
            destination_format: optional string
                    (one of the JOB_DESTINATION_FORMAT_* constants)
            print_header: optional boolean
            field_delimiter: optional string

            Optional arguments with value None are determined by
            BigQuery as described:
            https://developers.google.com/bigquery/docs/reference/v2/jobs

        Returns:
            dict, a BigQuery job resource
        Raises:
            JobInsertException on http/auth failures or error in result
        """
        destination_uris = destination_uris \
            if isinstance(destination_uris, list) else [destination_uris]

        configuration = {
            "sourceTable": {
                "projectId": self.project_id,
                "tableId": table,
                "datasetId": dataset
            },
            "destinationUris": destination_uris,
        }

        if compression:
            configuration['compression'] = compression

        if destination_format:
            configuration['destinationFormat'] = destination_format

        if print_header is not None:
            configuration['printHeader'] = print_header

        if field_delimiter:
            configuration['fieldDelimiter'] = field_delimiter

        if not job:
            hex = self._generate_hex_for_uris(destination_uris)
            job = "{dataset}-{table}-{digest}".format(dataset=dataset,
                                                      table=table,
                                                      digest=hex)

        body = {
            "configuration": {
                'extract': configuration
            },
            "jobReference": {
                "projectId": self.project_id,
                "jobId": job
            }
        }

        logger.info("Creating export job %s" % body)
        job_resource = self.bigquery.jobs() \
            .insert(projectId=self.project_id, body=body) \
            .execute()
        self._raise_insert_exception_if_error(job_resource)
        return job_resource
Esempio n. 4
0
    def write_to_table(
        self,
        query,
        dataset=None,
        table=None,
        allow_large_results=None,
        use_query_cache=None,
        priority=None,
        create_disposition=None,
        write_disposition=None,
    ):
        """
        Write query result to table. If dataset or table is not provided,
        Bigquery will write the result to temporary table.
        Args:
            query: required BigQuery query string.
            dataset: optional string id of the dataset
            table: optional string id of the table
            allow_large_results: optional boolean
            use_query_cache: optional boolean
            priority: optional string
                    (one of the JOB_PRIORITY_* constants)
            create_disposition: optional string
                    (one of the JOB_CREATE_* constants)
            write_disposition: optional string
                    (one of the JOB_WRITE_* constants)

            Optional arguments with value None are determined by
            BigQuery as described:
            https://developers.google.com/bigquery/docs/reference/v2/jobs

        Returns:
            dict, a BigQuery job resource
        Raises:
            JobInsertException on http/auth failures or error in result
        """

        configuration = {
            "query": query,
        }

        if dataset and table:
            configuration['destinationTable'] = {
                "projectId": self.project_id,
                "tableId": table,
                "datasetId": dataset
            }

        if allow_large_results is not None:
            configuration['allowLargeResults'] = allow_large_results

        if use_query_cache is not None:
            configuration['useQueryCache'] = use_query_cache

        if priority:
            configuration['priority'] = priority

        if create_disposition:
            configuration['createDisposition'] = create_disposition

        if write_disposition:
            configuration['writeDisposition'] = write_disposition

        body = {"configuration": {'query': configuration}}

        logger.info("Creating write to table job %s" % body)
        job_resource = self.bigquery.jobs() \
            .insert(projectId=self.project_id, body=body) \
            .execute()
        self._raise_insert_exception_if_error(job_resource)
        return job_resource
Esempio n. 5
0
    def import_data_from_uris(
            self,
            source_uris,
            dataset,
            table,
            schema=None,
            job=None,
            source_format=None,
            create_disposition=None,
            write_disposition=None,
            encoding=None,
            ignore_unknown_values=None,
            max_bad_records=None,
            allow_jagged_rows=None,
            allow_quoted_newlines=None,
            field_delimiter=None,
            quote=None,
            skip_leading_rows=None,
    ):
        """
        Imports data into a BigQuery table from cloud storage.
        Args:
            source_uris: required string or list of strings representing
                            the uris on cloud storage of the form:
                             gs://bucket/filename
            dataset: required string id of the dataset
            table: required string id of the table
            job: optional string identifying the job (a unique jobid
                    is automatically generated if not provided)
            schema: optional list representing the bigquery schema
            source_format: optional string
                    (one of the JOB_SOURCE_FORMAT_* constants)
            create_disposition: optional string
                    (one of the JOB_CREATE_* constants)
            write_disposition: optional string
                    (one of the JOB_WRITE_* constants)
            encoding: optional string default
                    (one of the JOB_ENCODING_* constants)
            ignore_unknown_values: optional boolean
            max_bad_records: optional boolean
            allow_jagged_rows: optional boolean for csv only
            allow_quoted_newlines: optional boolean for csv only
            field_delimiter: optional string for csv only
            quote: optional string the quote character for csv only
            skip_leading_rows: optional int for csv only

            Optional arguments with value None are determined by
            BigQuery as described:
            https://developers.google.com/bigquery/docs/reference/v2/jobs

        Returns:
            dict, a BigQuery job resource
        Raises:
            JobInsertException on http/auth failures or error in result
        """
        source_uris = source_uris if isinstance(source_uris, list) \
            else [source_uris]

        configuration = {
            "destinationTable": {
                "projectId": self.project_id,
                "tableId": table,
                "datasetId": dataset
            },
            "sourceUris": source_uris,
        }

        if max_bad_records:
            configuration['maxBadRecords'] = max_bad_records

        if ignore_unknown_values:
            configuration['ignoreUnknownValues'] = ignore_unknown_values

        if create_disposition:
            configuration['createDisposition'] = create_disposition

        if write_disposition:
            configuration['writeDisposition'] = write_disposition

        if encoding:
            configuration['encoding'] = encoding

        if schema:
            configuration['schema'] = {'fields': schema}

        if source_format:
            configuration['sourceFormat'] = source_format

        if not job:
            hex = self._generate_hex_for_uris(source_uris)
            job = "{dataset}-{table}-{digest}".format(
                dataset=dataset,
                table=table,
                digest=hex
            )

        if source_format == JOB_SOURCE_FORMAT_CSV:
            if field_delimiter:
                configuration['fieldDelimiter'] = field_delimiter

            if allow_jagged_rows:
                configuration['allowJaggedRows'] = allow_jagged_rows

            if allow_quoted_newlines:
                configuration['allowQuotedNewlines'] = allow_quoted_newlines

            if quote:
                configuration['quote'] = quote

            if skip_leading_rows:
                configuration['skipLeadingRows'] = skip_leading_rows

        elif field_delimiter or allow_jagged_rows \
                or allow_quoted_newlines or quote or skip_leading_rows:
            all_values = dict(field_delimiter=field_delimiter,
                              allow_jagged_rows=allow_jagged_rows,
                              allow_quoted_newlines=allow_quoted_newlines,
                              skip_leading_rows=skip_leading_rows,
                              quote=quote)
            non_null_values = dict((k, v) for k, v
                                   in all_values.items()
                                   if v)
            raise Exception("Parameters field_delimiter, allow_jagged_rows, "
                            "allow_quoted_newlines, quote and "
                            "skip_leading_rows are only allowed when "
                            "source_format=JOB_SOURCE_FORMAT_CSV: %s"
                            % non_null_values)

        body = {
            "configuration": {
                'load': configuration
            },
            "jobReference": {
                "projectId": self.project_id,
                "jobId": job
            }
        }

        logger.info("Creating load job %s" % body)
        job_resource = self.bigquery.jobs() \
            .insert(projectId=self.project_id, body=body) \
            .execute()
        self._raise_insert_exception_if_error(job_resource)
        return job_resource