Ejemplo n.º 1
0
    def query(self, query, max_results=None, timeout=10, dry_run=False):
        """Submit a query to BigQuery.

        Args:
            query: BigQuery query string.
            max_results: maximum number of rows to return per page of results.
            timeout: how long to wait for the query to complete, in seconds,
                     before the request times out and returns.
            dry_run: if True, the query isn't actually run. A valid query will
                     return an empty response, while an invalid one will return
                     the same error message it would if it wasn't a dry run.

        Returns:
            job id and query results if query completed. If dry_run is True,
            job id will be None and results will be empty if the query is valid
            or a dict containing the response if invalid.
        """

        logger.debug('Executing query: %s' % query)

        job_collection = self.bigquery.jobs()
        query_data = {
            'query': query,
            'timeoutMs': timeout * 1000,
            'dryRun': dry_run,
            'maxResults': max_results,
        }

        try:
            query_reply = job_collection.query(
                projectId=self.project_id, body=query_data).execute()
        except HttpError, e:
            if dry_run:
                return None, json.loads(e.content)
            raise
Ejemplo n.º 2
0
    def query(self, query, max_results=None, timeout=10, dry_run=False):
        """Submit a query to BigQuery.

        Args:
            query: BigQuery query string.
            max_results: maximum number of rows to return per page of results.
            timeout: how long to wait for the query to complete, in seconds,
                     before the request times out and returns.
            dry_run: if True, the query isn't actually run. A valid query will
                     return an empty response, while an invalid one will return
                     the same error message it would if it wasn't a dry run.

        Returns:
            job id and query results if query completed. If dry_run is True,
            job id will be None and results will be empty if the query is valid
            or a dict containing the response if invalid.
        """

        logger.debug('Executing query: %s' % query)

        job_collection = self.bigquery.jobs()
        query_data = {
            'query': query,
            'timeoutMs': timeout * 1000,
            'dryRun': dry_run,
            'maxResults': max_results,
        }

        try:
            query_reply = job_collection.query(projectId=self.project_id,
                                               body=query_data).execute()
        except HttpError, e:
            if dry_run:
                return None, json.loads(e.content)
            raise
Ejemplo n.º 3
0
    def query(self, query, max_results=None, timeout=10):
        """Submit a query to BigQuery.

        Args:
            query: BigQuery query string.
            max_results: maximum number of rows to return per page of results.
            timeout: how long to wait for the query to complete, in seconds,
                     before the request times out and returns.

        Returns:
            job id and query rows if query completed.
        """

        logger.debug('Executing query: %s' % query)

        job_collection = self.bigquery.jobs()
        query_data = {'query': query, 'timeoutMs': timeout * 1000}

        if max_results:
            query_data['maxResults'] = max_results

        query_reply = job_collection.query(
            projectId=self.project_id, body=query_data).execute()

        job_id = query_reply['jobReference']['jobId']
        schema = query_reply.get('schema', {'fields': None})['fields']
        rows = query_reply.get('rows', [])

        return job_id, [self._transform_row(row, schema) for row in rows]
Ejemplo n.º 4
0
    def import_data_from_uris(
            self,
            source_uris,
            dataset,
            table,
            schema=None,
            job=None,
            source_format=None,
            create_disposition=None,
            write_disposition=None,
            encoding=None,
            ignore_unknown_values=None,
            max_bad_records=None,
            allow_jagged_rows=None,
            allow_quoted_newlines=None,
            field_delimiter=None,
            quote=None,
            skip_leading_rows=None,
    ):
        """
        Imports data into a BigQuery table from cloud storage.
        Args:
            source_uris: required string or list of strings representing
                            the uris on cloud storage of the form:
                             gs://bucket/filename
            dataset: required string id of the dataset
            table: required string id of the table
            job: optional string identifying the job (a unique jobid
                    is automatically generated if not provided)
            schema: optional list representing the bigquery schema
            source_format: optional string
                    (one of the JOB_SOURCE_FORMAT_* constants)
            create_disposition: optional string
                    (one of the JOB_CREATE_* constants)
            write_disposition: optional string
                    (one of the JOB_WRITE_* constants)
            encoding: optional string default
                    (one of the JOB_ENCODING_* constants)
            ignore_unknown_values: optional boolean
            max_bad_records: optional boolean
            allow_jagged_rows: optional boolean for csv only
            allow_quoted_newlines: optional boolean for csv only
            field_delimiter: optional string for csv only
            quote: optional string the quote character for csv only
            skip_leading_rows: optional int for csv only

            Optional arguments with value None are determined by
            BigQuery as described:
            https://developers.google.com/bigquery/docs/reference/v2/jobs

        Returns:
            dict, a BigQuery job resource
        Raises:
            JobInsertException on http/auth failures or error in result
        """
        source_uris = source_uris if isinstance(source_uris, list) \
            else [source_uris]

        configuration = {
            "destinationTable": {
                "projectId": self.project_id,
                "tableId": table,
                "datasetId": dataset
            },
            "sourceUris": source_uris,
        }

        if max_bad_records:
            configuration['maxBadRecords'] = max_bad_records

        if ignore_unknown_values:
            configuration['ignoreUnknownValues'] = ignore_unknown_values

        if create_disposition:
            configuration['createDisposition'] = create_disposition

        if write_disposition:
            configuration['writeDisposition'] = write_disposition

        if encoding:
            configuration['encoding'] = encoding

        if schema:
            configuration['schema'] = schema

        if source_format:
            configuration['sourceFormat'] = source_format

        if not job:
            hex = self._generate_hex_for_uris(source_uris)
            job = "{dataset}-{table}-{digest}".format(
                dataset=dataset,
                table=table,
                digest=hex
            )

        if source_format == JOB_SOURCE_FORMAT_CSV:
            if field_delimiter:
                configuration['fieldDelimiter'] = field_delimiter

            if allow_jagged_rows:
                configuration['allowJaggedRows'] = allow_jagged_rows

            if allow_quoted_newlines:
                configuration['allowQuotedNewlines'] = allow_quoted_newlines

            if quote:
                configuration['quote'] = quote

            if skip_leading_rows:
                configuration['skipLeadingRows'] = skip_leading_rows

        elif field_delimiter or allow_jagged_rows \
                or allow_quoted_newlines or quote or skip_leading_rows:
            all_values = dict(field_delimiter=field_delimiter,
                              allow_jagged_rows=allow_jagged_rows,
                              allow_quoted_newlines=allow_quoted_newlines,
                              skip_leading_rows=skip_leading_rows,
                              quote=quote)
            non_null_values = dict((k, v) for k, v
                                   in all_values.items()
                                   if v)
            raise Exception("Parameters field_delimiter, allow_jagged_rows, "
                            "allow_quoted_newlines, quote and "
                            "skip_leading_rows are only allowed when "
                            "source_format=JOB_SOURCE_FORMAT_CSV: %s"
                            % non_null_values)

        body = {
            "configuration": {
                'load': configuration
            },
            "jobReference": {
                "projectId": self.project_id,
                "jobId": job
            }
        }

        logger.debug("Creating load job %s" % body)
        job_resource = self.bigquery.jobs() \
            .insert(projectId=self.project_id, body=body) \
            .execute()
        self._raise_insert_exception_if_error(job_resource)
        return job_resource
Ejemplo n.º 5
0
    def import_data_from_uris(
        self,
        source_uris,
        dataset,
        table,
        schema=None,
        job=None,
        source_format=None,
        create_disposition=None,
        write_disposition=None,
        encoding=None,
        ignore_unknown_values=None,
        max_bad_records=None,
        allow_jagged_rows=None,
        allow_quoted_newlines=None,
        field_delimiter=None,
        quote=None,
        skip_leading_rows=None,
    ):
        """
        Imports data into a BigQuery table from cloud storage.
        Args:
            source_uris: required string or list of strings representing
                            the uris on cloud storage of the form:
                             gs://bucket/filename
            dataset: required string id of the dataset
            table: required string id of the table
            job: optional string identifying the job (a unique jobid
                    is automatically generated if not provided)
            schema: optional list representing the bigquery schema
            source_format: optional string
                    (one of the JOB_SOURCE_FORMAT_* constants)
            create_disposition: optional string
                    (one of the JOB_CREATE_* constants)
            write_disposition: optional string
                    (one of the JOB_WRITE_* constants)
            encoding: optional string default
                    (one of the JOB_ENCODING_* constants)
            ignore_unknown_values: optional boolean
            max_bad_records: optional boolean
            allow_jagged_rows: optional boolean for csv only
            allow_quoted_newlines: optional boolean for csv only
            field_delimiter: optional string for csv only
            quote: optional string the quote character for csv only
            skip_leading_rows: optional int for csv only

            Optional arguments with value None are determined by
            BigQuery as described:
            https://developers.google.com/bigquery/docs/reference/v2/jobs

        Returns:
            dict, a BigQuery job resource
        Raises:
            JobInsertException on http/auth failures or error in result
        """
        source_uris = source_uris if isinstance(source_uris, list) \
            else [source_uris]

        configuration = {
            "destinationTable": {
                "projectId": self.project_id,
                "tableId": table,
                "datasetId": dataset
            },
            "sourceUris": source_uris,
        }

        if max_bad_records:
            configuration['maxBadRecords'] = max_bad_records

        if ignore_unknown_values:
            configuration['ignoreUnknownValues'] = ignore_unknown_values

        if create_disposition:
            configuration['createDisposition'] = create_disposition

        if write_disposition:
            configuration['writeDisposition'] = write_disposition

        if encoding:
            configuration['encoding'] = encoding

        if schema:
            configuration['schema'] = schema

        if source_format:
            configuration['sourceFormat'] = source_format

        if not job:
            hex = self._generate_hex_for_uris(source_uris)
            job = "{dataset}-{table}-{digest}".format(dataset=dataset,
                                                      table=table,
                                                      digest=hex)

        if source_format == JOB_SOURCE_FORMAT_CSV:
            if field_delimiter:
                configuration['fieldDelimiter'] = field_delimiter

            if allow_jagged_rows:
                configuration['allowJaggedRows'] = allow_jagged_rows

            if allow_quoted_newlines:
                configuration['allowQuotedNewlines'] = allow_quoted_newlines

            if quote:
                configuration['quote'] = quote

            if skip_leading_rows:
                configuration['skipLeadingRows'] = skip_leading_rows

        elif field_delimiter or allow_jagged_rows \
                or allow_quoted_newlines or quote or skip_leading_rows:
            all_values = dict(field_delimiter=field_delimiter,
                              allow_jagged_rows=allow_jagged_rows,
                              allow_quoted_newlines=allow_quoted_newlines,
                              skip_leading_rows=skip_leading_rows,
                              quote=quote)
            non_null_values = dict((k, v) for k, v in all_values.items() if v)
            raise Exception("Parameters field_delimiter, allow_jagged_rows, "
                            "allow_quoted_newlines, quote and "
                            "skip_leading_rows are only allowed when "
                            "source_format=JOB_SOURCE_FORMAT_CSV: %s" %
                            non_null_values)

        body = {
            "configuration": {
                'load': configuration
            },
            "jobReference": {
                "projectId": self.project_id,
                "jobId": job
            }
        }

        logger.debug("Creating load job %s" % body)
        job_resource = self.bigquery.jobs() \
            .insert(projectId=self.project_id, body=body) \
            .execute()
        self._raise_insert_exception_if_error(job_resource)
        return job_resource