def query(self, query, max_results=None, timeout=10, dry_run=False): """Submit a query to BigQuery. Args: query: BigQuery query string. max_results: maximum number of rows to return per page of results. timeout: how long to wait for the query to complete, in seconds, before the request times out and returns. dry_run: if True, the query isn't actually run. A valid query will return an empty response, while an invalid one will return the same error message it would if it wasn't a dry run. Returns: job id and query results if query completed. If dry_run is True, job id will be None and results will be empty if the query is valid or a dict containing the response if invalid. """ logger.debug('Executing query: %s' % query) job_collection = self.bigquery.jobs() query_data = { 'query': query, 'timeoutMs': timeout * 1000, 'dryRun': dry_run, 'maxResults': max_results, } try: query_reply = job_collection.query( projectId=self.project_id, body=query_data).execute() except HttpError, e: if dry_run: return None, json.loads(e.content) raise
def query(self, query, max_results=None, timeout=10, dry_run=False): """Submit a query to BigQuery. Args: query: BigQuery query string. max_results: maximum number of rows to return per page of results. timeout: how long to wait for the query to complete, in seconds, before the request times out and returns. dry_run: if True, the query isn't actually run. A valid query will return an empty response, while an invalid one will return the same error message it would if it wasn't a dry run. Returns: job id and query results if query completed. If dry_run is True, job id will be None and results will be empty if the query is valid or a dict containing the response if invalid. """ logger.debug('Executing query: %s' % query) job_collection = self.bigquery.jobs() query_data = { 'query': query, 'timeoutMs': timeout * 1000, 'dryRun': dry_run, 'maxResults': max_results, } try: query_reply = job_collection.query(projectId=self.project_id, body=query_data).execute() except HttpError, e: if dry_run: return None, json.loads(e.content) raise
def query(self, query, max_results=None, timeout=10): """Submit a query to BigQuery. Args: query: BigQuery query string. max_results: maximum number of rows to return per page of results. timeout: how long to wait for the query to complete, in seconds, before the request times out and returns. Returns: job id and query rows if query completed. """ logger.debug('Executing query: %s' % query) job_collection = self.bigquery.jobs() query_data = {'query': query, 'timeoutMs': timeout * 1000} if max_results: query_data['maxResults'] = max_results query_reply = job_collection.query( projectId=self.project_id, body=query_data).execute() job_id = query_reply['jobReference']['jobId'] schema = query_reply.get('schema', {'fields': None})['fields'] rows = query_reply.get('rows', []) return job_id, [self._transform_row(row, schema) for row in rows]
def import_data_from_uris( self, source_uris, dataset, table, schema=None, job=None, source_format=None, create_disposition=None, write_disposition=None, encoding=None, ignore_unknown_values=None, max_bad_records=None, allow_jagged_rows=None, allow_quoted_newlines=None, field_delimiter=None, quote=None, skip_leading_rows=None, ): """ Imports data into a BigQuery table from cloud storage. Args: source_uris: required string or list of strings representing the uris on cloud storage of the form: gs://bucket/filename dataset: required string id of the dataset table: required string id of the table job: optional string identifying the job (a unique jobid is automatically generated if not provided) schema: optional list representing the bigquery schema source_format: optional string (one of the JOB_SOURCE_FORMAT_* constants) create_disposition: optional string (one of the JOB_CREATE_* constants) write_disposition: optional string (one of the JOB_WRITE_* constants) encoding: optional string default (one of the JOB_ENCODING_* constants) ignore_unknown_values: optional boolean max_bad_records: optional boolean allow_jagged_rows: optional boolean for csv only allow_quoted_newlines: optional boolean for csv only field_delimiter: optional string for csv only quote: optional string the quote character for csv only skip_leading_rows: optional int for csv only Optional arguments with value None are determined by BigQuery as described: https://developers.google.com/bigquery/docs/reference/v2/jobs Returns: dict, a BigQuery job resource Raises: JobInsertException on http/auth failures or error in result """ source_uris = source_uris if isinstance(source_uris, list) \ else [source_uris] configuration = { "destinationTable": { "projectId": self.project_id, "tableId": table, "datasetId": dataset }, "sourceUris": source_uris, } if max_bad_records: configuration['maxBadRecords'] = max_bad_records if ignore_unknown_values: configuration['ignoreUnknownValues'] = ignore_unknown_values if create_disposition: configuration['createDisposition'] = create_disposition if write_disposition: configuration['writeDisposition'] = write_disposition if encoding: configuration['encoding'] = encoding if schema: configuration['schema'] = schema if source_format: configuration['sourceFormat'] = source_format if not job: hex = self._generate_hex_for_uris(source_uris) job = "{dataset}-{table}-{digest}".format( dataset=dataset, table=table, digest=hex ) if source_format == JOB_SOURCE_FORMAT_CSV: if field_delimiter: configuration['fieldDelimiter'] = field_delimiter if allow_jagged_rows: configuration['allowJaggedRows'] = allow_jagged_rows if allow_quoted_newlines: configuration['allowQuotedNewlines'] = allow_quoted_newlines if quote: configuration['quote'] = quote if skip_leading_rows: configuration['skipLeadingRows'] = skip_leading_rows elif field_delimiter or allow_jagged_rows \ or allow_quoted_newlines or quote or skip_leading_rows: all_values = dict(field_delimiter=field_delimiter, allow_jagged_rows=allow_jagged_rows, allow_quoted_newlines=allow_quoted_newlines, skip_leading_rows=skip_leading_rows, quote=quote) non_null_values = dict((k, v) for k, v in all_values.items() if v) raise Exception("Parameters field_delimiter, allow_jagged_rows, " "allow_quoted_newlines, quote and " "skip_leading_rows are only allowed when " "source_format=JOB_SOURCE_FORMAT_CSV: %s" % non_null_values) body = { "configuration": { 'load': configuration }, "jobReference": { "projectId": self.project_id, "jobId": job } } logger.debug("Creating load job %s" % body) job_resource = self.bigquery.jobs() \ .insert(projectId=self.project_id, body=body) \ .execute() self._raise_insert_exception_if_error(job_resource) return job_resource
def import_data_from_uris( self, source_uris, dataset, table, schema=None, job=None, source_format=None, create_disposition=None, write_disposition=None, encoding=None, ignore_unknown_values=None, max_bad_records=None, allow_jagged_rows=None, allow_quoted_newlines=None, field_delimiter=None, quote=None, skip_leading_rows=None, ): """ Imports data into a BigQuery table from cloud storage. Args: source_uris: required string or list of strings representing the uris on cloud storage of the form: gs://bucket/filename dataset: required string id of the dataset table: required string id of the table job: optional string identifying the job (a unique jobid is automatically generated if not provided) schema: optional list representing the bigquery schema source_format: optional string (one of the JOB_SOURCE_FORMAT_* constants) create_disposition: optional string (one of the JOB_CREATE_* constants) write_disposition: optional string (one of the JOB_WRITE_* constants) encoding: optional string default (one of the JOB_ENCODING_* constants) ignore_unknown_values: optional boolean max_bad_records: optional boolean allow_jagged_rows: optional boolean for csv only allow_quoted_newlines: optional boolean for csv only field_delimiter: optional string for csv only quote: optional string the quote character for csv only skip_leading_rows: optional int for csv only Optional arguments with value None are determined by BigQuery as described: https://developers.google.com/bigquery/docs/reference/v2/jobs Returns: dict, a BigQuery job resource Raises: JobInsertException on http/auth failures or error in result """ source_uris = source_uris if isinstance(source_uris, list) \ else [source_uris] configuration = { "destinationTable": { "projectId": self.project_id, "tableId": table, "datasetId": dataset }, "sourceUris": source_uris, } if max_bad_records: configuration['maxBadRecords'] = max_bad_records if ignore_unknown_values: configuration['ignoreUnknownValues'] = ignore_unknown_values if create_disposition: configuration['createDisposition'] = create_disposition if write_disposition: configuration['writeDisposition'] = write_disposition if encoding: configuration['encoding'] = encoding if schema: configuration['schema'] = schema if source_format: configuration['sourceFormat'] = source_format if not job: hex = self._generate_hex_for_uris(source_uris) job = "{dataset}-{table}-{digest}".format(dataset=dataset, table=table, digest=hex) if source_format == JOB_SOURCE_FORMAT_CSV: if field_delimiter: configuration['fieldDelimiter'] = field_delimiter if allow_jagged_rows: configuration['allowJaggedRows'] = allow_jagged_rows if allow_quoted_newlines: configuration['allowQuotedNewlines'] = allow_quoted_newlines if quote: configuration['quote'] = quote if skip_leading_rows: configuration['skipLeadingRows'] = skip_leading_rows elif field_delimiter or allow_jagged_rows \ or allow_quoted_newlines or quote or skip_leading_rows: all_values = dict(field_delimiter=field_delimiter, allow_jagged_rows=allow_jagged_rows, allow_quoted_newlines=allow_quoted_newlines, skip_leading_rows=skip_leading_rows, quote=quote) non_null_values = dict((k, v) for k, v in all_values.items() if v) raise Exception("Parameters field_delimiter, allow_jagged_rows, " "allow_quoted_newlines, quote and " "skip_leading_rows are only allowed when " "source_format=JOB_SOURCE_FORMAT_CSV: %s" % non_null_values) body = { "configuration": { 'load': configuration }, "jobReference": { "projectId": self.project_id, "jobId": job } } logger.debug("Creating load job %s" % body) job_resource = self.bigquery.jobs() \ .insert(projectId=self.project_id, body=body) \ .execute() self._raise_insert_exception_if_error(job_resource) return job_resource