def run_query(self, query, **kwargs): from concurrent.futures import TimeoutError from google.auth.exceptions import RefreshError from google.cloud import bigquery job_config = { "query": { "useLegacySql": self.dialect == "legacy" # 'allowLargeResults', 'createDisposition', # 'preserveNulls', destinationTable, useQueryCache } } config = kwargs.get("configuration") if config is not None: job_config.update(config) if "query" in config and "query" in config["query"]: if query is not None: raise ValueError( "Query statement can't be specified " "inside config while it is specified " "as parameter" ) query = config["query"].pop("query") self._start_timer() try: logger.debug("Requesting query... ") query_reply = self.client.query( query, job_config=bigquery.QueryJobConfig.from_api_repr(job_config), location=self.location, project=self.project_id, ) logger.debug("Query running...") except (RefreshError, ValueError): if self.private_key: raise AccessDenied( "The service account credentials are not valid" ) else: raise AccessDenied( "The credentials have been revoked or expired, " "please re-run the application to re-authorize" ) except self.http_error as ex: self.process_http_error(ex) job_id = query_reply.job_id logger.debug("Job ID: %s" % job_id) while query_reply.state != "DONE": self.log_elapsed_seconds(" Elapsed", "s. Waiting...") timeout_ms = job_config["query"].get("timeoutMs") if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000: raise QueryTimeout("Query timeout: {} ms".format(timeout_ms)) timeout_sec = 1.0 if timeout_ms: # Wait at most 1 second so we can show progress bar timeout_sec = min(1.0, timeout_ms / 1000.0) try: query_reply.result(timeout=timeout_sec) except TimeoutError: # Use our own timeout logic pass except self.http_error as ex: self.process_http_error(ex) if query_reply.cache_hit: logger.debug("Query done.\nCache hit.\n") else: bytes_processed = query_reply.total_bytes_processed or 0 bytes_billed = query_reply.total_bytes_billed or 0 logger.debug( "Query done.\nProcessed: {} Billed: {}".format( self.sizeof_fmt(bytes_processed), self.sizeof_fmt(bytes_billed), ) ) logger.debug( "Standard price: ${:,.2f} USD\n".format( bytes_billed * self.query_price_for_TB ) ) try: rows_iter = query_reply.result() except self.http_error as ex: self.process_http_error(ex) schema_fields = [field.to_api_repr() for field in rows_iter.schema] nullsafe_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields) df = rows_iter.to_dataframe( dtypes=nullsafe_dtypes, bqstorage_client=self.bqstorage_client ) if df.empty: df = _cast_empty_df_dtypes(schema_fields, df) # Ensure any TIMESTAMP columns are tz-aware. df = _localize_df(schema_fields, df) logger.debug("Got {} rows.\n".format(rows_iter.total_rows)) return df
def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): from concurrent.futures import TimeoutError from google.auth.exceptions import RefreshError job_config = { "query": { "useLegacySql": self.dialect == "legacy" # 'allowLargeResults', 'createDisposition', # 'preserveNulls', destinationTable, useQueryCache } } config = kwargs.get("configuration") if config is not None: job_config.update(config) if "query" in config and "query" in config["query"]: if query is not None: raise ValueError("Query statement can't be specified " "inside config while it is specified " "as parameter") query = config["query"].pop("query") self._start_timer() try: logger.debug("Requesting query... ") query_reply = self.client.query( query, job_config=bigquery.QueryJobConfig.from_api_repr(job_config), location=self.location, project=self.project_id, ) logger.debug("Query running...") except (RefreshError, ValueError): if self.private_key: raise AccessDenied( "The service account credentials are not valid") else: raise AccessDenied( "The credentials have been revoked or expired, " "please re-run the application to re-authorize") except self.http_error as ex: self.process_http_error(ex) job_id = query_reply.job_id logger.debug("Job ID: %s" % job_id) while query_reply.state != "DONE": self.log_elapsed_seconds(" Elapsed", "s. Waiting...") timeout_ms = job_config.get( "jobTimeoutMs") or job_config["query"].get("timeoutMs") timeout_ms = int(timeout_ms) if timeout_ms else None if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000: raise QueryTimeout("Query timeout: {} ms".format(timeout_ms)) timeout_sec = 1.0 if timeout_ms: # Wait at most 1 second so we can show progress bar timeout_sec = min(1.0, timeout_ms / 1000.0) try: query_reply.result(timeout=timeout_sec) except TimeoutError: # Use our own timeout logic pass except self.http_error as ex: self.process_http_error(ex) if query_reply.cache_hit: logger.debug("Query done.\nCache hit.\n") else: bytes_processed = query_reply.total_bytes_processed or 0 bytes_billed = query_reply.total_bytes_billed or 0 logger.debug("Query done.\nProcessed: {} Billed: {}".format( self.sizeof_fmt(bytes_processed), self.sizeof_fmt(bytes_billed), )) logger.debug("Standard price: ${:,.2f} USD\n".format( bytes_billed * self.query_price_for_TB)) return self._download_results( query_reply, max_results=max_results, progress_bar_type=progress_bar_type, )
def run_query(self, query, **kwargs): from concurrent.futures import TimeoutError from google.auth.exceptions import RefreshError from google.cloud import bigquery job_config = { 'query': { 'useLegacySql': self.dialect == 'legacy' # 'allowLargeResults', 'createDisposition', # 'preserveNulls', destinationTable, useQueryCache } } config = kwargs.get('configuration') if config is not None: job_config.update(config) if 'query' in config and 'query' in config['query']: if query is not None: raise ValueError("Query statement can't be specified " "inside config while it is specified " "as parameter") query = config['query'].pop('query') self._start_timer() try: logger.info('Requesting query... ') query_reply = self.client.query( query, job_config=bigquery.QueryJobConfig.from_api_repr(job_config), location=self.location) logger.info('ok.\nQuery running...') except (RefreshError, ValueError): if self.private_key: raise AccessDenied( "The service account credentials are not valid") else: raise AccessDenied( "The credentials have been revoked or expired, " "please re-run the application to re-authorize") except self.http_error as ex: self.process_http_error(ex) job_id = query_reply.job_id logger.info('Job ID: %s\nQuery running...' % job_id) while query_reply.state != 'DONE': self.log_elapsed_seconds(' Elapsed', 's. Waiting...') timeout_ms = job_config['query'].get('timeoutMs') if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000: raise QueryTimeout('Query timeout: {} ms'.format(timeout_ms)) timeout_sec = 1.0 if timeout_ms: # Wait at most 1 second so we can show progress bar timeout_sec = min(1.0, timeout_ms / 1000.0) try: query_reply.result(timeout=timeout_sec) except TimeoutError: # Use our own timeout logic pass except self.http_error as ex: self.process_http_error(ex) if query_reply.cache_hit: logger.debug('Query done.\nCache hit.\n') else: bytes_processed = query_reply.total_bytes_processed or 0 bytes_billed = query_reply.total_bytes_billed or 0 logger.debug('Query done.\nProcessed: {} Billed: {}'.format( self.sizeof_fmt(bytes_processed), self.sizeof_fmt(bytes_billed))) logger.debug('Standard price: ${:,.2f} USD\n'.format( bytes_billed * self.query_price_for_TB)) try: rows_iter = query_reply.result() except self.http_error as ex: self.process_http_error(ex) result_rows = list(rows_iter) total_rows = rows_iter.total_rows schema = { 'fields': [ field.to_api_repr() for field in rows_iter.schema], } # log basic query stats logger.info('Got {} rows.\n'.format(total_rows)) return schema, result_rows
def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): from google.auth.exceptions import RefreshError from google.cloud import bigquery import pandas job_config = { "query": { "useLegacySql": self.dialect == "legacy" # 'allowLargeResults', 'createDisposition', # 'preserveNulls', destinationTable, useQueryCache } } config = kwargs.get("configuration") if config is not None: job_config.update(config) self._start_timer() try: logger.debug("Requesting query... ") query_reply = self.client.query( query, job_config=bigquery.QueryJobConfig.from_api_repr(job_config), location=self.location, project=self.project_id, ) logger.debug("Query running...") except (RefreshError, ValueError): if self.private_key: raise AccessDenied( "The service account credentials are not valid") else: raise AccessDenied( "The credentials have been revoked or expired, " "please re-run the application to re-authorize") except self.http_error as ex: self.process_http_error(ex) job_id = query_reply.job_id logger.debug("Job ID: %s" % job_id) timeout_ms = job_config.get("jobTimeoutMs") or job_config["query"].get( "timeoutMs") timeout_ms = int(timeout_ms) if timeout_ms else None self._wait_for_query_job(query_reply, timeout_ms) if query_reply.cache_hit: logger.debug("Query done.\nCache hit.\n") else: bytes_processed = query_reply.total_bytes_processed or 0 bytes_billed = query_reply.total_bytes_billed or 0 logger.debug("Query done.\nProcessed: {} Billed: {}".format( self.sizeof_fmt(bytes_processed), self.sizeof_fmt(bytes_billed), )) logger.debug("Standard price: ${:,.2f} USD\n".format( bytes_billed * self.query_price_for_TB)) dtypes = kwargs.get("dtypes") # Ensure destination is populated. try: query_reply.result() except self.http_error as ex: self.process_http_error(ex) # Avoid attempting to download results from DML queries, which have no # destination. if query_reply.destination is None: return pandas.DataFrame() rows_iter = self.client.list_rows(query_reply.destination, max_results=max_results) return self._download_results( rows_iter, max_results=max_results, progress_bar_type=progress_bar_type, user_dtypes=dtypes, )