Esempio n. 1
0
    def run_query(self, query, **kwargs):
        from concurrent.futures import TimeoutError
        from google.auth.exceptions import RefreshError
        from google.cloud import bigquery

        job_config = {
            "query": {
                "useLegacySql": self.dialect
                == "legacy"
                # 'allowLargeResults', 'createDisposition',
                # 'preserveNulls', destinationTable, useQueryCache
            }
        }
        config = kwargs.get("configuration")
        if config is not None:
            job_config.update(config)

            if "query" in config and "query" in config["query"]:
                if query is not None:
                    raise ValueError(
                        "Query statement can't be specified "
                        "inside config while it is specified "
                        "as parameter"
                    )
                query = config["query"].pop("query")

        self._start_timer()

        try:
            logger.debug("Requesting query... ")
            query_reply = self.client.query(
                query,
                job_config=bigquery.QueryJobConfig.from_api_repr(job_config),
                location=self.location,
                project=self.project_id,
            )
            logger.debug("Query running...")
        except (RefreshError, ValueError):
            if self.private_key:
                raise AccessDenied(
                    "The service account credentials are not valid"
                )
            else:
                raise AccessDenied(
                    "The credentials have been revoked or expired, "
                    "please re-run the application to re-authorize"
                )
        except self.http_error as ex:
            self.process_http_error(ex)

        job_id = query_reply.job_id
        logger.debug("Job ID: %s" % job_id)

        while query_reply.state != "DONE":
            self.log_elapsed_seconds("  Elapsed", "s. Waiting...")

            timeout_ms = job_config["query"].get("timeoutMs")
            if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000:
                raise QueryTimeout("Query timeout: {} ms".format(timeout_ms))

            timeout_sec = 1.0
            if timeout_ms:
                # Wait at most 1 second so we can show progress bar
                timeout_sec = min(1.0, timeout_ms / 1000.0)

            try:
                query_reply.result(timeout=timeout_sec)
            except TimeoutError:
                # Use our own timeout logic
                pass
            except self.http_error as ex:
                self.process_http_error(ex)

        if query_reply.cache_hit:
            logger.debug("Query done.\nCache hit.\n")
        else:
            bytes_processed = query_reply.total_bytes_processed or 0
            bytes_billed = query_reply.total_bytes_billed or 0
            logger.debug(
                "Query done.\nProcessed: {} Billed: {}".format(
                    self.sizeof_fmt(bytes_processed),
                    self.sizeof_fmt(bytes_billed),
                )
            )
            logger.debug(
                "Standard price: ${:,.2f} USD\n".format(
                    bytes_billed * self.query_price_for_TB
                )
            )

        try:
            rows_iter = query_reply.result()
        except self.http_error as ex:
            self.process_http_error(ex)

        schema_fields = [field.to_api_repr() for field in rows_iter.schema]
        nullsafe_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)
        df = rows_iter.to_dataframe(
            dtypes=nullsafe_dtypes, bqstorage_client=self.bqstorage_client
        )

        if df.empty:
            df = _cast_empty_df_dtypes(schema_fields, df)

        # Ensure any TIMESTAMP columns are tz-aware.
        df = _localize_df(schema_fields, df)

        logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
        return df
Esempio n. 2
0
    def run_query(self,
                  query,
                  max_results=None,
                  progress_bar_type=None,
                  **kwargs):
        from concurrent.futures import TimeoutError
        from google.auth.exceptions import RefreshError

        job_config = {
            "query": {
                "useLegacySql": self.dialect == "legacy"
                # 'allowLargeResults', 'createDisposition',
                # 'preserveNulls', destinationTable, useQueryCache
            }
        }
        config = kwargs.get("configuration")
        if config is not None:
            job_config.update(config)

            if "query" in config and "query" in config["query"]:
                if query is not None:
                    raise ValueError("Query statement can't be specified "
                                     "inside config while it is specified "
                                     "as parameter")
                query = config["query"].pop("query")

        self._start_timer()

        try:
            logger.debug("Requesting query... ")
            query_reply = self.client.query(
                query,
                job_config=bigquery.QueryJobConfig.from_api_repr(job_config),
                location=self.location,
                project=self.project_id,
            )
            logger.debug("Query running...")
        except (RefreshError, ValueError):
            if self.private_key:
                raise AccessDenied(
                    "The service account credentials are not valid")
            else:
                raise AccessDenied(
                    "The credentials have been revoked or expired, "
                    "please re-run the application to re-authorize")
        except self.http_error as ex:
            self.process_http_error(ex)

        job_id = query_reply.job_id
        logger.debug("Job ID: %s" % job_id)

        while query_reply.state != "DONE":
            self.log_elapsed_seconds("  Elapsed", "s. Waiting...")

            timeout_ms = job_config.get(
                "jobTimeoutMs") or job_config["query"].get("timeoutMs")
            timeout_ms = int(timeout_ms) if timeout_ms else None
            if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000:
                raise QueryTimeout("Query timeout: {} ms".format(timeout_ms))

            timeout_sec = 1.0
            if timeout_ms:
                # Wait at most 1 second so we can show progress bar
                timeout_sec = min(1.0, timeout_ms / 1000.0)

            try:
                query_reply.result(timeout=timeout_sec)
            except TimeoutError:
                # Use our own timeout logic
                pass
            except self.http_error as ex:
                self.process_http_error(ex)

        if query_reply.cache_hit:
            logger.debug("Query done.\nCache hit.\n")
        else:
            bytes_processed = query_reply.total_bytes_processed or 0
            bytes_billed = query_reply.total_bytes_billed or 0
            logger.debug("Query done.\nProcessed: {} Billed: {}".format(
                self.sizeof_fmt(bytes_processed),
                self.sizeof_fmt(bytes_billed),
            ))
            logger.debug("Standard price: ${:,.2f} USD\n".format(
                bytes_billed * self.query_price_for_TB))

        return self._download_results(
            query_reply,
            max_results=max_results,
            progress_bar_type=progress_bar_type,
        )
Esempio n. 3
0
    def run_query(self, query, **kwargs):
        from concurrent.futures import TimeoutError
        from google.auth.exceptions import RefreshError
        from google.cloud import bigquery

        job_config = {
            'query': {
                'useLegacySql': self.dialect == 'legacy'
                # 'allowLargeResults', 'createDisposition',
                # 'preserveNulls', destinationTable, useQueryCache
            }
        }
        config = kwargs.get('configuration')
        if config is not None:
            job_config.update(config)

            if 'query' in config and 'query' in config['query']:
                if query is not None:
                    raise ValueError("Query statement can't be specified "
                                     "inside config while it is specified "
                                     "as parameter")
                query = config['query'].pop('query')

        self._start_timer()

        try:
            logger.info('Requesting query... ')
            query_reply = self.client.query(
                query,
                job_config=bigquery.QueryJobConfig.from_api_repr(job_config),
                location=self.location)
            logger.info('ok.\nQuery running...')
        except (RefreshError, ValueError):
            if self.private_key:
                raise AccessDenied(
                    "The service account credentials are not valid")
            else:
                raise AccessDenied(
                    "The credentials have been revoked or expired, "
                    "please re-run the application to re-authorize")
        except self.http_error as ex:
            self.process_http_error(ex)

        job_id = query_reply.job_id
        logger.info('Job ID: %s\nQuery running...' % job_id)

        while query_reply.state != 'DONE':
            self.log_elapsed_seconds('  Elapsed', 's. Waiting...')

            timeout_ms = job_config['query'].get('timeoutMs')
            if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000:
                raise QueryTimeout('Query timeout: {} ms'.format(timeout_ms))

            timeout_sec = 1.0
            if timeout_ms:
                # Wait at most 1 second so we can show progress bar
                timeout_sec = min(1.0, timeout_ms / 1000.0)

            try:
                query_reply.result(timeout=timeout_sec)
            except TimeoutError:
                # Use our own timeout logic
                pass
            except self.http_error as ex:
                self.process_http_error(ex)

        if query_reply.cache_hit:
            logger.debug('Query done.\nCache hit.\n')
        else:
            bytes_processed = query_reply.total_bytes_processed or 0
            bytes_billed = query_reply.total_bytes_billed or 0
            logger.debug('Query done.\nProcessed: {} Billed: {}'.format(
                self.sizeof_fmt(bytes_processed),
                self.sizeof_fmt(bytes_billed)))
            logger.debug('Standard price: ${:,.2f} USD\n'.format(
                bytes_billed * self.query_price_for_TB))

        try:
            rows_iter = query_reply.result()
        except self.http_error as ex:
            self.process_http_error(ex)
        result_rows = list(rows_iter)
        total_rows = rows_iter.total_rows
        schema = {
            'fields': [
                field.to_api_repr()
                for field in rows_iter.schema],
        }

        # log basic query stats
        logger.info('Got {} rows.\n'.format(total_rows))

        return schema, result_rows
Esempio n. 4
0
    def run_query(self,
                  query,
                  max_results=None,
                  progress_bar_type=None,
                  **kwargs):
        from google.auth.exceptions import RefreshError
        from google.cloud import bigquery
        import pandas

        job_config = {
            "query": {
                "useLegacySql": self.dialect == "legacy"
                # 'allowLargeResults', 'createDisposition',
                # 'preserveNulls', destinationTable, useQueryCache
            }
        }
        config = kwargs.get("configuration")
        if config is not None:
            job_config.update(config)

        self._start_timer()

        try:
            logger.debug("Requesting query... ")
            query_reply = self.client.query(
                query,
                job_config=bigquery.QueryJobConfig.from_api_repr(job_config),
                location=self.location,
                project=self.project_id,
            )
            logger.debug("Query running...")
        except (RefreshError, ValueError):
            if self.private_key:
                raise AccessDenied(
                    "The service account credentials are not valid")
            else:
                raise AccessDenied(
                    "The credentials have been revoked or expired, "
                    "please re-run the application to re-authorize")
        except self.http_error as ex:
            self.process_http_error(ex)

        job_id = query_reply.job_id
        logger.debug("Job ID: %s" % job_id)

        timeout_ms = job_config.get("jobTimeoutMs") or job_config["query"].get(
            "timeoutMs")
        timeout_ms = int(timeout_ms) if timeout_ms else None
        self._wait_for_query_job(query_reply, timeout_ms)

        if query_reply.cache_hit:
            logger.debug("Query done.\nCache hit.\n")
        else:
            bytes_processed = query_reply.total_bytes_processed or 0
            bytes_billed = query_reply.total_bytes_billed or 0
            logger.debug("Query done.\nProcessed: {} Billed: {}".format(
                self.sizeof_fmt(bytes_processed),
                self.sizeof_fmt(bytes_billed),
            ))
            logger.debug("Standard price: ${:,.2f} USD\n".format(
                bytes_billed * self.query_price_for_TB))

        dtypes = kwargs.get("dtypes")

        # Ensure destination is populated.
        try:
            query_reply.result()
        except self.http_error as ex:
            self.process_http_error(ex)

        # Avoid attempting to download results from DML queries, which have no
        # destination.
        if query_reply.destination is None:
            return pandas.DataFrame()

        rows_iter = self.client.list_rows(query_reply.destination,
                                          max_results=max_results)
        return self._download_results(
            rows_iter,
            max_results=max_results,
            progress_bar_type=progress_bar_type,
            user_dtypes=dtypes,
        )