Example #1
0
def _bq_cell_magic(line, query):
    from IPython.core import magic_arguments
    from google.cloud.bigquery.magics import _cell_magic
    import time

    start = time.time()
    args = magic_arguments.parse_argstring(_cell_magic, line)

    if args.params is not None:
        try:
            from google.cloud.bigquery.dbapi import _helpers
            import ast

            params = _helpers.to_query_parameters(
                ast.literal_eval("".join(args.params)))
            query_params = dict()
            for p in params:
                query_params[p.name] = p.value
            query = query.format(**query_params)
        except Exception:
            raise SyntaxError(
                "--params is not a correctly formatted JSON string or a JSON "
                "serializable dictionary")
    result = _cell_magic(line, query)
    print(f"BigQuery execution took {int(time.time() - start)} seconds.")
    return result
    def execute(self, operation, parameters=None, job_id=None):
        """Prepare and execute a database operation.

        .. note::
            When setting query parameters, values which are "text"
            (``unicode`` in Python2, ``str`` in Python3) will use
            the 'STRING' BigQuery type. Values which are "bytes" (``str`` in
            Python2, ``bytes`` in Python3), will use using the 'BYTES' type.

            A `~datetime.datetime` parameter without timezone information uses
            the 'DATETIME' BigQuery type (example: Global Pi Day Celebration
            March 14, 2017 at 1:59pm). A `~datetime.datetime` parameter with
            timezone information uses the 'TIMESTAMP' BigQuery type (example:
            a wedding on April 29, 2011 at 11am, British Summer Time).

            For more information about BigQuery data types, see:
            https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types

            ``STRUCT``/``RECORD`` and ``REPEATED`` query parameters are not
            yet supported. See:
            https://github.com/GoogleCloudPlatform/google-cloud-python/issues/3524

        :type operation: str
        :param operation: A Google BigQuery query string.

        :type parameters: Mapping[str, Any] or Sequence[Any]
        :param parameters:
            (Optional) dictionary or sequence of parameter values.

        :type job_id: str
        :param job_id: (Optional) The job_id to use. If not set, a job ID
            is generated at random.
        """
        self._query_data = None
        self._query_job = None
        client = self.connection._client

        # The DB-API uses the pyformat formatting, since the way BigQuery does
        # query parameters was not one of the standard options. Convert both
        # the query and the parameters to the format expected by the client
        # libraries.
        formatted_operation = _format_operation(operation, parameters=parameters)
        query_parameters = _helpers.to_query_parameters(parameters)

        config = job.QueryJobConfig()
        config.query_parameters = query_parameters
        config.use_legacy_sql = False
        self._query_job = client.query(
            formatted_operation, job_config=config, job_id=job_id
        )

        # Wait for the query to finish.
        try:
            self._query_job.result()
        except google.cloud.exceptions.GoogleCloudError as exc:
            raise exceptions.DatabaseError(exc)

        query_results = self._query_job._query_results
        self._set_rowcount(query_results)
        self._set_description(query_results.schema)
Example #3
0
    def execute(self, operation, parameters=None, job_id=None):
        """Prepare and execute a database operation.

        .. note::
            When setting query parameters, values which are "text"
            (``unicode`` in Python2, ``str`` in Python3) will use
            the 'STRING' BigQuery type. Values which are "bytes" (``str`` in
            Python2, ``bytes`` in Python3), will use using the 'BYTES' type.

            A `~datetime.datetime` parameter without timezone information uses
            the 'DATETIME' BigQuery type (example: Global Pi Day Celebration
            March 14, 2017 at 1:59pm). A `~datetime.datetime` parameter with
            timezone information uses the 'TIMESTAMP' BigQuery type (example:
            a wedding on April 29, 2011 at 11am, British Summer Time).

            For more information about BigQuery data types, see:
            https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types

            ``STRUCT``/``RECORD`` and ``REPEATED`` query parameters are not
            yet supported. See:
            https://github.com/GoogleCloudPlatform/google-cloud-python/issues/3524

        :type operation: str
        :param operation: A Google BigQuery query string.

        :type parameters: Mapping[str, Any] or Sequence[Any]
        :param parameters:
            (Optional) dictionary or sequence of parameter values.

        :type job_id: str
        :param job_id: (Optional) The job_id to use. If not set, a job ID
            is generated at random.
        """
        self._query_data = None
        self._query_job = None
        client = self.connection._client

        # The DB-API uses the pyformat formatting, since the way BigQuery does
        # query parameters was not one of the standard options. Convert both
        # the query and the parameters to the format expected by the client
        # libraries.
        formatted_operation = _format_operation(
            operation, parameters=parameters)
        query_parameters = _helpers.to_query_parameters(parameters)

        config = job.QueryJobConfig()
        config.query_parameters = query_parameters
        config.use_legacy_sql = False
        self._query_job = client.query(
            formatted_operation, job_config=config, job_id=job_id)

        # Wait for the query to finish.
        try:
            self._query_job.result()
        except google.cloud.exceptions.GoogleCloudError as exc:
            raise exceptions.DatabaseError(exc)

        query_results = self._query_job._query_results
        self._set_rowcount(query_results)
        self._set_description(query_results.schema)
Example #4
0
def _cell_magic(line, query=None):
  args = magic_arguments.parse_argstring(_cell_magic, line)

  args_dict = vars(args)

  active_unsupport_args = []
  for unsupport_arg in UNSUPPORTED_ARGS:
    if args_dict[unsupport_arg] is not None and args_dict[
        unsupport_arg] is not False:
      active_unsupport_args.append(unsupport_arg)

  if len(active_unsupport_args) > 0:
    active_unsupport_args.sort()

    print(F"Unsupported args: {', '.join(active_unsupport_args)}")

  params = None

  if args.maximum_bytes_billed is None:
    maximum_bytes_billed = None
  elif args.maximum_bytes_billed == 'None':
    maximum_bytes_billed = 0
  else:
    maximum_bytes_billed = int(args.maximum_bytes_billed)

  # test params syntax
  if args.params is not None:
    try:
      params = ast.literal_eval("".join(args.params))
      _helpers.to_query_parameters(params)
    except Exception:
      raise SyntaxError("--params is not a correctly formatted\
             JSON string or a JSON "
                        "serializable dictionary")

  query_flags = {
      'destination_table': args.destination_table,
      'project': args.project,
      'maximum_bytes_billed': maximum_bytes_billed,
      'params': params,
      'use_legacy_sql': args.use_legacy_sql,
  }

  e = QueryIncellEditor()
  e.query = query.strip('\n') if isinstance(query, str) else ''
  e.query_flags = json.dumps(query_flags)
  return e
Example #5
0
def _cell_magic(line, query):
    """Underlying function for bigquery cell magic

    Note:
        This function contains the underlying logic for the 'bigquery' cell
        magic. This function is not meant to be called directly.

    Args:
        line (str): "%%bigquery" followed by arguments as required
        query (str): SQL query to run

    Returns:
        pandas.DataFrame: the query results.
    """
    args = magic_arguments.parse_argstring(_cell_magic, line)

    params = []
    if args.params is not None:
        try:
            params = _helpers.to_query_parameters(
                ast.literal_eval("".join(args.params)))
        except Exception:
            raise SyntaxError(
                "--params is not a correctly formatted JSON string or a JSON "
                "serializable dictionary")

    project = args.project or context.project
    client = bigquery.Client(
        project=project,
        credentials=context.credentials,
        default_query_job_config=context.default_query_job_config,
        client_info=client_info.ClientInfo(
            user_agent="ipython-{}".format(IPython.__version__)),
    )
    if context._connection:
        client._connection = context._connection
    bqstorage_client = _make_bqstorage_client(
        args.use_bqstorage_api or context.use_bqstorage_api,
        context.credentials)
    job_config = bigquery.job.QueryJobConfig()
    job_config.query_parameters = params
    job_config.use_legacy_sql = args.use_legacy_sql

    if args.maximum_bytes_billed == "None":
        job_config.maximum_bytes_billed = 0
    elif args.maximum_bytes_billed is not None:
        value = int(args.maximum_bytes_billed)
        job_config.maximum_bytes_billed = value
    query_job = _run_query(client, query, job_config)

    if not args.verbose:
        display.clear_output()

    result = query_job.to_dataframe(bqstorage_client=bqstorage_client)
    if args.destination_var:
        IPython.get_ipython().push({args.destination_var: result})
    else:
        return result
def test_to_query_parameters_complex_types(parameters, parameter_types,
                                           expect):
    from google.cloud.bigquery.dbapi._helpers import to_query_parameters

    result = [
        p.to_api_repr()
        for p in to_query_parameters(parameters, parameter_types)
    ]
    assert result == expect
def test_to_query_parameters_struct_error():
    from google.cloud.bigquery.dbapi._helpers import to_query_parameters

    with pytest.raises(
            NotImplementedError,
            match=_expected_error_match(
                "STRUCT-like parameter values are not supported, "
                "unless an explicit type is give in the parameter placeholder "
                "(e.g. '%(:struct<...>)s')."),
    ):
        to_query_parameters([dict(x=1)], [None])

    with pytest.raises(
            NotImplementedError,
            match=_expected_error_match(
                "STRUCT-like parameter values are not supported (parameter foo), "
                "unless an explicit type is give in the parameter placeholder "
                "(e.g. '%(foo:struct<...>)s')."),
    ):
        to_query_parameters(dict(foo=dict(x=1)), {})
Example #8
0
def _cell_magic(line, query=None):
    args = magic_arguments.parse_argstring(_cell_magic, line)

    params = None

    if args.maximum_bytes_billed is None:
        maximum_bytes_billed = None
    elif args.maximum_bytes_billed == 'None':
        maximum_bytes_billed = 0
    else:
        maximum_bytes_billed = int(args.maximum_bytes_billed)

    # test params syntax
    if args.params is not None:
        try:
            params = ast.literal_eval("".join(args.params))
            _helpers.to_query_parameters(params)
        except Exception:
            raise SyntaxError(
                "--params is not a correctly formatted JSON string or a JSON "
                "serializable dictionary")

    query_flags = {
        'destination_table': args.destination_table,
        'project': args.project,
        'maximum_bytes_billed': maximum_bytes_billed,
        'params': params,
        'use_legacy_sql': args.use_legacy_sql,
        'destination_var': args.destination_var,
    }

    e = QueryIncellEditor()
    e.query = query if isinstance(query, str) else ''
    e.query_flags = json.dumps(query_flags)
    if args.destination_var:
        e.observe(partial(handler, args.destination_var, e))
    return e
Example #9
0
def _cell_magic(line, query):
    """Underlying function for bigquery cell magic

    Note:
        This function contains the underlying logic for the 'bigquery' cell
        magic. This function is not meant to be called directly.

    Args:
        line (str): "%%bigquery" followed by arguments as required
        query (str): SQL query to run

    Returns:
        pandas.DataFrame: the query results.
    """
    args = magic_arguments.parse_argstring(_cell_magic, line)

    params = []
    if args.params is not None:
        try:
            params = _helpers.to_query_parameters(
                ast.literal_eval("".join(args.params))
            )
        except Exception:
            raise SyntaxError(
                "--params is not a correctly formatted JSON string or a JSON "
                "serializable dictionary"
            )

    project = args.project or context.project
    client = bigquery.Client(project=project, credentials=context.credentials)
    bqstorage_client = _make_bqstorage_client(
        args.use_bqstorage_api or context.use_bqstorage_api, context.credentials
    )
    job_config = bigquery.job.QueryJobConfig()
    job_config.query_parameters = params
    job_config.use_legacy_sql = args.use_legacy_sql
    query_job = _run_query(client, query, job_config)

    if not args.verbose:
        display.clear_output()

    result = query_job.to_dataframe(bqstorage_client=bqstorage_client)
    if args.destination_var:
        IPython.get_ipython().push({args.destination_var: result})
    else:
        return result
Example #10
0
    def _execute(self, formatted_operation, parameters, job_id, job_config,
                 parameter_types):
        self._query_data = None
        self._query_job = None
        client = self.connection._client

        # The DB-API uses the pyformat formatting, since the way BigQuery does
        # query parameters was not one of the standard options. Convert both
        # the query and the parameters to the format expected by the client
        # libraries.
        query_parameters = _helpers.to_query_parameters(
            parameters, parameter_types)

        if client._default_query_job_config:
            if job_config:
                config = job_config._fill_from_default(
                    client._default_query_job_config)
            else:
                config = copy.deepcopy(client._default_query_job_config)
        else:
            config = job_config or job.QueryJobConfig(use_legacy_sql=False)

        config.query_parameters = query_parameters
        self._query_job = client.query(formatted_operation,
                                       job_config=config,
                                       job_id=job_id)

        if self._query_job.dry_run:
            self._set_description(schema=None)
            self.rowcount = 0
            return

        # Wait for the query to finish.
        try:
            self._query_job.result()
        except google.cloud.exceptions.GoogleCloudError as exc:
            raise exceptions.DatabaseError(exc)

        query_results = self._query_job._query_results
        self._set_rowcount(query_results)
        self._set_description(query_results.schema)
Example #11
0
def _cell_magic(line, query):
    """Underlying function for bigquery cell magic

    Note:
        This function contains the underlying logic for the 'bigquery' cell
        magic. This function is not meant to be called directly.

    Args:
        line (str): "%%bigquery" followed by arguments as required
        query (str): SQL query to run

    Returns:
        pandas.DataFrame: the query results.
    """
    args = magic_arguments.parse_argstring(_cell_magic, line)

    params = []
    if args.params is not None:
        try:
            params = _helpers.to_query_parameters(
                ast.literal_eval("".join(args.params)))
        except Exception:
            raise SyntaxError(
                "--params is not a correctly formatted JSON string or a JSON "
                "serializable dictionary")

    project = args.project or context.project
    client = bigquery.Client(project=project, credentials=context.credentials)
    job_config = bigquery.job.QueryJobConfig()
    job_config.query_parameters = params
    job_config.use_legacy_sql = args.use_legacy_sql
    query_job = _run_query(client, query, job_config)

    if not args.verbose:
        display.clear_output()

    result = query_job.to_dataframe()
    if args.destination_var:
        IPython.get_ipython().push({args.destination_var: result})
    return result
    def query(self, request_body, page_size):
        query = request_body['query']
        jobConfig = request_body['jobConfig']
        dryRunOnly = request_body['dryRunOnly']

        # process flags
        processed_flags = {
            support_flag: jobConfig[support_flag]
            for support_flag in SUPPORTED_JOB_CONFIG_FLAGS
            if support_flag in jobConfig
        }

        if 'params' in processed_flags:
            processed_flags['query_parameters'] = _helpers.to_query_parameters(
                processed_flags['params'])

        if 'maximum_bytes_billed' in processed_flags and\
          processed_flags['maximum_bytes_billed'] is None:
            del processed_flags['maximum_bytes_billed']

        if 'use_legacy_sql' in processed_flags and\
          not isinstance(processed_flags['use_legacy_sql'], bool):
            raise ValueError(
                'use_legacy_sql shoud be boolean, instead received {}'.format(
                    processed_flags['use_legacy_sql']))
        if 'destination_table' in processed_flags:
            processed_flags['destination'] = processed_flags[
                'destination_table']
            del processed_flags['destination_table']

        # dry run, will throw exception if fail
        dry_run_job_config = bigquery.QueryJobConfig(**processed_flags)
        dry_run_job_config.dry_run = True
        dry_run_job_config.use_query_cache = False

        try:
            with PagedQueryHandler.client_lock:
                if 'project' in jobConfig and jobConfig['project'] is not None:
                    PagedQueryHandler.client.project = jobConfig['project']
                else:
                    PagedQueryHandler.client.project = PagedQueryHandler.orig_project
                dry_run_job = PagedQueryHandler.client.query(
                    query, job_config=dry_run_job_config)
                PagedQueryHandler.client.project = PagedQueryHandler.orig_project
        except Exception as err:
            if hasattr(err, 'errors'):
                raise Exception(err.errors[0]['message'])
            else:
                raise Exception(err)
        total_bytes_processed = dry_run_job.total_bytes_processed

        if dryRunOnly:
            job_id = 'dry_run' if dry_run_job.job_id is None else dry_run_job.job_id
            yield dry_run_job, job_id
            yield {
                'content': json.dumps(None),
                'labels': json.dumps(None),
                'bytesProcessed': json.dumps(total_bytes_processed)
            }
            return

        # actual run
        job_config = bigquery.QueryJobConfig(**processed_flags)

        # need synchronization since all query handler share the same client
        with PagedQueryHandler.client_lock:
            if 'project' in jobConfig and jobConfig['project'] is not None:
                PagedQueryHandler.client.project = jobConfig['project']
            else:
                PagedQueryHandler.client.project = PagedQueryHandler.orig_project
            query_job = PagedQueryHandler.client.query(query,
                                                       job_config=job_config)
            PagedQueryHandler.client.project = PagedQueryHandler.orig_project

        if query_job.error_result is not None:
            raise Exception(query_job.error_result)

        yield query_job, query_job.job_id

        # send contents
        en = query_job.result(page_size)
        schema_fields = format_preview_fields(en.schema)
        duration = (query_job.ended - query_job.started).total_seconds()

        for page in en.pages:
            if page.num_items > USE_PARALLEL_THRESH:
                content = parallel_format_preview_rows(page,
                                                       en.schema,
                                                       pool=self.pool)
            else:
                content = format_preview_rows(page, en.schema)

            response = {
                'content': json.dumps(content),
                'labels': schema_fields,
                'bytesProcessed': total_bytes_processed,
                'project': query_job.project,
                'duration': duration,
            }
            yield response
Example #13
0
def _cell_magic(line, query):
    """Underlying function for bigquery cell magic

    Note:
        This function contains the underlying logic for the 'bigquery' cell
        magic. This function is not meant to be called directly.

    Args:
        line (str): "%%bigquery" followed by arguments as required
        query (str): SQL query to run

    Returns:
        pandas.DataFrame: the query results.
    """
    args = magic_arguments.parse_argstring(_cell_magic, line)

    params = []
    if args.params is not None:
        try:
            params = _helpers.to_query_parameters(
                ast.literal_eval("".join(args.params)))
        except Exception:
            raise SyntaxError(
                "--params is not a correctly formatted JSON string or a JSON "
                "serializable dictionary")

    project = args.project or context.project
    client = bigquery.Client(
        project=project,
        credentials=context.credentials,
        default_query_job_config=context.default_query_job_config,
        client_info=client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT),
    )
    if context._connection:
        client._connection = context._connection
    bqstorage_client = _make_bqstorage_client(
        args.use_bqstorage_api or context.use_bqstorage_api,
        context.credentials)

    if args.max_results:
        max_results = int(args.max_results)
    else:
        max_results = None

    query = query.strip()

    # Any query that does not contain whitespace (aside from leading and trailing whitespace)
    # is assumed to be a table id
    if not re.search(r"\s", query):
        try:
            rows = client.list_rows(query, max_results=max_results)
        except Exception as ex:
            _print_error(str(ex), args.destination_var)
            return

        result = rows.to_dataframe(bqstorage_client=bqstorage_client)
        if args.destination_var:
            IPython.get_ipython().push({args.destination_var: result})
            return
        else:
            return result

    job_config = bigquery.job.QueryJobConfig()
    job_config.query_parameters = params
    job_config.use_legacy_sql = args.use_legacy_sql
    job_config.dry_run = args.dry_run

    if args.maximum_bytes_billed == "None":
        job_config.maximum_bytes_billed = 0
    elif args.maximum_bytes_billed is not None:
        value = int(args.maximum_bytes_billed)
        job_config.maximum_bytes_billed = value

    try:
        query_job = _run_query(client, query, job_config=job_config)
    except Exception as ex:
        _print_error(str(ex), args.destination_var)
        return

    if not args.verbose:
        display.clear_output()

    if args.dry_run and args.destination_var:
        IPython.get_ipython().push({args.destination_var: query_job})
        return
    elif args.dry_run:
        print("Query validated. This query will process {} bytes.".format(
            query_job.total_bytes_processed))
        return query_job

    if max_results:
        result = query_job.result(max_results=max_results).to_dataframe(
            bqstorage_client=bqstorage_client)
    else:
        result = query_job.to_dataframe(bqstorage_client=bqstorage_client)

    if args.destination_var:
        IPython.get_ipython().push({args.destination_var: result})
    else:
        return result
Example #14
0
    def execute(self, operation, parameters=None):
        """Prepare and execute a database operation.

        .. note::
            When setting query parameters, values which are "text"
            (``unicode`` in Python2, ``str`` in Python3) will use
            the 'STRING' BigQuery type. Values which are "bytes" (``str`` in
            Python2, ``bytes`` in Python3), will use using the 'BYTES' type.

            A `~datetime.datetime` parameter without timezone information uses
            the 'DATETIME' BigQuery type (example: Global Pi Day Celebration
            March 14, 2017 at 1:59pm). A `~datetime.datetime` parameter with
            timezone information uses the 'TIMESTAMP' BigQuery type (example:
            a wedding on April 29, 2011 at 11am, British Summer Time).

            For more information about BigQuery data types, see:
            https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types

            ``STRUCT``/``RECORD`` and ``REPEATED`` query parameters are not
            yet supported. See:
            https://github.com/GoogleCloudPlatform/google-cloud-python/issues/3524

        :type operation: str
        :param operation: A Google BigQuery query string.

        :type parameters: Mapping[str, Any] or Sequence[Any]
        :param parameters:
            (Optional) dictionary or sequence of parameter values.
        """
        self._query_results = None
        self._page_token = None
        self._has_fetched_all_rows = False
        client = self.connection._client
        job_id = str(uuid.uuid4())

        # The DB-API uses the pyformat formatting, since the way BigQuery does
        # query parameters was not one of the standard options. Convert both
        # the query and the parameters to the format expected by the client
        # libraries.
        formatted_operation = _format_operation(operation,
                                                parameters=parameters)
        query_parameters = _helpers.to_query_parameters(parameters)

        query_job = client.run_async_query(job_id,
                                           formatted_operation,
                                           query_parameters=query_parameters)
        query_job.use_legacy_sql = False
        query_job.begin()
        _helpers.wait_for_job(query_job)
        query_results = query_job.results()

        # Force the iterator to run because the query_results doesn't
        # have the total_rows populated. See:
        # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/3506
        query_iterator = query_results.fetch_data()
        try:
            six.next(iter(query_iterator))
        except StopIteration:
            pass

        self._query_data = iter(
            query_results.fetch_data(max_results=self.arraysize))
        self._set_rowcount(query_results)
        self._set_description(query_results.schema)
Example #15
0
def _cell_magic(line, query):
    """Underlying function for bigquery cell magic

    Note:
        This function contains the underlying logic for the 'bigquery' cell
        magic. This function is not meant to be called directly.

    Args:
        line (str): "%%bigquery" followed by arguments as required
        query (str): SQL query to run

    Returns:
        pandas.DataFrame: the query results.
    """
    args = magic_arguments.parse_argstring(_cell_magic, line)

    params = []
    if args.params is not None:
        try:
            params = _helpers.to_query_parameters(
                ast.literal_eval("".join(args.params))
            )
        except Exception:
            raise SyntaxError(
                "--params is not a correctly formatted JSON string or a JSON "
                "serializable dictionary"
            )

    project = args.project or context.project
    client = bigquery.Client(
        project=project,
        credentials=context.credentials,
        default_query_job_config=context.default_query_job_config,
        client_info=client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT),
    )
    if context._connection:
        client._connection = context._connection
    bqstorage_client = _make_bqstorage_client(
        args.use_bqstorage_api or context.use_bqstorage_api, context.credentials
    )
    job_config = bigquery.job.QueryJobConfig()
    job_config.query_parameters = params
    job_config.use_legacy_sql = args.use_legacy_sql
    job_config.dry_run = args.dry_run

    if args.maximum_bytes_billed == "None":
        job_config.maximum_bytes_billed = 0
    elif args.maximum_bytes_billed is not None:
        value = int(args.maximum_bytes_billed)
        job_config.maximum_bytes_billed = value

    error = None
    try:
        query_job = _run_query(client, query, job_config)
    except Exception as ex:
        error = str(ex)

    if not args.verbose:
        display.clear_output()

    if error:
        if args.destination_var:
            print(
                "Could not save output to variable '{}'.".format(args.destination_var),
                file=sys.stderr,
            )
        print("\nERROR:\n", error, file=sys.stderr)
        return

    if args.dry_run and args.destination_var:
        IPython.get_ipython().push({args.destination_var: query_job})
        return
    elif args.dry_run:
        print(
            "Query validated. This query will process {} bytes.".format(
                query_job.total_bytes_processed
            )
        )
        return query_job

    result = query_job.to_dataframe(bqstorage_client=bqstorage_client)
    if args.destination_var:
        IPython.get_ipython().push({args.destination_var: result})
    else:
        return result
Example #16
0
def _cell_magic(line, query):
    """Underlying function for bigquery cell magic

    Note:
        This function contains the underlying logic for the 'bigquery' cell
        magic. This function is not meant to be called directly.

    Args:
        line (str): "%%bigquery" followed by arguments as required
        query (str): SQL query to run

    Returns:
        pandas.DataFrame: the query results.
    """
    args = magic_arguments.parse_argstring(_cell_magic, line)

    if args.use_bqstorage_api is not None:
        warnings.warn(
            "Deprecated option --use_bqstorage_api, the BigQuery "
            "Storage API is already used by default.",
            category=DeprecationWarning,
        )
    use_bqstorage_api = not args.use_rest_api

    params = []
    if args.params is not None:
        try:
            params = _helpers.to_query_parameters(
                ast.literal_eval("".join(args.params)))
        except Exception:
            raise SyntaxError(
                "--params is not a correctly formatted JSON string or a JSON "
                "serializable dictionary")

    project = args.project or context.project
    client = bigquery.Client(
        project=project,
        credentials=context.credentials,
        default_query_job_config=context.default_query_job_config,
        client_info=client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT),
    )
    if context._connection:
        client._connection = context._connection
    bqstorage_client = _make_bqstorage_client(use_bqstorage_api,
                                              context.credentials)

    close_transports = functools.partial(_close_transports, client,
                                         bqstorage_client)

    try:
        if args.max_results:
            max_results = int(args.max_results)
        else:
            max_results = None

        query = query.strip()

        if not query:
            error = ValueError("Query is missing.")
            _handle_error(error, args.destination_var)
            return

        # Any query that does not contain whitespace (aside from leading and trailing whitespace)
        # is assumed to be a table id
        if not re.search(r"\s", query):
            try:
                rows = client.list_rows(query, max_results=max_results)
            except Exception as ex:
                _handle_error(ex, args.destination_var)
                return

            result = rows.to_dataframe(bqstorage_client=bqstorage_client)
            if args.destination_var:
                IPython.get_ipython().push({args.destination_var: result})
                return
            else:
                return result

        job_config = bigquery.job.QueryJobConfig()
        job_config.query_parameters = params
        job_config.use_legacy_sql = args.use_legacy_sql
        job_config.dry_run = args.dry_run

        if args.destination_table:
            split = args.destination_table.split(".")
            if len(split) != 2:
                raise ValueError(
                    "--destination_table should be in a <dataset_id>.<table_id> format."
                )
            dataset_id, table_id = split
            job_config.allow_large_results = True
            dataset_ref = bigquery.dataset.DatasetReference(
                client.project, dataset_id)
            destination_table_ref = dataset_ref.table(table_id)
            job_config.destination = destination_table_ref
            job_config.create_disposition = "CREATE_IF_NEEDED"
            job_config.write_disposition = "WRITE_TRUNCATE"
            _create_dataset_if_necessary(client, dataset_id)

        if args.maximum_bytes_billed == "None":
            job_config.maximum_bytes_billed = 0
        elif args.maximum_bytes_billed is not None:
            value = int(args.maximum_bytes_billed)
            job_config.maximum_bytes_billed = value

        try:
            query_job = _run_query(client, query, job_config=job_config)
        except Exception as ex:
            _handle_error(ex, args.destination_var)
            return

        if not args.verbose:
            display.clear_output()

        if args.dry_run and args.destination_var:
            IPython.get_ipython().push({args.destination_var: query_job})
            return
        elif args.dry_run:
            print("Query validated. This query will process {} bytes.".format(
                query_job.total_bytes_processed))
            return query_job

        if max_results:
            result = query_job.result(max_results=max_results).to_dataframe(
                bqstorage_client=bqstorage_client)
        else:
            result = query_job.to_dataframe(bqstorage_client=bqstorage_client)

        if args.destination_var:
            IPython.get_ipython().push({args.destination_var: result})
        else:
            return result
    finally:
        close_transports()
Example #17
0
def _cell_magic(line, query):
    """Underlying function for bigquery cell magic

    Note:
        This function contains the underlying logic for the 'bigquery' cell
        magic. This function is not meant to be called directly.

    Args:
        line (str): "%%bigquery" followed by arguments as required
        query (str): SQL query to run

    Returns:
        pandas.DataFrame: the query results.
    """
    # The built-in parser does not recognize Python structures such as dicts, thus
    # we extract the "--params" option and inteprpret it separately.
    try:
        params_option_value, rest_of_args = _split_args_line(line)
    except lap.exceptions.QueryParamsParseError as exc:
        rebranded_error = SyntaxError(
            "--params is not a correctly formatted JSON string or a JSON "
            "serializable dictionary")
        six.raise_from(rebranded_error, exc)
    except lap.exceptions.DuplicateQueryParamsError as exc:
        rebranded_error = ValueError("Duplicate --params option.")
        six.raise_from(rebranded_error, exc)
    except lap.exceptions.ParseError as exc:
        rebranded_error = ValueError(
            "Unrecognized input, are option values correct? "
            "Error details: {}".format(exc.args[0]))
        six.raise_from(rebranded_error, exc)

    args = magic_arguments.parse_argstring(_cell_magic, rest_of_args)

    if args.use_bqstorage_api is not None:
        warnings.warn(
            "Deprecated option --use_bqstorage_api, the BigQuery "
            "Storage API is already used by default.",
            category=DeprecationWarning,
        )
    use_bqstorage_api = not args.use_rest_api

    params = []
    if params_option_value:
        # A non-existing params variable is not expanded and ends up in the input
        # in its raw form, e.g. "$query_params".
        if params_option_value.startswith("$"):
            msg = 'Parameter expansion failed, undefined variable "{}".'.format(
                params_option_value[1:])
            raise NameError(msg)

        params = _helpers.to_query_parameters(
            ast.literal_eval(params_option_value))

    project = args.project or context.project

    bigquery_client_options = copy.deepcopy(context.bigquery_client_options)
    if args.bigquery_api_endpoint:
        if isinstance(bigquery_client_options, dict):
            bigquery_client_options[
                "api_endpoint"] = args.bigquery_api_endpoint
        else:
            bigquery_client_options.api_endpoint = args.bigquery_api_endpoint

    client = bigquery.Client(
        project=project,
        credentials=context.credentials,
        default_query_job_config=context.default_query_job_config,
        client_info=client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT),
        client_options=bigquery_client_options,
    )
    if context._connection:
        client._connection = context._connection

    bqstorage_client_options = copy.deepcopy(context.bqstorage_client_options)
    if args.bqstorage_api_endpoint:
        if isinstance(bqstorage_client_options, dict):
            bqstorage_client_options[
                "api_endpoint"] = args.bqstorage_api_endpoint
        else:
            bqstorage_client_options.api_endpoint = args.bqstorage_api_endpoint

    bqstorage_client = _make_bqstorage_client(
        use_bqstorage_api,
        context.credentials,
        bqstorage_client_options,
    )

    close_transports = functools.partial(_close_transports, client,
                                         bqstorage_client)

    try:
        if args.max_results:
            max_results = int(args.max_results)
        else:
            max_results = None

        query = query.strip()

        if not query:
            error = ValueError("Query is missing.")
            _handle_error(error, args.destination_var)
            return

        # Any query that does not contain whitespace (aside from leading and trailing whitespace)
        # is assumed to be a table id
        if not re.search(r"\s", query):
            try:
                rows = client.list_rows(query, max_results=max_results)
            except Exception as ex:
                _handle_error(ex, args.destination_var)
                return

            result = rows.to_dataframe(bqstorage_client=bqstorage_client)
            if args.destination_var:
                IPython.get_ipython().push({args.destination_var: result})
                return
            else:
                return result

        job_config = bigquery.job.QueryJobConfig()
        job_config.query_parameters = params
        job_config.use_legacy_sql = args.use_legacy_sql
        job_config.dry_run = args.dry_run

        if args.destination_table:
            split = args.destination_table.split(".")
            if len(split) != 2:
                raise ValueError(
                    "--destination_table should be in a <dataset_id>.<table_id> format."
                )
            dataset_id, table_id = split
            job_config.allow_large_results = True
            dataset_ref = bigquery.dataset.DatasetReference(
                client.project, dataset_id)
            destination_table_ref = dataset_ref.table(table_id)
            job_config.destination = destination_table_ref
            job_config.create_disposition = "CREATE_IF_NEEDED"
            job_config.write_disposition = "WRITE_TRUNCATE"
            _create_dataset_if_necessary(client, dataset_id)

        if args.maximum_bytes_billed == "None":
            job_config.maximum_bytes_billed = 0
        elif args.maximum_bytes_billed is not None:
            value = int(args.maximum_bytes_billed)
            job_config.maximum_bytes_billed = value

        try:
            query_job = _run_query(client, query, job_config=job_config)
        except Exception as ex:
            _handle_error(ex, args.destination_var)
            return

        if not args.verbose:
            display.clear_output()

        if args.dry_run and args.destination_var:
            IPython.get_ipython().push({args.destination_var: query_job})
            return
        elif args.dry_run:
            print("Query validated. This query will process {} bytes.".format(
                query_job.total_bytes_processed))
            return query_job

        if max_results:
            result = query_job.result(max_results=max_results).to_dataframe(
                bqstorage_client=bqstorage_client)
        else:
            result = query_job.to_dataframe(bqstorage_client=bqstorage_client)

        if args.destination_var:
            IPython.get_ipython().push({args.destination_var: result})
        else:
            return result
    finally:
        close_transports()