コード例 #1
0
    def test_from_query_job(self):
        from google.cloud.bigquery.dataset import Dataset
        from google.cloud.bigquery.job import QueryJob
        from google.cloud.bigquery._helpers import UDFResource
        DS_NAME = 'DATASET'
        RESOURCE_URI = 'gs://some-bucket/js/lib.js'
        client = _Client(self.PROJECT)
        job = QueryJob(
            self.JOB_NAME,
            self.QUERY,
            client,
            udf_resources=[UDFResource("resourceUri", RESOURCE_URI)])
        dataset = job.default_dataset = Dataset(DS_NAME, client)
        job.use_query_cache = True
        job.use_legacy_sql = True
        klass = self._getTargetClass()

        query = klass.from_query_job(job)

        self.assertEqual(query.name, self.JOB_NAME)
        self.assertEqual(query.query, self.QUERY)
        self.assertIs(query._client, client)
        self.assertIs(query._job, job)
        self.assertEqual(query.udf_resources, job.udf_resources)
        self.assertIs(query.default_dataset, dataset)
        self.assertTrue(query.use_query_cache)
        self.assertTrue(query.use_legacy_sql)
コード例 #2
0
    def test_from_query_job(self):
        from google.cloud.bigquery.dataset import Dataset
        from google.cloud.bigquery.job import QueryJob
        from google.cloud.bigquery._helpers import UDFResource
        DS_NAME = 'DATASET'
        RESOURCE_URI = 'gs://some-bucket/js/lib.js'
        client = _Client(self.PROJECT)
        job = QueryJob(
            self.JOB_NAME, self.QUERY, client,
            udf_resources=[UDFResource("resourceUri", RESOURCE_URI)])
        dataset = job.default_dataset = Dataset(DS_NAME, client)
        job.use_query_cache = True
        job.use_legacy_sql = True
        klass = self._getTargetClass()

        query = klass.from_query_job(job)

        self.assertEqual(query.name, self.JOB_NAME)
        self.assertEqual(query.query, self.QUERY)
        self.assertIs(query._client, client)
        self.assertIs(query._job, job)
        self.assertEqual(query.udf_resources, job.udf_resources)
        self.assertIs(query.default_dataset, dataset)
        self.assertTrue(query.use_query_cache)
        self.assertTrue(query.use_legacy_sql)
コード例 #3
0
def test_to_dataframe_with_progress_bar(tqdm_mock):
    from google.cloud.bigquery.job import QueryJob as target_class

    begun_resource = _make_job_resource(job_type="query")
    query_resource = {
        "jobComplete": True,
        "jobReference": begun_resource["jobReference"],
        "totalRows": "4",
        "schema": {
            "fields": [{
                "name": "name",
                "type": "STRING",
                "mode": "NULLABLE"
            }]
        },
    }
    done_resource = copy.deepcopy(begun_resource)
    done_resource["status"] = {"state": "DONE"}
    connection = _make_connection(
        begun_resource,
        query_resource,
        done_resource,
        query_resource,
        query_resource,
    )
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(begun_resource, client)

    job.to_dataframe(progress_bar_type=None, create_bqstorage_client=False)
    tqdm_mock.assert_not_called()

    job.to_dataframe(progress_bar_type="tqdm", create_bqstorage_client=False)
    tqdm_mock.assert_called()
コード例 #4
0
ファイル: bq.py プロジェクト: colpal/dataEng-container-tools
    def send_to_gcs(self, query, project_id, output_uri, delimiter=","):
        job_results = {}

        client = self.bq_client

        queryJob = QueryJob(self.__create_job_id(project_id, "queryJob"),
                            query, client)
        job_results["queryJob"] = self.__get_results(queryJob)

        output_type = output_uri.split(".")[-1]
        dest_format = self.__get_file_type(output_type)

        if dest_format == SourceFormat.CSV:
            config = ExtractJobConfig(destination_format=dest_format,
                                      field_delimiter=delimiter)
        else:
            config = ExtractJobConfig(destination_format=dest_format)

        extractJob = ExtractJob(self.__create_job_id(project_id, "extractJob"),
                                queryJob.destination,
                                output_uri,
                                client,
                                job_config=config)

        job_results["extractJob"] = self.__get_results(extractJob)

        return job_results
コード例 #5
0
    def run_async_query(self,
                        job_name,
                        query,
                        udf_resources=(),
                        query_parameters=()):
        """Construct a job for running a SQL query asynchronously.

        See
        https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query

        :type job_name: str
        :param job_name: Name of the job.

        :type query: str
        :param query: SQL query to be executed

        :type udf_resources: tuple
        :param udf_resources: An iterable of
                            :class:`google.cloud.bigquery._helpers.UDFResource`
                            (empty by default)

        :type query_parameters: tuple
        :param query_parameters:
            An iterable of
            :class:`google.cloud.bigquery._helpers.AbstractQueryParameter`
            (empty by default)

        :rtype: :class:`google.cloud.bigquery.job.QueryJob`
        :returns: a new ``QueryJob`` instance
        """
        return QueryJob(job_name,
                        query,
                        client=self,
                        udf_resources=udf_resources,
                        query_parameters=query_parameters)
コード例 #6
0
def test_to_arrow_max_results_no_progress_bar():
    from google.cloud.bigquery import table
    from google.cloud.bigquery.job import QueryJob as target_class
    from google.cloud.bigquery.schema import SchemaField

    connection = _make_connection({})
    client = _make_client(connection=connection)
    begun_resource = _make_job_resource(job_type="query")
    job = target_class.from_api_repr(begun_resource, client)

    schema = [
        SchemaField("name", "STRING", mode="REQUIRED"),
        SchemaField("age", "INTEGER", mode="REQUIRED"),
    ]
    rows = [
        {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
        {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
    ]
    path = "/foo"
    api_request = mock.Mock(return_value={"rows": rows})
    row_iterator = table.RowIterator(client, api_request, path, schema)

    result_patch = mock.patch(
        "google.cloud.bigquery.job.QueryJob.result", return_value=row_iterator,
    )
    with result_patch as result_patch_tqdm:
        tbl = job.to_arrow(create_bqstorage_client=False, max_results=123)

    result_patch_tqdm.assert_called_once_with(max_results=123)

    assert isinstance(tbl, pyarrow.Table)
    assert tbl.num_rows == 2
コード例 #7
0
def test_to_dataframe_column_date_dtypes():
    from google.cloud.bigquery.job import QueryJob as target_class

    begun_resource = _make_job_resource(job_type="query")
    query_resource = {
        "jobComplete": True,
        "jobReference": begun_resource["jobReference"],
        "totalRows": "1",
        "schema": {
            "fields": [{
                "name": "date",
                "type": "DATE"
            }]
        },
    }
    row_data = [
        ["1999-12-01"],
    ]
    rows = [{"f": [{"v": field} for field in row]} for row in row_data]
    query_resource["rows"] = rows
    done_resource = copy.deepcopy(begun_resource)
    done_resource["status"] = {"state": "DONE"}
    connection = _make_connection(begun_resource, query_resource,
                                  done_resource, query_resource)
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(begun_resource, client)
    df = job.to_dataframe(date_as_object=False, create_bqstorage_client=False)

    assert isinstance(df, pandas.DataFrame)
    assert len(df) == 1  # verify the number of rows
    exp_columns = [
        field["name"] for field in query_resource["schema"]["fields"]
    ]
    assert list(df) == exp_columns  # verify the column names
    assert df.date.dtype.name == "datetime64[ns]"
コード例 #8
0
def test_to_dataframe_column_dtypes():
    from google.cloud.bigquery.job import QueryJob as target_class

    begun_resource = _make_job_resource(job_type="query")
    query_resource = {
        "jobComplete": True,
        "jobReference": begun_resource["jobReference"],
        "totalRows": "4",
        "schema": {
            "fields": [
                {"name": "start_timestamp", "type": "TIMESTAMP"},
                {"name": "seconds", "type": "INT64"},
                {"name": "miles", "type": "FLOAT64"},
                {"name": "km", "type": "FLOAT64"},
                {"name": "payment_type", "type": "STRING"},
                {"name": "complete", "type": "BOOL"},
                {"name": "date", "type": "DATE"},
            ]
        },
    }
    row_data = [
        [
            "1433836800000000",
            "420",
            "1.1",
            "1.77",
            "Cto_dataframeash",
            "true",
            "1999-12-01",
        ],
        ["1387811700000000", "2580", "17.7", "28.5", "Cash", "false", "1953-06-14"],
        ["1385565300000000", "2280", "4.4", "7.1", "Credit", "true", "1981-11-04"],
    ]
    rows = [{"f": [{"v": field} for field in row]} for row in row_data]
    query_resource["rows"] = rows
    done_resource = copy.deepcopy(begun_resource)
    done_resource["status"] = {"state": "DONE"}
    connection = _make_connection(
        begun_resource, query_resource, done_resource, query_resource
    )
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(begun_resource, client)

    df = job.to_dataframe(dtypes={"km": "float16"}, create_bqstorage_client=False)

    assert isinstance(df, pandas.DataFrame)
    assert len(df) == 3  # verify the number of rows
    exp_columns = [field["name"] for field in query_resource["schema"]["fields"]]
    assert list(df) == exp_columns  # verify the column names

    assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]"
    assert df.seconds.dtype.name == "int64"
    assert df.miles.dtype.name == "float64"
    assert df.km.dtype.name == "float16"
    assert df.payment_type.dtype.name == "object"
    assert df.complete.dtype.name == "bool"
    assert df.date.dtype.name == "object"
コード例 #9
0
def test_to_dataframe_w_tqdm():
    from google.cloud.bigquery import table
    from google.cloud.bigquery.job import QueryJob as target_class
    from google.cloud.bigquery.schema import SchemaField
    from google.cloud.bigquery._tqdm_helpers import _PROGRESS_BAR_UPDATE_INTERVAL

    begun_resource = _make_job_resource(job_type="query")
    schema = [
        SchemaField("name", "STRING", mode="NULLABLE"),
        SchemaField("age", "INTEGER", mode="NULLABLE"),
    ]
    rows = [
        {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
        {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
        {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
        {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
    ]

    connection = _make_connection({})
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(begun_resource, client)

    path = "/foo"
    api_request = mock.Mock(return_value={"rows": rows})
    row_iterator = table.RowIterator(client, api_request, path, schema)

    job._properties["statistics"] = {
        "query": {
            "queryPlan": [
                {"name": "S00: Input", "id": "0", "status": "COMPLETE"},
                {"name": "S01: Output", "id": "1", "status": "COMPLETE"},
            ]
        },
    }
    reload_patch = mock.patch(
        "google.cloud.bigquery.job._AsyncJob.reload", autospec=True
    )
    result_patch = mock.patch(
        "google.cloud.bigquery.job.QueryJob.result",
        side_effect=[
            concurrent.futures.TimeoutError,
            concurrent.futures.TimeoutError,
            row_iterator,
        ],
    )

    with result_patch as result_patch_tqdm, reload_patch:
        df = job.to_dataframe(progress_bar_type="tqdm", create_bqstorage_client=False)

    assert result_patch_tqdm.call_count == 3
    assert isinstance(df, pandas.DataFrame)
    assert len(df) == 4  # verify the number of rows
    assert list(df), ["name", "age"]  # verify the column names
    result_patch_tqdm.assert_called_with(
        timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=None
    )
コード例 #10
0
    def job(self):
        """Job instance used to run the query.

        :rtype: :class:`google.cloud.bigquery.job.QueryJob`, or ``NoneType``
        :returns: Job instance used to run the query (None until
                  ``jobReference`` property is set by the server).
        """
        if self._job is None:
            job_ref = self._properties.get('jobReference')
            if job_ref is not None:
                self._job = QueryJob(job_ref['jobId'], self.query,
                                     self._client)
        return self._job
コード例 #11
0
def test_to_dataframe_bqstorage_preserve_order(query, table_read_options_kwarg):
    from google.cloud.bigquery.job import QueryJob as target_class

    job_resource = _make_job_resource(
        project_id="test-project", job_type="query", ended=True
    )
    job_resource["configuration"]["query"]["query"] = query
    job_resource["status"] = {"state": "DONE"}
    get_query_results_resource = {
        "jobComplete": True,
        "jobReference": {"projectId": "test-project", "jobId": "test-job"},
        "schema": {
            "fields": [
                {"name": "name", "type": "STRING", "mode": "NULLABLE"},
                {"name": "age", "type": "INTEGER", "mode": "NULLABLE"},
            ]
        },
        "totalRows": "4",
    }
    connection = _make_connection(get_query_results_resource, job_resource)
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(job_resource, client)
    bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient)
    session = bigquery_storage.types.ReadSession()
    session.avro_schema.schema = json.dumps(
        {
            "type": "record",
            "name": "__root__",
            "fields": [
                {"name": "name", "type": ["null", "string"]},
                {"name": "age", "type": ["null", "long"]},
            ],
        }
    )
    bqstorage_client.create_read_session.return_value = session

    job.to_dataframe(bqstorage_client=bqstorage_client)

    destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format(
        **job_resource["configuration"]["query"]["destinationTable"]
    )
    expected_session = bigquery_storage.ReadSession(
        table=destination_table,
        data_format=bigquery_storage.DataFormat.ARROW,
        **table_read_options_kwarg,
    )
    bqstorage_client.create_read_session.assert_called_once_with(
        parent="projects/test-project",
        read_session=expected_session,
        max_stream_count=1,  # Use a single stream to preserve row order.
    )
コード例 #12
0
def test_to_arrow_w_tqdm_w_query_plan():
    from google.cloud.bigquery import table
    from google.cloud.bigquery.job import QueryJob as target_class
    from google.cloud.bigquery.schema import SchemaField
    from google.cloud.bigquery._tqdm_helpers import _PROGRESS_BAR_UPDATE_INTERVAL

    begun_resource = _make_job_resource(job_type="query")
    rows = [
        {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
        {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
    ]

    schema = [
        SchemaField("name", "STRING", mode="REQUIRED"),
        SchemaField("age", "INTEGER", mode="REQUIRED"),
    ]
    connection = _make_connection({})
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(begun_resource, client)

    path = "/foo"
    api_request = mock.Mock(return_value={"rows": rows})
    row_iterator = table.RowIterator(client, api_request, path, schema)

    job._properties["statistics"] = {
        "query": {
            "queryPlan": [
                {"name": "S00: Input", "id": "0", "status": "COMPLETE"},
                {"name": "S01: Output", "id": "1", "status": "COMPLETE"},
            ]
        },
    }
    reload_patch = mock.patch(
        "google.cloud.bigquery.job._AsyncJob.reload", autospec=True
    )
    result_patch = mock.patch(
        "google.cloud.bigquery.job.QueryJob.result",
        side_effect=[
            concurrent.futures.TimeoutError,
            concurrent.futures.TimeoutError,
            row_iterator,
        ],
    )

    with result_patch as result_patch_tqdm, reload_patch:
        tbl = job.to_arrow(progress_bar_type="tqdm", create_bqstorage_client=False)

    assert result_patch_tqdm.call_count == 3
    assert isinstance(tbl, pyarrow.Table)
    assert tbl.num_rows == 2
    result_patch_tqdm.assert_called_with(timeout=_PROGRESS_BAR_UPDATE_INTERVAL)
コード例 #13
0
def test_to_dataframe_bqstorage_no_pyarrow_compression():
    from google.cloud.bigquery.job import QueryJob as target_class

    resource = _make_job_resource(job_type="query", ended=True)
    query_resource = {
        "jobComplete": True,
        "jobReference": resource["jobReference"],
        "totalRows": "4",
        "schema": {
            "fields": [{
                "name": "name",
                "type": "STRING",
                "mode": "NULLABLE"
            }]
        },
    }
    connection = _make_connection(query_resource)
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(resource, client)
    bqstorage_client = mock.create_autospec(
        bigquery_storage.BigQueryReadClient)
    session = bigquery_storage.types.ReadSession()
    session.avro_schema.schema = json.dumps({
        "type":
        "record",
        "name":
        "__root__",
        "fields": [{
            "name": "name",
            "type": ["null", "string"]
        }],
    })
    bqstorage_client.create_read_session.return_value = session

    with mock.patch(
            "google.cloud.bigquery._pandas_helpers._ARROW_COMPRESSION_SUPPORT",
            new=False):
        job.to_dataframe(bqstorage_client=bqstorage_client)

    destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format(
        **resource["configuration"]["query"]["destinationTable"])
    expected_session = bigquery_storage.ReadSession(
        table=destination_table,
        data_format=bigquery_storage.DataFormat.ARROW,
    )
    bqstorage_client.create_read_session.assert_called_once_with(
        parent=f"projects/{client.project}",
        read_session=expected_session,
        max_stream_count=0,
    )
コード例 #14
0
    def run_async_query(self, job_name, query):
        """Construct a job for running a SQL query asynchronously.

        See:
        https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.query

        :type job_name: str
        :param job_name: Name of the job.

        :type query: str
        :param query: SQL query to be executed

        :rtype: :class:`google.cloud.bigquery.job.QueryJob`
        :returns: a new ``QueryJob`` instance
        """
        return QueryJob(job_name, query, client=self)
コード例 #15
0
def test_to_dataframe_bqstorage(table_read_options_kwarg):
    from google.cloud.bigquery.job import QueryJob as target_class

    resource = _make_job_resource(job_type="query", ended=True)
    query_resource = {
        "jobComplete": True,
        "jobReference": resource["jobReference"],
        "totalRows": "4",
        "schema": {
            "fields": [
                {"name": "name", "type": "STRING", "mode": "NULLABLE"},
                {"name": "age", "type": "INTEGER", "mode": "NULLABLE"},
            ]
        },
    }
    connection = _make_connection(query_resource)
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(resource, client)
    bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient)
    session = bigquery_storage.types.ReadSession()
    session.avro_schema.schema = json.dumps(
        {
            "type": "record",
            "name": "__root__",
            "fields": [
                {"name": "name", "type": ["null", "string"]},
                {"name": "age", "type": ["null", "long"]},
            ],
        }
    )
    bqstorage_client.create_read_session.return_value = session

    job.to_dataframe(bqstorage_client=bqstorage_client)

    destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format(
        **resource["configuration"]["query"]["destinationTable"]
    )
    expected_session = bigquery_storage.ReadSession(
        table=destination_table,
        data_format=bigquery_storage.DataFormat.ARROW,
        **table_read_options_kwarg,
    )
    bqstorage_client.create_read_session.assert_called_once_with(
        parent=f"projects/{client.project}",
        read_session=expected_session,
        max_stream_count=0,  # Use default number of streams for best performance.
    )
コード例 #16
0
def test_to_dataframe_ddl_query():
    from google.cloud.bigquery.job import QueryJob as target_class

    # Destination table may have no schema for some DDL and DML queries.
    resource = _make_job_resource(job_type="query", ended=True)
    query_resource = {
        "jobComplete": True,
        "jobReference": resource["jobReference"],
        "schema": {"fields": []},
    }
    connection = _make_connection(query_resource)
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(resource, client)

    df = job.to_dataframe()

    assert len(df) == 0
コード例 #17
0
    def test_from_query_job_wo_default_dataset(self):
        from google.cloud.bigquery.job import QueryJob
        from google.cloud.bigquery._helpers import UDFResource
        RESOURCE_URI = 'gs://some-bucket/js/lib.js'
        client = _Client(self.PROJECT)
        job = QueryJob(
            self.JOB_NAME, self.QUERY, client,
            udf_resources=[UDFResource("resourceUri", RESOURCE_URI)])
        klass = self._getTargetClass()

        query = klass.from_query_job(job)

        self.assertEqual(query.query, self.QUERY)
        self.assertTrue(query._client is client)
        self.assertTrue(query._job is job)
        self.assertEqual(query.udf_resources, job.udf_resources)
        self.assertIsNone(query.default_dataset)
        self.assertIsNone(query.use_query_cache)
        self.assertIsNone(query.use_legacy_sql)
コード例 #18
0
def _make_job(schema=(), rows=()):
    from google.cloud.bigquery.job import QueryJob as target_class

    begun_resource = _make_job_resource(job_type="query")
    query_resource = {
        "jobComplete": True,
        "jobReference": begun_resource["jobReference"],
        "totalRows": str(len(rows)),
        "schema": {
            "fields": [
                dict(name=field[0], type=field[1], mode=field[2]) for field in schema
            ]
        },
    }
    tabledata_resource = {"rows": [{"f": [{"v": v} for v in row]} for row in rows]}
    done_resource = copy.deepcopy(begun_resource)
    done_resource["status"] = {"state": "DONE"}
    connection = _make_connection(
        begun_resource, query_resource, done_resource, tabledata_resource
    )
    client = _make_client(connection=connection)
    return target_class.from_api_repr(begun_resource, client)
コード例 #19
0
    def job_from_resource(self, resource):
        """Detect correct job type from resource and instantiate.

        :type resource: dict
        :param resource: one job resource from API response

        :rtype: One of:
                :class:`google.cloud.bigquery.job.LoadTableFromStorageJob`,
                :class:`google.cloud.bigquery.job.CopyJob`,
                :class:`google.cloud.bigquery.job.ExtractTableToStorageJob`,
                :class:`google.cloud.bigquery.job.QueryJob`,
                :class:`google.cloud.bigquery.job.RunSyncQueryJob`
        :returns: the job instance, constructed via the resource
        """
        config = resource['configuration']
        if 'load' in config:
            return LoadTableFromStorageJob.from_api_repr(resource, self)
        elif 'copy' in config:
            return CopyJob.from_api_repr(resource, self)
        elif 'extract' in config:
            return ExtractTableToStorageJob.from_api_repr(resource, self)
        elif 'query' in config:
            return QueryJob.from_api_repr(resource, self)
        raise ValueError('Cannot parse job resource')
コード例 #20
0
ファイル: client.py プロジェクト: jonparrott/gcloud-python
    def job_from_resource(self, resource):
        """Detect correct job type from resource and instantiate.

        :type resource: dict
        :param resource: one job resource from API response

        :rtype: One of:
                :class:`google.cloud.bigquery.job.LoadTableFromStorageJob`,
                :class:`google.cloud.bigquery.job.CopyJob`,
                :class:`google.cloud.bigquery.job.ExtractTableToStorageJob`,
                :class:`google.cloud.bigquery.job.QueryJob`,
                :class:`google.cloud.bigquery.job.RunSyncQueryJob`
        :returns: the job instance, constructed via the resource
        """
        config = resource['configuration']
        if 'load' in config:
            return LoadTableFromStorageJob.from_api_repr(resource, self)
        elif 'copy' in config:
            return CopyJob.from_api_repr(resource, self)
        elif 'extract' in config:
            return ExtractTableToStorageJob.from_api_repr(resource, self)
        elif 'query' in config:
            return QueryJob.from_api_repr(resource, self)
        raise ValueError('Cannot parse job resource')
コード例 #21
0
def test_to_dataframe():
    from google.cloud.bigquery.job import QueryJob as target_class

    begun_resource = _make_job_resource(job_type="query")
    query_resource = {
        "jobComplete": True,
        "jobReference": begun_resource["jobReference"],
        "totalRows": "4",
        "schema": {
            "fields": [
                {"name": "name", "type": "STRING", "mode": "NULLABLE"},
                {"name": "age", "type": "INTEGER", "mode": "NULLABLE"},
            ]
        },
    }
    tabledata_resource = {
        "rows": [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
    }
    done_resource = copy.deepcopy(begun_resource)
    done_resource["status"] = {"state": "DONE"}
    connection = _make_connection(
        begun_resource, query_resource, done_resource, tabledata_resource
    )
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(begun_resource, client)

    df = job.to_dataframe(create_bqstorage_client=False)

    assert isinstance(df, pandas.DataFrame)
    assert len(df) == 4  # verify the number of rows
    assert list(df) == ["name", "age"]  # verify the column names
コード例 #22
0
def parse_query_result(query_job: QueryJob, query_rows: RowIterator) -> Rows:
    rows = [[field.name for field in query_job.result().schema]]
    rows.extend([str(item) for item in row] for row in query_rows)
    return rows
コード例 #23
0
def test_to_arrow(method_kwargs):
    from google.cloud.bigquery.job import QueryJob as target_class

    begun_resource = _make_job_resource(job_type="query")
    query_resource = {
        "jobComplete":
        True,
        "jobReference":
        begun_resource["jobReference"],
        "totalRows":
        "4",
        "schema": {
            "fields": [
                {
                    "name":
                    "spouse_1",
                    "type":
                    "RECORD",
                    "fields": [
                        {
                            "name": "name",
                            "type": "STRING",
                            "mode": "NULLABLE"
                        },
                        {
                            "name": "age",
                            "type": "INTEGER",
                            "mode": "NULLABLE"
                        },
                    ],
                },
                {
                    "name":
                    "spouse_2",
                    "type":
                    "RECORD",
                    "fields": [
                        {
                            "name": "name",
                            "type": "STRING",
                            "mode": "NULLABLE"
                        },
                        {
                            "name": "age",
                            "type": "INTEGER",
                            "mode": "NULLABLE"
                        },
                    ],
                },
            ]
        },
        "rows": [
            {
                "f": [
                    {
                        "v": {
                            "f": [{
                                "v": "Phred Phlyntstone"
                            }, {
                                "v": "32"
                            }]
                        }
                    },
                    {
                        "v": {
                            "f": [{
                                "v": "Wylma Phlyntstone"
                            }, {
                                "v": "29"
                            }]
                        }
                    },
                ]
            },
            {
                "f": [
                    {
                        "v": {
                            "f": [{
                                "v": "Bhettye Rhubble"
                            }, {
                                "v": "27"
                            }]
                        }
                    },
                    {
                        "v": {
                            "f": [{
                                "v": "Bharney Rhubble"
                            }, {
                                "v": "33"
                            }]
                        }
                    },
                ]
            },
        ],
    }
    done_resource = copy.deepcopy(begun_resource)
    done_resource["status"] = {"state": "DONE"}
    connection = _make_connection(begun_resource, query_resource,
                                  done_resource)
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(begun_resource, client)

    tbl = job.to_arrow(**method_kwargs)

    assert isinstance(tbl, pyarrow.Table)
    assert tbl.num_rows == 2

    # Check the schema.
    assert tbl.schema[0].name == "spouse_1"
    assert tbl.schema[0].type[0].name == "name"
    assert tbl.schema[0].type[1].name == "age"
    assert pyarrow.types.is_struct(tbl.schema[0].type)
    assert pyarrow.types.is_string(tbl.schema[0].type[0].type)
    assert pyarrow.types.is_int64(tbl.schema[0].type[1].type)
    assert tbl.schema[1].name == "spouse_2"
    assert tbl.schema[1].type[0].name == "name"
    assert tbl.schema[1].type[1].name == "age"
    assert pyarrow.types.is_struct(tbl.schema[1].type)
    assert pyarrow.types.is_string(tbl.schema[1].type[0].type)
    assert pyarrow.types.is_int64(tbl.schema[1].type[1].type)

    # Check the data.
    tbl_data = tbl.to_pydict()
    spouse_1 = tbl_data["spouse_1"]
    assert spouse_1 == [
        {
            "name": "Phred Phlyntstone",
            "age": 32
        },
        {
            "name": "Bhettye Rhubble",
            "age": 27
        },
    ]
    spouse_2 = tbl_data["spouse_2"]
    assert spouse_2 == [
        {
            "name": "Wylma Phlyntstone",
            "age": 29
        },
        {
            "name": "Bharney Rhubble",
            "age": 33
        },
    ]