Esempio n. 1
0
def test_to_dataframe_bqstorage_preserve_order(query, table_read_options_kwarg):
    from google.cloud.bigquery.job import QueryJob as target_class

    job_resource = _make_job_resource(
        project_id="test-project", job_type="query", ended=True
    )
    job_resource["configuration"]["query"]["query"] = query
    job_resource["status"] = {"state": "DONE"}
    get_query_results_resource = {
        "jobComplete": True,
        "jobReference": {"projectId": "test-project", "jobId": "test-job"},
        "schema": {
            "fields": [
                {"name": "name", "type": "STRING", "mode": "NULLABLE"},
                {"name": "age", "type": "INTEGER", "mode": "NULLABLE"},
            ]
        },
        "totalRows": "4",
    }
    connection = _make_connection(get_query_results_resource, job_resource)
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(job_resource, client)
    bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient)
    session = bigquery_storage.types.ReadSession()
    session.avro_schema.schema = json.dumps(
        {
            "type": "record",
            "name": "__root__",
            "fields": [
                {"name": "name", "type": ["null", "string"]},
                {"name": "age", "type": ["null", "long"]},
            ],
        }
    )
    bqstorage_client.create_read_session.return_value = session

    job.to_dataframe(bqstorage_client=bqstorage_client)

    destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format(
        **job_resource["configuration"]["query"]["destinationTable"]
    )
    expected_session = bigquery_storage.ReadSession(
        table=destination_table,
        data_format=bigquery_storage.DataFormat.ARROW,
        **table_read_options_kwarg,
    )
    bqstorage_client.create_read_session.assert_called_once_with(
        parent="projects/test-project",
        read_session=expected_session,
        max_stream_count=1,  # Use a single stream to preserve row order.
    )
    def test_fetchall_w_bqstorage_client_no_arrow_compression(self):
        from google.cloud.bigquery import dbapi
        from google.cloud.bigquery import table

        # Use unordered data to also test any non-determenistic key order in dicts.
        row_data = [table.Row([1.2, 1.1], {"bar": 1, "foo": 0})]
        bqstorage_streamed_rows = [{
            "bar": _to_pyarrow(1.2),
            "foo": _to_pyarrow(1.1)
        }]

        mock_client = self._mock_client(rows=row_data)
        mock_bqstorage_client = self._mock_bqstorage_client(
            stream_count=1,
            rows=bqstorage_streamed_rows,
        )

        connection = dbapi.connect(
            client=mock_client,
            bqstorage_client=mock_bqstorage_client,
        )
        cursor = connection.cursor()
        cursor.execute("SELECT foo, bar FROM some_table")

        with mock.patch(
                "google.cloud.bigquery.dbapi.cursor._ARROW_COMPRESSION_SUPPORT",
                new=False):
            rows = cursor.fetchall()

        mock_client.list_rows.assert_not_called(
        )  # The default client was not used.

        # Check the BQ Storage session config.
        expected_session = bigquery_storage.ReadSession(
            table="projects/P/datasets/DS/tables/T",
            data_format=bigquery_storage.DataFormat.ARROW,
        )
        mock_bqstorage_client.create_read_session.assert_called_once_with(
            parent="projects/P",
            read_session=expected_session,
            max_stream_count=1)

        # Check the data returned.
        field_value = op.itemgetter(1)
        sorted_row_data = [
            sorted(row.items(), key=field_value) for row in rows
        ]
        expected_row_data = [[("foo", 1.1), ("bar", 1.2)]]

        self.assertEqual(sorted_row_data, expected_row_data)
Esempio n. 3
0
def test_to_dataframe_bqstorage_no_pyarrow_compression():
    from google.cloud.bigquery.job import QueryJob as target_class

    resource = _make_job_resource(job_type="query", ended=True)
    query_resource = {
        "jobComplete": True,
        "jobReference": resource["jobReference"],
        "totalRows": "4",
        "schema": {
            "fields": [{
                "name": "name",
                "type": "STRING",
                "mode": "NULLABLE"
            }]
        },
    }
    connection = _make_connection(query_resource)
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(resource, client)
    bqstorage_client = mock.create_autospec(
        bigquery_storage.BigQueryReadClient)
    session = bigquery_storage.types.ReadSession()
    session.avro_schema.schema = json.dumps({
        "type":
        "record",
        "name":
        "__root__",
        "fields": [{
            "name": "name",
            "type": ["null", "string"]
        }],
    })
    bqstorage_client.create_read_session.return_value = session

    with mock.patch(
            "google.cloud.bigquery._pandas_helpers._ARROW_COMPRESSION_SUPPORT",
            new=False):
        job.to_dataframe(bqstorage_client=bqstorage_client)

    destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format(
        **resource["configuration"]["query"]["destinationTable"])
    expected_session = bigquery_storage.ReadSession(
        table=destination_table,
        data_format=bigquery_storage.DataFormat.ARROW,
    )
    bqstorage_client.create_read_session.assert_called_once_with(
        parent=f"projects/{client.project}",
        read_session=expected_session,
        max_stream_count=0,
    )
Esempio n. 4
0
def test_to_dataframe_bqstorage(table_read_options_kwarg):
    from google.cloud.bigquery.job import QueryJob as target_class

    resource = _make_job_resource(job_type="query", ended=True)
    query_resource = {
        "jobComplete": True,
        "jobReference": resource["jobReference"],
        "totalRows": "4",
        "schema": {
            "fields": [
                {"name": "name", "type": "STRING", "mode": "NULLABLE"},
                {"name": "age", "type": "INTEGER", "mode": "NULLABLE"},
            ]
        },
    }
    connection = _make_connection(query_resource)
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(resource, client)
    bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient)
    session = bigquery_storage.types.ReadSession()
    session.avro_schema.schema = json.dumps(
        {
            "type": "record",
            "name": "__root__",
            "fields": [
                {"name": "name", "type": ["null", "string"]},
                {"name": "age", "type": ["null", "long"]},
            ],
        }
    )
    bqstorage_client.create_read_session.return_value = session

    job.to_dataframe(bqstorage_client=bqstorage_client)

    destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format(
        **resource["configuration"]["query"]["destinationTable"]
    )
    expected_session = bigquery_storage.ReadSession(
        table=destination_table,
        data_format=bigquery_storage.DataFormat.ARROW,
        **table_read_options_kwarg,
    )
    bqstorage_client.create_read_session.assert_called_once_with(
        parent=f"projects/{client.project}",
        read_session=expected_session,
        max_stream_count=0,  # Use default number of streams for best performance.
    )