def test_to_dataframe_bqstorage_preserve_order(query, table_read_options_kwarg): from google.cloud.bigquery.job import QueryJob as target_class job_resource = _make_job_resource( project_id="test-project", job_type="query", ended=True ) job_resource["configuration"]["query"]["query"] = query job_resource["status"] = {"state": "DONE"} get_query_results_resource = { "jobComplete": True, "jobReference": {"projectId": "test-project", "jobId": "test-job"}, "schema": { "fields": [ {"name": "name", "type": "STRING", "mode": "NULLABLE"}, {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, ] }, "totalRows": "4", } connection = _make_connection(get_query_results_resource, job_resource) client = _make_client(connection=connection) job = target_class.from_api_repr(job_resource, client) bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) session = bigquery_storage.types.ReadSession() session.avro_schema.schema = json.dumps( { "type": "record", "name": "__root__", "fields": [ {"name": "name", "type": ["null", "string"]}, {"name": "age", "type": ["null", "long"]}, ], } ) bqstorage_client.create_read_session.return_value = session job.to_dataframe(bqstorage_client=bqstorage_client) destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format( **job_resource["configuration"]["query"]["destinationTable"] ) expected_session = bigquery_storage.ReadSession( table=destination_table, data_format=bigquery_storage.DataFormat.ARROW, **table_read_options_kwarg, ) bqstorage_client.create_read_session.assert_called_once_with( parent="projects/test-project", read_session=expected_session, max_stream_count=1, # Use a single stream to preserve row order. )
def test_fetchall_w_bqstorage_client_no_arrow_compression(self): from google.cloud.bigquery import dbapi from google.cloud.bigquery import table # Use unordered data to also test any non-determenistic key order in dicts. row_data = [table.Row([1.2, 1.1], {"bar": 1, "foo": 0})] bqstorage_streamed_rows = [{ "bar": _to_pyarrow(1.2), "foo": _to_pyarrow(1.1) }] mock_client = self._mock_client(rows=row_data) mock_bqstorage_client = self._mock_bqstorage_client( stream_count=1, rows=bqstorage_streamed_rows, ) connection = dbapi.connect( client=mock_client, bqstorage_client=mock_bqstorage_client, ) cursor = connection.cursor() cursor.execute("SELECT foo, bar FROM some_table") with mock.patch( "google.cloud.bigquery.dbapi.cursor._ARROW_COMPRESSION_SUPPORT", new=False): rows = cursor.fetchall() mock_client.list_rows.assert_not_called( ) # The default client was not used. # Check the BQ Storage session config. expected_session = bigquery_storage.ReadSession( table="projects/P/datasets/DS/tables/T", data_format=bigquery_storage.DataFormat.ARROW, ) mock_bqstorage_client.create_read_session.assert_called_once_with( parent="projects/P", read_session=expected_session, max_stream_count=1) # Check the data returned. field_value = op.itemgetter(1) sorted_row_data = [ sorted(row.items(), key=field_value) for row in rows ] expected_row_data = [[("foo", 1.1), ("bar", 1.2)]] self.assertEqual(sorted_row_data, expected_row_data)
def test_to_dataframe_bqstorage_no_pyarrow_compression(): from google.cloud.bigquery.job import QueryJob as target_class resource = _make_job_resource(job_type="query", ended=True) query_resource = { "jobComplete": True, "jobReference": resource["jobReference"], "totalRows": "4", "schema": { "fields": [{ "name": "name", "type": "STRING", "mode": "NULLABLE" }] }, } connection = _make_connection(query_resource) client = _make_client(connection=connection) job = target_class.from_api_repr(resource, client) bqstorage_client = mock.create_autospec( bigquery_storage.BigQueryReadClient) session = bigquery_storage.types.ReadSession() session.avro_schema.schema = json.dumps({ "type": "record", "name": "__root__", "fields": [{ "name": "name", "type": ["null", "string"] }], }) bqstorage_client.create_read_session.return_value = session with mock.patch( "google.cloud.bigquery._pandas_helpers._ARROW_COMPRESSION_SUPPORT", new=False): job.to_dataframe(bqstorage_client=bqstorage_client) destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format( **resource["configuration"]["query"]["destinationTable"]) expected_session = bigquery_storage.ReadSession( table=destination_table, data_format=bigquery_storage.DataFormat.ARROW, ) bqstorage_client.create_read_session.assert_called_once_with( parent=f"projects/{client.project}", read_session=expected_session, max_stream_count=0, )
def test_to_dataframe_bqstorage(table_read_options_kwarg): from google.cloud.bigquery.job import QueryJob as target_class resource = _make_job_resource(job_type="query", ended=True) query_resource = { "jobComplete": True, "jobReference": resource["jobReference"], "totalRows": "4", "schema": { "fields": [ {"name": "name", "type": "STRING", "mode": "NULLABLE"}, {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, ] }, } connection = _make_connection(query_resource) client = _make_client(connection=connection) job = target_class.from_api_repr(resource, client) bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) session = bigquery_storage.types.ReadSession() session.avro_schema.schema = json.dumps( { "type": "record", "name": "__root__", "fields": [ {"name": "name", "type": ["null", "string"]}, {"name": "age", "type": ["null", "long"]}, ], } ) bqstorage_client.create_read_session.return_value = session job.to_dataframe(bqstorage_client=bqstorage_client) destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format( **resource["configuration"]["query"]["destinationTable"] ) expected_session = bigquery_storage.ReadSession( table=destination_table, data_format=bigquery_storage.DataFormat.ARROW, **table_read_options_kwarg, ) bqstorage_client.create_read_session.assert_called_once_with( parent=f"projects/{client.project}", read_session=expected_session, max_stream_count=0, # Use default number of streams for best performance. )