def test_from_query_job(self): from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.job import QueryJob from google.cloud.bigquery._helpers import UDFResource DS_NAME = 'DATASET' RESOURCE_URI = 'gs://some-bucket/js/lib.js' client = _Client(self.PROJECT) job = QueryJob( self.JOB_NAME, self.QUERY, client, udf_resources=[UDFResource("resourceUri", RESOURCE_URI)]) dataset = job.default_dataset = Dataset(DS_NAME, client) job.use_query_cache = True job.use_legacy_sql = True klass = self._getTargetClass() query = klass.from_query_job(job) self.assertEqual(query.name, self.JOB_NAME) self.assertEqual(query.query, self.QUERY) self.assertIs(query._client, client) self.assertIs(query._job, job) self.assertEqual(query.udf_resources, job.udf_resources) self.assertIs(query.default_dataset, dataset) self.assertTrue(query.use_query_cache) self.assertTrue(query.use_legacy_sql)
def test_to_dataframe_with_progress_bar(tqdm_mock): from google.cloud.bigquery.job import QueryJob as target_class begun_resource = _make_job_resource(job_type="query") query_resource = { "jobComplete": True, "jobReference": begun_resource["jobReference"], "totalRows": "4", "schema": { "fields": [{ "name": "name", "type": "STRING", "mode": "NULLABLE" }] }, } done_resource = copy.deepcopy(begun_resource) done_resource["status"] = {"state": "DONE"} connection = _make_connection( begun_resource, query_resource, done_resource, query_resource, query_resource, ) client = _make_client(connection=connection) job = target_class.from_api_repr(begun_resource, client) job.to_dataframe(progress_bar_type=None, create_bqstorage_client=False) tqdm_mock.assert_not_called() job.to_dataframe(progress_bar_type="tqdm", create_bqstorage_client=False) tqdm_mock.assert_called()
def send_to_gcs(self, query, project_id, output_uri, delimiter=","): job_results = {} client = self.bq_client queryJob = QueryJob(self.__create_job_id(project_id, "queryJob"), query, client) job_results["queryJob"] = self.__get_results(queryJob) output_type = output_uri.split(".")[-1] dest_format = self.__get_file_type(output_type) if dest_format == SourceFormat.CSV: config = ExtractJobConfig(destination_format=dest_format, field_delimiter=delimiter) else: config = ExtractJobConfig(destination_format=dest_format) extractJob = ExtractJob(self.__create_job_id(project_id, "extractJob"), queryJob.destination, output_uri, client, job_config=config) job_results["extractJob"] = self.__get_results(extractJob) return job_results
def run_async_query(self, job_name, query, udf_resources=(), query_parameters=()): """Construct a job for running a SQL query asynchronously. See https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query :type job_name: str :param job_name: Name of the job. :type query: str :param query: SQL query to be executed :type udf_resources: tuple :param udf_resources: An iterable of :class:`google.cloud.bigquery._helpers.UDFResource` (empty by default) :type query_parameters: tuple :param query_parameters: An iterable of :class:`google.cloud.bigquery._helpers.AbstractQueryParameter` (empty by default) :rtype: :class:`google.cloud.bigquery.job.QueryJob` :returns: a new ``QueryJob`` instance """ return QueryJob(job_name, query, client=self, udf_resources=udf_resources, query_parameters=query_parameters)
def test_to_arrow_max_results_no_progress_bar(): from google.cloud.bigquery import table from google.cloud.bigquery.job import QueryJob as target_class from google.cloud.bigquery.schema import SchemaField connection = _make_connection({}) client = _make_client(connection=connection) begun_resource = _make_job_resource(job_type="query") job = target_class.from_api_repr(begun_resource, client) schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] rows = [ {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, ] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = table.RowIterator(client, api_request, path, schema) result_patch = mock.patch( "google.cloud.bigquery.job.QueryJob.result", return_value=row_iterator, ) with result_patch as result_patch_tqdm: tbl = job.to_arrow(create_bqstorage_client=False, max_results=123) result_patch_tqdm.assert_called_once_with(max_results=123) assert isinstance(tbl, pyarrow.Table) assert tbl.num_rows == 2
def test_to_dataframe_column_date_dtypes(): from google.cloud.bigquery.job import QueryJob as target_class begun_resource = _make_job_resource(job_type="query") query_resource = { "jobComplete": True, "jobReference": begun_resource["jobReference"], "totalRows": "1", "schema": { "fields": [{ "name": "date", "type": "DATE" }] }, } row_data = [ ["1999-12-01"], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] query_resource["rows"] = rows done_resource = copy.deepcopy(begun_resource) done_resource["status"] = {"state": "DONE"} connection = _make_connection(begun_resource, query_resource, done_resource, query_resource) client = _make_client(connection=connection) job = target_class.from_api_repr(begun_resource, client) df = job.to_dataframe(date_as_object=False, create_bqstorage_client=False) assert isinstance(df, pandas.DataFrame) assert len(df) == 1 # verify the number of rows exp_columns = [ field["name"] for field in query_resource["schema"]["fields"] ] assert list(df) == exp_columns # verify the column names assert df.date.dtype.name == "datetime64[ns]"
def test_to_dataframe_column_dtypes(): from google.cloud.bigquery.job import QueryJob as target_class begun_resource = _make_job_resource(job_type="query") query_resource = { "jobComplete": True, "jobReference": begun_resource["jobReference"], "totalRows": "4", "schema": { "fields": [ {"name": "start_timestamp", "type": "TIMESTAMP"}, {"name": "seconds", "type": "INT64"}, {"name": "miles", "type": "FLOAT64"}, {"name": "km", "type": "FLOAT64"}, {"name": "payment_type", "type": "STRING"}, {"name": "complete", "type": "BOOL"}, {"name": "date", "type": "DATE"}, ] }, } row_data = [ [ "1433836800000000", "420", "1.1", "1.77", "Cto_dataframeash", "true", "1999-12-01", ], ["1387811700000000", "2580", "17.7", "28.5", "Cash", "false", "1953-06-14"], ["1385565300000000", "2280", "4.4", "7.1", "Credit", "true", "1981-11-04"], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] query_resource["rows"] = rows done_resource = copy.deepcopy(begun_resource) done_resource["status"] = {"state": "DONE"} connection = _make_connection( begun_resource, query_resource, done_resource, query_resource ) client = _make_client(connection=connection) job = target_class.from_api_repr(begun_resource, client) df = job.to_dataframe(dtypes={"km": "float16"}, create_bqstorage_client=False) assert isinstance(df, pandas.DataFrame) assert len(df) == 3 # verify the number of rows exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] assert list(df) == exp_columns # verify the column names assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]" assert df.seconds.dtype.name == "int64" assert df.miles.dtype.name == "float64" assert df.km.dtype.name == "float16" assert df.payment_type.dtype.name == "object" assert df.complete.dtype.name == "bool" assert df.date.dtype.name == "object"
def test_to_dataframe_w_tqdm(): from google.cloud.bigquery import table from google.cloud.bigquery.job import QueryJob as target_class from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery._tqdm_helpers import _PROGRESS_BAR_UPDATE_INTERVAL begun_resource = _make_job_resource(job_type="query") schema = [ SchemaField("name", "STRING", mode="NULLABLE"), SchemaField("age", "INTEGER", mode="NULLABLE"), ] rows = [ {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, ] connection = _make_connection({}) client = _make_client(connection=connection) job = target_class.from_api_repr(begun_resource, client) path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = table.RowIterator(client, api_request, path, schema) job._properties["statistics"] = { "query": { "queryPlan": [ {"name": "S00: Input", "id": "0", "status": "COMPLETE"}, {"name": "S01: Output", "id": "1", "status": "COMPLETE"}, ] }, } reload_patch = mock.patch( "google.cloud.bigquery.job._AsyncJob.reload", autospec=True ) result_patch = mock.patch( "google.cloud.bigquery.job.QueryJob.result", side_effect=[ concurrent.futures.TimeoutError, concurrent.futures.TimeoutError, row_iterator, ], ) with result_patch as result_patch_tqdm, reload_patch: df = job.to_dataframe(progress_bar_type="tqdm", create_bqstorage_client=False) assert result_patch_tqdm.call_count == 3 assert isinstance(df, pandas.DataFrame) assert len(df) == 4 # verify the number of rows assert list(df), ["name", "age"] # verify the column names result_patch_tqdm.assert_called_with( timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=None )
def job(self): """Job instance used to run the query. :rtype: :class:`google.cloud.bigquery.job.QueryJob`, or ``NoneType`` :returns: Job instance used to run the query (None until ``jobReference`` property is set by the server). """ if self._job is None: job_ref = self._properties.get('jobReference') if job_ref is not None: self._job = QueryJob(job_ref['jobId'], self.query, self._client) return self._job
def test_to_dataframe_bqstorage_preserve_order(query, table_read_options_kwarg): from google.cloud.bigquery.job import QueryJob as target_class job_resource = _make_job_resource( project_id="test-project", job_type="query", ended=True ) job_resource["configuration"]["query"]["query"] = query job_resource["status"] = {"state": "DONE"} get_query_results_resource = { "jobComplete": True, "jobReference": {"projectId": "test-project", "jobId": "test-job"}, "schema": { "fields": [ {"name": "name", "type": "STRING", "mode": "NULLABLE"}, {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, ] }, "totalRows": "4", } connection = _make_connection(get_query_results_resource, job_resource) client = _make_client(connection=connection) job = target_class.from_api_repr(job_resource, client) bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) session = bigquery_storage.types.ReadSession() session.avro_schema.schema = json.dumps( { "type": "record", "name": "__root__", "fields": [ {"name": "name", "type": ["null", "string"]}, {"name": "age", "type": ["null", "long"]}, ], } ) bqstorage_client.create_read_session.return_value = session job.to_dataframe(bqstorage_client=bqstorage_client) destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format( **job_resource["configuration"]["query"]["destinationTable"] ) expected_session = bigquery_storage.ReadSession( table=destination_table, data_format=bigquery_storage.DataFormat.ARROW, **table_read_options_kwarg, ) bqstorage_client.create_read_session.assert_called_once_with( parent="projects/test-project", read_session=expected_session, max_stream_count=1, # Use a single stream to preserve row order. )
def test_to_arrow_w_tqdm_w_query_plan(): from google.cloud.bigquery import table from google.cloud.bigquery.job import QueryJob as target_class from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery._tqdm_helpers import _PROGRESS_BAR_UPDATE_INTERVAL begun_resource = _make_job_resource(job_type="query") rows = [ {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, ] schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] connection = _make_connection({}) client = _make_client(connection=connection) job = target_class.from_api_repr(begun_resource, client) path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = table.RowIterator(client, api_request, path, schema) job._properties["statistics"] = { "query": { "queryPlan": [ {"name": "S00: Input", "id": "0", "status": "COMPLETE"}, {"name": "S01: Output", "id": "1", "status": "COMPLETE"}, ] }, } reload_patch = mock.patch( "google.cloud.bigquery.job._AsyncJob.reload", autospec=True ) result_patch = mock.patch( "google.cloud.bigquery.job.QueryJob.result", side_effect=[ concurrent.futures.TimeoutError, concurrent.futures.TimeoutError, row_iterator, ], ) with result_patch as result_patch_tqdm, reload_patch: tbl = job.to_arrow(progress_bar_type="tqdm", create_bqstorage_client=False) assert result_patch_tqdm.call_count == 3 assert isinstance(tbl, pyarrow.Table) assert tbl.num_rows == 2 result_patch_tqdm.assert_called_with(timeout=_PROGRESS_BAR_UPDATE_INTERVAL)
def test_to_dataframe_bqstorage_no_pyarrow_compression(): from google.cloud.bigquery.job import QueryJob as target_class resource = _make_job_resource(job_type="query", ended=True) query_resource = { "jobComplete": True, "jobReference": resource["jobReference"], "totalRows": "4", "schema": { "fields": [{ "name": "name", "type": "STRING", "mode": "NULLABLE" }] }, } connection = _make_connection(query_resource) client = _make_client(connection=connection) job = target_class.from_api_repr(resource, client) bqstorage_client = mock.create_autospec( bigquery_storage.BigQueryReadClient) session = bigquery_storage.types.ReadSession() session.avro_schema.schema = json.dumps({ "type": "record", "name": "__root__", "fields": [{ "name": "name", "type": ["null", "string"] }], }) bqstorage_client.create_read_session.return_value = session with mock.patch( "google.cloud.bigquery._pandas_helpers._ARROW_COMPRESSION_SUPPORT", new=False): job.to_dataframe(bqstorage_client=bqstorage_client) destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format( **resource["configuration"]["query"]["destinationTable"]) expected_session = bigquery_storage.ReadSession( table=destination_table, data_format=bigquery_storage.DataFormat.ARROW, ) bqstorage_client.create_read_session.assert_called_once_with( parent=f"projects/{client.project}", read_session=expected_session, max_stream_count=0, )
def run_async_query(self, job_name, query): """Construct a job for running a SQL query asynchronously. See: https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.query :type job_name: str :param job_name: Name of the job. :type query: str :param query: SQL query to be executed :rtype: :class:`google.cloud.bigquery.job.QueryJob` :returns: a new ``QueryJob`` instance """ return QueryJob(job_name, query, client=self)
def test_to_dataframe_bqstorage(table_read_options_kwarg): from google.cloud.bigquery.job import QueryJob as target_class resource = _make_job_resource(job_type="query", ended=True) query_resource = { "jobComplete": True, "jobReference": resource["jobReference"], "totalRows": "4", "schema": { "fields": [ {"name": "name", "type": "STRING", "mode": "NULLABLE"}, {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, ] }, } connection = _make_connection(query_resource) client = _make_client(connection=connection) job = target_class.from_api_repr(resource, client) bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) session = bigquery_storage.types.ReadSession() session.avro_schema.schema = json.dumps( { "type": "record", "name": "__root__", "fields": [ {"name": "name", "type": ["null", "string"]}, {"name": "age", "type": ["null", "long"]}, ], } ) bqstorage_client.create_read_session.return_value = session job.to_dataframe(bqstorage_client=bqstorage_client) destination_table = "projects/{projectId}/datasets/{datasetId}/tables/{tableId}".format( **resource["configuration"]["query"]["destinationTable"] ) expected_session = bigquery_storage.ReadSession( table=destination_table, data_format=bigquery_storage.DataFormat.ARROW, **table_read_options_kwarg, ) bqstorage_client.create_read_session.assert_called_once_with( parent=f"projects/{client.project}", read_session=expected_session, max_stream_count=0, # Use default number of streams for best performance. )
def test_to_dataframe_ddl_query(): from google.cloud.bigquery.job import QueryJob as target_class # Destination table may have no schema for some DDL and DML queries. resource = _make_job_resource(job_type="query", ended=True) query_resource = { "jobComplete": True, "jobReference": resource["jobReference"], "schema": {"fields": []}, } connection = _make_connection(query_resource) client = _make_client(connection=connection) job = target_class.from_api_repr(resource, client) df = job.to_dataframe() assert len(df) == 0
def test_from_query_job_wo_default_dataset(self): from google.cloud.bigquery.job import QueryJob from google.cloud.bigquery._helpers import UDFResource RESOURCE_URI = 'gs://some-bucket/js/lib.js' client = _Client(self.PROJECT) job = QueryJob( self.JOB_NAME, self.QUERY, client, udf_resources=[UDFResource("resourceUri", RESOURCE_URI)]) klass = self._getTargetClass() query = klass.from_query_job(job) self.assertEqual(query.query, self.QUERY) self.assertTrue(query._client is client) self.assertTrue(query._job is job) self.assertEqual(query.udf_resources, job.udf_resources) self.assertIsNone(query.default_dataset) self.assertIsNone(query.use_query_cache) self.assertIsNone(query.use_legacy_sql)
def _make_job(schema=(), rows=()): from google.cloud.bigquery.job import QueryJob as target_class begun_resource = _make_job_resource(job_type="query") query_resource = { "jobComplete": True, "jobReference": begun_resource["jobReference"], "totalRows": str(len(rows)), "schema": { "fields": [ dict(name=field[0], type=field[1], mode=field[2]) for field in schema ] }, } tabledata_resource = {"rows": [{"f": [{"v": v} for v in row]} for row in rows]} done_resource = copy.deepcopy(begun_resource) done_resource["status"] = {"state": "DONE"} connection = _make_connection( begun_resource, query_resource, done_resource, tabledata_resource ) client = _make_client(connection=connection) return target_class.from_api_repr(begun_resource, client)
def job_from_resource(self, resource): """Detect correct job type from resource and instantiate. :type resource: dict :param resource: one job resource from API response :rtype: One of: :class:`google.cloud.bigquery.job.LoadTableFromStorageJob`, :class:`google.cloud.bigquery.job.CopyJob`, :class:`google.cloud.bigquery.job.ExtractTableToStorageJob`, :class:`google.cloud.bigquery.job.QueryJob`, :class:`google.cloud.bigquery.job.RunSyncQueryJob` :returns: the job instance, constructed via the resource """ config = resource['configuration'] if 'load' in config: return LoadTableFromStorageJob.from_api_repr(resource, self) elif 'copy' in config: return CopyJob.from_api_repr(resource, self) elif 'extract' in config: return ExtractTableToStorageJob.from_api_repr(resource, self) elif 'query' in config: return QueryJob.from_api_repr(resource, self) raise ValueError('Cannot parse job resource')
def test_to_dataframe(): from google.cloud.bigquery.job import QueryJob as target_class begun_resource = _make_job_resource(job_type="query") query_resource = { "jobComplete": True, "jobReference": begun_resource["jobReference"], "totalRows": "4", "schema": { "fields": [ {"name": "name", "type": "STRING", "mode": "NULLABLE"}, {"name": "age", "type": "INTEGER", "mode": "NULLABLE"}, ] }, } tabledata_resource = { "rows": [ {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, ] } done_resource = copy.deepcopy(begun_resource) done_resource["status"] = {"state": "DONE"} connection = _make_connection( begun_resource, query_resource, done_resource, tabledata_resource ) client = _make_client(connection=connection) job = target_class.from_api_repr(begun_resource, client) df = job.to_dataframe(create_bqstorage_client=False) assert isinstance(df, pandas.DataFrame) assert len(df) == 4 # verify the number of rows assert list(df) == ["name", "age"] # verify the column names
def parse_query_result(query_job: QueryJob, query_rows: RowIterator) -> Rows: rows = [[field.name for field in query_job.result().schema]] rows.extend([str(item) for item in row] for row in query_rows) return rows
def test_to_arrow(method_kwargs): from google.cloud.bigquery.job import QueryJob as target_class begun_resource = _make_job_resource(job_type="query") query_resource = { "jobComplete": True, "jobReference": begun_resource["jobReference"], "totalRows": "4", "schema": { "fields": [ { "name": "spouse_1", "type": "RECORD", "fields": [ { "name": "name", "type": "STRING", "mode": "NULLABLE" }, { "name": "age", "type": "INTEGER", "mode": "NULLABLE" }, ], }, { "name": "spouse_2", "type": "RECORD", "fields": [ { "name": "name", "type": "STRING", "mode": "NULLABLE" }, { "name": "age", "type": "INTEGER", "mode": "NULLABLE" }, ], }, ] }, "rows": [ { "f": [ { "v": { "f": [{ "v": "Phred Phlyntstone" }, { "v": "32" }] } }, { "v": { "f": [{ "v": "Wylma Phlyntstone" }, { "v": "29" }] } }, ] }, { "f": [ { "v": { "f": [{ "v": "Bhettye Rhubble" }, { "v": "27" }] } }, { "v": { "f": [{ "v": "Bharney Rhubble" }, { "v": "33" }] } }, ] }, ], } done_resource = copy.deepcopy(begun_resource) done_resource["status"] = {"state": "DONE"} connection = _make_connection(begun_resource, query_resource, done_resource) client = _make_client(connection=connection) job = target_class.from_api_repr(begun_resource, client) tbl = job.to_arrow(**method_kwargs) assert isinstance(tbl, pyarrow.Table) assert tbl.num_rows == 2 # Check the schema. assert tbl.schema[0].name == "spouse_1" assert tbl.schema[0].type[0].name == "name" assert tbl.schema[0].type[1].name == "age" assert pyarrow.types.is_struct(tbl.schema[0].type) assert pyarrow.types.is_string(tbl.schema[0].type[0].type) assert pyarrow.types.is_int64(tbl.schema[0].type[1].type) assert tbl.schema[1].name == "spouse_2" assert tbl.schema[1].type[0].name == "name" assert tbl.schema[1].type[1].name == "age" assert pyarrow.types.is_struct(tbl.schema[1].type) assert pyarrow.types.is_string(tbl.schema[1].type[0].type) assert pyarrow.types.is_int64(tbl.schema[1].type[1].type) # Check the data. tbl_data = tbl.to_pydict() spouse_1 = tbl_data["spouse_1"] assert spouse_1 == [ { "name": "Phred Phlyntstone", "age": 32 }, { "name": "Bhettye Rhubble", "age": 27 }, ] spouse_2 = tbl_data["spouse_2"] assert spouse_2 == [ { "name": "Wylma Phlyntstone", "age": 29 }, { "name": "Bharney Rhubble", "age": 33 }, ]