def test_table_lookup(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') table_ref1 = TableReference(dataset_ref1, 'table1') table1 = Table(table_ref1, _TEST_SCHEMA) # Trying to get the same dataset/table in another project doesn't work. with self.assertRaisesRegexp(NotFound, 'other_project'): self.bq_client.get_table( TableReference(DatasetReference('other_project', 'dataset1'), 'table1')) # Trying to get the table before the dataset exists doesn't work with self.assertRaisesRegexp(NotFound, 'dataset1'): self.bq_client.get_table(table_ref1) self.bq_client.create_dataset(Dataset(dataset_ref1)) # Trying to get the table before the table exists doesn't work with self.assertRaises(NotFound): self.bq_client.get_table(table_ref1) self.bq_client.create_table(table1) # Assert the created table has the expected properties. table_found = self.bq_client.get_table(table_ref1) self.assertEqual(table1.project, "my_project") self.assertEqual(table1.dataset_id, "dataset1") self.assertEqual(table1.table_id, "table1") six.assertCountEqual(self, table_found.schema, _TEST_SCHEMA)
def test_listing_tables(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') self.bq_client.create_dataset(Dataset(dataset_ref1)) self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table1'), _TEST_SCHEMA)) self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table2'), [])) six.assertCountEqual( self, [table_ref.table_id for table_ref in self.bq_client.list_tables(dataset_ref1)], ['table1', 'table2'])
def setUp(self): super(ClientWriteFromQueryTest, self).setUp() dataset_ref = DatasetReference(self.bq_client.project, 'my_dataset') schema = [SchemaField(name="a", field_type='INT64'), SchemaField(name="b", field_type='FLOAT64'), ] self.source_table = Table(TableReference(dataset_ref, 'source_table'), schema) self.destination_table = Table(TableReference(dataset_ref, 'destination_table'), schema) self.bq_client.create_dataset(Dataset(dataset_ref)) self.bq_client.create_table(self.source_table) # We don't create the destination table here; some tests do not want it created. # Stick two rows into source_table self.assertFalse(self.bq_client.insert_rows(self.source_table, [{'a': 1, 'b': 2.5}, {'a': 3, 'b': 4.25}]))
def __create_job_config( self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig: job_config = QueryJobConfig() job_config.priority = ems_query_job_config.priority.value job_config.use_legacy_sql = False job_config.use_query_cache = ems_query_job_config.use_query_cache job_config.labels = ems_query_job_config.labels if ems_query_job_config.destination_table is not None: job_config.time_partitioning = TimePartitioning("DAY") table_reference = TableReference( DatasetReference( ems_query_job_config.destination_project_id or self.__project_id, ems_query_job_config.destination_dataset), ems_query_job_config.destination_table) job_config.destination = table_reference job_config.write_disposition = ems_query_job_config.write_disposition.value job_config.create_disposition = ems_query_job_config.create_disposition.value partitioning = ems_query_job_config.time_partitioning if partitioning is not None: job_config.time_partitioning = TimePartitioning( partitioning.type.value, partitioning.field, partitioning.expiration_ms, partitioning.require_partition_filter) if ems_query_job_config.table_definitions is not None: job_config.table_definitions = ems_query_job_config.table_definitions return job_config
def table_exists(self, table: str) -> bool: try: self.__bigquery_client.get_table( table=TableReference.from_string(table)) return True except NotFound: return False
def log_bigquery_job(job: Union[bigquery.LoadJob, bigquery.QueryJob], table: bigquery.TableReference, message: Optional[str] = None, severity: Optional[str] = 'NOTICE'): """ Creates a structured log which includes a BigQuery job and table reference :param job: :param table: :param message: :param severity: """ if job.error_result: severity = "ERROR" message = message or "BigQuery Job had errors." elif severity == "ERROR": message = message or ("BigQuery Job completed" " but is considered an error.") else: severity = "NOTICE" message = message or "BigQuery Job completed without errors." print( json.dumps( dict( message=message, severity=severity, job=job.to_api_repr(), table=table.to_api_repr(), errors=job.error_result, )))
def make_table(project, dataset_id, table_id, friendly_name=None, description=None, expires=None, partitioning_type=None, view_use_legacy_sql=None, view_query=None, schema=None, labels=None): dataset_ref = DatasetReference(project, dataset_id) table_ref = TableReference(dataset_ref, table_id) table = Table(table_ref) table.friendly_name = friendly_name table.description = description table.expires = expires table.partitioning_type = partitioning_type if view_use_legacy_sql is not None: table.view_use_legacy_sql = view_use_legacy_sql if view_query is not None: table.view_query = view_query table.schema = schema if labels is not None: table.labels = labels return table
def _table_ref(self, table_id): """Return a BigQuery client library table reference""" from google.cloud.bigquery import DatasetReference from google.cloud.bigquery import TableReference return TableReference( DatasetReference(self.project_id, self.dataset_id), table_id)
def test_generate_queries(self, mock_table_cols, mock_table_dates, mock_sandbox): table_cols_dict = { "condition_occurrence": [ "condition_start_date", "condition_start_datetime", "condition_end_date", "condition_end_datetime" ], "measurement": ["measurement_date", "measurement_datetime"], 'drug_exposure': [ 'drug_exposure_start_date', 'drug_exposure_start_datetime', 'drug_exposure_end_date', 'drug_exposure_end_datetime', 'verbatim_end_date' ] } table_cols_df = pd.DataFrame.from_dict({ "table_name": [ "condition_occurrence", "_mapping_condition_occurrence", "measurement", "measurement_ext", "drug_exposure_ext", "drug_exposure" ] }) mock_table_cols.return_value = table_cols_df mock_table_dates.return_value = table_cols_dict mock_sandbox.return_value = self.dataset_id + '_sandbox' pid_rid_table_ref = TableReference.from_string( f"{self.project_id}.{self.pids_dataset_id}.{self.pids_table}") deactivated_pids_table_ref = TableReference.from_string( f"{self.project_id}.{self.deactivated_pids_dataset_id}.{self.deactivated_pids_table}" ) queries = rdp.generate_queries( client=self.mock_bq_client, project_id=self.project_id, dataset_id=self.dataset_id, sandbox_dataset_id=self.dataset_id, deact_pids_table_ref=deactivated_pids_table_ref, pid_rid_table_ref=pid_rid_table_ref, ) mock_table_cols.called_once_with(self.mock_bq_client, self.project_id, self.dataset_id) mock_table_dates.called_once_with(table_cols_df) # count sandbox and clean queries self.assertEqual(len(table_cols_dict) * 2, len(queries))
def test_listing_tables_with_max(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') self.bq_client.create_dataset(Dataset(dataset_ref1)) for i in range(10): self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table{}'.format(i)), _TEST_SCHEMA)) self.assertEqual(5, len(self.bq_client.list_tables(dataset_ref1, max_results=5))) self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1, max_results=20))) self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1)))
def __create_test_table(self, table_name, dataset_id): table_schema = [ SchemaField("int_data", "INT64"), SchemaField("str_data", "STRING") ] table_reference = TableReference(dataset_id, table_name) test_table = Table(table_reference, table_schema) test_table.time_partitioning = TimePartitioning("DAY") self.__delete_if_exists(test_table) self.GCP_BIGQUERY_CLIENT.create_table(test_table) return test_table
def run_async_load_job(self, job_id_prefix: str, config: EmsLoadJobConfig) -> str: return self.__bigquery_client.load_table_from_uri( source_uris=config.source_uri_template, destination=TableReference( DatasetReference(config.destination_project_id, config.destination_dataset), config.destination_table), job_id_prefix=job_id_prefix, location=self.__location, job_config=self.__create_load_job_config(config)).job_id
def run_async_extract_job(self, job_id_prefix: str, table: str, destination_uris: List[str], job_config: EmsExtractJobConfig) -> str: extract_job_config = self.__create_extract_job_config(job_config) return self.__bigquery_client.extract_table( source=TableReference.from_string(table_id=table), destination_uris=destination_uris, job_id_prefix=job_id_prefix, location=self.__location, job_config=extract_job_config).job_id
def list_item_from_table_id(table_id: str) -> TableListItem: """ Get a table list item as returned by :meth:`bigquery.Client.list_tables` from a table ID :param table_id: A table ID including project ID, dataset ID, and table ID, each separated by ``.``. :return: a table list item """ resource = { "tableReference": TableReference.from_string(table_id).to_api_repr() } return TableListItem(resource)
def get_raw_details(self, dataset_id, view_id): if self.did_full_update: return self.raw_details[dataset_id][view_id] if dataset_id not in self.raw_details: self.raw_details[dataset_id] = dict() if view_id not in self.raw_details[dataset_id]: try: self.raw_details[dataset_id][view_id] = self.bq_client._client.get_table( \ TableReference(DatasetReference(self.bq_conductor_conf.GOOGLE_CLOUD_PROJECT, dataset_id.split('.')[-1]), view_id)).to_api_repr() except: self.raw_details[dataset_id][view_id] = None return self.raw_details[dataset_id][view_id]
def log_api_error(table: bigquery.TableReference, message: str, error: Union[GoogleAPIError, ClientError]): """ Creates a structured api error log which includes a BigQuery table reference :param table: :param message: :param error: :return: """ print( json.dumps( dict(message=message or error.message, severity='ERROR', table=table.to_api_repr(), errors=error.errors or error.message or message)))
def handle_bq_lock(gcs_client: storage.Client, lock_blob: storage.Blob, next_job_id: Optional[str], table: bigquery.TableReference, retry_attempt_cnt: Optional[int] = None): """Reclaim the lock blob for the new job id (in-place) or delete the lock blob if next_job_id is None.""" try: if next_job_id: lock_blob_contents = json.dumps( dict(job_id=next_job_id, table=table.to_api_repr(), retry_attempt_cnt=retry_attempt_cnt)) logging.log_with_table( table, f"Writing the following content to lock_blob {lock_blob.name}:" f" {dict(job_id=next_job_id, table=table.to_api_repr(), retry_attempt_cnt=retry_attempt_cnt)}" ) if lock_blob.exists(client=gcs_client): lock_blob.upload_from_string( lock_blob_contents, if_generation_match=lock_blob.generation, client=gcs_client) else: # This happens when submitting the first job in the backlog lock_blob.upload_from_string( lock_blob_contents, if_generation_match=0, # noqa: E126 client=gcs_client) else: logging.log_with_table( table, "releasing lock at: " f"gs://{lock_blob.bucket.name}/{lock_blob.name}") lock_blob.delete( if_generation_match=lock_blob.generation, client=gcs_client, ) except (google.api_core.exceptions.PreconditionFailed, google.api_core.exceptions.NotFound) as err: if isinstance(err, google.api_core.exceptions.PreconditionFailed): raise exceptions.BacklogException( f"The lock at gs://{lock_blob.bucket.name}/{lock_blob.name} " f"was changed by another process.") from err logging.log_with_table( table, "Tried deleting a lock blob that was either already deleted " "or never existed.")
def send_to_bq(dataset, table, row): bigquery_client = bigquery.Client(project='theo-home') table_ref = TableReference( dataset_ref=DatasetReference(dataset_id=dataset, project='theo-home'), table_id=table, ) schema = [SchemaField(name=field, field_type=bq_types[type(data)]) for field, data in row.items()] table = bigquery_client.create_table( Table(table_ref, schema=schema), exists_ok=True ) errors = bigquery_client.insert_rows(table, [row]) if errors: print(errors, file=sys.stderr)
def to_table(dataset_ref, model): schema = model.schema if schema: schema = tuple( BigQuerySchemaField.to_schema_field(s) for s in schema) else: schema = None table_ref = TableReference(dataset_ref, model.table_id) table = Table(table_ref, schema) table.friendly_name = model.friendly_name table.description = model.description table.expires = model.expires table.partitioning_type = model.partitioning_type if model.view_use_legacy_sql is not None: table.view_use_legacy_sql = model.view_use_legacy_sql if model.view_query is not None: table.view_query = model.view_query table.labels = model.labels if model.labels is not None else dict() return table
def test_load_table_from_file(self, column_to_null): # type: (int) -> None dataset_ref = DatasetReference('my_project', 'my_dataset') dataset = Dataset(dataset_ref) table = Table(TableReference(dataset_ref, 'table1'), _TEST_SCHEMA) self.bq_client.create_dataset(dataset) self.bq_client.create_table(table) output = cStringIO() csv_out = csv.writer(output) input_row = list(self.INPUT_ROW) expected_row = list(self.EXPECTED_ROW) if column_to_null is not None: input_row[column_to_null] = 'NULL' expected_row[column_to_null] = None csv_out.writerow(input_row) self.bq_client.load_table_from_file(output, table.reference, job_config=None, rewind=True) self.assertRowsExpected( self.bq_client.query('SELECT * FROM `my_project.my_dataset.table1`', QueryJobConfig()), [expected_row])
def log_with_table( table: bigquery.TableReference, message: str, severity: Optional[str] = 'NOTICE', ): """ Creates a structured log which includes a BigQuery table reference :param table: :param message: :param severity: :return: """ table_json = None if table is not None: table_json = table.to_api_repr() print( json.dumps(dict( message=message, severity=severity, table=table_json, )))
def test_delete_table(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') table_ref1 = TableReference(dataset_ref1, 'table1') # Can't delete table, dataset not created yet. with self.assertRaisesRegexp(NotFound, 'dataset1'): self.bq_client.delete_table(table_ref1) self.bq_client.create_dataset(Dataset(dataset_ref1)) # Can't delete table, table not created yet. with self.assertRaisesRegexp(NotFound, 'table1'): self.bq_client.delete_table(table_ref1) table1 = Table(table_ref1, _TEST_SCHEMA) self.bq_client.create_table(table1) self.bq_client.delete_table(table_ref1) with self.assertRaisesRegexp(NotFound, 'table1'): self.bq_client.get_table(table_ref1)
def test_insert_rows(self): # type: () -> None dataset_ref = DatasetReference('my_project', 'my_dataset') dataset = Dataset(dataset_ref) table1_ref = TableReference(dataset_ref, 'table1') schema = [ SchemaField(name="a", field_type='INT64'), SchemaField(name="b", field_type='FLOAT64'), ] table = Table(table1_ref, schema) self.bq_client.create_dataset(dataset) self.bq_client.create_table(table) # Insert two rows, check that they landed self.assertFalse( self.bq_client.insert_rows(table, [{ 'a': 1, 'b': 2.5 }, { 'a': 3, 'b': 4.25 }])) self.assertRowsExpected( self.bq_client.query( 'SELECT * FROM `my_project.my_dataset.table1`', QueryJobConfig()), [[1, 2.5], [3, 4.25]]) # Insert two more rows, check that all four rows are now present. self.assertFalse( self.bq_client.insert_rows(table, [{ 'a': 5, 'b': 6.5 }, { 'a': 7, 'b': 8.25 }])) self.assertRowsExpected( self.bq_client.query( 'SELECT * FROM `my_project.my_dataset.table1`', QueryJobConfig()), [[1, 2.5], [3, 4.25], [5, 6.5], [7, 8.25]])
def write_to_table_from_json_file(path: Path) -> LoadJob: table = TableReference(DatasetReference(PROJECT_NAME, BQ_DATASET), BQ_TABLE) bq = BQ() job_config = job.LoadJobConfig() # job_config.autodetect = True job_config.source_format = job.SourceFormat.NEWLINE_DELIMITED_JSON job_config.write_disposition = job.WriteDisposition.WRITE_APPEND job_config.create_disposition = job.CreateDisposition.CREATE_NEVER try: with path.open("rb") as f: load_job: LoadJob = bq.load_table_from_file(f, destination=table, job_config=job_config) load_job.result() os.remove(str(path)) logging.info(f"{load_job.job_id} completed at {load_job.ended}") except Exception as ex: logging.info(f"{load_job.job_id} crashed with {load_job.errors}") return load_job
def create(self, table_id, schema): """Create a table in Google BigQuery given a table and schema Parameters ---------- table : str Name of table to be written schema : str Use the generate_bq_schema to generate your table schema from a dataframe. """ from google.cloud.bigquery import DatasetReference from google.cloud.bigquery import Table from google.cloud.bigquery import TableReference if self.exists(table_id): raise TableCreationError( "Table {0} already exists".format(table_id) ) if not _Dataset(self.project_id, credentials=self.credentials).exists( self.dataset_id ): _Dataset( self.project_id, credentials=self.credentials, location=self.location, ).create(self.dataset_id) table_ref = TableReference( DatasetReference(self.project_id, self.dataset_id), table_id ) table = Table(table_ref) table.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) try: self.client.create_table(table) except self.http_error as ex: self.process_http_error(ex)
def test_dataset_delete(self): # type: () -> None dataset1 = Dataset(DatasetReference('my_project', 'dataset1')) # Can't delete dataset, doesn't exist yet. with self.assertRaisesRegexp(NotFound, 'dataset1'): self.bq_client.delete_dataset(dataset1.reference) self.bq_client.create_dataset(dataset1) self.bq_client.create_table(Table(TableReference(dataset1.reference, 'table1'), _TEST_SCHEMA)) # Can't delete dataset, not empty with self.assertRaises(BadRequest): self.bq_client.delete_dataset(dataset1.reference) # Okay to delete, specifically requesting to delete contents. self.bq_client.delete_dataset(dataset1.reference, delete_contents=True) # And now dataset is gone again. with self.assertRaisesRegexp(NotFound, 'dataset1'): self.bq_client.get_dataset(DatasetReference('my_project', 'dataset1'))
def __inner(s, i, client): # check if weather station (data) exists in BQ # not all stations are active now/data may not be available # Supply NaN's for missing weather columns log.info(f"Download data for station: {i}, site: {s}") this_station_ref = TableReference(dataset_ref_, str(s)) if client.table_exists(this_station_ref): df = client.bq_to_df(this_station_ref) # remove any null year/month/day, sometimes we get dirty data df = df[(~df['year'].isnull()) & (~df['month'].isnull()) & (~df['day'].isnull())] df.index = df.apply(lambda x: pd.datetime(int(x[ 'year']), int(x['month']), int(x['day'])), axis=1) for dt in weather_data_types: if dt not in df.columns: df[dt] = np.nan df = df_index.join(df) return df[weather_data_types] return pd.DataFrame() # else return empty dataframe
def load_from_gcs(self, table_id, input_uri): job_results = {} client = self.bq_client project_id, ds_id, table_name = table_id.split(".") dataset = DatasetReference(project_id, ds_id) output_table = TableReference(dataset, table_name) ending = input_uri.split(".")[-1] file_type = self.__get_file_type(ending) config = LoadJobConfig(autodetect=True, write_disposition=WriteDisposition.WRITE_APPEND, source_format=file_type) loadJob = LoadJob(self.__create_job_id(project_id, "loadJob"), input_uri, output_table, client, job_config=config) job_results["loadJob"] = self.__get_results(loadJob) return job_results
def external_query( # pylint: disable=too-many-arguments gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str, query: str, job_id: str, table: bigquery.TableReference): """Load from query over external table from GCS. This hinges on a SQL query defined in GCS at _config/*.sql and an external table definition _config/{constants.BQ_EXTERNAL_TABLE_CONFIG_FILENAME} (otherwise will assume PARQUET external table) """ external_table_config = read_gcs_file_if_exists( gcs_client, f"{gsurl}_config/{constants.BQ_EXTERNAL_TABLE_CONFIG_FILENAME}") if not external_table_config: external_table_config = look_for_config_in_parents( gcs_client, gsurl, constants.BQ_EXTERNAL_TABLE_CONFIG_FILENAME) if external_table_config: external_table_def = json.loads(external_table_config) else: print(f" {gsurl}_config/{constants.BQ_EXTERNAL_TABLE_CONFIG_FILENAME} " f"not found in parents of {gsurl}. " "Falling back to default PARQUET external table: " f"{json.dumps(constants.DEFAULT_EXTERNAL_TABLE_DEFINITION)}") external_table_def = constants.DEFAULT_EXTERNAL_TABLE_DEFINITION print( json.dumps( dict(message="Found external table definition.", table=table.to_api_repr(), external_table_def=external_table_def))) # Reduce the amount of sourceUris by using wildcards with common # prefixes. This is done to keep the cloud logging audit metadata # below 100k in size, otherwise the metadata is omitted in the event. source_uris_with_wildcards = compact_source_uris_with_wildcards( flatten2dlist(get_batches_for_gsurl(gcs_client, gsurl))) # This may cause an issue if >10,000 files. external_table_def["sourceUris"] = source_uris_with_wildcards # Check if hivePartitioningOptions # https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#hivepartitioningoptions # is set in external.json file if external_table_def.get("hivePartitioningOptions"): external_table_def["hivePartitioningOptions"] = { "mode": external_table_def["hivePartitioningOptions"].get("mode") or "AUTO", "sourceUriPrefix": get_hive_partitioning_source_uri_prefix( external_table_def["sourceUris"][0]) } external_config = bigquery.ExternalConfig.from_api_repr(external_table_def) job_config = bigquery.QueryJobConfig( table_definitions={"temp_ext": external_config}, use_legacy_sql=False) # drop partition decorator if present. table_id = table.table_id.split("$")[0] # similar syntax to str.format but doesn't require escaping braces # elsewhere in query (e.g. in a regex) rendered_query = query.replace( "{dest_dataset}", f"`{table.project}`.{table.dataset_id}").replace( "{dest_table}", table_id) job: bigquery.QueryJob = bq_client.query(rendered_query, job_config=job_config, job_id=job_id) logging.log_bigquery_job( job, table, f"Submitted asynchronous query job: {job.job_id}") start_poll_for_errors = time.monotonic() # Check if job failed quickly while time.monotonic( ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS: job.reload(client=bq_client) if job.state == "DONE": check_for_bq_job_and_children_errors(bq_client, job, table) return time.sleep(constants.JOB_POLL_INTERVAL_SECONDS)
ch.setFormatter(formatter) # add ch to logger log.addHandler(ch) return log if __name__ == '__main__': log = _setup_logging() bqclient = BQClient() dataset_id = 'weather' dataset_ref_ = bqclient.client.dataset(dataset_id) stations_ref = TableReference(dataset_ref_, 'stations_site_list') # https://www.matthewproctor.com/australian_postcodes postcodes_ref = TableReference(dataset_ref_, 'australian_postcodes') stations = bqclient.bq_to_df(stations_ref) postcodes = bqclient.bq_to_df(postcodes_ref) kdtree = cKDTree(stations[['lat', 'lon']]) weather_file_on_disc = 'historical_weather.pk' import pickle if Path(weather_file_on_disc).exists(): historical_weather = pickle.load(open(weather_file_on_disc, 'rb')) else: