def exist_dataset(client: bq.Client, dataset_id: str) -> bool: dataset_full_id = get_full_dataset_name(client, dataset_id) try: client.get_dataset(dataset_full_id) except NotFound: return False return True
def dataset(bq: bigquery.Client, dataset_id: str): """Context manager for creating and deleting the BigQuery dataset for a test.""" try: bq.get_dataset(dataset_id) except NotFound: bq.create_dataset(dataset_id) try: yield bq.dataset(dataset_id) finally: bq.delete_dataset(dataset_id, delete_contents=True)
def _get_table_id_for_new_entity(client: Client, project: str, dataset_name: str) -> str: """Gets the table_id for the new entity to be uploaded.""" # First create the BigQuery dataset if it doesn't exist dataset = bigquery.Dataset(f"{client.project}.{dataset_name}") dataset.location = "US" try: client.get_dataset(dataset) except NotFound: # Only create the dataset if it does not exist client.create_dataset(dataset, exists_ok=True) return f"{client.project}.{dataset_name}.entity_df_{project}_{int(time.time())}"
def execute_query(bq_client: bigquery.Client, env_vars: {}, query_path: object, output_table_name: str, time_partition: bool) -> None: """Executes transformation query to a new destination table. Args: bq_client: bigquery.Client object env_vars: Dictionary of key: value, where value is environment variable query_path: Object representing location of SQL query to execute output_table_name: String representing name of table that holds output time_partition: Boolean indicating whether to time-partition output """ dataset_ref = bq_client.get_dataset( bigquery.DatasetReference(project=bq_client.project, dataset_id=env_vars['corrected_dataset_id'])) table_ref = dataset_ref.table(output_table_name) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE # Time Partitioning table is only needed for final output query if time_partition: job_config.time_partitioning = bigquery.TimePartitioning( field='usage_start_time', expiration_ms=None) logging.info('Attempting query...') # Execute Query query_job = bq_client.query(query=render_template(query_path, env_vars), job_config=job_config) query_job.result() # Waits for the query to finish
def _get_table_reference_for_new_entity(client: Client, dataset_project: str, dataset_name: str) -> str: """Gets the table_id for the new entity to be uploaded.""" # First create the BigQuery dataset if it doesn't exist dataset = bigquery.Dataset(f"{dataset_project}.{dataset_name}") dataset.location = "US" try: client.get_dataset(dataset) except NotFound: # Only create the dataset if it does not exist client.create_dataset(dataset, exists_ok=True) table_name = offline_utils.get_temp_entity_table_name() return f"{dataset_project}.{dataset_name}.{table_name}"
def get_bq_dataset( client: bigquery.Client, dataset_id: str, project_id: str = None, ) -> bigquery.Dataset: # If `project_id is None` then the default project of `client` will be used. dataset_ref = client.dataset( dataset_id, project=project_id) # type: bigquery.DatasetReference # API request return client.get_dataset(dataset_ref) # type: bigquery.Dataset
def get_or_create_table(client: bigquery.Client) -> bigquery.Table: try: dataset = client.get_dataset("sensors") except NotFound as _: dataset = client.create_dataset("sensors") # The default project ID is not set and hence a fully-qualified ID is required. table_ref = bigquery.TableReference(dataset, table_id="particulate_matter") try: return client.get_table(table_ref) except NotFound as _: return client.create_table( bigquery.Table( table_ref, schema=[ bigquery.SchemaField( "humidity", "NUMERIC", description="Sensor DHT22humidity in %"), bigquery.SchemaField("max_micro", "NUMERIC", description=""), bigquery.SchemaField("min_micro", "NUMERIC", description=""), bigquery.SchemaField("samples", "NUMERIC", description=""), bigquery.SchemaField( "sds_p1", "NUMERIC", description="Sensor SDS011 PM10 in µg/m³"), bigquery.SchemaField( "sds_p2", "NUMERIC", description="Sensor SDS011 PM2.5 in µg/m³"), bigquery.SchemaField( "signal", "NUMERIC", description="WiFi signal strength in dBm"), bigquery.SchemaField( "temperature", "NUMERIC", description="Sensor DHT22 temperature in °C"), bigquery.SchemaField("datetime", "DATETIME", description="Datetime of measurement", mode="REQUIRED"), ], ))
def get_tables(project_id: str, client: Client, dataset_id: Optional[str] = None) -> Iterator[Table]: """ Gets BigQuery tables from a Google Cloud project. Args: project_id (str): ID of the project. dataset_id (Optional[str]): The ID of the dataset. If `None`, will retrieve tables from all datasets in project. client (Client): A Google Cloud Client instance. Yields: Table: A BigQuery table. """ dataset_refs = ([f"{project_id}.{dataset_id}"] if dataset_id else (dataset.reference for dataset in client.list_datasets(project=project_id))) datasets = (client.get_dataset(dataset_ref) for dataset_ref in dataset_refs) for dataset in datasets: for table in client.list_tables(dataset): yield client.get_table(table)