def bigquery_dataset(bigquery_client: bigquery.Client, bigquery_schema: List[bigquery.SchemaField]): project_id = bigquery_client.project dataset_id = "test_pybigquery" dataset = bigquery.Dataset(f"{project_id}.{dataset_id}") dataset = bigquery_client.create_dataset(dataset, exists_ok=True) sample_table_id = f"{project_id}.{dataset_id}.sample" try: # Since the data changes rarely and the tests are mostly read-only, # only create the tables if they don't already exist. # TODO: Create shared sample data tables in bigquery-public-data that # include test values for all data types. bigquery_client.get_table(sample_table_id) except google.api_core.exceptions.NotFound: job1 = load_sample_data(sample_table_id, bigquery_client, bigquery_schema) job1.result() one_row_table_id = f"{project_id}.{dataset_id}.sample_one_row" try: bigquery_client.get_table(one_row_table_id) except google.api_core.exceptions.NotFound: job2 = load_sample_data( one_row_table_id, bigquery_client, bigquery_schema, filename="sample_one_row.json", ) job2.result() view = bigquery.Table(f"{project_id}.{dataset_id}.sample_view", ) view.view_query = f"SELECT string FROM `{dataset_id}.sample`" bigquery_client.create_table(view, exists_ok=True) return dataset_id
def _table_exists(self, bq: bigquery.Client, table_ref: bigquery.TableReference) -> bool: try: bq.get_table(table_ref) return True except NotFound: return False
def exist_table(client: bq.Client, dataset_id: str, table_id: str) -> bool: table_full_id = get_full_table_name(client, dataset_id, table_id) try: client.get_table(table_full_id) except NotFound: return False return True
def clean_up_bq_tables(client: cloud_bigquery.Client, table_names: List[str]) -> None: for table_name in table_names: try: client.get_table(table_name) client.delete_table(table_name) except NotFound: pass
def does_bigquery_table_exist(client: bigquery.Client, dataset_name: str, table_name: str): dataset_ref = client.dataset(dataset_name) table_ref = dataset_ref.table(table_name) try: client.get_table(table_ref) return True except NotFound: return False
def bigquery_alt_dataset(bigquery_client: bigquery.Client, bigquery_schema: List[bigquery.SchemaField]): project_id = bigquery_client.project dataset_id = "test_pybigquery_alt" dataset = bigquery.Dataset(f"{project_id}.{dataset_id}") dataset = bigquery_client.create_dataset(dataset, exists_ok=True) sample_table_id = f"{project_id}.{dataset_id}.sample_alt" try: bigquery_client.get_table(sample_table_id) except google.api_core.exceptions.NotFound: job = load_sample_data(sample_table_id, bigquery_client, bigquery_schema) job.result() return dataset_id
def load_bigquery_table_via_bq_apis(bq_client: bigquery.Client, dataset_id, table_name, imported_data_info, src_uris): """ Load tables using BigQuery Load jobs, using the same configuration as BQ DTS ImportedDataInfo :return: """ # https://googlecloudplatform.github.io/google-cloud-python/latest/_modules/google/cloud/bigquery/client.html#Client.load_table_from_uri # Step 1 - Translate required fields for BigQuery Python SDK tgt_tabledef = imported_data_info['table_defs'][0] # Step 2 - Create target table if it doesn't exist dataset_ref = bq_client.dataset(dataset_id) table_ref = dataset_ref.table(table_name) try: bq_client.get_table(table_ref) except exceptions.NotFound: # Step 2a - Attach schema tgt_schema = RPCRecordSchema_to_GCloudSchema(tgt_tabledef['schema']) tgt_table = bigquery.Table(table_ref, schema=tgt_schema) # Step 2b - Attach description tgt_table.description = imported_data_info[ 'destination_table_description'] # Step 2c - Conditionally set partitioning type if '$' in table_name: tgt_table.partitioning_type = 'DAY' tgt_table._properties['tableReference'][ 'tableId'], _, _ = table_name.partition('$') # Step 2d - Create BigQuery table bq_client.create_table(tgt_table) # Step 3a - Create BigQuery Load Job ID current_datetime = datetime.datetime.utcnow().isoformat() raw_job_id = f'{table_name}_{current_datetime}' clean_job_id = BQ_JOB_ID_MATCHER.sub('___', raw_job_id) # Step 3b - Create BigQuery Job Config job_config = DTSTableDefinition_to_BQLoadJobConfig(tgt_tabledef) # Step 4 - Execute BigQuery Load Job using Python SDK load_job = bq_client.load_table_from_uri(source_uris=src_uris, destination=table_ref, job_id=clean_job_id, job_config=job_config) return load_job
def load_data_to_bq(df=None, table_name='CRY', dataset='price_data', project=None): client = Client() table = client.get_table(".".join([client.project, dataset, table_name])) if table.num_rows == 0 and df is not None: df.to_gbq(".".join([dataset, table_name]), if_exists='append') else: delete_qry = f'''DELETE FROM `{dataset+"."+table_name}` AS t2 WHERE concat(symbol, cast(date as string)) IN (SELECT concat(symbol, cast(MAX(date) as string)) FROM `{dataset+"."+table_name}` GROUP BY symbol) AND symbol IN {'("'+'","'.join(df.symbol.unique())+'")'}''' delete_DML = client.query(delete_qry) delete_DML.result() existing = pd.read_gbq(f'''select symbol, max(date) as max_date from {dataset+"."+table_name} group by symbol''', dialect="legacy") df = df.merge(existing, on='symbol', how='left') df = df.loc[(df.date.dt.tz_localize('UTC') > df.max_date) | df.max_date.isnull(), :] df.drop('max_date', axis=1, inplace=True) df.to_gbq(".".join([dataset, table_name]), if_exists='append', project_id=project)
def dest_partitioned_table(request, bq: bigquery.Client, mock_env, dest_dataset) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( bigquery.TableReference.from_string( "bigquery-public-data.new_york_311.311_service_requests")) schema = public_table.schema table: bigquery.Table = bigquery.Table( f"{os.environ.get('GCP_PROJECT')}" f".{dest_dataset.dataset_id}.cf_test_nyc_311_" f"{str(uuid.uuid4()).replace('-','_')}", schema=schema, ) table.time_partitioning = bigquery.TimePartitioning() table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR table.time_partitioning.field = "created_date" table = bq.create_table(table) def teardown(): bq.delete_table(table, not_found_ok=True) request.addfinalizer(teardown) return table
def _upload_entity_df( client: Client, table_name: str, entity_df: Union[pd.DataFrame, str], ) -> Table: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" if isinstance(entity_df, str): job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})") elif isinstance(entity_df, pd.DataFrame): # Drop the index so that we don't have unnecessary columns entity_df.reset_index(drop=True, inplace=True) job = client.load_table_from_dataframe(entity_df, table_name) else: raise InvalidEntityType(type(entity_df)) block_until_done(client, job) # Ensure that the table expires after some time table = client.get_table(table=table_name) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return table
def _upload_entity_df_into_bigquery( client: Client, project: str, dataset_name: str, entity_df: Union[pandas.DataFrame, str], ) -> Table: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" table_id = _get_table_id_for_new_entity(client, project, dataset_name) if type(entity_df) is str: job = client.query(f"CREATE TABLE {table_id} AS ({entity_df})") job.result() elif isinstance(entity_df, pandas.DataFrame): # Drop the index so that we dont have unnecessary columns entity_df.reset_index(drop=True, inplace=True) # Upload the dataframe into BigQuery, creating a temporary table job_config = bigquery.LoadJobConfig() job = client.load_table_from_dataframe(entity_df, table_id, job_config=job_config) job.result() else: raise ValueError( f"The entity dataframe you have provided must be a Pandas DataFrame or BigQuery SQL query, " f"but we found: {type(entity_df)} ") # Ensure that the table expires after some time table = client.get_table(table=table_id) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return table
def _add_new_columns(client: bigquery.Client, table_id: str, columns: List[str]) -> List[Dict]: """Adds any new columns if they are missing. Creates new string columns for every column if it doesn't exist. Args: client: The BigQuery client. table_id: Table id. columns: List of columns. Returns: The table schema. """ try: table = client.get_table(table_id) except google.api_core.exceptions.NotFound: logging.error( 'Table: \'%s\' not found - please create the table. It is okay to create it with no columns.', table_id) raise new_fields = [] for c in columns: field = bigquery.SchemaField(c, 'STRING') if field not in table.schema: new_fields.append(field) if new_fields: logging.info('Found new fields: %s', new_fields) table.schema += new_fields client.update_table(table, ['schema']) return table.schema
def _upload_entity_df_and_get_entity_schema( client: Client, table_name: str, entity_df: Union[pd.DataFrame, str], ) -> Dict[str, np.dtype]: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" if type(entity_df) is str: job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})") block_until_done(client, job) limited_entity_df = ( client.query(f"SELECT * FROM {table_name} LIMIT 1").result().to_dataframe() ) entity_schema = dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)) elif isinstance(entity_df, pd.DataFrame): # Drop the index so that we dont have unnecessary columns entity_df.reset_index(drop=True, inplace=True) job = client.load_table_from_dataframe(entity_df, table_name) block_until_done(client, job) entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)) else: raise InvalidEntityType(type(entity_df)) # Ensure that the table expires after some time table = client.get_table(table=table_name) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return entity_schema
def iter_results( bigquery_client: bigquery.Client, query: str, job_config: QueryJobConfig, df_cleaner: Callable[[pd.DataFrame], pd.DataFrame] = None, ) -> Generator[pd.Series, None, None]: """ Page through the results of a query and yield each row as a pandas Series Args: bigquery_client (bigquery.Client): The BigQuery client query (str): The query to run job_config (QueryJobConfig): The BigQuery job config Returns: Generator[pd.Series, None, None]: A generator of pandas Series """ query_job = bigquery_client.query(query, job_config=job_config) query_job.result() # Get reference to destination table destination = bigquery_client.get_table(query_job.destination) rows = bigquery_client.list_rows(destination, page_size=10000) dfs = rows.to_dataframe_iterable() for df in dfs: if df_cleaner is not None: df = df_cleaner(df) for index, row in df.iterrows(): yield row
def bq_wait_for_rows(bq_client: bigquery.Client, table: bigquery.Table, expected_num_rows: int): """ polls tables.get API for number of rows until reaches expected value or times out. This is mostly an optimization to speed up the test suite without making it flaky. """ start_poll = time.monotonic() actual_num_rows = 0 while time.monotonic() - start_poll < LOAD_JOB_POLLING_TIMEOUT: bq_table: bigquery.Table = bq_client.get_table(table) actual_num_rows = bq_table.num_rows if actual_num_rows == expected_num_rows: return if actual_num_rows > expected_num_rows: raise AssertionError( f"{table.project}.{table.dataset_id}.{table.table_id} has" f"{actual_num_rows} rows. expected {expected_num_rows} rows.") raise AssertionError( f"Timed out after {LOAD_JOB_POLLING_TIMEOUT} seconds waiting for " f"{table.project}.{table.dataset_id}.{table.table_id} to " f"reach {expected_num_rows} rows." f"last poll returned {actual_num_rows} rows.")
def find_schema_differences( module_path: str, bigquery_client: BigQueryClient, global_project: Optional[str], global_dataset: Optional[str], ) -> _SchemaDiffs: schema_diffs: _SchemaDiffs = {} for local_table in find_tables(module_path): project = global_project or local_table.project assert project, "Project has not been set." dataset = global_dataset or local_table.dataset assert dataset, "Dataset has not been set." table_identifier = f"{project}.{dataset}.{local_table.full_table_name()}" try: remote_table = bigquery_client.get_table(table_identifier) if list( check_schemas(local_table.get_schema_fields(), remote_table.schema)): schema_diffs[table_identifier] = ExistingTable( local_table=local_table, remote_table=remote_table, schema_diffs=list( check_schemas(local_table.get_schema_fields(), remote_table.schema)), ) except NotFound: schema_diffs[table_identifier] = MissingTable( local_table=local_table) return schema_diffs
def dest_partitioned_table_allow_jagged(bq: bigquery.Client, dest_dataset, monkeypatch) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( bigquery.TableReference.from_string( "bigquery-public-data.new_york_311.311_service_requests")) schema = public_table.schema if os.getenv('GCP_PROJECT') is None: monkeypatch.setenv("GCP_PROJECT", bq.project) extra_field_for_jagged_row_test = bigquery.schema.SchemaField( "extra_jagged_row_test_column", "STRING") schema.append(extra_field_for_jagged_row_test) table: bigquery.Table = bigquery.Table( f"{os.getenv('GCP_PROJECT')}" f".{dest_dataset.dataset_id}.cf_test_nyc_311_" f"{str(uuid.uuid4()).replace('-', '_')}", schema=schema, ) table.time_partitioning = bigquery.TimePartitioning() table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR table.time_partitioning.field = "created_date" table = bq.create_table(table) return table
def main(): from google.cloud.bigquery import Client tables = {} with open("/vol/required_tables.txt") as rt: table_names = rt.read().split() bq_client = Client() for table_name in table_names: splited_table_name = table_name.split(".") if len(splited_table_name) == 3: dataset_ref = bq_client.dataset(splited_table_name[1], project=splited_table_name[0]) else: dataset_ref = bq_client.dataset(splited_table_name[0]) table_ref = dataset_ref.table(splited_table_name[-1]) table = bq_client.get_table(table_ref) tables[table_name] = [ field.to_api_repr() for field in table.schema ] if table_name.endswith("*"): tables[table_name].append({ "name": "_TABLE_SUFFIX", "type": "STRING", "mode": "REQUIRED" }) with open("/vol/schema.json", mode="w") as schema: schema.write(json.dumps(tables))
def update_recently_unixtime(client: bigquery.Client, df_unixtime): table_id = f'{project_id}.{dataset}.{recently_unixtime_table}' # unixtimeデータフレームをunixtime管理テーブルへinsert client.insert_rows_from_dataframe(client.get_table(table_id), df_unixtime) # unixtime管理テーブルでTABLE_NAMEカラムが重複してるデータを削除 duplicate_query = f""" SELECT * EXCEPT(rowNumber) FROM ( SELECT *, ROW_NUMBER() OVER ( PARTITION BY TABLE_NAME ORDER BY UNIX_TIME DESC ) as rowNumber FROM {table_id} ) WHERE rowNumber = 1; """ job_config = bigquery.QueryJobConfig() job_config.destination = table_id job_config.write_disposition = 'WRITE_TRUNCATE' job = client.query(duplicate_query, job_config=job_config) job.result()
def does_table_exist(bigquery_client: bigquery.Client, table: str, dataset: str = 'analytics') -> bool: """Check if given table from given Dataset exists in BigQuery, return True if so.""" try: table_reference = bigquery_client.dataset(dataset).table(table) is_table = bigquery_client.get_table(table_reference) if is_table: logging.info('Table "{}" in Dataset "{}" already exists in BigQuery.'.format(table, dataset)) return True except NotFound as error: logging.warning('Table "{}" does not exist in BigQuery Dataset "{}". Ref: {}.'.format(table, dataset, error)) return False
def get_bq_table( client: bigquery.Client, dataset_id: str, table_id: str, project_id: str = None, ) -> bigquery.Table: # If `project_id is None` then the default project of `client` will be used. table_ref = client.dataset(dataset_id, project=project_id).table( table_id) # type: bigquery.TableReference # noqa: E501 # API request return client.get_table(table_ref) # type: bigquery.Table
def update_or_create_view(client: bigquery.Client, view_name: str, view_query: str, dataset: str): LOGGER.debug("update_view: %s=%s", view_name, [view_query]) dataset_ref = client.dataset(dataset) view_ref = dataset_ref.table(view_name) view = bigquery.Table(view_ref) view.view_query = view_query query_job = client.query(get_create_or_replace_view_query(view)) query_job.result() # wait for query job to finish updated_view = client.get_table(view) LOGGER.info("updated or replaced view: %s", updated_view.full_table_id) LOGGER.debug("view schema (%s): %s", updated_view.full_table_id, updated_view.schema)
def get_or_create_table(client: bigquery.Client) -> bigquery.Table: try: dataset = client.get_dataset("sensors") except NotFound as _: dataset = client.create_dataset("sensors") # The default project ID is not set and hence a fully-qualified ID is required. table_ref = bigquery.TableReference(dataset, table_id="particulate_matter") try: return client.get_table(table_ref) except NotFound as _: return client.create_table( bigquery.Table( table_ref, schema=[ bigquery.SchemaField( "humidity", "NUMERIC", description="Sensor DHT22humidity in %"), bigquery.SchemaField("max_micro", "NUMERIC", description=""), bigquery.SchemaField("min_micro", "NUMERIC", description=""), bigquery.SchemaField("samples", "NUMERIC", description=""), bigquery.SchemaField( "sds_p1", "NUMERIC", description="Sensor SDS011 PM10 in µg/m³"), bigquery.SchemaField( "sds_p2", "NUMERIC", description="Sensor SDS011 PM2.5 in µg/m³"), bigquery.SchemaField( "signal", "NUMERIC", description="WiFi signal strength in dBm"), bigquery.SchemaField( "temperature", "NUMERIC", description="Sensor DHT22 temperature in °C"), bigquery.SchemaField("datetime", "DATETIME", description="Datetime of measurement", mode="REQUIRED"), ], ))
def scrape( self, bq_client: bigquery.Client, table_path: str, timestamp: datetime.datetime, dry_run: bool = False, ): table = bq_client.get_table(table_path) rows = [{ "provider": self.name, "timestamp": timestamp, **asdict(row), } for row in self.fetch_spaces()] if not dry_run: errors = bq_client.insert_rows(table, rows) if len(errors) > 0: raise ValueError(errors)
def _generate_dimensions(client: bigquery.Client, table: str) -> List[Dict[str, Any]]: """Generate dimensions and dimension groups from a bigquery table. When schema contains both submission_timestamp and submission_date, only produce a dimension group for submission_timestamp. Raise ClickException if schema results in duplicate dimensions. """ dimensions = {} for dimension in _generate_dimensions_helper(client.get_table(table).schema): name = dimension["name"] # overwrite duplicate "submission" dimension group, thus picking the # last value sorted by field name, which is submission_timestamp if name in dimensions and name != "submission": raise click.ClickException( f"duplicate dimension {name!r} for table {table!r}" ) dimensions[name] = dimension return list(dimensions.values())
def dest_table(request, bq: bigquery.Client, dest_dataset) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( bigquery.TableReference.from_string( "bigquery-public-data.new_york_311.311_service_requests")) schema = public_table.schema table: bigquery.Table = bigquery.Table( f"{os.environ.get('TF_VAR_project_id', 'bqutil')}" f".{dest_dataset.dataset_id}.cf_e2e_test_nyc_311_" f"{os.getenv('SHORT_SHA', 'manual')}", schema=schema, ) table = bq.create_table(table) def teardown(): bq.delete_table(table, not_found_ok=True) request.addfinalizer(teardown) return table
def validate_rule(self, client: bigquery.Client, *args, **keyword_args): """ Raise an error if there are still rows to delete :param client: active BigQuery client object :param args: :param keyword_args: :return: None """ backup_table_obj = client.get_table(self.backup_table) if not backup_table_obj.created: raise RuntimeError( f'Backup table {backup_table_obj.table_id} for branching cleaning rule was not ' f'found on the server') query = BACKUP_ROWS_QUERY.render(lookup_table=self.lookup_table, src_table=self.observation_table) result = client.query(query).result() if result.total_rows > 0: raise RuntimeError( f'Branching cleaning rule was run but still identifies {result.total_rows} ' f'rows from the observation table to drop')
def _get_table(self, table: str, client: bigquery.Client) -> DbTableSchema: bq_table = client.get_table(table) if not bq_table._properties: return table = bq_table._properties if not table.get('schema') or not table.get('schema').get('fields'): return fields = table.get('schema').get('fields') columns = [ DbColumn(name=fields[i].get('name'), type=fields[i].get('type'), description=fields[i].get('description'), ordinal_position=i) for i in range(len(fields)) ] self.log.info(DbTableName(table.get('tableReference').get('tableId'))) return DbTableSchema( schema_name=table.get('tableReference').get('projectId') + '.' + table.get('tableReference').get('datasetId'), table_name=DbTableName(table.get('tableReference').get('tableId')), columns=columns)
def get_tables(project_id: str, client: Client, dataset_id: Optional[str] = None) -> Iterator[Table]: """ Gets BigQuery tables from a Google Cloud project. Args: project_id (str): ID of the project. dataset_id (Optional[str]): The ID of the dataset. If `None`, will retrieve tables from all datasets in project. client (Client): A Google Cloud Client instance. Yields: Table: A BigQuery table. """ dataset_refs = ([f"{project_id}.{dataset_id}"] if dataset_id else (dataset.reference for dataset in client.list_datasets(project=project_id))) datasets = (client.get_dataset(dataset_ref) for dataset_ref in dataset_refs) for dataset in datasets: for table in client.list_tables(dataset): yield client.get_table(table)
def clean_up_bq_table(client: cloud_bigquery.Client, table_name: str) -> None: try: client.get_table(table_name) client.delete_table(table_name) except NotFound: pass