def _upload_entity_df_and_get_entity_schema( client: Client, table_name: str, entity_df: Union[pd.DataFrame, str], ) -> Dict[str, np.dtype]: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" if type(entity_df) is str: job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})") block_until_done(client, job) limited_entity_df = ( client.query(f"SELECT * FROM {table_name} LIMIT 1").result().to_dataframe() ) entity_schema = dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)) elif isinstance(entity_df, pd.DataFrame): # Drop the index so that we dont have unnecessary columns entity_df.reset_index(drop=True, inplace=True) job = client.load_table_from_dataframe(entity_df, table_name) block_until_done(client, job) entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)) else: raise InvalidEntityType(type(entity_df)) # Ensure that the table expires after some time table = client.get_table(table=table_name) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return entity_schema
def _mark_existing_metadata_row_as_processed_helper( *, table_name: str, project_id: str, dry_run: bool, client: bigquery.Client, file_id: int, processed_time: datetime.datetime, ) -> None: query = f""" UPDATE `{project_id}.direct_ingest_processing_metadata.{table_name}` SET processed_time = DATETIME "{processed_time}" WHERE file_id = {file_id} """ if dry_run: logging.info("[DRY RUN] Would have run query to mark as processed: %s", query) return query_job = client.query(query) query_job.result() logging.info("Ran query to mark as processed: %s", query)
def _upload_entity_df( client: Client, table_name: str, entity_df: Union[pd.DataFrame, str], ) -> Table: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" if isinstance(entity_df, str): job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})") elif isinstance(entity_df, pd.DataFrame): # Drop the index so that we don't have unnecessary columns entity_df.reset_index(drop=True, inplace=True) job = client.load_table_from_dataframe(entity_df, table_name) else: raise InvalidEntityType(type(entity_df)) block_until_done(client, job) # Ensure that the table expires after some time table = client.get_table(table=table_name) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return table
def update_recently_unixtime(client: bigquery.Client, df_unixtime): table_id = f'{project_id}.{dataset}.{recently_unixtime_table}' # unixtimeデータフレームをunixtime管理テーブルへinsert client.insert_rows_from_dataframe(client.get_table(table_id), df_unixtime) # unixtime管理テーブルでTABLE_NAMEカラムが重複してるデータを削除 duplicate_query = f""" SELECT * EXCEPT(rowNumber) FROM ( SELECT *, ROW_NUMBER() OVER ( PARTITION BY TABLE_NAME ORDER BY UNIX_TIME DESC ) as rowNumber FROM {table_id} ) WHERE rowNumber = 1; """ job_config = bigquery.QueryJobConfig() job_config.destination = table_id job_config.write_disposition = 'WRITE_TRUNCATE' job = client.query(duplicate_query, job_config=job_config) job.result()
def get_file_id_and_processed_status_for_file( *, metadata_type: MetadataType, project_id: str, region_code: str, client: bigquery.Client, normalized_file_name: str) -> Tuple[Optional[int], bool]: """Checks to see if the provided |normalized_file_name| has been registered in the raw_data_metadata table. If it has, it returns the file's file_id and whether or not the file has already been processed. If it has not, returns None, False """ table_name = get_table_name_for_type(metadata_type=metadata_type) table_id = f'{project_id}.direct_ingest_processing_metadata.{table_name}' query = f"""SELECT file_id, processed_time FROM `{table_id}` WHERE region_code = '{region_code}' AND normalized_file_name = '{normalized_file_name}'""" query_job = client.query(query) rows = query_job.result() if rows.total_rows > 1: raise ValueError( f'Expected there to only be one row per combination of {region_code} and {normalized_file_name}' ) if not rows.total_rows: # TODO(3020): Once metadata tables are in postgres (and we don't have any limits on UPDATE queries), insert # a row here that will have the processed_time filled in later. logging.info('\nNo found row for %s and %s in %s.', normalized_file_name, region_code, table_id) return None, False row = one(rows) file_id = row.get('file_id') processed_time = row.get('processed_time') logging.info( 'Found row for %s and %s with values file_id: %s and processed_time: %s', normalized_file_name, region_code, file_id, processed_time) return file_id, processed_time
def execute_query(bq_client: bigquery.Client, env_vars: {}, query_path: object, output_table_name: str, time_partition: bool) -> None: """Executes transformation query to a new destination table. Args: bq_client: bigquery.Client object env_vars: Dictionary of key: value, where value is environment variable query_path: Object representing location of SQL query to execute output_table_name: String representing name of table that holds output time_partition: Boolean indicating whether to time-partition output """ dataset_ref = bq_client.get_dataset( bigquery.DatasetReference(project=bq_client.project, dataset_id=env_vars['corrected_dataset_id'])) table_ref = dataset_ref.table(output_table_name) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE # Time Partitioning table is only needed for final output query if time_partition: job_config.time_partitioning = bigquery.TimePartitioning( field='usage_start_time', expiration_ms=None) logging.info('Attempting query...') # Execute Query query_job = bq_client.query(query=render_template(query_path, env_vars), job_config=job_config) query_job.result() # Waits for the query to finish
def iter_results( bigquery_client: bigquery.Client, query: str, job_config: QueryJobConfig, df_cleaner: Callable[[pd.DataFrame], pd.DataFrame] = None, ) -> Generator[pd.Series, None, None]: """ Page through the results of a query and yield each row as a pandas Series Args: bigquery_client (bigquery.Client): The BigQuery client query (str): The query to run job_config (QueryJobConfig): The BigQuery job config Returns: Generator[pd.Series, None, None]: A generator of pandas Series """ query_job = bigquery_client.query(query, job_config=job_config) query_job.result() # Get reference to destination table destination = bigquery_client.get_table(query_job.destination) rows = bigquery_client.list_rows(destination, page_size=10000) dfs = rows.to_dataframe_iterable() for df in dfs: if df_cleaner is not None: df = df_cleaner(df) for index, row in df.iterrows(): yield row
def get_all_rows(table_name: str = table_name, dest_data_project: str = dest_data_project, dest_dataset_name: str = dest_dataset_name, repo_data_project: str = repo_data_project, repo_dataset_name: str = repo_dataset_name, release_date: str = release_date, version_col: str = version_col, client: bigquery.Client = repo_client): query = f""" SELECT * EXCEPT (datarepo_row_id, release_date) FROM `{repo_data_project}.{repo_dataset_name}.{table_name}` WHERE {version_col} = "{release_date}" """ dest_table_name = f"{table_name}_{DiffType.CREATE.value}" table_id = f"{dest_data_project}.{dest_dataset_name}.{dest_table_name}" job_config = bigquery.QueryJobConfig(destination=table_id) query_job = client.query(query, job_config=job_config) print("The query submitted is:") print(query) print(f"The destination table is: {table_id}") # return the destination table name try: query_job.result() except GoogleCloudError as err: print(f"There was a {type(err)}") print(err) # return the destination table name return dest_table_name
def retry_query(gcs_client: storage.Client, bq_client: bigquery.Client, lock_blob: storage.Blob, failed_job_id: str, table: bigquery.TableReference, retry_attempt_cnt): """Retry a query that failed""" if retry_attempt_cnt > 1: # if this is not the first retry, truncate over the previous # job_id retry attempt suffix '_xx' (3 chars) retry_job_id = f"{failed_job_id[:-3]}_{retry_attempt_cnt:02}" # pad with zero else: retry_job_id = f"{failed_job_id}_{retry_attempt_cnt:02}" # pad with zero failed_job: bigquery.QueryJob = bq_client.get_job(failed_job_id) job_config: bigquery.QueryJobConfig = bigquery.QueryJobConfig( table_definitions=failed_job.table_definitions, use_legacy_sql=False) retry_job = bq_client.query(failed_job.query, job_config=job_config, job_id=retry_job_id) # To keep track of retry attempts between cloud # function invocations, the retry count state is # kept in the _bqlock lock file. utils.handle_bq_lock(gcs_client, lock_blob, retry_job_id, table, retry_attempt_cnt=retry_attempt_cnt) logging.log_bigquery_job( retry_job, table, f"Submitted asynchronous query job: {retry_job_id}")
def _upload_entity_df_into_bigquery( client: Client, project: str, dataset_name: str, entity_df: Union[pandas.DataFrame, str], ) -> Table: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" table_id = _get_table_id_for_new_entity(client, project, dataset_name) if type(entity_df) is str: job = client.query(f"CREATE TABLE {table_id} AS ({entity_df})") job.result() elif isinstance(entity_df, pandas.DataFrame): # Drop the index so that we dont have unnecessary columns entity_df.reset_index(drop=True, inplace=True) # Upload the dataframe into BigQuery, creating a temporary table job_config = bigquery.LoadJobConfig() job = client.load_table_from_dataframe(entity_df, table_id, job_config=job_config) job.result() else: raise ValueError( f"The entity dataframe you have provided must be a Pandas DataFrame or BigQuery SQL query, " f"but we found: {type(entity_df)} ") # Ensure that the table expires after some time table = client.get_table(table=table_id) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return table
def load_data_to_bq(df=None, table_name='CRY', dataset='price_data', project=None): client = Client() table = client.get_table(".".join([client.project, dataset, table_name])) if table.num_rows == 0 and df is not None: df.to_gbq(".".join([dataset, table_name]), if_exists='append') else: delete_qry = f'''DELETE FROM `{dataset+"."+table_name}` AS t2 WHERE concat(symbol, cast(date as string)) IN (SELECT concat(symbol, cast(MAX(date) as string)) FROM `{dataset+"."+table_name}` GROUP BY symbol) AND symbol IN {'("'+'","'.join(df.symbol.unique())+'")'}''' delete_DML = client.query(delete_qry) delete_DML.result() existing = pd.read_gbq(f'''select symbol, max(date) as max_date from {dataset+"."+table_name} group by symbol''', dialect="legacy") df = df.merge(existing, on='symbol', how='left') df = df.loc[(df.date.dt.tz_localize('UTC') > df.max_date) | df.max_date.isnull(), :] df.drop('max_date', axis=1, inplace=True) df.to_gbq(".".join([dataset, table_name]), if_exists='append', project_id=project)
def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, client: Client, ) -> Tuple[datetime, datetime]: if type(entity_df) is str: job = client.query( f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max " f"FROM ({entity_df})") res = next(job.result()) entity_df_event_timestamp_range = ( res.get("min"), res.get("max"), ) elif isinstance(entity_df, pd.DataFrame): entity_df_event_timestamp = entity_df.loc[:, entity_df_event_timestamp_col].infer_objects( ) if pd.api.types.is_string_dtype(entity_df_event_timestamp): entity_df_event_timestamp = pd.to_datetime( entity_df_event_timestamp, utc=True) entity_df_event_timestamp_range = ( entity_df_event_timestamp.min().to_pydatetime(), entity_df_event_timestamp.max().to_pydatetime(), ) else: raise InvalidEntityType(type(entity_df)) return entity_df_event_timestamp_range
def _export_table( client: bigquery.Client, project_id: str, dataset_id: str, table: str, bucket: str, storage_client: storage.Client, ): """Export a single table or view to GCS as JSON.""" # since views cannot get exported directly, write data into a temporary table job = client.query( f""" SELECT * FROM {dataset_id}.{table} """ ) job.result() destination_uri = f"gs://{bucket}/{table}.ndjson" dataset_ref = bigquery.DatasetReference(project_id, job.destination.dataset_id) table_ref = dataset_ref.table(job.destination.table_id) logger.info(f"Export table {table} to {destination_uri}") job_config = bigquery.ExtractJobConfig() job_config.destination_format = "NEWLINE_DELIMITED_JSON" extract_job = client.extract_table( table_ref, destination_uri, location="US", job_config=job_config ) extract_job.result() # convert ndjson to json _convert_ndjson_to_json(bucket, table, storage_client)
def exclude_site_submission(client: bigquery.Client, dataset: bigquery.DatasetReference, hpo_ids: List[str]) -> bigquery.QueryJob: """ Empty all CDM tables associated with one or more HPO sites :param client: Active bigquery client object :param dataset: the dataset to exclude site data from :param hpo_ids: Identifies the HPO sites whose data should be excluded :return: Query job associated with removing all the records :raises RuntimeError if CDM tables associated with a site are not found in the dataset """ LOGGER.debug( f'exclude_site_submission called with dataset={dataset.dataset_id} and hpo_ids={hpo_ids}' ) all_tables = list(bq.list_tables(client, dataset)) tables_to_empty = [] for hpo_id in hpo_ids: hpo_tables = _filter_hpo_tables(all_tables, hpo_id) if not hpo_tables: raise RuntimeError( f'No tables found for {hpo_id} in dataset {dataset.dataset_id}. ' f'Ensure the specified arguments are correct.') tables_to_empty.extend(hpo_tables) script = DELETE_QUERY_TPL.render(tables_to_empty=tables_to_empty) LOGGER.debug(f'exclude_site_submission about to start script:\n {script}') return client.query(script)
def bq_reader( client: bigquery.Client, table: str, modifier: Callable[[Dict[str, Any]], BaseModel], table_index: int, total_tables: int, report_frequency: int, ): # TODO: Probably want some sort of ordering here, a report of where # we are in that ordering, and a way to resume from that place if # things crash (LOGGING AND QUERYING?) query = f"SELECT * FROM `mozilla-cdp-prod.sfdc_exports.{table}`" query_job_rows = client.query(query).result() total_rows = query_job_rows.total_rows report_prefix = f"{table} (table: {table_index}/{total_tables})" start = monotonic() for i, row in enumerate(query_job_rows): i = i + 1 if i % report_frequency == 0 or i == total_rows: percent_done = int(i / total_rows * 100) time_since_start = monotonic() - start per_second = int(i / time_since_start) print( f"{report_prefix}: {percent_done}% Complete ({per_second} rows/s) ({int(time_since_start)}s since query)" ) newrow = {} for key, value in row.items(): if value != "": newrow[key] = value try: yield modifier(newrow) except ValidationError as e: # TODO: Write this to a table so we know what didn't work print(newrow["email_id"], str(e), file=sys.stderr)
def execute_bq_query( client: bigquery.Client, query_sql: str, query_params: list = None, dry_run: bool = False, ) -> bigquery.table.RowIterator: """ Execute bq query and return results as a row iterator. """ if query_params is not None: # TODO: implement. See # https://cloud.google.com/bigquery/docs/parameterized-queries # https://googleapis.github.io/google-cloud-python/latest/bigquery/usage/queries.html#run-a-query-using-a-named-query-parameter raise NotImplementedError if dry_run: # TODO: implement. See # https://googleapis.github.io/google-cloud-python/latest/bigquery/usage/queries.html#run-a-dry-run-query raise NotImplementedError query_job = client.query(query_sql) # type: bigquery.QueryJob # API request; waits for job to complete. # TODO: is all results' data retrieved here? Or will some be retrieved later, when the iterator # is consumed? return query_job.result() # type: bigquery.table.RowIterator
def get_tables_in_dataset(client: Client, project_id, dataset_id, table_names) -> List[str]: """ This function retrieves tables that exist in dataset for an inital list table_names . This function raises GoogleCloudError if the query throws an error :param client: :param project_id: :param dataset_id: :param table_names: :return: a list of tables that exist in the given dataset """ # The following makes sure the tables exist in the dataset query_job = client.query( GET_ALL_TABLES_QUERY_TEMPLATE.render(project=project_id, dataset=dataset_id, table_names=table_names)) try: result = query_job.result() # Raise the Runtime Error if the errors are neither GoogleCloudError nor TimeoutError if query_job.errors: raise RuntimeError(result.errors) return [dict(row.items())[TABLE_ID] for row in result] except (GoogleCloudError, TimeoutError, RuntimeError) as e: # Catch GoogleCloudError and TimeoutError that could be raised by query_job.result() # Also catch the RuntimeError raised from the try block # Log the error and raise it again LOGGER.error(f"Error running job {result.job_id}: {e}") raise
def get_bucket(client: BQClient, hpo_id: str) -> str: """ Retrieves bucket name for site :param client: Bigquery Client object :param hpo_id: Identifies the HPO site :return: bucket name for the HPO site as a string :raises GoogleCloudError/TimeoutError """ bucket_name_query = BUCKET_NAME_QUERY.render( project=client.project, dataset=bq_consts.LOOKUP_TABLES_DATASET_ID, bucket_names_table=bq_consts.HPO_ID_BUCKET_NAME_TABLE_ID, hpo_id=hpo_id.upper()) try: bucket_names_result = client.query(bucket_name_query).result() bucket_names = bucket_names_result.to_dataframe( )['bucket_name'].to_list() if len(bucket_names) > 1: LOGGER.warning( f'Found more than one bucket name for site {hpo_id}: {bucket_names}' ) bucket_name = bucket_names[0] except (GoogleCloudError, TOError) as e: LOGGER.error(f'Job failed with error {str(e)}') raise e return bucket_name
def _execute_query(client: bigquery.Client, query: str, parameters: List) -> pandas.DataFrame: try: start = timer() result = ( client.query( query, job_config=bigquery.QueryJobConfig( query_parameters=parameters)).result().to_dataframe( create_bqstorage_client=True, ) # Use to generate directly a dataframe pandas ) end = timer() logging.getLogger(__name__).info( f'[benchmark][google_big_query] - execute {end - start} seconds', extra={ 'benchmark': { 'operation': 'execute', 'execution_time': end - start, 'connector': 'google_big_query', } }, ) return result except TypeError as e: logging.getLogger(__name__).error( f'Error to execute request {query} - {e}') raise e
class QueryPublications(DoFn): '''Apache Beam DoFn class for querying patent publications from BigQuery by application numbers. ''' def __init__(self): super().__init__() self.storage_client = None def setup(self): logging.info('Initializing BigQuery client') self.storage_client = Client() def process(self, app_numbers): sql = """ SELECT application_number, application_kind, grant_date FROM `patents-public-data.patents.publications` WHERE country_code = @us_country_code AND application_number IN UNNEST(@application_numbers) AND IF ( publication_date >= @wipo_kind_codes_from, kind_code IN UNNEST(@wipo_patent_publication_codes), kind_code = @uspto_patent_publication_code ); """ job_config = QueryJobConfig(query_parameters=[ ScalarQueryParameter( 'us_country_code', 'STRING', US_COUNTRY_CODE, ), ArrayQueryParameter( 'application_numbers', 'STRING', app_numbers, ), ScalarQueryParameter( 'wipo_kind_codes_from', 'INT64', WIPO_KIND_CODES_FROM, ), ArrayQueryParameter( 'wipo_patent_publication_codes', 'STRING', WIPO_PATENT_PUBLICATION_CODES, ), ScalarQueryParameter( 'uspto_patent_publication_code', 'STRING', USPTO_PATENT_PUBLICATION_CODE, ), ]) query = self.storage_client.query(sql, job_config=job_config) logging.info('Executing query for publications') iterator = query.result() return iterator
def test_tables_correct(client: Client, bq_tables: List[Table]): counts = set() for table in bq_tables: job = client.query( f"SELECT COUNT(*) FROM `{table.project}.{table.dataset_id}.{table.table_id}`" ) counts.add(Driver.get_scalar_result(job)) assert counts == {1000, 100, 37, 6}
def external_query( # pylint: disable=too-many-arguments gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str, query: str, dest_table_ref: bigquery.TableReference, job_id: str): """Load from query over external table from GCS. This hinges on a SQL query defined in GCS at _config/*.sql and an external table definition _config/external.json (otherwise will assume CSV external table) """ external_table_config = read_gcs_file_if_exists( gcs_client, f"{gsurl}_config/external.json") if not external_table_config: external_table_config = look_for_config_in_parents( gcs_client, gsurl, "external.json") if external_table_config: external_table_def = json.loads(external_table_config) else: print( f" {gsurl}_config/external.json not found in parents of {gsurl}. " "Falling back to default PARQUET external table: " f"{json.dumps(constants.DEFAULT_EXTERNAL_TABLE_DEFINITION)}") external_table_def = constants.DEFAULT_EXTERNAL_TABLE_DEFINITION # This may cause an issue if >10,000 files. external_table_def["sourceUris"] = flatten2dlist( get_batches_for_prefix(gcs_client, gsurl)) print( f"external table def = {json.dumps(external_table_config, indent=2)}") external_config = bigquery.ExternalConfig.from_api_repr(external_table_def) job_config = bigquery.QueryJobConfig( table_definitions={"temp_ext": external_config}, use_legacy_sql=False) # drop partition decorator if present. table_id = dest_table_ref.table_id.split("$")[0] # similar syntax to str.format but doesn't require escaping braces # elsewhere in query (e.g. in a regex) rendered_query = query\ .replace( "{dest_dataset}", f"`{dest_table_ref.project}`.{dest_table_ref.dataset_id}")\ .replace("{dest_table}", table_id) job: bigquery.QueryJob = bq_client.query(rendered_query, job_config=job_config, job_id=job_id) print(f"started asynchronous query job: {job.job_id}") start_poll_for_errors = time.monotonic() # Check if job failed quickly while time.monotonic( ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS: job.reload(client=bq_client) if job.state == "DONE": check_for_bq_job_and_children_errors(bq_client, job) return time.sleep(constants.JOB_POLL_INTERVAL_SECONDS)
def get_table_refs(bq_client: bigquery.Client, bq_storage_client: bigquery_storage.BigQueryReadClient, dataset: bigquery.Dataset): query = f"select table_id from {dataset.project}.{dataset.dataset_id}.__TABLES__" df: pd.DataFrame = bq_client.query(query).to_dataframe( bqstorage_client=bq_storage_client) for table_id in df.table_id: if table_id is not None: yield bigquery.TableReference(dataset.reference, table_id)
def _delete_rows(client: bigquery.Client, table_id: str) -> None: """Delete all rows in a table.""" query = f'DELETE FROM `{table_id}` WHERE TRUE' job = client.query(query) try: rows = job.result() except google.cloud.exceptions.GoogleCloudError as e: logging.error(e) raise e
def get_last_update_status(client: gbigquery.Client): check_environment_variable() query = make_oanda_prices_query() results = client.query(query).result() last_datetime = datetime_process.extract_datetime(results) is_valid = datetime_process.is_update_time_valid(last_datetime) status = make_status(last_datetime, is_valid) return status
def external_query( # pylint: disable=too-many-arguments gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str, query: str, dest_table_ref: bigquery.TableReference, job_id_prefix: str): """Load from query over external table from GCS. This hinges on a SQL query defined in GCS at _config/bq_transform.sql and an external table definition _config/external.json (otherwise will assume CSV external table) """ external_table_config = read_gcs_file_if_exists( gcs_client, f"{gsurl}_config/external.json") if external_table_config: external_table_def = json.loads(external_table_config) else: print(f"Falling back to default CSV external table." f" {gsurl}/_config/external.json not found.") external_table_def = DEFAULT_EXTERNAL_TABLE_DEFINITION external_table_def["sourceUris"] = flatten2dlist( get_batches_for_prefix(gcs_client, gsurl)) external_config = bigquery.ExternalConfig.from_api_repr(external_table_def) job_config = bigquery.QueryJobConfig( table_definitions={"temp_ext": external_config}, use_legacy_sql=False) # Note, dest_table might include a partition decorator. rendered_query = query.format( dest_dataset=dest_table_ref.dataset_id, dest_table=dest_table_ref.table_id, ) job: bigquery.QueryJob = bq_client.query( rendered_query, job_config=job_config, job_id_prefix=job_id_prefix, ) print(f"started asynchronous query job: {job.job_id}") start_poll_for_errors = time.monotonic() # Check if job failed quickly while time.monotonic() - start_poll_for_errors < WAIT_FOR_JOB_SECONDS: job.reload() if job.errors: msg = f"query job {job.job_id} failed quickly: {job.errors}" for err in job.errors: # BQ gives confusing warning about missing dataset if the # external query refers to the wrong external table name. # In this case we can give the end user a little more context. if "missing dataset" in err.get("message", ""): raise RuntimeError( "External queries must select from the external table " "named 'temp_ext'. This error may be due to specifying" "the wrong name for the external table. " + msg) raise RuntimeError(msg) time.sleep(JOB_POLL_INTERVAL_SECONDS)
def get_temporary_dataset(client: bigquery.Client): """Get a cached reference to the dataset used for server-assigned destinations.""" global temporary_dataset if temporary_dataset is None: # look up the dataset used for query results without a destination dry_run = bigquery.QueryJobConfig(dry_run=True) destination = client.query("SELECT NULL", dry_run).destination temporary_dataset = client.dataset(destination.dataset_id, destination.project) return temporary_dataset
def test_burnham( client: bigquery.Client, query_job_config: bigquery.QueryJobConfig, query: str, want: List[Any], ): """Test that the Glean telemetry in BigQuery matches what we expect.""" query_job = client.query(query, job_config=query_job_config) got = [dict(row.items()) for row in query_job.result()] assert got == want
def get_non_pk_cols(table_name: str, repo_data_project: str, repo_dataset_name: str, client: bigquery.Client) -> Dict[ str, str]: # should use the jade client query = f""" SELECT column_name, data_type FROM `{repo_data_project}.{repo_dataset_name}.INFORMATION_SCHEMA.COLUMNS` WHERE table_name = "{table_name}" AND column_name NOT IN ("id", "release_date", "datarepo_row_id") """ query_job = client.query(query) return {row[0]: row[1] for row in query_job}
def get_next_available_file_id(metadata_type: MetadataType, project_id: str, client: bigquery.Client) -> int: """Retrieves the next available file_id in the raw_file_metadata table.""" table_name = get_table_name_for_type(metadata_type) query = f"""SELECT MAX(file_id) AS max_file_id FROM `{project_id}.direct_ingest_processing_metadata.{table_name}`""" query_job = client.query(query) rows = query_job.result() max_file_id = one(rows).get('max_file_id') if max_file_id is None: return 1 return max_file_id + 1