def select_into(self, query: str, output_dataset: str, output_table: str) -> bool: logging.info( f"DataWarehouse.select_into -> {output_dataset}.{output_table} ..." ) client = self._get_client() config = QueryJobConfig() config.allow_large_results = True config.destination = f"{self.config.gcp_project}.{output_dataset}.{output_table}" config.create_disposition = CreateDisposition.CREATE_IF_NEEDED config.write_disposition = WriteDisposition.WRITE_TRUNCATE query_job = client.query(query, config) # Execute the thing and check the result try: result = query_job.result() mb = int(query_job.total_bytes_processed / (1024 * 1024)) logging.info( f"DataWarehouse.select_into -> {output_dataset}.{output_table}: {query_job.state} (processed {mb}mb)" ) return True except: logging.error( f"DataWarehouse.select_into -> Exception: \n\t{query_job.error_result.message}" ) return False
def select_insert(self, source_table_id, destination_table_id, query_field, prefix=' ', fg='yellow'): query = 'SELECT {query_field} FROM {dataset_id}.{source_table_id}'.format( query_field=query_field, dataset_id=self._dataset_ref.dataset_id, source_table_id=source_table_id) destination_table = self.dataset.table(destination_table_id) job_config = QueryJobConfig() job_config.use_legacy_sql = False job_config.use_query_cache = False job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE job_config.destination = destination_table job = self._client.query(query, job_config) echo('Inserting... {0}'.format(job.job_id), prefix=prefix, fg=fg, no_color=self.no_color) echo(' {0}'.format(job.query), prefix=prefix, fg=fg, no_color=self.no_color) job.result() assert job.state == 'DONE' error_result = job.error_result if error_result: raise RuntimeError(job.errors)
def execute(self, query, destination_table, write_disposition="WRITE_TRUNCATE", allow_large_results=True): """ :param query_file: query file path :param destination_table: target table :param write_disposition: default is to replace existing table. To append: WRITE_APPEND :param allow_large_results: default to True :return: """ query_configuration = QueryJobConfig() query_configuration.use_legacy_sql = False if destination_table: ref = TableReferenceBuilder(destination_table, self._dataset, self._project) query_configuration.write_disposition = write_disposition query_configuration.default_dataset = ref.dataset_reference query_configuration.destination = ref.table_reference query_configuration.allow_large_results = allow_large_results sql_query = self.__get_query(query) if not self._quiet: print("-- #### {}\n{}\n".format(destination_table or "", sql_query)) self._query_job = bigquery.Client(project=self._project).query( sql_query, job_config=query_configuration) if self._query_job.errors: raise Exception(self._query_job.errors)
def run_query_job( querytext: str, temp_table: str = None, query_job_config: QueryJobConfig = QueryJobConfig() ) -> QueryJob: """ Set up and run a query job. Arguments: querytext {str} -- The querytext for the job. Keyword Arguments: temp_table {str} -- A temporary table in which to materialize results. The results will be streamed from this table when done. This is required for all large queries, and strongly recommended. (default: {None}) query_job_config {QueryJobConfig} -- A QueryJobConfig to start from. (default: {QueryJobConfig()}) Returns: QueryJob -- The resulting job. """ LOG.debug("Running query: %s", querytext) client = get_bq_client() if temp_table: query_job_config.destination = temp_table query_job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE return client.query(query=querytext, job_config=query_job_config)
def write_to_table_with_query(self, write_disposition): # type: (str) -> None """Query all rows from source table, write to destination table w/ requested disposition. Args: write_disposition: Whether to require the destination table to be empty, to append to it, or to overwrite (truncate) it. """ job_config = QueryJobConfig() job_config.destination = self.destination_table.reference job_config.write_disposition = write_disposition self.bq_client.query( 'SELECT * FROM `{}.{}.{}`'.format(self.source_table.project, self.source_table.dataset_id, self.source_table.table_id), job_config)
def create_table_from_query( self, query, # type: str table_path, # type: str write_disposition='WRITE_EMPTY', # type: Optional[str] use_legacy_sql=False, # type: Optional[bool] max_wait_secs=None, # type: Optional[int] expected_schema=None # type: Optional[List[SchemaField]] ): # type: (...) -> None """Creates a table in BigQuery from a specified query. Args: query: The query to run. table_path: The path to the table (in the client's project) to write the results to. write_disposition: Specifies behavior if table already exists. See options here: https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs under configuration.query.writeDisposition use_legacy_sql: Whether the query is written in standard or legacy sql. max_wait_secs: Seconds to wait for the query before timing out. If not set, the class default will be used. expected_schema: The expected schema of the resulting table; unused in this implementation """ if write_disposition not in [ 'WRITE_TRUNCATE', 'WRITE_APPEND', 'WRITE_EMPTY' ]: raise ValueError( 'write_disposition must be one of WRITE_TRUNCATE, ' 'WRITE_APPEND, or WRITE_EMPTY') config = QueryJobConfig() if self.maximum_billing_tier: config.maximum_billing_tier = self.maximum_billing_tier config.use_legacy_sql = use_legacy_sql config.write_disposition = write_disposition config.allow_large_results = True config.destination = self.get_table_reference_from_path(table_path) query_job = self._run_async_query(query, job_config=config) return self._wait_for_job(query_job, query, max_wait_secs=max_wait_secs or self.max_wait_secs)
def execute(self, query, tbl_ref=None, append=False, preview=True): sql_query = self.__get_query(query) if tbl_ref: print("-- ## " + str(tbl_ref)) print("{}{}".format("-- preview: \n" if preview else "", sql_query)) if preview: return job_conf = QueryJobConfig() job_conf.use_legacy_sql = False if tbl_ref: job_conf.write_disposition = "WRITE_APPEND" if append else "WRITE_TRUNCATE" job_conf.default_dataset = tbl_ref.dataset_ref job_conf.destination = tbl_ref.table_ref job_conf.allow_large_results = True query_job = self.connect(tbl_ref.project if tbl_ref else None).query( sql_query, job_config=job_conf) if query_job.errors: raise Exception(query_job.errors)
def start_async_job(self, query, dest_path=None): # type: (str, Optional[str]) -> QueryJob """ Makes a QueryJob for the given query to be written to the dest_path, and starts it, returning the job. Args: query: The query string to run. dest_path: String of the path to the destination table. It's None if the query is a Data Definition Language (DDL) statement (CREATE/ALTER/DELETE tables), because destination table is already specified in a DDL query Returns: A QueryJob instance for the job Raises: ValueError. If dest_path is specified for a DDL query or dest_path is missing for a non-DDL query. """ is_ddl_query = any(query.strip().upper().startswith(ddl_op) for ddl_op in ['CREATE', 'ALTER', 'DROP']) # Make an asynchronous job and start it. config = QueryJobConfig() if dest_path: if is_ddl_query: raise ValueError( 'Cannot specify destination path for the DDL query below:\n ' + query) # allow_large_results requires destination to be specified config.allow_large_results = True config.destination = ( self.get_table_reference_from_path(dest_path)) elif not is_ddl_query: raise ValueError('Destination table is not specified, ' 'But the query below is not a DDL statement:\n ' + query) return self._run_async_query(query, config)
def create_table_from_query(self, query, # type: str table_path, # type: str write_disposition='WRITE_EMPTY', # type: Optional[str] use_legacy_sql=False, # type: Optional[bool] max_wait_secs=None, # type: Optional[int] expected_schema=None # type: Optional[List[SchemaField]] ): # type: (...) -> None """Creates a table in BigQuery from a specified query. Args: query: The query to run. table_path: The path to the table (in the client's project) to write the results to. write_disposition: One of 'WRITE_TRUNCATE', 'WRITE_APPEND', 'WRITE_EMPTY'. Default is WRITE_EMPTY. use_legacy_sql: Whether the query is written in standard or legacy sql. max_wait_secs: Seconds to wait for the query before timing out. If not set, the class default will be used. expected_schema: The expected schema of the resulting table; unused in this implementation """ if write_disposition not in ['WRITE_TRUNCATE', 'WRITE_APPEND', 'WRITE_EMPTY']: raise ValueError('write_disposition must be one of WRITE_TRUNCATE, ' 'WRITE_APPEND, or WRITE_EMPTY') config = QueryJobConfig() if self.maximum_billing_tier: config.maximum_billing_tier = self.maximum_billing_tier config.use_legacy_sql = use_legacy_sql config.write_disposition = write_disposition config.allow_large_results = True config.destination = self.get_table_reference_from_path(table_path) query_job = self.gclient.query(query, job_config=config, retry=self.default_retry) return query_job.result(timeout=max_wait_secs or self.max_wait_secs)
def materialize_view( # pylint: disable=too-many-arguments, too-many-locals client: bigquery.Client, source_view_name: str, destination_table_name: str, project: str, source_dataset: str, destination_dataset: str, ) -> QueryJob: query = get_select_all_from_query(source_view_name, project=project, dataset=source_dataset) LOGGER.info("materializing view: %s.%s -> %s.%s", source_dataset, source_view_name, destination_dataset, destination_table_name) LOGGER.debug("materialize_view: %s=%s", destination_table_name, [query]) start = time.perf_counter() dataset_ref = client.dataset(destination_dataset) destination_table_ref = dataset_ref.table(destination_table_name) job_config = QueryJobConfig() job_config.destination = destination_table_ref job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE query_job = client.query(query, job_config=job_config) # getting the result will make sure that the query ran successfully result: bigquery.table.RowIterator = query_job.result() duration = time.perf_counter() - start total_bytes_processed = query_job.total_bytes_processed LOGGER.info( 'materialized view: %s.%s, total rows: %s, %s bytes processed, took: %.3fs', source_dataset, source_view_name, result.total_rows, total_bytes_processed, duration) if LOGGER.isEnabledFor(logging.DEBUG): sample_result = list(islice(result, 3)) LOGGER.debug("sample_result: %s", sample_result) return MaterializeViewResult(total_bytes_processed=total_bytes_processed, total_rows=result.total_rows)