Esempio n. 1
0
def wait_on_bq_job_id(bq_client: bigquery.Client,
                      job_id: str,
                      polling_timeout: int,
                      polling_interval: int = 1) -> bool:
    """"
    Wait for a BigQuery Job ID to complete.

    Args:
        bq_client: bigquery.Client
        job_id: str the BQ job ID to wait on
        polling_timeout: int number of seconds to poll this job ID
        polling_interval: frequency to query the job state during polling
    Returns:
        bool: if the job ID has finished successfully. True if DONE without
        errors, False if RUNNING or PENDING
    Raises:
        exceptions.BigQueryJobFailure if the job failed.
        google.api_core.exceptions.NotFound if the job id cannot be found.
    """
    start_poll = time.monotonic()
    while time.monotonic() - start_poll < (polling_timeout - polling_interval):
        job: Union[bigquery.LoadJob,
                   bigquery.QueryJob] = bq_client.get_job(job_id)
        if job.state == "DONE":
            check_for_bq_job_and_children_errors(bq_client, job)
            return True
        if job.state in {"RUNNING", "PENDING"}:
            print(f"waiting on BigQuery Job {job.job_id}")
            time.sleep(polling_interval)
    print(f"reached polling timeout waiting for bigquery job {job_id}")
    return False
Esempio n. 2
0
def retry_query(gcs_client: storage.Client, bq_client: bigquery.Client,
                lock_blob: storage.Blob, failed_job_id: str,
                table: bigquery.TableReference, retry_attempt_cnt):
    """Retry a query that failed"""
    if retry_attempt_cnt > 1:
        # if this is not the first retry, truncate over the previous
        # job_id retry attempt suffix '_xx' (3 chars)
        retry_job_id = f"{failed_job_id[:-3]}_{retry_attempt_cnt:02}"  # pad with zero
    else:
        retry_job_id = f"{failed_job_id}_{retry_attempt_cnt:02}"  # pad with zero
    failed_job: bigquery.QueryJob = bq_client.get_job(failed_job_id)
    job_config: bigquery.QueryJobConfig = bigquery.QueryJobConfig(
        table_definitions=failed_job.table_definitions, use_legacy_sql=False)
    retry_job = bq_client.query(failed_job.query,
                                job_config=job_config,
                                job_id=retry_job_id)
    # To keep track of retry attempts between cloud
    # function invocations, the retry count state is
    # kept in the _bqlock lock file.
    utils.handle_bq_lock(gcs_client,
                         lock_blob,
                         retry_job_id,
                         table,
                         retry_attempt_cnt=retry_attempt_cnt)
    logging.log_bigquery_job(
        retry_job, table, f"Submitted asynchronous query job: {retry_job_id}")
Esempio n. 3
0
def block_until_done(
    client: Client,
    bq_job: Union[bigquery.job.query.QueryJob, bigquery.job.load.LoadJob],
    timeout: int = 1800,
    retry_cadence: float = 1,
):
    """
    Waits for bq_job to finish running, up to a maximum amount of time specified by the timeout parameter (defaulting to 30 minutes).

    Args:
        client: A bigquery.client.Client to monitor the bq_job.
        bq_job: The bigquery.job.QueryJob that blocks until done runnning.
        timeout: An optional number of seconds for setting the time limit of the job.
        retry_cadence: An optional number of seconds for setting how long the job should checked for completion.

    Raises:
        BigQueryJobStillRunning exception if the function has blocked longer than 30 minutes.
        BigQueryJobCancelled exception to signify when that the job has been cancelled (i.e. from timeout or KeyboardInterrupt).
    """

    # For test environments, retry more aggressively
    if flags_helper.is_test():
        retry_cadence = 0.1

    def _wait_until_done(bq_job):
        if client.get_job(bq_job).state in ["PENDING", "RUNNING"]:
            raise BigQueryJobStillRunning(job_id=bq_job.job_id)

    try:
        retryer = Retrying(
            wait=wait_fixed(retry_cadence),
            stop=stop_after_delay(timeout),
            retry=retry_if_exception_type(BigQueryJobStillRunning),
            reraise=True,
        )
        retryer(_wait_until_done, bq_job)

    finally:
        if client.get_job(bq_job).state in ["PENDING", "RUNNING"]:
            client.cancel_job(bq_job)
            raise BigQueryJobCancelled(job_id=bq_job.job_id)

        if bq_job.exception():
            raise bq_job.exception()
Esempio n. 4
0
class BigQueryDatasetsProvider:
    def __init__(self,
                 client: Optional[Client] = None,
                 logger: Optional[logging.Logger] = None):
        self.client = client
        if client is None:
            self.client = Client()
        self.logger = logger
        if logger is None:
            self.logger = logging.getLogger(__name__)

    def get_facets(self, job_id: str) -> BigQueryFacets:
        inputs = []
        output = None
        run_facets = {}
        try:
            try:
                job = self.client.get_job(job_id=job_id)
                props = job._properties

                run_stat_facet, dataset_stat_facet = self._get_output_statistics(
                    props)

                run_facets.update({"bigQuery_job": run_stat_facet})
                inputs = self._get_input_from_bq(props)
                output = self._get_output_from_bq(props)
                if output and dataset_stat_facet:
                    output.custom_facets.update({"stats": dataset_stat_facet})
                    output.output_facets.update({
                        'outputStatistics':
                        dataset_stat_facet.to_openlineage()
                    })

            finally:
                # Ensure client has close() defined, otherwise ignore.
                # NOTE: close() was introduced in python-bigquery v1.23.0
                if hasattr(self.client, "close"):
                    self.client.close()
        except Exception as e:
            self.logger.error(
                f"Cannot retrieve job details from BigQuery.Client. {e}",
                exc_info=True)
            run_facets.update({
                "bigQuery_error":
                BigQueryErrorRunFacet(
                    clientError=f"{e}: {traceback.format_exc()}", )
            })
        return BigQueryFacets(run_facets, inputs, output)

    def _get_output_statistics(self, properties) \
            -> Tuple[BigQueryJobRunFacet, Optional[BigQueryStatisticsDatasetFacet]]:
        stages = get_from_nullable_chain(properties,
                                         ['statistics', 'query', 'queryPlan'])
        json_props = json.dumps(properties)

        if not stages:
            if get_from_nullable_chain(properties, ['statistics', 'query', 'statementType']) \
                    in ['CREATE_VIEW', 'CREATE_TABLE', 'ALTER_TABLE']:
                return BigQueryJobRunFacet(cached=False), None

            # we're probably getting cached results
            if get_from_nullable_chain(properties,
                                       ['statistics', 'query', 'cacheHit']):
                return BigQueryJobRunFacet(cached=True), None
            if get_from_nullable_chain(properties,
                                       ['status', 'state']) != "DONE":
                raise ValueError(
                    "Trying to extract data from running bigquery job")
            raise ValueError(
                f"BigQuery properties did not have required data: queryPlan - {json_props}"
            )

        out_stage = stages[-1]
        out_rows = out_stage.get("recordsWritten", None)
        out_bytes = out_stage.get("shuffleOutputBytes", None)
        billed_bytes = get_from_nullable_chain(
            properties, ['statistics', 'query', 'totalBytesBilled'])
        return BigQueryJobRunFacet(
            cached=False,
            billedBytes=int(billed_bytes) if billed_bytes else None,
            properties=json_props), BigQueryStatisticsDatasetFacet(
                rowCount=int(out_rows),
                size=int(out_bytes)) if out_bytes and out_rows else None

    def _get_input_from_bq(self, properties):
        bq_input_tables = get_from_nullable_chain(
            properties, ['statistics', 'query', 'referencedTables'])
        if not bq_input_tables:
            return []

        input_table_names = [
            self._bq_table_name(bq_t) for bq_t in bq_input_tables
        ]
        sources = [self._source() for bq_t in bq_input_tables]
        try:
            return [
                Dataset.from_table_schema(source=source,
                                          table_schema=table_schema)
                for table_schema, source in zip(
                    self._get_table_schemas(input_table_names), sources)
            ]
        except Exception as e:
            self.logger.warning(f'Could not extract schema from bigquery. {e}')
            return [
                Dataset.from_table(source, table)
                for table, source in zip(input_table_names, sources)
            ]

    def _get_output_from_bq(self, properties) -> Optional[Dataset]:
        bq_output_table = get_from_nullable_chain(
            properties, ['configuration', 'query', 'destinationTable'])
        if not bq_output_table:
            return None

        output_table_name = self._bq_table_name(bq_output_table)
        source = self._source()

        table_schema = self._get_table_safely(output_table_name)
        if table_schema:
            return Dataset.from_table_schema(
                source=source,
                table_schema=table_schema,
            )
        else:
            self.logger.warning("Could not resolve output table from bq")
            return Dataset.from_table(source, output_table_name)

    def _get_table_safely(self, output_table_name):
        try:
            return self._get_table(output_table_name)
        except Exception as e:
            self.logger.warning(
                f'Could not extract output schema from bigquery. {e}')
        return None

    def _get_table_schemas(self, tables: [str]) \
            -> [DbTableSchema]:
        # Avoid querying BigQuery by returning an empty array
        # if no tables have been provided.
        if not tables:
            return []

        return [self._get_table(table) for table in tables]

    def _get_table(self, table: str) -> Optional[DbTableSchema]:
        bq_table = self.client.get_table(table)
        if not bq_table._properties:
            return
        table = bq_table._properties

        fields = get_from_nullable_chain(table, ['schema', 'fields'])
        if not fields:
            return

        columns = [
            DbColumn(name=fields[i].get('name'),
                     type=fields[i].get('type'),
                     description=fields[i].get('description'),
                     ordinal_position=i) for i in range(len(fields))
        ]

        return DbTableSchema(
            schema_name=table.get('tableReference').get('projectId') + '.' +
            table.get('tableReference').get('datasetId'),
            table_name=DbTableName(table.get('tableReference').get('tableId')),
            columns=columns)

    def _source(self) -> Source:
        return Source(scheme='bigquery', connection_url='bigquery')

    def _bq_table_name(self, bq_table):
        project = bq_table.get('projectId')
        dataset = bq_table.get('datasetId')
        table = bq_table.get('tableId')
        return f"{project}.{dataset}.{table}"