def _gcs_blob_chunk_generator(self, blob_name: str
                               ) -> Generator[bytes, None, None]:
    """Downloads and generates chunks from given blob.

    The base GoogleCloudStorageHook only allows downloading an entire file.
    To enable handling large files this class provides a chunk-wise download of
    bytes within the blob.

    Args:
      blob_name: Unique location within the bucket for the target blob.

    Yields:
      Chunks of the given blob, formatted as bytes.

    Raises:
      DataInConnectorError: When download failed.
    """
    outio = io.BytesIO()
    try:
      bucket = self.get_conn().bucket(self.bucket)
      file_blob = bucket.get_blob(blob_name)
    except NotFound as error:
      raise errors.DataInConnectorError(
          error=error, msg='Failed to download the blob.',
          error_num=errors.ErrorNameIDMap.GCS_HOOK_ERROR_MISSING_BLOB)

    if file_blob is None:
      raise errors.DataInConnectorError(
          msg='Failed to download the blob.',
          error_num=errors.ErrorNameIDMap.GCS_HOOK_ERROR_MISSING_BLOB)

    chunks = int(file_blob.size / _DEFAULT_CHUNK_SIZE) + 1
    for i in range(0, chunks):
      outio.truncate(0)
      outio.seek(0)

      start = i * (_DEFAULT_CHUNK_SIZE + 1)
      end = i * (_DEFAULT_CHUNK_SIZE + 1) + _DEFAULT_CHUNK_SIZE
      if end > file_blob.size:
        end = file_blob.size

      try:
        file_blob.download_to_file(outio, start=start, end=end)
      except NotFound as error:
        raise errors.DataInConnectorError(
            error=error, msg='Failed to download the blob.',
            error_num=errors.ErrorNameIDMap.GCS_HOOK_ERROR_MISSING_BLOB)

      self.log.debug('Blob loading: {}%'.format(int(i / chunks * 100)))
      yield outio.getvalue()
  def events_blobs_generator(
      self,
      processed_blobs_generator: Generator[Tuple[str, str], None, None] = None
      ) -> Generator[blob.Blob, None, None]:
    """Generates all blobs from the bucket's prefix location.

    Args:
      processed_blobs_generator: A generator that provides the processed blob
        information that helps skip read ranges.

    Yields:
      A generator that generates Blob objects from blob contents within a
      prefix location in the bucket.

    Raises:
      DataInConnectorError: When listing blob in bucket returns a HttpError.
    """
    try:
      blob_names = self.list(bucket=self.bucket, prefix=self.prefix)
    except googleapiclient_errors.HttpError as error:
      raise errors.DataInConnectorError(
          error=error, msg='Failed to get list of blobs from bucket.',
          error_num=errors.ErrorNameIDMap.RETRIABLE_GCS_HOOK_ERROR_HTTP_ERROR)

    if processed_blobs_generator is not None:
      for processed_file, _ in processed_blobs_generator:
        if processed_file in blob_names:
          blob_names.remove(processed_file)

    for blob_name in blob_names:
      if not blob_name.endswith('/'):
        try:
          events = self.get_blob_events(blob_name)
          yield blob.Blob(events=events, location=self.get_location(),
                          position=blob_name)
        except (errors.DataInConnectorBlobParseError,
                errors.DataInConnectorError) as error:
          continue
Beispiel #3
0
    def events_blobs_generator(
        self,
        processed_blobs_generator: Generator[Tuple[str, str], None,
                                             None] = None
    ) -> Generator[blob.Blob, None, None]:
        """Generates pages of specified BigQuery table as blobs.

    Args:
      processed_blobs_generator: A generator that provides the processed blob
        information that helps skip read ranges.

    Yields:
      blob: A blob object containing events from a page with length of
      _DEFAULT_PAGE_SIZE from the specified BigQuery table.

    Raises:
      DataInConnectorError: Raised when BigQuery table data cannot be accessed.
    """
        start_index = 0
        total_rows = -1
        bq_cursor = self.get_conn().cursor()

        # Get the first row to ensure the accessibility.
        try:
            query_results = self._get_tabledata_with_retries(
                bq_cursor=bq_cursor, start_index=start_index, max_results=1)
        except googleapiclient_errors.HttpError as error:
            raise errors.DataInConnectorError(
                error=error,
                msg=str(error),
                error_num=errors.ErrorNameIDMap.
                RETRIABLE_BQ_HOOK_ERROR_HTTP_ERROR)
        else:
            if query_results is None:
                raise errors.DataInConnectorError(
                    msg='Unable to get any blobs in {}.'.format(self.url),
                    error_num=errors.ErrorNameIDMap.BQ_HOOK_ERROR_NO_BLOBS)
            try:
                total_rows = int(query_results.get('totalRows'))
            except (AttributeError, TypeError, ValueError):
                raise errors.DataInConnectorError(
                    msg='Unable to get total rows in {}.'.format(self.url),
                    error_num=errors.ErrorNameIDMap.
                    RETRIABLE_BQ_HOOK_ERROR_NO_TOTAL_ROWS)

        processed_start, processed_end = self._get_next_range(
            processed_blobs_generator)

        # Get the pages of the requested table.
        while start_index < total_rows:
            num_rows = min(total_rows - start_index, _DEFAULT_PAGE_SIZE)
            end_index = start_index + num_rows

            if processed_start != -1 and processed_start < end_index:
                num_rows = processed_start - start_index
                if num_rows == 0:
                    start_index = processed_end
                    processed_start, processed_end = self._get_next_range(
                        processed_blobs_generator)
                    continue

            try:
                query_results = self._get_tabledata_with_retries(
                    bq_cursor=bq_cursor,
                    start_index=start_index,
                    max_results=num_rows)
            except googleapiclient_errors.HttpError:
                pass
            else:
                yield self._query_results_to_blob(query_results, start_index,
                                                  num_rows)
            finally:
                start_index = start_index + num_rows
  def test_events_blobs_generator_raises_data_in_connector_error(self):
    self.mocked_list.side_effect = errors.DataInConnectorError()

    with self.assertRaises(errors.DataInConnectorError):
      self.gcs_hook.events_blobs_generator().__next__()