Beispiel #1
0
    def _gcs_blob_chunk_generator(
            self, blob_name: str) -> Generator[bytes, None, None]:
        """Downloads and generates chunks from given blob.

    The base GoogleCloudStorageHook only allows downloading an entire file.
    To enable handling large files this class provides a chunk-wise download of
    bytes within the blob.

    Args:
      blob_name: Unique location within the bucket for the target blob.

    Yields:
      Chunks of the given blob, formatted as bytes.

    Raises:
      DataInConnectorError: When download failed.
    """
        outio = io.BytesIO()
        try:
            bucket = self.get_conn().bucket(self.bucket)
            file_blob = bucket.get_blob(blob_name)
        except NotFound as error:
            raise errors.DataInConnectorError(
                error=error,
                msg='Failed to download the blob.',
                error_num=errors.ErrorNameIDMap.GCS_HOOK_ERROR_MISSING_BLOB)

        if file_blob is None:
            raise errors.DataInConnectorError(
                msg='Failed to download the blob.',
                error_num=errors.ErrorNameIDMap.GCS_HOOK_ERROR_MISSING_BLOB)

        chunks = int(file_blob.size / _DEFAULT_CHUNK_SIZE) + 1
        for i in range(0, chunks):
            outio.truncate(0)
            outio.seek(0)

            start = i * (_DEFAULT_CHUNK_SIZE + 1)
            end = i * (_DEFAULT_CHUNK_SIZE + 1) + _DEFAULT_CHUNK_SIZE
            if end > file_blob.size:
                end = file_blob.size

            try:
                file_blob.download_to_file(outio, start=start, end=end)
            except NotFound as error:
                raise errors.DataInConnectorError(
                    error=error,
                    msg='Failed to download the blob.',
                    error_num=errors.ErrorNameIDMap.GCS_HOOK_ERROR_MISSING_BLOB
                )

            self.log.debug('Blob loading: {}%'.format(int(i / chunks * 100)))
            yield outio.getvalue()
Beispiel #2
0
    def events_blobs_generator(
        self,
        processed_blobs_generator: Generator[Tuple[str, str], None,
                                             None] = None
    ) -> Generator[blob.Blob, None, None]:
        """Generates all blobs from the bucket's prefix location.

    Args:
      processed_blobs_generator: A generator that provides the processed blob
        information that helps skip read ranges.

    Yields:
      A generator that generates Blob objects from blob contents within a
      prefix location in the bucket.

    Raises:
      DataInConnectorError: When listing blob in bucket returns a HttpError.
    """
        try:
            blob_names = self.list(bucket=self.bucket, prefix=self.prefix)
        except googleapiclient_errors.HttpError as error:
            raise errors.DataInConnectorError(
                error=error,
                msg='Failed to get list of blobs from bucket.',
                error_num=errors.ErrorNameIDMap.
                RETRIABLE_GCS_HOOK_ERROR_HTTP_ERROR)

        if processed_blobs_generator is not None:
            for processed_file, _ in processed_blobs_generator:
                if processed_file in blob_names:
                    blob_names.remove(processed_file)

        for blob_name in blob_names:
            if not blob_name.endswith('/'):
                try:
                    events = self.get_blob_events(blob_name)
                    yield blob.Blob(events=events,
                                    location=self.get_location(),
                                    position=blob_name)
                except (errors.DataInConnectorBlobParseError,
                        errors.DataInConnectorError) as error:
                    continue
Beispiel #3
0
    def test_events_blobs_generator_raises_data_in_connector_error(self):
        self.mocked_list.side_effect = errors.DataInConnectorError()

        with self.assertRaises(errors.DataInConnectorError):
            self.gcs_hook.events_blobs_generator().__next__()
Beispiel #4
0
    def events_blobs_generator(
        self,
        processed_blobs_generator: Optional[Generator[Tuple[str, str], None,
                                                      None]] = None
    ) -> Generator[blob.Blob, None, None]:
        """Generates pages of specified BigQuery table as blobs.

    Args:
      processed_blobs_generator: A generator that provides the processed blob
        information that helps skip read ranges.

    Yields:
      blob: A blob object containing events from a page with length of
      _DEFAULT_PAGE_SIZE from the specified BigQuery table.

    Raises:
      DataInConnectorError: Raised when BigQuery table data cannot be accessed.
    """
        start_index = 0
        total_rows = -1
        bq_cursor = self.get_conn().cursor()

        # Get the first row to ensure the accessibility.
        try:
            query_results = self._get_tabledata_with_retries(
                bq_cursor=bq_cursor, start_index=start_index, max_results=1)
        except googleapiclient_errors.HttpError as error:
            raise errors.DataInConnectorError(
                error=error,
                msg=str(error),
                error_num=errors.ErrorNameIDMap.
                RETRIABLE_BQ_HOOK_ERROR_HTTP_ERROR)
        else:
            if query_results is None:
                raise errors.DataInConnectorError(
                    msg='Unable to get any blobs in {}.'.format(self.url),
                    error_num=errors.ErrorNameIDMap.BQ_HOOK_ERROR_NO_BLOBS)
            try:
                total_rows = int(query_results.get('totalRows'))
            except (AttributeError, TypeError, ValueError):
                raise errors.DataInConnectorError(
                    msg='Unable to get total rows in {}.'.format(self.url),
                    error_num=errors.ErrorNameIDMap.
                    RETRIABLE_BQ_HOOK_ERROR_NO_TOTAL_ROWS)

        processed_start, processed_end = self._get_next_range(
            processed_blobs_generator)

        # Get the pages of the requested table.
        while start_index < total_rows:
            num_rows = min(total_rows - start_index, _DEFAULT_PAGE_SIZE)
            end_index = start_index + num_rows

            if processed_start != -1 and processed_start < end_index:
                num_rows = processed_start - start_index
                if num_rows == 0:
                    start_index = processed_end
                    processed_start, processed_end = self._get_next_range(
                        processed_blobs_generator)
                    continue

            try:
                query_results = self._get_tabledata_with_retries(
                    bq_cursor=bq_cursor,
                    start_index=start_index,
                    max_results=num_rows)
            except googleapiclient_errors.HttpError:
                pass
            else:
                yield self._query_results_to_blob(query_results, start_index,
                                                  num_rows)
            finally:
                start_index = start_index + num_rows