def events_blobs_generator(self) -> Generator[blob.Blob, None, None]:
    """Generates all blobs from the bucket's prefix location.

    Yields:
      A generator that generates Blob objects from blob contents within a
      prefix location in the bucket.

    Raises:
      DataInConnectorError: When listing blob in bucket returns a HttpError.
    """
    try:
      blob_names = self.list(bucket=self.bucket, prefix=self.prefix)
    except googleapiclient_errors.HttpError as error:
      raise errors.DataInConnectorError(
          error=error, msg='Failed to get list of blobs from bucket.')

    for blob_name in blob_names:
      url = 'gs://{}/{}'.format(self.bucket, blob_name)
      # Exclude folders from uploading to Datastore.
      if not blob_name.endswith('/'):
        try:
          events = self.get_blob_events(blob_name)
          yield blob.Blob(events=events, blob_id=url, platform=_PLATFORM,
                          source=self.bucket, location=blob_name,
                          position=_START_POSITION_IN_BLOB)
        except (errors.DataInConnectorBlobParseError,
                errors.DataInConnectorError) as error:
          yield blob.Blob(events=[], blob_id=url, platform=_PLATFORM,
                          source=self.bucket, location=blob_name,
                          position=_START_POSITION_IN_BLOB,
                          status=blob.BlobStatus.ERROR, status_desc=str(error))
Beispiel #2
0
    def setUp(self):
        super(DataConnectorOperatorTest, self).setUp()
        self.addCleanup(mock.patch.stopall)

        self.mock_gcs_hook = mock.patch.object(gcs_hook,
                                               'GoogleCloudStorageHook',
                                               autospec=True).start()

        self.mock_ga_hook = mock.patch.object(ga_hook,
                                              'GoogleAnalyticsHook',
                                              autospec=True).start()

        self.test_operator_kwargs = {'task_id': 'test_task_id'}
        self.dc_operator = data_connector_operator.DataConnectorOperator(
            self.mock_gcs_hook,
            self.mock_ga_hook,
            return_report=True,
            **self.test_operator_kwargs)
        self.dc_operator_no_report = data_connector_operator.DataConnectorOperator(
            self.mock_gcs_hook, self.mock_ga_hook, **self.test_operator_kwargs)

        self.event = {
            'cid': '12345.67890',
            'ec': 'ClientID',
            'ea': 'PredictedPayer',
            'el': '20190423',
            'ev': 1,
            'z': '1558517072202080'
        }
        self.blob = blob.Blob(events=([self.event] * 2),
                              blob_id='id',
                              platform='GCS',
                              source='bucket',
                              location='blob')
Beispiel #3
0
    def test_execute_appends_empty_reports_when_no_events_to_send(self):
        blb = blob.Blob(events=[],
                        blob_id='id',
                        platform='GCS',
                        source='bucket',
                        location='blob')
        (self.dc_operator.input_hook.events_blobs_generator.return_value
         ) = fake_events_generator([blb] * 2)

        reports = self.dc_operator.execute({})

        self.assertListEqual(reports, [(), ()])
Beispiel #4
0
    def test_init(self):
        blob_instance = blob.Blob([{
            '': ''
        }], 'id', 'GCP', 'Source', 'Location', 0)

        self.assertTupleEqual((blob_instance.events, blob_instance.blob_id,
                               blob_instance.platform, blob_instance.source,
                               blob_instance.location, blob_instance.position,
                               blob_instance.status, blob_instance.status_desc,
                               blob_instance.unsent_events_indexes),
                              ([{
                                  '': ''
                              }], 'id', 'GCP', 'Source', 'Location', 0,
                               blob.BlobStatus.UNPROCESSED, '', []))
Beispiel #5
0
  def events_blobs_generator(self) -> Generator[blob.Blob, None, None]:
    """Generates pages of specified BigQuery table as blobs.

    Yields:
      blob: A blob object containing events from a page with length of
      _DEFAULT_PAGE_SIZE from the specified BigQuery table.

    Raises:
      DataInConnectorError: Raised when BigQuery table data cannot be accessed.
    """
    start_index = 0
    total_rows = -1
    bq_cursor = self.get_conn().cursor()

    # Get the first page to ensure the accessibility.
    try:
      query_results = self._get_tabledata_with_retries(bq_cursor=bq_cursor,
                                                       start_index=start_index)
    except googleapiclient_errors.HttpError as error:
      raise errors.DataInConnectorError(error=error, msg=str(error))
    else:
      if query_results is None:
        raise errors.DataInConnectorError(
            msg='Unable to get any blobs in {}.'.format(self.url))
      try:
        total_rows = int(query_results.get('totalRows'))
      except (AttributeError, TypeError, ValueError):
        raise errors.DataInConnectorError(
            msg='Unable to get total rows in {}.'.format(self.url))
      else:
        yield self._query_results_to_blob(query_results, start_index)
        start_index = start_index + _DEFAULT_PAGE_SIZE

    # Get the remaining pages of the requested table.
    while start_index < total_rows:
      try:
        query_results = self._get_tabledata_with_retries(
            bq_cursor=bq_cursor, start_index=start_index)
      except googleapiclient_errors.HttpError as error:
        # Generate a blob with error status.
        blob_unique_id = '{}/{}'.format(self.url, start_index)
        yield blob.Blob(events=[], blob_id=blob_unique_id,
                        platform=_PLATFORM, source=self.dataset_id,
                        location=self.table_id, position=start_index,
                        status=blob.BlobStatus.ERROR, status_desc=str(error))
      else:
        yield self._query_results_to_blob(query_results, start_index)
      finally:
        start_index = start_index + _DEFAULT_PAGE_SIZE
Beispiel #6
0
  def _query_results_to_blob(
      self, query_results: Dict[Text, Any], start_index: int) -> blob.Blob:
    """Converts query results of BigQuery to event blob.

    Args:
      query_results: Raw query results.
      start_index: Start index of BigQuery table rows.

    Returns:
      blob: Event blob containing event list and status.
    """
    blob_unique_id = '{}/{}'.format(self.url, start_index)
    if query_results is None:
      return blob.Blob(events=[], blob_id=blob_unique_id,
                       platform=_PLATFORM, source=self.dataset_id,
                       location=self.table_id, position=start_index,
                       status=blob.BlobStatus.ERROR,
                       status_desc='Unable to get the blob at {} in {}.'.format(
                           start_index, self.url))

    events = self._query_results_to_maps_list(query_results)
    return blob.Blob(events=events, blob_id=blob_unique_id,
                     platform=_PLATFORM, source=self.dataset_id,
                     location=self.table_id, position=start_index)