def events_blobs_generator(self) -> Generator[blob.Blob, None, None]: """Generates all blobs from the bucket's prefix location. Yields: A generator that generates Blob objects from blob contents within a prefix location in the bucket. Raises: DataInConnectorError: When listing blob in bucket returns a HttpError. """ try: blob_names = self.list(bucket=self.bucket, prefix=self.prefix) except googleapiclient_errors.HttpError as error: raise errors.DataInConnectorError( error=error, msg='Failed to get list of blobs from bucket.') for blob_name in blob_names: url = 'gs://{}/{}'.format(self.bucket, blob_name) # Exclude folders from uploading to Datastore. if not blob_name.endswith('/'): try: events = self.get_blob_events(blob_name) yield blob.Blob(events=events, blob_id=url, platform=_PLATFORM, source=self.bucket, location=blob_name, position=_START_POSITION_IN_BLOB) except (errors.DataInConnectorBlobParseError, errors.DataInConnectorError) as error: yield blob.Blob(events=[], blob_id=url, platform=_PLATFORM, source=self.bucket, location=blob_name, position=_START_POSITION_IN_BLOB, status=blob.BlobStatus.ERROR, status_desc=str(error))
def setUp(self): super(DataConnectorOperatorTest, self).setUp() self.addCleanup(mock.patch.stopall) self.mock_gcs_hook = mock.patch.object(gcs_hook, 'GoogleCloudStorageHook', autospec=True).start() self.mock_ga_hook = mock.patch.object(ga_hook, 'GoogleAnalyticsHook', autospec=True).start() self.test_operator_kwargs = {'task_id': 'test_task_id'} self.dc_operator = data_connector_operator.DataConnectorOperator( self.mock_gcs_hook, self.mock_ga_hook, return_report=True, **self.test_operator_kwargs) self.dc_operator_no_report = data_connector_operator.DataConnectorOperator( self.mock_gcs_hook, self.mock_ga_hook, **self.test_operator_kwargs) self.event = { 'cid': '12345.67890', 'ec': 'ClientID', 'ea': 'PredictedPayer', 'el': '20190423', 'ev': 1, 'z': '1558517072202080' } self.blob = blob.Blob(events=([self.event] * 2), blob_id='id', platform='GCS', source='bucket', location='blob')
def test_execute_appends_empty_reports_when_no_events_to_send(self): blb = blob.Blob(events=[], blob_id='id', platform='GCS', source='bucket', location='blob') (self.dc_operator.input_hook.events_blobs_generator.return_value ) = fake_events_generator([blb] * 2) reports = self.dc_operator.execute({}) self.assertListEqual(reports, [(), ()])
def test_init(self): blob_instance = blob.Blob([{ '': '' }], 'id', 'GCP', 'Source', 'Location', 0) self.assertTupleEqual((blob_instance.events, blob_instance.blob_id, blob_instance.platform, blob_instance.source, blob_instance.location, blob_instance.position, blob_instance.status, blob_instance.status_desc, blob_instance.unsent_events_indexes), ([{ '': '' }], 'id', 'GCP', 'Source', 'Location', 0, blob.BlobStatus.UNPROCESSED, '', []))
def events_blobs_generator(self) -> Generator[blob.Blob, None, None]: """Generates pages of specified BigQuery table as blobs. Yields: blob: A blob object containing events from a page with length of _DEFAULT_PAGE_SIZE from the specified BigQuery table. Raises: DataInConnectorError: Raised when BigQuery table data cannot be accessed. """ start_index = 0 total_rows = -1 bq_cursor = self.get_conn().cursor() # Get the first page to ensure the accessibility. try: query_results = self._get_tabledata_with_retries(bq_cursor=bq_cursor, start_index=start_index) except googleapiclient_errors.HttpError as error: raise errors.DataInConnectorError(error=error, msg=str(error)) else: if query_results is None: raise errors.DataInConnectorError( msg='Unable to get any blobs in {}.'.format(self.url)) try: total_rows = int(query_results.get('totalRows')) except (AttributeError, TypeError, ValueError): raise errors.DataInConnectorError( msg='Unable to get total rows in {}.'.format(self.url)) else: yield self._query_results_to_blob(query_results, start_index) start_index = start_index + _DEFAULT_PAGE_SIZE # Get the remaining pages of the requested table. while start_index < total_rows: try: query_results = self._get_tabledata_with_retries( bq_cursor=bq_cursor, start_index=start_index) except googleapiclient_errors.HttpError as error: # Generate a blob with error status. blob_unique_id = '{}/{}'.format(self.url, start_index) yield blob.Blob(events=[], blob_id=blob_unique_id, platform=_PLATFORM, source=self.dataset_id, location=self.table_id, position=start_index, status=blob.BlobStatus.ERROR, status_desc=str(error)) else: yield self._query_results_to_blob(query_results, start_index) finally: start_index = start_index + _DEFAULT_PAGE_SIZE
def _query_results_to_blob( self, query_results: Dict[Text, Any], start_index: int) -> blob.Blob: """Converts query results of BigQuery to event blob. Args: query_results: Raw query results. start_index: Start index of BigQuery table rows. Returns: blob: Event blob containing event list and status. """ blob_unique_id = '{}/{}'.format(self.url, start_index) if query_results is None: return blob.Blob(events=[], blob_id=blob_unique_id, platform=_PLATFORM, source=self.dataset_id, location=self.table_id, position=start_index, status=blob.BlobStatus.ERROR, status_desc='Unable to get the blob at {} in {}.'.format( start_index, self.url)) events = self._query_results_to_maps_list(query_results) return blob.Blob(events=events, blob_id=blob_unique_id, platform=_PLATFORM, source=self.dataset_id, location=self.table_id, position=start_index)