async def _consume( self, payload_queue: asyncio.Queue, request_meta: Optional[RequestMeta] = None, add_start_dispatch: Optional[List[str]] = None, ) -> None: while True: try: task = await payload_queue.get() # Determine whether the provider has returned a `Payload`, or a task. # If it is a task, load the defined archiver plugin to load the # `Payload`, otherwise, simply continue on with the scanning. if isinstance(task, Payload): request = Request([task], request_meta) await self.scan_request(request, add_start_dispatch) else: for source_archiver, task_meta in task.items(): self.log.debug( f'Provider task received: source_archiver: {source_archiver}, ' f'task_meta: {task_meta}') try: ar = ArchiverResponse(task_meta) payload = await self._loaded_source_archiver_plugins[ source_archiver].get(ar) if payload: request = Request([payload], request_meta) await self.scan_request( request, add_start_dispatch) except Exception as e: self.log.warn( f'"{task_meta}" failed with archiver "{source_archiver}": {str(e)}' ) payload_queue.task_done() except asyncio.QueueEmpty: pass
async def archive(self, payload: Payload, request: Request) -> ArchiverResponse: """ Archive payload to Azure Blob Storage """ if self.use_sha: filename = hashlib.sha1(payload.content).hexdigest() filename = f'{"/".join(list(filename[:5]))}/{filename}' elif self.use_datetime: datetime_path = datetime.now().strftime('%Y/%m/%d') filename = f'{datetime_path}/{payload.payload_id}' else: filename = payload.results.payload_id blob_client: BlobClient = BlobClient.from_connection_string( conn_str=self.conn_str, container_name=self.archive_container, blob_name=filename, ) try: await blob_client.upload_blob(payload.content) except ResourceExistsError: pass await blob_client.close() return ArchiverResponse({ 'container_name': self.archive_container, 'blob_name': filename })
async def archive( self, payload: Payload, request: Request ) -> Optional[ArchiverResponse]: self.conn.set(f'{payload.payload_id}_meta', str(payload.payload_meta)) self.conn.set(f'{payload.payload_id}_buf', payload.content) self.conn.rpush(self.redis_queue, payload.payload_id) return ArchiverResponse({'msg_id': payload.payload_id})
async def archive(self, payload: Payload, request: Request) -> Optional[ArchiverResponse]: topic = f'projects/{self.project_id}/topics/{self.topic}' self._publish_connect(topic) future = self.publish_client.publish(topic, payload.content, meta=payload.payload_meta) return ArchiverResponse({'msg_id': future.result()})
def archive(self, payload: Payload, request_meta: RequestMeta) -> Optional[ArchiverResponse]: if self.RAISE_EXCEPTION: raise Exception('Test exception please ignore') ar = ArchiverResponse({'file_save_id': 12345}) if self.RETURN_ERRORS: ar.errors += ['Test error please ignore'] return ar
async def test_source_archive(self): s = Stoq(base_dir=utils.get_data_dir(), source_archivers=['simple_archiver']) simple_archiver = s.load_plugin('simple_archiver') simple_archiver.PAYLOAD = b'This is a payload' task = ArchiverResponse(results={'path': '/tmp/123'}) payload = await simple_archiver.get(task) self.assertEqual('/tmp/123', payload.results.payload_meta.extra_data['path']) self.assertEqual(payload.content, simple_archiver.PAYLOAD)
def archive(self, payload: Payload, request_meta: RequestMeta) -> ArchiverResponse: """ Archive payload to S3 """ if self.use_sha: filename = hashlib.sha1(payload.content).hexdigest() filename = f'{"/".join(list(filename[:5]))}/{filename}' else: filename = payload.payload_id self._upload(payload.content, filename, self.archive_bucket) return ArchiverResponse({'bucket': self.archive_bucket, 'path': filename})
def archive( self, payload: Payload, request_meta: RequestMeta ) -> Optional[ArchiverResponse]: self._connect() msg = { '_is_payload': True, '_content': payload.content, '_request_meta': request_meta, } self.producer.send(self.topic, helpers.dumps(msg).encode()) self.producer.flush() return ArchiverResponse()
def archive(self, payload: Payload, request_meta: RequestMeta) -> ArchiverResponse: """ Archive a payload to MongoDB """ self._connect_gridfs() sha1 = helpers.get_sha1(payload.content) meta = payload.payload_meta.extra_data meta['_id'] = sha1 try: with self.gridfs_db.new_file(**meta) as fp: fp.write(payload.content) except (DuplicateKeyError, FileExists): pass return ArchiverResponse(meta)
def archive( self, payload: Payload, request_meta: RequestMeta ) -> Optional[ArchiverResponse]: """ Archive Payload object to Kafka queue """ self._connect() msg = { '_is_payload': True, '_content': b64encode(payload.content), '_payload_meta': payload.payload_meta.extra_data, '_request_meta': request_meta, } self.producer.send(self.topic, helpers.dumps(msg).encode()) self.producer.flush() return ArchiverResponse()
def archive(self, payload: Payload, request_meta: RequestMeta) -> ArchiverResponse: """ Archive payload to GCS """ if self.use_sha: filename = hashlib.sha1(payload.content).hexdigest() filename = f'{"/".join(list(filename[:5]))}/{filename}' elif self.use_datetime: datetime_path = datetime.now().strftime('%Y/%m/%d') filename = f'{datetime_path}/{payload.payload_id}' else: filename = payload.payload_id self._upload(payload.content, filename, self.archive_bucket) return ArchiverResponse( { 'bucketId': self.archive_bucket, 'objectId': filename, 'projectId': self.project_id, } )
def run( self, request_meta: Optional[RequestMeta] = None, add_start_dispatch: Optional[List[str]] = None, add_start_deep_dispatch: Optional[List[str]] = None, ) -> None: """ Run stoQ using a provider plugin to scan multiple files until exhaustion :param request_meta: Metadata pertaining to the originating request :param add_start_dispatch: Force first round of scanning to use specified plugins :param add_start_deep_dispatch: Force second round of scanning to use specified plugins """ # Don't initialize any (provider) plugins here! They should be # initialized on stoq start-up or via load_plugin() if not self._loaded_provider_plugins: raise StoqException('No activated provider plugins') payload_queue: queue.Queue = queue.Queue(self.max_queue) with concurrent.futures.ThreadPoolExecutor() as executor: # Start the load operations and mark each future with its URL future_to_name = { executor.submit(plugin.ingest, payload_queue): name for name, plugin in self._loaded_provider_plugins.items() } while len(future_to_name) > 0 or payload_queue.qsize() > 0: try: # Using get_nowait results in high CPU churn task = payload_queue.get(timeout=0.1) # Determine whether the provider has returned a `Payload`, or a task. # If it is a task, load the defined archiver plugin to load the # `Payload`, otherwise, simply continue on with the scanning. if isinstance(task, Payload): self.scan_payload( task, request_meta=request_meta, add_start_dispatch=add_start_dispatch, add_start_deep_dispatch=add_start_deep_dispatch, ) else: for source_archiver, task_meta in task.items(): try: ar = ArchiverResponse(task_meta) payload = self._loaded_source_archiver_plugins[ source_archiver].get(ar) if payload: self.scan_payload( payload, request_meta=request_meta, add_start_dispatch=add_start_dispatch, add_start_deep_dispatch= add_start_deep_dispatch, ) except Exception as e: self.log.warn( f'"{task_meta}" failed with archiver "{source_archiver}": {str(e)}' ) except queue.Empty: pass for future in [fut for fut in future_to_name if fut.done()]: try: future.result() self.log.info( f'Provider plugin {future_to_name[future]} successfully completed' ) del future_to_name[future] except Exception as e: msg = f'provider:{future_to_name[future]} failed' self.log.exception(msg) raise StoqException(msg) from e
def test_archiverresponse_to_str(self): response = ArchiverResponse() response_str = str(response) response_dict = json.loads(response_str) self.assertIsInstance(response_str, str) self.assertIsInstance(response_dict, dict)