Ejemplo n.º 1
0
    def _get_deep_dispatches(
        self,
        payload: Payload,
        add_deep_dispatches: List[str],
        request_meta: RequestMeta,
    ) -> Tuple[Set[str], DefaultDict[str, List[str]]]:

        errors: DefaultDict[str, List[str]] = defaultdict(list)
        deep_dispatches = set(add_deep_dispatches)

        for (
                deep_dispatcher_name,
                deep_dispatcher,
        ) in self._loaded_deep_dispatcher_plugins.items():
            try:
                deep_dispatcher_result = deep_dispatcher.get_deep_dispatches(
                    payload, request_meta)
                deep_dispatches.update(deep_dispatcher_result.plugin_names)
                if deep_dispatcher_result.meta is not None:
                    payload.deep_dispatch_meta[
                        deep_dispatcher_name] = deep_dispatcher_result.meta
            except Exception as e:
                msg = 'deep dispatcher:failed to deep dispatch'
                self.log.exception(msg)
                errors[deep_dispatcher_name].append(
                    helpers.format_exc(e, msg=msg))

        return (deep_dispatches, errors)
Ejemplo n.º 2
0
    async def scan(
        self,
        content: bytes,
        payload_meta: Optional[PayloadMeta] = None,
        request_meta: Optional[RequestMeta] = None,
        add_start_dispatch: Optional[List[str]] = None,
        ratelimit: Optional[str] = None,
    ) -> StoqResponse:
        """

        Wrapper for `scan_request` that creates a `Payload` object from bytes

        :param content: Raw bytes to be scanned
        :param payload_meta: Metadata pertaining to originating source
        :param request_meta: Metadata pertaining to the originating request
        :param add_start_dispatch: Force first round of scanning to use specified plugins
        :param ratelimit: Rate limit calls to scan

        """
        self.log.debug(
            f'Content received ({len(content)} bytes): '
            f'PayloadMeta: {helpers.dumps(payload_meta, indent=0)}, '
            f'RequestMeta: {helpers.dumps(request_meta, indent=0)}'
        )
        payload_meta = payload_meta or PayloadMeta()
        payload = Payload(content, payload_meta)
        request_meta = request_meta or RequestMeta()
        request = Request(payloads=[payload], request_meta=request_meta)
        return await self.scan_request(request, add_start_dispatch)
Ejemplo n.º 3
0
    def scan(
        self,
        content: bytes,
        payload_meta: Optional[PayloadMeta] = None,
        request_meta: Optional[RequestMeta] = None,
        add_start_dispatch: Optional[List[str]] = None,
        add_start_deep_dispatch: Optional[List[str]] = None,
        ratelimit: Optional[str] = None,
    ) -> StoqResponse:
        """

        Wrapper for `scan_payload` that creates a `Payload` object from bytes

        :param content: Raw bytes to be scanned
        :param payload_meta: Metadata pertaining to originating source
        :param request_meta: Metadata pertaining to the originating request
        :param add_start_dispatch: Force first round of scanning to use specified plugins
        :param add_start_deep_dispatch: Force second round of scanning to use specified plugins
        :param ratelimit: Rate limit calls to scan

        :return: Complete scan results
        :rtype: StoqResponse

        """
        payload_meta = PayloadMeta() if payload_meta is None else payload_meta
        payload = Payload(content, payload_meta)
        return self.scan_payload(payload, request_meta, add_start_dispatch,
                                 add_start_deep_dispatch)
Ejemplo n.º 4
0
    async def ingest(self, queue: Queue) -> None:
        consumer = AIOKafkaConsumer(
            self.topic,
            group_id=self.group,
            auto_offset_reset='earliest',
            bootstrap_servers=self.servers,
            heartbeat_interval_ms=self.heartbeat_interval_ms,
            session_timeout_ms=self.session_timeout_ms,
            loop=get_event_loop(),
        )
        await consumer.start()
        self.log.info(f'Monitoring {self.topic} topic for messages...')

        async for message in consumer:
            msg = json.loads(message.value)
            if msg.get('_is_payload'):
                # This message is a payload that was placed on the queue
                # from the kafka-queue archiver plugin
                extra_data = msg['_payload_meta']
                extra_data['request_meta'] = msg['_request_meta']
                meta = PayloadMeta(extra_data=extra_data)
                payload = Payload(content=b64decode(msg['_content']), payload_meta=meta)
                await queue.put(payload)
            else:
                await queue.put(msg)
Ejemplo n.º 5
0
    def get(self, task: ArchiverResponse) -> Optional[Payload]:
        """
        Retrieve archived payload from MongoDB

        """
        self._connect_gridfs()
        result = self.gridfs_db.get(task.results['_id'])
        if result:
            # payload = result.read()
            return Payload(payload, PayloadMeta(extra_data=task.results))
Ejemplo n.º 6
0
    async def _apply_worker(
        self, payload: Payload, plugin: WorkerPlugin, request: Request
    ) -> Tuple[Set[Tuple[Payload, str]], List[Payload]]:
        self.log.debug(
            f'Scanning Payload {payload.results.payload_id} with WorkerPlugin {plugin.plugin_name}'
        )
        try:
            worker_response: Optional[WorkerResponse] = await plugin.scan(
                payload, request
            )
        except Exception as e:
            worker_response = None
            msg = 'worker:failed to scan'
            self.log.exception(msg)
            request.errors.append(
                Error(
                    payload_id=payload.results.payload_id,
                    plugin_name=plugin.plugin_name,
                    error=helpers.format_exc(e, msg=msg),
                )
            )
        payload.results.plugins_run['workers'].append(plugin.plugin_name)

        if not worker_response:
            return set(), []

        if worker_response.results is not None:
            payload.results.workers[plugin.plugin_name] = worker_response.results
        request.errors.extend(worker_response.errors)

        additional_dispatches: Set[Tuple[Payload, str]] = {
            (payload, plugin_name) for plugin_name in worker_response.dispatch_to
        }

        extracted_payloads: List[Payload] = [
            Payload(
                content=extracted_payload.content,
                payload_meta=extracted_payload.payload_meta,
                extracted_by=plugin.plugin_name,
                extracted_from=payload.results.payload_id,
            )
            for extracted_payload in worker_response.extracted
        ]

        self.log.debug(
            f'Completed scan of {payload.results.payload_id} with '
            f'{len(worker_response.results) if worker_response.results else 0} result keys, '  # type: ignore
            f'{len(additional_dispatches)} additional dispatches, and '
            f'{len(extracted_payloads)} extracted payloads'
        )
        return additional_dispatches, extracted_payloads
Ejemplo n.º 7
0
    async def get(self, task: ArchiverResponse) -> Payload:
        """
        Retrieve archived payload from Azure Blob Storage

        """
        blob_client: BlobClient = BlobClient.from_connection_string(
            conn_str=self.conn_str,
            container_name=task.results['container_name'],
            blob_name=task.results['blob_name'],
        )
        content = await blob_client.download_blob()
        await blob_client.close()
        meta = PayloadMeta(task.results)
        return Payload(content.readall(), meta)
Ejemplo n.º 8
0
    def get(self, task: ArchiverResponse) -> Payload:
        """
        Retrieve archived payload from S3

        """
        if not self.client:
            self._get_client()
        meta = PayloadMeta(
            extra_data={'bucket': task.results['bucket'], 'path': task.results['path']}
        )
        content = self.client.get_object(
            Bucket=task.results['bucket'], Key=task.results['path']
        )['Body']
        return Payload(content.read(), meta)
Ejemplo n.º 9
0
 def ingest(self, queue: Queue) -> None:
     consumer = KafkaConsumer(
         self.topic,
         group_id=self.group,
         auto_offset_reset='earliest',
         bootstrap_servers=self.servers,
     )
     print(f'Monitoring {self.topic} topic for messages...')
     for message in consumer:
         msg = json.loads(message.value)
         if msg.get('_is_payload'):
             meta = PayloadMeta(extra_data=msg['_request_meta'])
             payload = Payload(content=msg['_content'], payload_meta=meta)
             queue.put(payload)
         else:
             queue.put(msg)
Ejemplo n.º 10
0
    def get(self, task: ArchiverResponse) -> Payload:
        """
        Retrieve archived payload from gcs

        """
        meta = PayloadMeta(
            extra_data={
                'bucket': task.results['archive_bucket'],
                'path': task.results['path'],
                'project_id': task.results['project_id'],
            })
        client = Client(project=task.results['project_id'])
        bucket = client.get_bucket(task.results['archive_bucket'])
        blob = Blob(task.results['path'], bucket)
        content = BytesIO()
        blob.download_to_file(content)
        content.seek(0)
        return Payload(content.read(), meta)
Ejemplo n.º 11
0
 async def ingest(self, queue: Queue) -> None:
     self.log.info(f'Monitoring redis queue {self.redis_queue}')
     while True:
         msg = self.conn.blpop(self.redis_queue, timeout=0)
         if not msg:
             time.sleep(0.1)
             continue
         data = msg[1].decode()
         payload = self.conn.get(f'{data}_buf')
         meta = self.conn.get(f'{data}_meta')
         if meta and payload:
             meta = json.loads(meta.decode())
             await queue.put(
                 Payload(payload, payload_meta=PayloadMeta(extra_data=meta))
             )
             self.conn.delete(f'{meta}_buf')
             self.conn.delete(f'{meta}_meta')
         else:
             await queue.put(json.loads(data))
Ejemplo n.º 12
0
    def get(self, task: ArchiverResponse) -> Payload:
        """
        Retrieve archived payload from gcs

        """
        meta = PayloadMeta(
            extra_data={
                'bucketId': task.results['bucketId'],
                'objectId': task.results['objectId'],
                'projectId': task.results['projectId'],
            }
        )
        count = 0
        client = Client(project=task.results['projectId'])
        while count < self.max_retries:
            try:
                bucket = client.get_bucket(task.results['bucketId'])
                blob = Blob(task.results['objectId'], bucket)
                content = BytesIO()
                blob.download_to_file(content)
                break
            except (
                InvalidResponse,
                GoogleAPICallError,
                InternalServerError,
                SSLError,
            ) as e:
                if count >= self.max_retries:
                    raise StoqPluginException(
                        f'Failed to download {task.results["bucketId"]}/{task.results["objectId"]} from GCS: {str(e)}'
                    )
                count += 1
                sleep(randrange(0, 4))
        content.seek(0)
        data = content.read()
        if self.use_encryption:
            data = self._decrypt(data)
        return Payload(data, meta)
Ejemplo n.º 13
0
    async def reconstruct_all_subresponses(
            self,
            stoq_response: StoqResponse) -> AsyncGenerator[StoqResponse, None]:
        """

        Generate a new `StoqResponse` object for each `Payload` within
        the `Request`

        """

        for i, new_root_payload_result in enumerate(stoq_response.results):
            parent_payload_ids = {stoq_response.results[i].payload_id}
            # Contruct a new root Payload object since StoqResponse only has the
            # PayloadResults object
            new_root_payload = Payload(b'')
            new_root_payload.results = new_root_payload_result
            relevant_payloads: List[Payload] = [new_root_payload]

            for payload_result in stoq_response.results[i:]:
                for extracted_from in payload_result.extracted_from:
                    if extracted_from in parent_payload_ids:
                        parent_payload_ids.add(payload_result.payload_id)
                        new_payload = Payload(b'')
                        new_payload.results = payload_result
                        relevant_payloads.append(new_payload)

            new_request = Request(payloads=relevant_payloads,
                                  request_meta=stoq_response.request_meta)
            new_response = StoqResponse(
                request=new_request,
                time=stoq_response.time,
                scan_id=stoq_response.scan_id,
            )
            decorator_tasks = []
            for plugin_name, decorator in self._loaded_decorator_plugins.items(
            ):
                decorator_tasks.append(
                    self._apply_decorator(decorator, new_response))
            await asyncio.gather(*decorator_tasks)
            yield new_response
Ejemplo n.º 14
0
    async def test_reconstruct_all_subresponses(self):
        # Construct a fake stoq_response as if it were generated from a file
        # A.zip that contains two files, B.txt and C.zip, where C.zip contains D.txt
        results = [
            Payload(content=b'', payload_id='A.zip', payload_meta=PayloadMeta()),
            Payload(
                content=b'',
                payload_id='B.txt',
                payload_meta=PayloadMeta(),
                extracted_from='A.zip',
                extracted_by='fake',
            ),
            Payload(
                content=b'',
                payload_id='C.zip',
                payload_meta=PayloadMeta(),
                extracted_from='A.zip',
                extracted_by='fake',
            ),
            Payload(
                content=b'',
                payload_id='D.txt',
                payload_meta=PayloadMeta(),
                extracted_from='C.zip',
                extracted_by='fake',
            ),
        ]
        request = Request(request_meta=RequestMeta(extra_data={'check': 'me'}))
        payload_count = 1
        for result in results:
            result.results.workers['fake'] = f'result-{payload_count}'
            result.results.plugins_run['workers'].append('fake')
            request.payloads.append(result)
            payload_count += 1

        initial_response = StoqResponse(request)
        s = Stoq(base_dir=utils.get_data_dir(), decorators=['simple_decorator'])
        all_subresponses = [
            r async for r in s.reconstruct_all_subresponses(initial_response)
        ]
        # We expect there to be four "artificial" responses generated, one for
        # each payload as the root.
        self.assertEqual(len(all_subresponses), 4)
        # We expect the first response to have all 4 payloads, the second response
        # to have just the second payload, the third response to have the third
        # and fourth payload, and the fourth response to have just the fourth payload
        self.assertEqual(
            [len(stoq_response.results) for stoq_response in all_subresponses],
            [4, 1, 2, 1],
        )
        self.assertEqual(
            [
                stoq_response.results[0].workers['fake']
                for stoq_response in all_subresponses
            ],
            ['result-1', 'result-2', 'result-3', 'result-4'],
        )
        self.assertTrue(
            all(
                'simple_decorator' in stoq_response.decorators
                for stoq_response in all_subresponses
            )
        )
        # Assert that they all have the same scan ID
        self.assertEqual(
            len({stoq_response.scan_id for stoq_response in all_subresponses}), 1
        )
Ejemplo n.º 15
0
    def _single_scan(
        self,
        payload: Payload,
        add_dispatch: List[str],
        add_deep_dispatch: List[str],
        request_meta: RequestMeta,
    ) -> Tuple[PayloadResults, List[Payload], DefaultDict[str, List[str]]]:

        extracted = []
        errors: DefaultDict[str, List[str]] = defaultdict(list)
        dispatch_pass = 0

        dispatches, dispatch_errors = self._get_dispatches(
            payload, add_dispatch, request_meta)
        if dispatch_errors:
            errors = helpers.merge_dicts(errors, dispatch_errors)
        for plugin_name in dispatches:
            try:
                plugin = self.load_plugin(plugin_name)
            except Exception as e:
                msg = 'worker:failed to load'
                self.log.exception(msg)
                errors[plugin_name].append(helpers.format_exc(e, msg=msg))
                continue
            # Normal dispatches are the "1st round" of scanning
            payload.plugins_run['workers'][0].append(plugin_name)
            try:
                worker_response = plugin.scan(payload,
                                              request_meta)  # pyre-ignore[16]
            except Exception as e:
                msg = 'worker:failed to scan'
                self.log.exception(msg)
                errors[plugin_name].append(helpers.format_exc(e, msg=msg))
                continue
            if worker_response is None:
                continue
            if worker_response.results is not None:
                # Normal dispatches are the "1st round" of scanning
                payload.worker_results[0][
                    plugin_name] = worker_response.results
            extracted.extend([
                Payload(ex.content, ex.payload_meta, plugin_name,
                        payload.payload_id) for ex in worker_response.extracted
            ])
            if worker_response.errors:
                errors[plugin_name].extend(worker_response.errors)

        while dispatch_pass < self.max_dispatch_passes:
            dispatch_pass += 1
            deep_dispatches, deep_dispatch_errors = self._get_deep_dispatches(
                payload, add_deep_dispatch, request_meta)
            if deep_dispatch_errors:
                errors = helpers.merge_dicts(errors, deep_dispatch_errors)
            if deep_dispatches:
                # Add another entry for this round
                payload.plugins_run['workers'].append([])
                payload.worker_results.append({})
            else:
                break
            for plugin_name in deep_dispatches:
                try:
                    plugin = self.load_plugin(plugin_name)
                except Exception as e:
                    msg = f'deep dispatch:failed to load (pass {dispatch_pass}/{self.max_dispatch_passes})'
                    self.log.exception(msg)
                    errors[plugin_name].append(helpers.format_exc(e, msg=msg))
                    continue
                payload.plugins_run['workers'][dispatch_pass].append(
                    plugin_name)
                try:
                    worker_response = plugin.scan(  # pyre-ignore[16]
                        payload, request_meta)
                except Exception as e:
                    msg = f'deep dispatch:failed to scan (pass {dispatch_pass}/{self.max_dispatch_passes})'
                    self.log.exception(msg)
                    errors[plugin_name].append(helpers.format_exc(e, msg=msg))
                    continue
                if worker_response is None:
                    continue
                if worker_response.results is not None:
                    payload.worker_results[dispatch_pass][
                        plugin_name] = worker_response.results
                extracted.extend([
                    Payload(ex.content, ex.payload_meta, plugin_name,
                            payload.payload_id)
                    for ex in worker_response.extracted
                ])
                if worker_response.errors:
                    errors[plugin_name].extend(worker_response.errors)

        payload_results = PayloadResults.from_payload(payload)
        if request_meta.archive_payloads and payload.payload_meta.should_archive:
            for plugin_name, archiver in self._loaded_dest_archiver_plugins.items(
            ):
                payload.plugins_run['archivers'].append(plugin_name)
                try:
                    archiver_response = archiver.archive(payload, request_meta)
                except Exception as e:
                    msg = 'archiver:failed to archive'
                    self.log.exception(msg)
                    errors[plugin_name].append(helpers.format_exc(e, msg=msg))
                    continue
                if archiver_response is None:
                    continue
                if archiver_response.results is not None:
                    payload_results.archivers[
                        plugin_name] = archiver_response.results
                if archiver_response.errors:
                    errors[plugin_name].extend(archiver_response.errors)

        return (payload_results, extracted, errors)
Ejemplo n.º 16
0
 def get(self, task: ArchiverResponse) -> Optional[Payload]:
     if self.RAISE_EXCEPTION:
         raise Exception('Test exception please ignore')
     return Payload(self.PAYLOAD, PayloadMeta(extra_data=task.results))
Ejemplo n.º 17
0
 def test_payloadresults_to_str(self):
     payload = Payload(self.generic_content)
     response_str = str(payload.results)
     response_dict = json.loads(response_str)
     self.assertIsInstance(response_str, str)
     self.assertIsInstance(response_dict, dict)