Example #1
0
    def launch(self) -> None:
        """Starts the storage controller"""
        self.storage_controller = Process(
            name="StorageController",
            target=StorageController.run,
            args=(self.storage_controller, ),
        )
        self.storage_controller.daemon = True
        self.storage_controller.start()

        self.listener_address = self.status_queue.get()
Example #2
0
 def test_child_process_logging(self, tmpdir):
     log_file = self.get_logfile_path(str(tmpdir))
     openwpm_logger = mp_logger.MPLogger(log_file)
     child_process = Process(target=child_proc_logging_exception())
     child_process.daemon = True
     child_process.start()
     openwpm_logger.close()
     child_process.join()
     log_content = self.get_logfile_contents(log_file)
     assert "I'm logging an exception" in log_content
Example #3
0
    def __init__(
        self,
        structured_storage: StructuredStorageProvider,
        unstructured_storage: Optional[UnstructuredStorageProvider],
    ) -> None:

        self.listener_address: Optional[Tuple[str, int]] = None
        self.listener_process: Optional[Process] = None
        self.status_queue = Queue()
        self.completion_queue = Queue()
        self.shutdown_queue = Queue()
        self._last_status = None
        self._last_status_received: Optional[float] = None
        self.logger = logging.getLogger("openwpm")
        self.storage_controller = StorageController(
            structured_storage,
            unstructured_storage,
            status_queue=self.status_queue,
            completion_queue=self.completion_queue,
            shutdown_queue=self.shutdown_queue,
        )
Example #4
0
    def test_multiprocess(self, tmpdir):
        # Set up loggingserver
        log_file = self.get_logfile_path(str(tmpdir))
        openwpm_logger = mp_logger.MPLogger(log_file)

        child_process_1 = Process(target=child_proc, args=(0, ))
        child_process_1.daemon = True
        child_process_1.start()
        child_process_2 = Process(target=child_proc, args=(1, ))
        child_process_2.daemon = True
        child_process_2.start()

        # Send some sample logs
        logger.info(PARENT_INFO_STR_1)
        logger.error(PARENT_ERROR_STR)
        logger.critical(PARENT_CRITICAL_STR)
        logger.debug(PARENT_DEBUG_STR)
        logger.warning(PARENT_WARNING_STR)

        logger1 = logging.getLogger("test1")
        logger2 = logging.getLogger("test2")
        logger1.info(NAMED_LOGGER_INFO_1)
        logger2.info(NAMED_LOGGER_INFO_2)

        # Close the logging server
        time.sleep(2)  # give some time for logs to be sent
        openwpm_logger.close()
        child_process_1.join()
        child_process_2.join()
        print("Child processes joined...")

        log_content = self.get_logfile_contents(log_file)
        for child in range(2):
            assert log_content.count(CHILD_INFO_STR_1 % child) == 1
            assert log_content.count(CHILD_INFO_STR_2 % child) == 1
            assert log_content.count(CHILD_ERROR_STR % child) == 1
            assert log_content.count(CHILD_CRITICAL_STR % child) == 1
            assert log_content.count(CHILD_DEBUG_STR % child) == 1
            assert log_content.count(CHILD_WARNING_STR % child) == 1
        assert log_content.count(PARENT_INFO_STR_1) == 1
        assert log_content.count(PARENT_ERROR_STR) == 1
        assert log_content.count(PARENT_CRITICAL_STR) == 1
        assert log_content.count(PARENT_DEBUG_STR) == 1
        assert log_content.count(PARENT_WARNING_STR) == 1
Example #5
0
    def test_child_process_with_exception(self, tmpdir):
        log_file = self.get_logfile_path(str(tmpdir))
        openwpm_logger = mp_logger.MPLogger(log_file)

        child_process_1 = Process(target=child_proc_with_exception, args=(0, ))
        child_process_1.daemon = True
        child_process_1.start()
        child_process_2 = Process(target=child_proc_with_exception, args=(1, ))
        child_process_2.daemon = True
        child_process_2.start()

        # Close the logging server
        time.sleep(2)  # give some time for logs to be sent
        child_process_1.join()
        child_process_2.join()
        print("Child processes joined...")
        openwpm_logger.close()

        log_content = self.get_logfile_contents(log_file)
        for child in range(2):
            assert log_content.count(CHILD_INFO_STR_1 % child) == 1
            assert log_content.count(CHILD_INFO_STR_2 % child) == 1
            assert log_content.count(CHILD_EXCEPTION_STR % child) == 1
Example #6
0
class StorageControllerHandle:
    """This class contains all methods relevant for the TaskManager
    to interact with the StorageController
    """
    def __init__(
        self,
        structured_storage: StructuredStorageProvider,
        unstructured_storage: Optional[UnstructuredStorageProvider],
    ) -> None:

        self.listener_address: Optional[Tuple[str, int]] = None
        self.listener_process: Optional[Process] = None
        self.status_queue = Queue()
        self.completion_queue = Queue()
        self.shutdown_queue = Queue()
        self._last_status = None
        self._last_status_received: Optional[float] = None
        self.logger = logging.getLogger("openwpm")
        self.storage_controller = StorageController(
            structured_storage,
            unstructured_storage,
            status_queue=self.status_queue,
            completion_queue=self.completion_queue,
            shutdown_queue=self.shutdown_queue,
        )

    def get_next_visit_id(self) -> VisitId:
        """Generate visit id as randomly generated positive integer less than 2^53.

        Parquet can support integers up to 64 bits, but Javascript can only
        represent integers up to 53 bits:
        https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Number/MAX_SAFE_INTEGER
        Thus, we cap these values at 53 bits.
        """
        return VisitId(random.getrandbits(53))

    def get_next_browser_id(self) -> BrowserId:
        """Generate crawl id as randomly generated positive 32bit integer

        Note: Parquet's partitioned dataset reader only supports integer
        partition columns up to 32 bits.
        """
        return BrowserId(random.getrandbits(32))

    def save_configuration(
        self,
        manager_params: ManagerParamsInternal,
        browser_params: List[BrowserParamsInternal],
        openwpm_version: str,
        browser_version: str,
    ) -> None:
        assert self.listener_address is not None
        sock = DataSocket(self.listener_address)
        task_id = random.getrandbits(32)
        sock.store_record(
            TableName("task"),
            INVALID_VISIT_ID,
            {
                "task_id": task_id,
                "manager_params": manager_params.to_json(),
                "openwpm_version": openwpm_version,
                "browser_version": browser_version,
            },
        )
        # Record browser details for each browser
        for browser_param in browser_params:
            sock.store_record(
                TableName("crawl"),
                INVALID_VISIT_ID,
                {
                    "browser_id": browser_param.browser_id,
                    "task_id": task_id,
                    "browser_params": browser_param.to_json(),
                },
            )
        sock.finalize_visit_id(INVALID_VISIT_ID, success=True)

    def launch(self) -> None:
        """Starts the storage controller"""
        self.storage_controller = Process(
            name="StorageController",
            target=StorageController.run,
            args=(self.storage_controller, ),
        )
        self.storage_controller.daemon = True
        self.storage_controller.start()

        self.listener_address = self.status_queue.get()

    def get_new_completed_visits(self) -> List[Tuple[int, bool]]:
        """
        Returns a list of all visit ids that have been processed since
        the last time the method was called and whether or not they
        ran successfully.

        This method will return an empty list in case no visit ids have
        been processed since the last time this method was called
        """
        finished_visit_ids = list()
        while not self.completion_queue.empty():
            finished_visit_ids.append(self.completion_queue.get())
        return finished_visit_ids

    def shutdown(self, relaxed: bool = True) -> None:
        """Terminate the storage controller process"""
        assert isinstance(self.storage_controller, Process)
        self.logger.debug(
            "Sending the shutdown signal to the Storage Controller...")
        self.shutdown_queue.put((SHUTDOWN_SIGNAL, relaxed))
        start_time = time.time()
        self.storage_controller.join(300)
        self.logger.debug("%s took %s seconds to close." %
                          (type(self).__name__, str(time.time() - start_time)))

    def get_most_recent_status(self) -> int:
        """Return the most recent queue size sent from the Storage Controller process"""

        # Block until we receive the first status update
        if self._last_status is None:
            return self.get_status()

        # Drain status queue until we receive most recent update
        while not self.status_queue.empty():
            self._last_status = self.status_queue.get()
            self._last_status_received = time.time()

        # Check last status signal
        if (time.time() - self._last_status_received) > STATUS_TIMEOUT:
            raise RuntimeError(
                "No status update from the storage controller process "
                "for %d seconds." % (time.time() - self._last_status_received))

        return self._last_status

    def get_status(self) -> int:
        """Get listener process status. If the status queue is empty, block."""
        try:
            self._last_status = self.status_queue.get(block=True,
                                                      timeout=STATUS_TIMEOUT)
            self._last_status_received = time.time()
        except queue.Empty:
            assert self._last_status_received is not None
            raise RuntimeError(
                "No status update from the storage controller process "
                "for %d seconds." % (time.time() - self._last_status_received))
        assert isinstance(self._last_status, int)
        return self._last_status