Exemple #1
0
def test_output_format():
    manager_params = ManagerParams()

    manager_params.output_format = "not None and not int"
    with pytest.raises(ConfigError):
        validate_manager_params(manager_params)

    manager_params.output_format = "s3"
    validate_manager_params(manager_params)
Exemple #2
0
def test_log_file_extension():
    manager_params = ManagerParams()

    manager_params.log_file = "something.unsupported"
    with pytest.raises(ConfigError):
        validate_manager_params(manager_params)

    manager_params.log_file = []
    with pytest.raises(ConfigError):
        validate_manager_params(manager_params)
Exemple #3
0
def test_failure_limit():
    manager_params = ManagerParams()

    manager_params.failure_limit = "not None and not int"
    with pytest.raises(ConfigError):
        validate_manager_params(manager_params)

    manager_params.failure_limit = None  # when failure_limit is set to None
    validate_manager_params(manager_params)

    manager_params.failure_limit = 2  # when failure_limit is set to int
    validate_manager_params(manager_params)
Exemple #4
0
def test_database_file_extension():
    manager_params = ManagerParams()

    manager_params.database_name = "something.unsupported"
    with pytest.raises(ConfigError):
        validate_manager_params(manager_params)
    def __init__(
        self,
        manager_params_temp: ManagerParams,
        browser_params_temp: List[BrowserParams],
        logger_kwargs: Dict[Any, Any] = {},
    ) -> None:
        """Initialize the TaskManager with browser and manager config params

        Parameters
        ----------
        manager_params_temp : ManagerParams
            TaskManager configuration parameters
        browser_params_temp : list of BrowserParams
            Browser configuration parameters. It is a list which
            includes individual configurations for each browser.
        logger_kwargs : dict, optional
            Keyword arguments to pass to MPLogger on initialization.
        """

        validate_manager_params(manager_params_temp)
        for bp in browser_params_temp:
            validate_browser_params(bp)
        validate_crawl_configs(manager_params_temp, browser_params_temp)

        manager_params = ManagerParamsInternal(**manager_params_temp.to_dict())
        browser_params = [
            BrowserParamsInternal(**bp.to_dict()) for bp in browser_params_temp
        ]

        # Make paths absolute in manager_params
        if manager_params.data_directory:
            manager_params.data_directory = os.path.expanduser(
                manager_params.data_directory
            )
        if manager_params.log_directory:
            manager_params.log_directory = os.path.expanduser(
                manager_params.log_directory
            )

        manager_params.database_name = os.path.join(
            manager_params.data_directory, manager_params.database_name
        )
        manager_params.log_file = os.path.join(
            manager_params.log_directory, manager_params.log_file
        )
        manager_params.screenshot_path = os.path.join(
            manager_params.data_directory, "screenshots"
        )
        manager_params.source_dump_path = os.path.join(
            manager_params.data_directory, "sources"
        )
        self.manager_params = manager_params
        self.browser_params = browser_params
        self._logger_kwargs = logger_kwargs

        # Create data directories if they do not exist
        if not os.path.exists(manager_params.screenshot_path):
            os.makedirs(manager_params.screenshot_path)
        if not os.path.exists(manager_params.source_dump_path):
            os.makedirs(manager_params.source_dump_path)

        # Check size of parameter dictionary
        self.num_browsers = manager_params.num_browsers

        # Parse and flesh out js_instrument_settings
        for a_browsers_params in self.browser_params:
            js_settings = a_browsers_params.js_instrument_settings
            cleaned_js_settings = clean_js_instrumentation_settings(js_settings)
            a_browsers_params.js_instrument_settings = cleaned_js_settings

        # Flow control
        self.closing = False
        self.failure_status: Optional[Dict[str, Any]] = None
        self.threadlock = threading.Lock()
        self.failurecount = 0

        if manager_params.failure_limit:
            self.failure_limit = manager_params.failure_limit
        else:
            self.failure_limit = self.num_browsers * 2 + 10

        # Start logging server thread
        self.logging_server = MPLogger(
            self.manager_params.log_file, self.manager_params, **self._logger_kwargs
        )
        self.manager_params.logger_address = self.logging_server.logger_address
        self.logger = logging.getLogger("openwpm")

        # Initialize the data aggregators
        self._launch_aggregators()

        # Sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(browser_params)
        self._launch_browsers()

        # Start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.name = "OpenWPM-watchdog"
        thread.start()

        # Save crawl config information to database
        openwpm_v, browser_v = get_version()
        self.data_aggregator.save_configuration(openwpm_v, browser_v)
        self.logger.info(
            get_configuration_string(
                self.manager_params, browser_params, (openwpm_v, browser_v)
            )
        )
        self.unsaved_command_sequences: Dict[int, CommandSequence] = dict()
        self.callback_thread = threading.Thread(
            target=self._mark_command_sequences_complete, args=()
        )
        self.callback_thread.name = "OpenWPM-completion_handler"
        self.callback_thread.start()