Esempio n. 1
0
def test_output_format():
    manager_params = ManagerParams()

    manager_params.output_format = "not None and not int"
    with pytest.raises(ConfigError):
        validate_manager_params(manager_params)

    manager_params.output_format = "s3"
    validate_manager_params(manager_params)
Esempio n. 2
0
def test_log_file_extension():
    manager_params = ManagerParams()

    manager_params.log_file = "something.unsupported"
    with pytest.raises(ConfigError):
        validate_manager_params(manager_params)

    manager_params.log_file = []
    with pytest.raises(ConfigError):
        validate_manager_params(manager_params)
Esempio n. 3
0
def test_failure_limit():
    manager_params = ManagerParams()

    manager_params.failure_limit = "not None and not int"
    with pytest.raises(ConfigError):
        validate_manager_params(manager_params)

    manager_params.failure_limit = None  # when failure_limit is set to None
    validate_manager_params(manager_params)

    manager_params.failure_limit = 2  # when failure_limit is set to int
    validate_manager_params(manager_params)
Esempio n. 4
0
 def get_test_config(
     self,
     data_dir: Optional[Path] = None,
     num_browsers: int = NUM_BROWSERS,
     display_mode: str = "headless",
 ) -> Tuple[ManagerParams, List[BrowserParams]]:
     """Load and return the default test parameters."""
     if not data_dir:
         data_dir = self.tmpdir
     assert data_dir is not None  # Mypy doesn't understand this without help
     manager_params = ManagerParams(num_browsers=num_browsers)
     browser_params = [BrowserParams() for _ in range(num_browsers)]
     manager_params.log_path = data_dir / "openwpm.log"
     manager_params.num_browsers = num_browsers
     for i in range(num_browsers):
         browser_params[i].display_mode = display_mode
     return manager_params, browser_params
Esempio n. 5
0
def default_params(
    tmp_path: Path, num_browsers: int = NUM_BROWSERS
) -> Tuple[ManagerParams, List[BrowserParams]]:
    """Just a simple wrapper around task_manager.load_default_params"""

    manager_params = ManagerParams(
        num_browsers=NUM_BROWSERS
    )  # num_browsers is necessary to let TaskManager know how many browsers to spawn

    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]
    manager_params.data_directory = tmp_path
    manager_params.log_path = tmp_path / "openwpm.log"
    for i in range(num_browsers):
        browser_params[i].display_mode = "headless"
    return manager_params, browser_params
Esempio n. 6
0
def test_num_browser_crawl_config():
    manager_params = ManagerParams(num_browsers=2)
    browser_params = [BrowserParams()]

    with pytest.raises(ConfigError):
        validate_crawl_configs(manager_params, browser_params)

    browser_params.append(BrowserParams())
    validate_crawl_configs(manager_params, browser_params)
Esempio n. 7
0
 def get_test_config(
     self, data_dir="", num_browsers=NUM_BROWSERS, display_mode="headless"
 ):
     """Load and return the default test parameters."""
     if not data_dir:
         data_dir = self.tmpdir
     manager_params = ManagerParams(num_browsers=num_browsers)
     browser_params = [BrowserParams() for _ in range(num_browsers)]
     manager_params.data_directory = data_dir
     manager_params.log_directory = data_dir
     manager_params.num_browsers = num_browsers
     for i in range(num_browsers):
         browser_params[i].display_mode = display_mode
     manager_params.database_name = join(
         manager_params.data_directory, manager_params.database_name
     )
     return manager_params, browser_params
Esempio n. 8
0
    def __init__(
        self,
        manager_params_temp: ManagerParams,
        browser_params_temp: List[BrowserParams],
        structured_storage_provider: StructuredStorageProvider,
        unstructured_storage_provider: Optional[UnstructuredStorageProvider],
        logger_kwargs: Dict[Any, Any] = {},
    ) -> None:
        """Initialize the TaskManager with browser and manager config params

        Parameters
        ----------
        manager_params_temp : ManagerParams
            TaskManager configuration parameters
        browser_params_temp : list of BrowserParams
            Browser configuration parameters. It is a list which
            includes individual configurations for each browser.
        logger_kwargs : dict, optional
            Keyword arguments to pass to MPLogger on initialization.
        """

        validate_crawl_configs(manager_params_temp, browser_params_temp)
        manager_params = ManagerParamsInternal.from_dict(manager_params_temp.to_dict())
        browser_params = [
            BrowserParamsInternal.from_dict(bp.to_dict()) for bp in browser_params_temp
        ]

        manager_params.screenshot_path = manager_params.data_directory / "screenshots"

        manager_params.source_dump_path = manager_params.data_directory / "sources"

        self.manager_params = manager_params
        self.browser_params = browser_params
        self._logger_kwargs = logger_kwargs

        # Create data directories if they do not exist
        if not os.path.exists(manager_params.screenshot_path):
            os.makedirs(manager_params.screenshot_path)
        if not os.path.exists(manager_params.source_dump_path):
            os.makedirs(manager_params.source_dump_path)

        # Check size of parameter dictionary
        self.num_browsers = manager_params.num_browsers

        # Parse and flesh out js_instrument_settings
        for a_browsers_params in self.browser_params:
            js_settings = a_browsers_params.js_instrument_settings
            cleaned_js_settings = clean_js_instrumentation_settings(js_settings)
            a_browsers_params.cleaned_js_instrument_settings = cleaned_js_settings

        # Flow control
        self.closing = False
        self.failure_status: Optional[Dict[str, Any]] = None
        self.threadlock = threading.Lock()
        self.failure_count = 0

        self.failure_limit = manager_params.failure_limit
        # Start logging server thread
        self.logging_server = MPLogger(
            self.manager_params.log_path,
            str(structured_storage_provider),
            **self._logger_kwargs
        )
        self.manager_params.logger_address = self.logging_server.logger_address
        self.logger = logging.getLogger("openwpm")

        # Initialize the storage controller
        self._launch_storage_controller(
            structured_storage_provider, unstructured_storage_provider
        )

        # Sets up the BrowserManager(s) + associated queues
        self.browsers = self._initialize_browsers(browser_params)
        self._launch_browsers()

        # Start the manager watchdog
        thread = threading.Thread(target=self._manager_watchdog, args=())
        thread.daemon = True
        thread.name = "OpenWPM-watchdog"
        thread.start()

        # Save crawl config information to database
        openwpm_v, browser_v = get_version()
        self.storage_controller_handle.save_configuration(
            manager_params, browser_params, openwpm_v, browser_v
        )
        self.logger.info(
            get_configuration_string(
                self.manager_params, browser_params, (openwpm_v, browser_v)
            )
        )
        self.unsaved_command_sequences: Dict[int, CommandSequence] = dict()
        self.callback_thread = threading.Thread(
            target=self._mark_command_sequences_complete, args=()
        )
        self.callback_thread.name = "OpenWPM-completion_handler"
        self.callback_thread.start()
Esempio n. 9
0
def test_database_file_extension():
    manager_params = ManagerParams()

    manager_params.database_name = "something.unsupported"
    with pytest.raises(ConfigError):
        validate_manager_params(manager_params)
from openwpm.task_manager import TaskManager

#Code gotten from OPENWPM Project with slight modifications

# The list of sites that we wish to crawl
NUM_BROWSERS = 1
sites = []

file = open("SubPagesLess.txt", "r")
for page in file:
    page = page.strip('\n')
    sites.append(page)
# Loads the default ManagerParams
# and NUM_BROWSERS copies of the default BrowserParams
manager_params = ManagerParams(
    num_browsers=NUM_BROWSERS
)  # num_browsers is necessary to let TaskManager know how many browsers to spawn

browser_params = [
    BrowserParams(display_mode="native") for _ in range(NUM_BROWSERS)
]

# Update browser configuration (use this for per-browser settings)
for i in range(NUM_BROWSERS):
    # Record HTTP Requests and Responses
    browser_params[i].http_instrument = True
    # Record cookie changes
    browser_params[i].cookie_instrument = True
    # Record Navigations
    browser_params[i].navigation_instrument = True
    # Record JS Web API calls
Esempio n. 11
0
def processSite(site):
    # The list of sites that we wish to crawl
    NUM_BROWSERS = 1
    sites = [site]

    # Loads the default ManagerParams
    # and NUM_BROWSERS copies of the default BrowserParams

    manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i].http_instrument = True
        # Record cookie changes
        browser_params[i].cookie_instrument = True
        # Record Navigations
        browser_params[i].navigation_instrument = True
        # Record JS Web API calls
        browser_params[i].js_instrument = True
        # Record the callstack of all WebRequests made
        browser_params[i].callstack_instrument = True
        # Record DNS resolution
        browser_params[i].dns_instrument = True

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params.data_directory = Path("./datadir/")
    manager_params.log_directory = Path("./datadir/")

    # memory_watchdog and process_watchdog are useful for large scale cloud crawls.
    # Please refer to docs/Configuration.md#platform-configuration-options for more information
    # manager_params.memory_watchdog = True
    # manager_params.process_watchdog = True

    # Commands time out by default after 60 seconds
    with TaskManager(
            manager_params,
            browser_params,
            SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")),
            None,
    ) as manager:
        # Visits the sites
        for index, site in enumerate(sites):

            def callback(success: bool, val: str = site) -> None:
                print(
                    f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
                )

            # Parallelize sites over all number of browsers set above.
            command_sequence = CommandSequence(
                site,
                site_rank=index,
                callback=callback,
            )
            # Start by visiting the page
            command_sequence.append_command(GetCommand(url=site, sleep=3),
                                            timeout=60)
            # Have a look at custom_command.py to see how to implement your own command
            command_sequence.append_command(LinkCountingCommand())
            # Run commands across all browsers (simple parallelization)
            manager.execute_command_sequence(command_sequence)
    return (None)
Esempio n. 12
0
SENTRY_DSN = os.getenv("SENTRY_DSN", None)
LOGGER_SETTINGS = mp_logger.parse_config_from_env()

if CALLSTACK_INSTRUMENT is True:
    # Must have JS_INSTRUMENT True for CALLSTACK_INSTRUMENT to work
    JS_INSTRUMENT = True

EXTENDED_LEASE_TIME = 2 * (TIMEOUT + DWELL_TIME + 30)

# Loads the default manager params
# We can't use more than one browser per instance because the job management
# code below requires blocking commands. For more context see:
# https://github.com/openwpm/OpenWPM/issues/470
NUM_BROWSERS = 1
manager_params = ManagerParams()
browser_params = [BrowserParams() for _ in range(NUM_BROWSERS)]

# Browser configuration
for i in range(NUM_BROWSERS):
    browser_params[i].display_mode = DISPLAY_MODE
    browser_params[i].http_instrument = HTTP_INSTRUMENT
    browser_params[i].cookie_instrument = COOKIE_INSTRUMENT
    browser_params[i].navigation_instrument = NAVIGATION_INSTRUMENT
    browser_params[i].callstack_instrument = CALLSTACK_INSTRUMENT
    browser_params[i].js_instrument = JS_INSTRUMENT
    browser_params[i].js_instrument_settings = JS_INSTRUMENT_SETTINGS
    if SAVE_CONTENT == "1":
        browser_params[i].save_content = True
    elif SAVE_CONTENT == "0":
        browser_params[i].save_content = False
Esempio n. 13
0
from openwpm.config import BrowserParams, ManagerParams
from openwpm.storage.sql_provider import SQLiteStorageProvider
from openwpm.task_manager import TaskManager

# The list of sites that we wish to crawl
NUM_BROWSERS = 1
sites = [
    "http://www.example.com",
    "http://www.princeton.edu",
    "http://citp.princeton.edu/",
]

# Loads the default ManagerParams
# and NUM_BROWSERS copies of the default BrowserParams

manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
browser_params = [
    BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
]

# Update browser configuration (use this for per-browser settings)
for i in range(NUM_BROWSERS):
    # Record HTTP Requests and Responses
    browser_params[i].http_instrument = True
    # Record cookie changes
    browser_params[i].cookie_instrument = True
    # Record Navigations
    browser_params[i].navigation_instrument = True
    # Record JS Web API calls
    browser_params[i].js_instrument = True
    # Record the callstack of all WebRequests made
Esempio n. 14
0
MAX_JOB_RETRIES = int(os.getenv("MAX_JOB_RETRIES", "2"))

JS_INSTRUMENT_SETTINGS = json.loads(JS_INSTRUMENT_SETTINGS)

if CALLSTACK_INSTRUMENT is True:
    # Must have JS_INSTRUMENT True for CALLSTACK_INSTRUMENT to work
    JS_INSTRUMENT = True

EXTENDED_LEASE_TIME = 2 * (TIMEOUT + DWELL_TIME + 30)

# Loads the default manager params
# We can't use more than one browser per instance because the job management
# code below requires blocking commands. For more context see:
# https://github.com/mozilla/OpenWPM/issues/470
NUM_BROWSERS = 1
manager_params = ManagerParams()
browser_params = [BrowserParams() for _ in range(NUM_BROWSERS)]

# Browser configuration
for i in range(NUM_BROWSERS):
    browser_params[i].display_mode = DISPLAY_MODE
    browser_params[i].http_instrument = HTTP_INSTRUMENT
    browser_params[i].cookie_instrument = COOKIE_INSTRUMENT
    browser_params[i].navigation_instrument = NAVIGATION_INSTRUMENT
    browser_params[i].callstack_instrument = CALLSTACK_INSTRUMENT
    browser_params[i].js_instrument = JS_INSTRUMENT
    browser_params[i].js_instrument_settings = JS_INSTRUMENT_SETTINGS
    if SAVE_CONTENT == "1":
        browser_params[i].save_content = True
    elif SAVE_CONTENT == "0":
        browser_params[i].save_content = False
Esempio n. 15
0
def crawl(sites, db_filename):
    '''
    sites ihe list of sites that we wish to crawl
    db_filename is the file name of the output database
    '''

    # Loads the default ManagerParams
    # and NUM_BROWSERS copies of the default BrowserParams
    NUM_BROWSERS = 12

    manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i].http_instrument = True
        # Record cookie changes
        browser_params[i].cookie_instrument = True
        # Record Navigations
        browser_params[i].navigation_instrument = True
        # Record JS Web API calls
        browser_params[i].js_instrument = True
        # Record the callstack of all WebRequests made
        browser_params[i].callstack_instrument = True
        # Record DNS resolution
        browser_params[i].dns_instrument = True

        browser_params[i].bot_mitigation = True

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params.data_directory = Path("./datadir/")
    manager_params.log_directory = Path("./datadir/")

    # Commands time out by default after 60 seconds
    with TaskManager(
            manager_params,
            browser_params,
            SQLiteStorageProvider(
                Path("./datadir/{}.sqlite".format(db_filename))),
            None,
    ) as manager:
        # Visits the sites
        for index, site in enumerate(sites):

            def callback(success: bool, val: str = site) -> None:
                print(
                    f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
                )

            # Parallelize sites over all number of browsers set above.
            command_sequence = CommandSequence(
                site,
                site_rank=index,
                reset=True,
                callback=callback,
            )

            # Start by visiting the page
            command_sequence.append_command(GetCommand(url=site, sleep=3),
                                            timeout=60)

            # Run commands across the three browsers (simple parallelization)
            manager.execute_command_sequence(command_sequence)