Esempio n. 1
0
def test_seed_persistence(default_params, task_manager_creator):
    manager_params, browser_params = default_params
    p = Path("profile.tar.gz")
    for browser_param in browser_params:
        browser_param.seed_tar = p
    manager, db = task_manager_creator(default_params)

    command_sequences = []
    for _ in range(2):
        cs = CommandSequence(url=BASE_TEST_URL)
        cs.get()
        cs.append_command(AssertConfigSetCommand("test_pref", True))
        command_sequences.append(cs)

    for cs in command_sequences:
        manager.execute_command_sequence(cs)
    manager.close()
    query_result = db_utils.query_db(
        db,
        "SELECT * FROM crawl_history;",
    )
    assert len(query_result) > 0
    for row in query_result:
        assert row[
            "command_status"] == "ok", f"Command {tuple(row)} was not ok"
Esempio n. 2
0
def test_profile_recovery(monkeypatch, default_params, task_manager_creator,
                          testcase, stateful, seed_tar):
    """Test browser profile recovery in various scenarios."""
    manager_params, browser_params = default_params
    manager_params.num_browsers = 1
    browser_params[0].seed_tar = seed_tar
    manager, db = task_manager_creator((manager_params, browser_params[:1]))
    manager.get(BASE_TEST_URL, reset=not stateful)

    if testcase == "normal_operation":
        pass
    elif testcase == "on_crash":
        # Cause a selenium crash to force browser to restart
        manager.get("example.com", reset=not stateful)
    elif testcase == "on_crash_during_launch":
        # Cause a selenium crash to force browser to restart
        manager.get("example.com", reset=not stateful)
        # This will cause browser restarts to fail
        monkeypatch.setenv("FIREFOX_BINARY", "/tmp/NOTREAL")

        # Let the launch succeed after some failed launch attempts
        def undo_monkeypatch():
            time.sleep(5)  # This should be smaller than _SPAWN_TIMEOUT
            monkeypatch.undo()

        Thread(target=undo_monkeypatch).start()
    elif testcase == "on_timeout":
        # Set a very low timeout to cause a restart
        manager.get("about:config", reset=not stateful, timeout=0.1)

    cs = CommandSequence("about:config", reset=not stateful)
    expected_value = True if seed_tar else False
    cs.append_command(AssertConfigSetCommand("test_pref", expected_value))
    tar_directory = manager_params.data_directory / "browser_profile"
    tar_path = tar_directory / "profile.tar.gz"
    cs.dump_profile(tar_path, True)
    manager.execute_command_sequence(cs)
    manager.close()

    # Check that a consistent profile is used for stateful crawls but
    # not for stateless crawls
    with tarfile.open(tar_path) as tar:
        tar.extractall(tar_directory)
    ff_db = tar_directory / "places.sqlite"
    rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
    places = [url for (url, ) in rows]
    if stateful:
        assert BASE_TEST_URL in places
    else:
        assert BASE_TEST_URL not in places

    # Check if seed_tar was loaded on restart
    rows = db_utils.query_db(
        db,
        "SELECT command_status FROM crawl_history WHERE"
        " command='AssertConfigSetCommand'",
    )
    assert rows[0][0] == "ok"
Esempio n. 3
0
    def test_display_shutdown(self):
        manager_params, browser_params = self.get_config()
        TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
        manager = TaskManager(manager_params, browser_params)
        port = manager.browsers[0].display_port

        sequence = CommandSequence(TEST_SITE)
        sequence.get()
        sequence.append_command(ExceptionCommand)
        manager.execute_command_sequence(sequence)
        manager.close()
        assert not os.path.exists("/tmp/.X%s-lock" % port)
Esempio n. 4
0
def test_assertion_error_propagation(
    task_manager_creator, default_params, testing, expectation
):
    """Test that assertion errors bubble up through the TaskManager when running tests"""
    manager_params, browser_params = default_params
    manager_params.num_browsers = 1
    manager_params.testing = testing
    manager, _ = task_manager_creator((manager_params, browser_params[:1]))
    cs = CommandSequence("http://example.com", blocking=True)
    cs.append_command(CrashingAssertionCommand())
    with expectation:
        with manager:
            manager.execute_command_sequence(cs)
def test_display_shutdown(task_manager_creator, default_params):
    """Test the XVFB display option to see if it runs and deletes the lockfile upon shutdown"""
    manager_params, browser_params = default_params
    for browser_param in browser_params:
        browser_param.display_mode = "xvfb"
    TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
    manager, db = task_manager_creator((manager_params, browser_params))
    port = manager.browsers[0].display_port

    sequence = CommandSequence(TEST_SITE)
    sequence.get()
    sequence.append_command(ExceptionCommand())
    manager.execute_command_sequence(sequence)
    manager.close()
    assert not os.path.exists("/tmp/.X%s-lock" % port)
Esempio n. 6
0
 def test_seed_persistance(self):
     manager_params, browser_params = self.get_test_config(num_browsers=1)
     browser_params[0].seed_tar = "."
     command_sequences = []
     for _ in range(2):
         cs = CommandSequence(url="https://example.com", reset=True)
         cs.get()
         cs.append_command(TestConfigSetCommand("test_pref", True))
         command_sequences.append(cs)
     manager = TaskManager(manager_params, browser_params)
     for cs in command_sequences:
         manager.execute_command_sequence(cs)
     manager.close()
     query_result = db_utils.query_db(
         manager_params.database_name,
         "SELECT * FROM crawl_history;",
     )
     assert len(query_result) > 0
     for row in query_result:
         assert row[
             "command_status"] == "ok", f"Command {tuple(row)} was not ok"
# memory_watchdog and process_watchdog are useful for large scale cloud crawls.
# Please refer to docs/Configuration.md#platform-configuration-options for more information
# manager_params.memory_watchdog = True
# manager_params.process_watchdog = True

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager(manager_params, browser_params)

# Visits the sites
for site in sites:

    # Parallelize sites over all number of browsers set above.
    command_sequence = CommandSequence(
        site,
        reset=True,
        callback=lambda success, val=site: print("CommandSequence {} done".
                                                 format(val)),
    )

    # Start by visiting the page
    command_sequence.append_command(GetCommand(url=site, sleep=10), timeout=60)
    # Have a look at custom_command.py to see how to implement your own command
    command_sequence.append_command(LinkCountingCommand())

    # Run commands across the three browsers (simple parallelization)
    manager.execute_command_sequence(command_sequence)

# Shuts down the browsers and waits for the data to finish logging
manager.close()
Esempio n. 8
0
def processSite(site):
    # The list of sites that we wish to crawl
    NUM_BROWSERS = 1
    sites = [site]

    # Loads the default ManagerParams
    # and NUM_BROWSERS copies of the default BrowserParams

    manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i].http_instrument = True
        # Record cookie changes
        browser_params[i].cookie_instrument = True
        # Record Navigations
        browser_params[i].navigation_instrument = True
        # Record JS Web API calls
        browser_params[i].js_instrument = True
        # Record the callstack of all WebRequests made
        browser_params[i].callstack_instrument = True
        # Record DNS resolution
        browser_params[i].dns_instrument = True

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params.data_directory = Path("./datadir/")
    manager_params.log_directory = Path("./datadir/")

    # memory_watchdog and process_watchdog are useful for large scale cloud crawls.
    # Please refer to docs/Configuration.md#platform-configuration-options for more information
    # manager_params.memory_watchdog = True
    # manager_params.process_watchdog = True

    # Commands time out by default after 60 seconds
    with TaskManager(
            manager_params,
            browser_params,
            SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")),
            None,
    ) as manager:
        # Visits the sites
        for index, site in enumerate(sites):

            def callback(success: bool, val: str = site) -> None:
                print(
                    f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
                )

            # Parallelize sites over all number of browsers set above.
            command_sequence = CommandSequence(
                site,
                site_rank=index,
                callback=callback,
            )
            # Start by visiting the page
            command_sequence.append_command(GetCommand(url=site, sleep=3),
                                            timeout=60)
            # Have a look at custom_command.py to see how to implement your own command
            command_sequence.append_command(LinkCountingCommand())
            # Run commands across all browsers (simple parallelization)
            manager.execute_command_sequence(command_sequence)
    return (None)
Esempio n. 9
0
def crawl(sites, db_filename):
    '''
    sites ihe list of sites that we wish to crawl
    db_filename is the file name of the output database
    '''

    # Loads the default ManagerParams
    # and NUM_BROWSERS copies of the default BrowserParams
    NUM_BROWSERS = 12

    manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i].http_instrument = True
        # Record cookie changes
        browser_params[i].cookie_instrument = True
        # Record Navigations
        browser_params[i].navigation_instrument = True
        # Record JS Web API calls
        browser_params[i].js_instrument = True
        # Record the callstack of all WebRequests made
        browser_params[i].callstack_instrument = True
        # Record DNS resolution
        browser_params[i].dns_instrument = True

        browser_params[i].bot_mitigation = True

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params.data_directory = Path("./datadir/")
    manager_params.log_directory = Path("./datadir/")

    # Commands time out by default after 60 seconds
    with TaskManager(
            manager_params,
            browser_params,
            SQLiteStorageProvider(
                Path("./datadir/{}.sqlite".format(db_filename))),
            None,
    ) as manager:
        # Visits the sites
        for index, site in enumerate(sites):

            def callback(success: bool, val: str = site) -> None:
                print(
                    f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
                )

            # Parallelize sites over all number of browsers set above.
            command_sequence = CommandSequence(
                site,
                site_rank=index,
                reset=True,
                callback=callback,
            )

            # Start by visiting the page
            command_sequence.append_command(GetCommand(url=site, sleep=3),
                                            timeout=60)

            # Run commands across the three browsers (simple parallelization)
            manager.execute_command_sequence(command_sequence)