Exemple #1
0
    def test_extension_gets_correct_visit_id(self) -> None:
        url_a = utilities.BASE_TEST_URL + "/simple_a.html"
        url_b = utilities.BASE_TEST_URL + "/simple_b.html"
        self.visit(url_a)
        db = self.visit(url_b)

        qry_res = db_utils.query_db(
            db, "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        simple_a_visit_id = db_utils.query_db(
            db,
            "SELECT visit_id FROM javascript WHERE symbol=?",
            ("window.navigator.userAgent", ),
        )

        simple_b_visit_id = db_utils.query_db(
            db,
            "SELECT visit_id FROM javascript WHERE symbol=?",
            ("window.navigator.platform", ),
        )

        assert visit_ids[url_a] == simple_a_visit_id[0][0]
        assert visit_ids[url_b] == simple_b_visit_id[0][0]
Exemple #2
0
    def test_extension_gets_correct_visit_id(self):
        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)

        url_a = utilities.BASE_TEST_URL + "/simple_a.html"
        url_b = utilities.BASE_TEST_URL + "/simple_b.html"

        manager.get(url_a)
        manager.get(url_b)
        manager.close()
        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        simple_a_visit_id = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM javascript WHERE "
            "symbol=?",
            ("window.navigator.userAgent", ),
        )

        simple_b_visit_id = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM javascript WHERE "
            "symbol=?",
            ("window.navigator.platform", ),
        )

        assert visit_ids[url_a] == simple_a_visit_id[0][0]
        assert visit_ids[url_b] == simple_b_visit_id[0][0]
Exemple #3
0
    def test_page_visit(self):
        test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
        db = self.visit(test_url)

        request_id_to_url = dict()

        # HTTP Requests
        rows = db_utils.query_db(db, "SELECT * FROM http_requests")
        observed_records = set()
        for row in rows:
            observed_records.add(
                (
                    row["url"].split("?")[0],
                    row["top_level_url"],
                    row["triggering_origin"],
                    row["loading_origin"],
                    row["loading_href"],
                    row["is_XHR"],
                    row["is_third_party_channel"],
                    row["is_third_party_to_top_window"],
                    row["resource_type"],
                )
            )

            request_id_to_url[row["request_id"]] = row["url"]
        assert HTTP_REQUESTS == observed_records

        # HTTP Responses
        rows = db_utils.query_db(db, "SELECT * FROM http_responses")
        observed_records: Set[Tuple[str, str]] = set()
        for row in rows:
            observed_records.add(
                (
                    row["url"].split("?")[0],
                    # TODO: webext-instrumentation doesn't support referrer
                    # yet | row['referrer'],
                    row["location"],
                )
            )
            assert row["request_id"] in request_id_to_url
            assert request_id_to_url[row["request_id"]] == row["url"]
        assert HTTP_RESPONSES == observed_records

        # HTTP Redirects
        rows = db_utils.query_db(db, "SELECT * FROM http_redirects")
        observed_records = set()
        for row in rows:
            # TODO: webext instrumentation doesn't support new_request_id yet
            # src = request_id_to_url[row['old_request_id']].split('?')[0]
            # dst = request_id_to_url[row['new_request_id']].split('?')[0]
            src = row["old_request_url"].split("?")[0]
            dst = row["new_request_url"].split("?")[0]
            headers = json.loads(row["headers"])
            location = None
            for header, value in headers:
                if header.lower() == "location":
                    location = value
                    break
            observed_records.add((src, dst, location))
        assert HTTP_REDIRECTS == observed_records
Exemple #4
0
def test_profile_recovery(monkeypatch, default_params, task_manager_creator,
                          testcase, stateful, seed_tar):
    """Test browser profile recovery in various scenarios."""
    manager_params, browser_params = default_params
    manager_params.num_browsers = 1
    browser_params[0].seed_tar = seed_tar
    manager, db = task_manager_creator((manager_params, browser_params[:1]))
    manager.get(BASE_TEST_URL, reset=not stateful)

    if testcase == "normal_operation":
        pass
    elif testcase == "on_crash":
        # Cause a selenium crash to force browser to restart
        manager.get("example.com", reset=not stateful)
    elif testcase == "on_crash_during_launch":
        # Cause a selenium crash to force browser to restart
        manager.get("example.com", reset=not stateful)
        # This will cause browser restarts to fail
        monkeypatch.setenv("FIREFOX_BINARY", "/tmp/NOTREAL")

        # Let the launch succeed after some failed launch attempts
        def undo_monkeypatch():
            time.sleep(5)  # This should be smaller than _SPAWN_TIMEOUT
            monkeypatch.undo()

        Thread(target=undo_monkeypatch).start()
    elif testcase == "on_timeout":
        # Set a very low timeout to cause a restart
        manager.get("about:config", reset=not stateful, timeout=0.1)

    cs = CommandSequence("about:config", reset=not stateful)
    expected_value = True if seed_tar else False
    cs.append_command(AssertConfigSetCommand("test_pref", expected_value))
    tar_directory = manager_params.data_directory / "browser_profile"
    tar_path = tar_directory / "profile.tar.gz"
    cs.dump_profile(tar_path, True)
    manager.execute_command_sequence(cs)
    manager.close()

    # Check that a consistent profile is used for stateful crawls but
    # not for stateless crawls
    with tarfile.open(tar_path) as tar:
        tar.extractall(tar_directory)
    ff_db = tar_directory / "places.sqlite"
    rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
    places = [url for (url, ) in rows]
    if stateful:
        assert BASE_TEST_URL in places
    else:
        assert BASE_TEST_URL not in places

    # Check if seed_tar was loaded on restart
    rows = db_utils.query_db(
        db,
        "SELECT command_status FROM crawl_history WHERE"
        " command='AssertConfigSetCommand'",
    )
    assert rows[0][0] == "ok"
    def test_get_http_tables_valid(self, display_mode):
        """Check that get works and populates http tables correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = command_sequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = command_sequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_requests"
            " WHERE url = ?",
            (url_a, ),
        )
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_requests"
            " WHERE url = ?",
            (url_b, ),
        )
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_responses"
            " WHERE url = ?",
            (url_a, ),
        )
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_responses"
            " WHERE url = ?",
            (url_b, ),
        )
        assert qry_res[0][0] == visit_ids[url_b]
def test_get_site_visits_table_valid(http_params, task_manager_creator,
                                     display_mode):
    """Check that get works and populates db correctly."""
    # Run the test crawl
    manager_params, browser_params = http_params(display_mode)
    manager, db = task_manager_creator((manager_params, browser_params))

    # Set up two sequential get commands to two URLS
    cs_a = command_sequence.CommandSequence(url_a)
    cs_a.get(sleep=1)
    cs_b = command_sequence.CommandSequence(url_b)
    cs_b.get(sleep=1)

    # Perform the get commands
    manager.execute_command_sequence(cs_a)
    manager.execute_command_sequence(cs_b)
    manager.close()

    qry_res = db_utils.query_db(
        db,
        "SELECT site_url FROM site_visits ORDER BY site_url",
    )

    # We had two separate page visits
    assert len(qry_res) == 2

    assert qry_res[0][0] == url_a
    assert qry_res[1][0] == url_b
Exemple #7
0
    def test_browse_site_visits_table_valid(self, display_mode):
        """Check that CommandSequence.browse() populates db correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)

        # Set up two sequential browse commands to two URLS
        cs_a = command_sequence.CommandSequence(url_a, site_rank=0)
        cs_a.browse(num_links=1, sleep=1)
        cs_b = command_sequence.CommandSequence(url_b, site_rank=1)
        cs_b.browse(num_links=1, sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params["db"], "SELECT site_url, site_rank"
            " FROM site_visits")

        # We had two separate page visits
        assert len(qry_res) == 2

        assert qry_res[0][0] == url_a
        assert qry_res[0][1] == 0
        assert qry_res[1][0] == url_b
        assert qry_res[1][1] == 1
Exemple #8
0
    def test_get_site_visits_table_valid(self, display_mode):
        """Check that get works and populates db correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = command_sequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = command_sequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        # Perform the get commands
        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(manager_params["db"],
                                    "SELECT site_url FROM site_visits")

        # We had two separate page visits
        assert len(qry_res) == 2

        assert qry_res[0][0] == url_a
        assert qry_res[1][0] == url_b
Exemple #9
0
 def get_post_requests_from_db(self, db):
     """Query the crawl database and return the POST requests."""
     return db_utils.query_db(
         db,
         "SELECT * FROM http_requests\
                                    WHERE method = 'POST'",
     )
def test_browse_site_visits_table_valid(http_params, task_manager_creator,
                                        display_mode):
    """Check that CommandSequence.browse() populates db correctly."""
    # Run the test crawl
    manager_params, browser_params = http_params(display_mode)
    manager, db = task_manager_creator((manager_params, browser_params))

    # Set up two sequential browse commands to two URLS
    cs_a = command_sequence.CommandSequence(url_a, site_rank=0)
    cs_a.browse(num_links=1, sleep=1)
    cs_b = command_sequence.CommandSequence(url_b, site_rank=1)
    cs_b.browse(num_links=1, sleep=1)

    manager.execute_command_sequence(cs_a)
    manager.execute_command_sequence(cs_b)
    manager.close()

    qry_res = db_utils.query_db(
        db,
        "SELECT site_url, site_rank FROM site_visits ORDER BY site_rank",
    )

    # We had two separate page visits
    assert len(qry_res) == 2

    assert qry_res[0][0] == url_a
    assert qry_res[0][1] == 0
    assert qry_res[1][0] == url_b
    assert qry_res[1][1] == 1
Exemple #11
0
 def test_http_stacktrace(self):
     test_url = utilities.BASE_TEST_URL + "/http_stacktrace.html"
     manager_params, browser_params = self.get_config()
     manager = task_manager.TaskManager(manager_params, browser_params)
     manager.get(test_url, sleep=10)
     db = manager_params["db"]
     manager.close()
     rows = db_utils.query_db(
         db,
         (
             "SELECT hr.url, c.call_stack"
             "   FROM callstacks c"
             "   JOIN http_requests hr"
             "   ON c.request_id=hr.request_id"
             "      AND c.visit_id= hr.visit_id"
             "      AND c.browser_id = hr.browser_id;"
         ),
     )
     print("Printing callstacks contents")
     observed_records = set()
     for row in rows:
         print(row["call_stack"])
         url, call_stack = row
         test_urls = (
             "inject_pixel.js",
             "test_image.png",
             "Blank.gif",
         )
         if url.endswith(test_urls):
             observed_records.add(call_stack)
     assert HTTP_STACKTRACES == observed_records
Exemple #12
0
def test_seed_persistence(default_params, task_manager_creator):
    manager_params, browser_params = default_params
    p = Path("profile.tar.gz")
    for browser_param in browser_params:
        browser_param.seed_tar = p
    manager, db = task_manager_creator(default_params)

    command_sequences = []
    for _ in range(2):
        cs = CommandSequence(url=BASE_TEST_URL)
        cs.get()
        cs.append_command(AssertConfigSetCommand("test_pref", True))
        command_sequences.append(cs)

    for cs in command_sequences:
        manager.execute_command_sequence(cs)
    manager.close()
    query_result = db_utils.query_db(
        db,
        "SELECT * FROM crawl_history;",
    )
    assert len(query_result) > 0
    for row in query_result:
        assert row[
            "command_status"] == "ok", f"Command {tuple(row)} was not ok"
Exemple #13
0
def test_custom_function(default_params, xpi, server):
    """ Test `custom_function` with an inline func that collects links """
    table_name = TableName("page_links")

    manager_params, browser_params = default_params
    path = manager_params.data_directory / "crawl-data.sqlite"
    db = sqlite3.connect(path)
    cur = db.cursor()

    cur.execute(
        """CREATE TABLE IF NOT EXISTS %s (
            top_url TEXT, link TEXT,
            visit_id INTEGER, browser_id INTEGER);"""
        % table_name
    )
    cur.close()
    db.close()

    storage_provider = SQLiteStorageProvider(path)
    manager = TaskManager(manager_params, browser_params, storage_provider, None)
    cs = command_sequence.CommandSequence(url_a)
    cs.get(sleep=0, timeout=60)
    cs.append_command(CollectLinksCommand(table_name, "http"))
    manager.execute_command_sequence(cs)
    manager.close()
    query_result = db_utils.query_db(
        path,
        "SELECT top_url, link FROM page_links;",
        as_tuple=True,
    )
    assert PAGE_LINKS == set(query_result)
Exemple #14
0
    def test_content_saving(self, tmpdir):
        """ check that content is saved and hashed correctly """
        test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
        manager_params, browser_params = self.get_test_config(str(tmpdir))
        browser_params[0]["http_instrument"] = True
        browser_params[0]["save_content"] = True
        manager = task_manager.TaskManager(manager_params, browser_params)
        manager.get(url=test_url, sleep=1)
        manager.close()
        db = manager_params["db"]
        rows = db_utils.query_db(db, "SELECT * FROM http_responses;")
        disk_content = dict()
        for row in rows:
            if "MAGIC_REDIRECT" in row["url"] or "404" in row["url"]:
                continue
            path = urlparse(row["url"]).path
            with open(os.path.join(BASE_PATH, path[1:]), "rb") as f:
                content = f.read()
            chash = sha256(content).hexdigest()
            assert chash == row["content_hash"]
            disk_content[chash] = content

        ldb_content = dict()
        for chash, content in db_utils.get_content(str(tmpdir)):
            chash = chash.decode("ascii")
            ldb_content[chash] = content

        for k, v in disk_content.items():
            assert v == ldb_content[k]
Exemple #15
0
    def test_service_worker_requests(self):
        """Check correct URL attribution for requests made by service worker"""
        test_url = utilities.BASE_TEST_URL + "/http_service_worker_page.html"
        db = self.visit(test_url)

        request_id_to_url = dict()

        # HTTP Requests
        rows = db_utils.query_db(db, "SELECT * FROM http_requests")
        observed_records = set()
        for row in rows:
            observed_records.add((
                row["url"].split("?")[0],
                row["top_level_url"],
                row["triggering_origin"],
                row["loading_origin"],
                row["loading_href"],
                row["is_XHR"],
                row["is_third_party_channel"],
                row["is_third_party_to_top_window"],
                row["resource_type"],
            ))
            request_id_to_url[row["request_id"]] = row["url"]

        assert HTTP_SERVICE_WORKER_REQUESTS == observed_records
Exemple #16
0
 def test_name_resolution(self):
     db = self.visit("http://localtest.me:8000")
     result = db_utils.query_db(db, "SELECT * FROM dns_responses")
     result = result[0]
     print(result.keys())
     assert result["used_address"] == "127.0.0.1"
     assert result["addresses"] == "127.0.0.1"
     assert result["hostname"] == "localtest.me"
     assert result["canonical_name"] == "localtest.me"
Exemple #17
0
 def test_property_enumeration(self) -> None:
     test_url = utilities.BASE_TEST_URL + "/property_enumeration.html"
     db = self.visit(test_url)
     rows = db_utils.query_db(db,
                              "SELECT script_url, symbol FROM javascript")
     observed_symbols = set()
     for script_url, symbol in rows:
         assert script_url == test_url
         observed_symbols.add(symbol)
     assert PROPERTIES == observed_symbols
    def test_custom_function(self):
        """ Test `custom_function` with an inline func that collects links """

        from openwpm.socket_interface import ClientSocket

        def collect_links(table_name, scheme, **kwargs):
            """ Collect links with `scheme` and save in table `table_name` """
            driver = kwargs["driver"]
            manager_params = kwargs["manager_params"]
            browser_id = kwargs["command"].browser_id
            visit_id = kwargs["command"].visit_id
            link_urls = [
                x
                for x in (element.get_attribute("href")
                          for element in driver.find_elements_by_tag_name("a"))
                if x.startswith(scheme + "://")
            ]
            current_url = driver.current_url

            sock = ClientSocket()
            sock.connect(*manager_params.aggregator_address)

            query = ("CREATE TABLE IF NOT EXISTS %s ("
                     "top_url TEXT, link TEXT, "
                     "visit_id INTEGER, browser_id INTEGER);" % table_name)
            sock.send(("create_table", query))

            for link in link_urls:
                query = (
                    table_name,
                    {
                        "top_url": current_url,
                        "link": link,
                        "visit_id": visit_id,
                        "browser_id": browser_id,
                    },
                )
                sock.send(query)
            sock.close()

        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        cs = command_sequence.CommandSequence(url_a)
        cs.get(sleep=0, timeout=60)
        cs.run_custom_function(collect_links, ("page_links", "http"))
        manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params.database_name,
            "SELECT top_url, link FROM page_links;",
            as_tuple=True,
        )
        assert PAGE_LINKS == set(query_result)
Exemple #19
0
def test_parse_neterror_integration(default_params, task_manager_creator):
    manager, db = task_manager_creator(default_params)
    manager.get("http://website.invalid")
    manager.close()

    get_command = db_utils.query_db(
        db,
        "SELECT command_status, error FROM crawl_history WHERE command ='GetCommand'",
        as_tuple=True,
    )[0]

    assert get_command[0] == "neterror"
    assert get_command[1] == "dnsNotFound"
Exemple #20
0
    def test_parse_neterror_integration(self):
        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        manager.get("http://website.invalid")
        manager.close()

        get_command = db_utils.query_db(
            manager_params["db"],
            "SELECT command_status, error FROM crawl_history WHERE command = \"<class 'openwpm.commands.types.GetCommand'>\"",
            as_tuple=True,
        )[0]

        assert get_command[0] == "neterror"
        assert get_command[1] == "dnsNotFound"
Exemple #21
0
def test_name_resolution(default_params, task_manager_creator):
    manager_params, browser_params = default_params
    for browser_param in browser_params:
        browser_param.dns_instrument = True

    manager, db = task_manager_creator((manager_params, browser_params))
    manager.get("http://localtest.me:8000")
    manager.close()

    result = db_utils.query_db(db, "SELECT * FROM dns_responses")
    result = result[0]
    assert result["used_address"] == "127.0.0.1"
    assert result["addresses"] == "127.0.0.1"
    assert result["hostname"] == "localtest.me"
    assert result["canonical_name"] == "localtest.me"
Exemple #22
0
    def test_custom_function(self):
        """ Test `custom_function` with an inline func that collects links """

        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        cs = command_sequence.CommandSequence(url_a)
        cs.get(sleep=0, timeout=60)
        cs.append_command(CollectLinksCommand("http", "page_links"))
        manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params.database_name,
            "SELECT top_url, link FROM page_links;",
            as_tuple=True,
        )
        assert PAGE_LINKS == set(query_result)
Exemple #23
0
def test_command_duration(default_params, task_manager_creator):
    manager, db = task_manager_creator(default_params)
    manager.get(url=TEST_URL, sleep=5)
    manager.close()

    get_command = db_utils.query_db(
        db,
        "SELECT duration FROM crawl_history WHERE command = 'GetCommand'",
        as_tuple=True,
    )[0]

    assert get_command[0] > (5 * 1000
                             )  # milliseconds conversion for sleep time
    assert get_command[0] <= (
        (5 * 1000) + 2 * 1000
    )  # milliseconds conversion for sleep time + time duration a command took (milliseconds)
Exemple #24
0
    def test_command_duration(self):
        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        manager.get(url=TEST_URL, sleep=5)
        manager.close()

        get_command = db_utils.query_db(
            manager_params.database_name,
            "SELECT duration FROM crawl_history WHERE command = \"<class 'openwpm.commands.types.GetCommand'>\"",
            as_tuple=True,
        )[0]

        assert get_command[0] > (5 * 1000
                                 )  # milliseconds conversion for sleep time
        assert get_command[0] <= (
            (5 * 1000) + 2 * 1000
        )  # milliseconds conversion for sleep time + time duration a command took (milliseconds)
Exemple #25
0
 def test_seed_persistance(self):
     manager_params, browser_params = self.get_test_config(num_browsers=1)
     browser_params[0].seed_tar = "."
     command_sequences = []
     for _ in range(2):
         cs = CommandSequence(url="https://example.com", reset=True)
         cs.get()
         cs.append_command(TestConfigSetCommand("test_pref", True))
         command_sequences.append(cs)
     manager = TaskManager(manager_params, browser_params)
     for cs in command_sequences:
         manager.execute_command_sequence(cs)
     manager.close()
     query_result = db_utils.query_db(
         manager_params.database_name,
         "SELECT * FROM crawl_history;",
     )
     assert len(query_result) > 0
     for row in query_result:
         assert row[
             "command_status"] == "ok", f"Command {tuple(row)} was not ok"
 def test_js_profile_cookies(self):
     """ Check that profile cookies set by JS are saved """
     # Run the test crawl
     manager_params, browser_params = self.get_config()
     browser_params[0].cookie_instrument = True
     manager = task_manager.TaskManager(manager_params, browser_params)
     url = utilities.BASE_TEST_URL + "/js_cookie.html"
     cs = command_sequence.CommandSequence(url)
     cs.get(sleep=3, timeout=120)
     manager.execute_command_sequence(cs)
     manager.close()
     # Check that the JS cookie we stored is recorded
     qry_res = db_utils.query_db(
         manager_params.database_name,
         ("SELECT visit_id, record_type, change_cause, is_http_only, "
          "is_host_only, is_session, host, is_secure, name, path, "
          "value, same_site FROM javascript_cookies"),
         as_tuple=True,
     )
     assert len(qry_res) == 1  # we store only one cookie
     cookies = qry_res[0]  # take the first cookie
     # compare URL, domain, name, value, origin, path
     assert cookies == expected_js_cookie
def test_http_stacktrace(default_params, task_manager_creator):
    manager_params, browser_params = default_params
    for browser_param in browser_params:
        # Record HTTP Requests and Responses
        browser_param.http_instrument = True
        # Record JS Web API calls
        browser_param.js_instrument = True
        # Record the callstack of all WebRequests made
        browser_param.callstack_instrument = True
    test_url = utilities.BASE_TEST_URL + "/http_stacktrace.html"
    manager, db = task_manager_creator((manager_params, browser_params))
    manager.get(test_url, sleep=10)
    manager.close()
    rows = db_utils.query_db(
        db,
        (
            "SELECT hr.url, c.call_stack"
            "   FROM callstacks c"
            "   JOIN http_requests hr"
            "   ON c.request_id=hr.request_id"
            "      AND c.visit_id= hr.visit_id"
            "      AND c.browser_id = hr.browser_id;"
        ),
    )
    print("Printing callstacks contents")
    observed_records = set()
    for row in rows:
        print(row["call_stack"])
        url, call_stack = row
        test_urls = (
            "inject_pixel.js",
            "test_image.png",
            "Blank.gif",
        )
        if url.endswith(test_urls):
            observed_records.add(call_stack)
    assert HTTP_STACKTRACES == observed_records
Exemple #28
0
def test_content_saving(http_params, xpi, server):
    """ check that content is saved and hashed correctly """
    test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
    manager_params, browser_params = http_params()
    for browser_param in browser_params:
        browser_param.http_instrument = True
        browser_param.save_content = True
    db = manager_params.data_directory / "crawl-data.sqlite"
    structured_storage = SQLiteStorageProvider(db_path=db)
    ldb_path = Path(manager_params.data_directory) / "content.ldb"
    unstructured_storage = LevelDbProvider(db_path=ldb_path)
    manager = task_manager.TaskManager(manager_params, browser_params,
                                       structured_storage,
                                       unstructured_storage)
    manager.get(url=test_url, sleep=1)
    manager.close()

    rows = db_utils.query_db(db, "SELECT * FROM http_responses;")
    disk_content = dict()
    for row in rows:
        if "MAGIC_REDIRECT" in row["url"] or "404" in row["url"]:
            continue
        path = urlparse(row["url"]).path
        with open(os.path.join(BASE_PATH, path[1:]), "rb") as f:
            content = f.read()
        chash = sha256(content).hexdigest()
        assert chash == row["content_hash"]
        disk_content[chash] = content

    ldb_content = dict()
    for chash, content in db_utils.get_content(ldb_path):
        chash = chash.decode("ascii")
        ldb_content[chash] = content

    for k, v in disk_content.items():
        assert v == ldb_content[k]
Exemple #29
0
    def test_seed_persistance(self):
        def test_config_is_set(*args, **kwargs):
            driver = kwargs["driver"]
            driver.get("about:config")
            result = driver.execute_script("""
                var prefs = Components
                            .classes["@mozilla.org/preferences-service;1"]
                            .getService(Components.interfaces.nsIPrefBranch);
                try {
                    return prefs.getBoolPref("test_pref")
                } catch (e) {
                    return false;
                }
            """)
            assert result

        manager_params, browser_params = self.get_test_config(num_browsers=1)
        browser_params[0]["seed_tar"] = "."
        command_sequences = []
        for _ in range(2):
            cs = CommandSequence(url="https://example.com", reset=True)
            cs.get()
            cs.run_custom_function(test_config_is_set)
            command_sequences.append(cs)
        manager = task_manager.TaskManager(manager_params, browser_params)
        for cs in command_sequences:
            manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params["db"],
            "SELECT * FROM crawl_history;",
        )
        assert len(query_result) > 0
        for row in query_result:
            assert row[
                "command_status"] == "ok", f"Command {tuple(row)} was not ok"
def test_browse_wrapper_http_table_valid(http_params, task_manager_creator,
                                         display_mode):
    """Check that TaskManager.browse() wrapper works and populates
    http tables correctly.

    NOTE: Since the browse command is choosing links randomly, there is a
          (very small -- 2*0.5^20) chance this test will fail with valid
          code.
    """
    # Run the test crawl
    manager_params, browser_params = http_params(display_mode)
    manager, db = task_manager_creator((manager_params, browser_params))

    # Set up two sequential browse commands to two URLS
    manager.browse(url_a, num_links=20, sleep=1)
    manager.browse(url_b, num_links=1, sleep=1)
    manager.close()

    qry_res = db_utils.query_db(db,
                                "SELECT visit_id, site_url FROM site_visits")

    # Construct dict mapping site_url to visit_id
    visit_ids = dict()
    for row in qry_res:
        visit_ids[row[1]] = row[0]

    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_requests WHERE url = ?",
        (url_a, ),
    )
    assert qry_res[0][0] == visit_ids[url_a]

    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_requests WHERE url = ?",
        (url_b, ),
    )
    assert qry_res[0][0] == visit_ids[url_b]

    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_responses WHERE url = ?",
        (url_a, ),
    )
    assert qry_res[0][0] == visit_ids[url_a]

    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_responses WHERE url = ?",
        (url_b, ),
    )
    assert qry_res[0][0] == visit_ids[url_b]

    # Page simple_a.html has three links:
    # 1) An absolute link to simple_c.html
    # 2) A relative link to simple_d.html
    # 3) A javascript: link
    # 4) A link to www.google.com
    # 5) A link to example.com?localhost
    # We should see page visits for 1 and 2, but not 3-5.
    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_responses WHERE url = ?",
        (url_c, ),
    )
    assert qry_res[0][0] == visit_ids[url_a]
    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_responses WHERE url = ?",
        (url_d, ),
    )
    assert qry_res[0][0] == visit_ids[url_a]

    # We expect 4 urls: a,c,d and a favicon request
    qry_res = db_utils.query_db(
        db,
        "SELECT COUNT(DISTINCT url) FROM http_responses WHERE visit_id = ?",
        (visit_ids[url_a], ),
    )
    assert qry_res[0][0] == 4