Ejemplo n.º 1
0
 def test_saving(self):
     manager_params, browser_params = self.get_config()
     manager = task_manager.TaskManager(manager_params, browser_params)
     manager.get("http://example.com")
     manager.close()
     assert isfile(
         join(browser_params[0]["profile_archive_dir"], "profile.tar.gz"))
Ejemplo n.º 2
0
 def test_http_stacktrace(self):
     test_url = utilities.BASE_TEST_URL + "/http_stacktrace.html"
     manager_params, browser_params = self.get_config()
     manager = task_manager.TaskManager(manager_params, browser_params)
     manager.get(test_url, sleep=10)
     db = manager_params["db"]
     manager.close()
     rows = db_utils.query_db(
         db,
         (
             "SELECT hr.url, c.call_stack"
             "   FROM callstacks c"
             "   JOIN http_requests hr"
             "   ON c.request_id=hr.request_id"
             "      AND c.visit_id= hr.visit_id"
             "      AND c.browser_id = hr.browser_id;"
         ),
     )
     print("Printing callstacks contents")
     observed_records = set()
     for row in rows:
         print(row["call_stack"])
         url, call_stack = row
         test_urls = (
             "inject_pixel.js",
             "test_image.png",
             "Blank.gif",
         )
         if url.endswith(test_urls):
             observed_records.add(call_stack)
     assert HTTP_STACKTRACES == observed_records
Ejemplo n.º 3
0
    def test_extension_gets_correct_visit_id(self):
        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)

        url_a = utilities.BASE_TEST_URL + "/simple_a.html"
        url_b = utilities.BASE_TEST_URL + "/simple_b.html"

        manager.get(url_a)
        manager.get(url_b)
        manager.close()
        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        simple_a_visit_id = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM javascript WHERE "
            "symbol=?",
            ("window.navigator.userAgent", ),
        )

        simple_b_visit_id = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM javascript WHERE "
            "symbol=?",
            ("window.navigator.platform", ),
        )

        assert visit_ids[url_a] == simple_a_visit_id[0][0]
        assert visit_ids[url_b] == simple_b_visit_id[0][0]
Ejemplo n.º 4
0
    def test_content_saving(self, tmpdir):
        """ check that content is saved and hashed correctly """
        test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
        manager_params, browser_params = self.get_test_config(str(tmpdir))
        browser_params[0]["http_instrument"] = True
        browser_params[0]["save_content"] = True
        manager = task_manager.TaskManager(manager_params, browser_params)
        manager.get(url=test_url, sleep=1)
        manager.close()
        db = manager_params["db"]
        rows = db_utils.query_db(db, "SELECT * FROM http_responses;")
        disk_content = dict()
        for row in rows:
            if "MAGIC_REDIRECT" in row["url"] or "404" in row["url"]:
                continue
            path = urlparse(row["url"]).path
            with open(os.path.join(BASE_PATH, path[1:]), "rb") as f:
                content = f.read()
            chash = sha256(content).hexdigest()
            assert chash == row["content_hash"]
            disk_content[chash] = content

        ldb_content = dict()
        for chash, content in db_utils.get_content(str(tmpdir)):
            chash = chash.decode("ascii")
            ldb_content[chash] = content

        for k, v in disk_content.items():
            assert v == ldb_content[k]
Ejemplo n.º 5
0
def test_document_saving(http_params, xpi, server):
    """ check that document content is saved and hashed correctly """
    test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
    expected_hashes = {
        "2390eceab422db15bc45940b7e042e83e6cbd5f279f57e714bc4ad6cded7f966",
        "25343f42d9ffa5c082745f775b172db87d6e14dfbc3160b48669e06d727bfc8d",
    }
    manager_params, browser_params = http_params()
    for browser_param in browser_params:
        browser_param.http_instrument = True
        browser_param.save_content = "main_frame,sub_frame"

    structured_storage = SQLiteStorageProvider(
        db_path=manager_params.data_directory / "crawl-data.sqlite")
    ldb_path = Path(manager_params.data_directory) / "content.ldb"
    unstructured_storage = LevelDbProvider(db_path=ldb_path)
    manager = task_manager.TaskManager(manager_params, browser_params,
                                       structured_storage,
                                       unstructured_storage)

    manager.get(url=test_url, sleep=1)
    manager.close()
    for chash, content in db_utils.get_content(ldb_path):
        chash = chash.decode("ascii").lower()
        pyhash = sha256(content).hexdigest().lower()
        assert pyhash == chash  # Verify expected key (sha256 of content)
        assert chash in expected_hashes
        expected_hashes.remove(chash)
    assert len(expected_hashes) == 0  # All expected hashes have been seen
Ejemplo n.º 6
0
def test_javascript_saving(http_params, xpi, server):
    """ check that javascript content is saved and hashed correctly """
    test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
    manager_params, browser_params = http_params()

    for browser_param in browser_params:
        browser_param.http_instrument = True
        browser_param.save_content = "script"

    structured_storage = SQLiteStorageProvider(
        db_path=manager_params.data_directory / "crawl-data.sqlite")
    ldb_path = Path(manager_params.data_directory) / "content.ldb"
    unstructured_storage = LevelDbProvider(db_path=ldb_path)
    manager = task_manager.TaskManager(manager_params, browser_params,
                                       structured_storage,
                                       unstructured_storage)
    manager.get(url=test_url, sleep=1)
    manager.close()
    expected_hashes = {
        "0110c0521088c74f179615cd7c404816816126fa657550032f75ede67a66c7cc",
        "b34744034cd61e139f85f6c4c92464927bed8343a7ac08acf9fb3c6796f80f08",
    }
    for chash, content in db_utils.get_content(ldb_path):
        chash = chash.decode("ascii").lower()
        pyhash = sha256(content).hexdigest().lower()
        assert pyhash == chash  # Verify expected key (sha256 of content)
        assert chash in expected_hashes
        expected_hashes.remove(chash)
    assert len(expected_hashes) == 0  # All expected hashes have been seen
Ejemplo n.º 7
0
    def test_browse_site_visits_table_valid(self, display_mode):
        """Check that CommandSequence.browse() populates db correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)

        # Set up two sequential browse commands to two URLS
        cs_a = command_sequence.CommandSequence(url_a, site_rank=0)
        cs_a.browse(num_links=1, sleep=1)
        cs_b = command_sequence.CommandSequence(url_b, site_rank=1)
        cs_b.browse(num_links=1, sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params["db"], "SELECT site_url, site_rank"
            " FROM site_visits")

        # We had two separate page visits
        assert len(qry_res) == 2

        assert qry_res[0][0] == url_a
        assert qry_res[0][1] == 0
        assert qry_res[1][0] == url_b
        assert qry_res[1][1] == 1
Ejemplo n.º 8
0
    def test_save_screenshot_valid(self, display_mode):
        """Check that 'save_screenshot' works"""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)
        cs = command_sequence.CommandSequence(url_a)
        cs.get(sleep=1)
        cs.save_screenshot("test")
        cs.screenshot_full_page("test_full")
        manager.execute_command_sequence(cs)
        manager.close()

        # Check that viewport image is not blank
        pattern = os.path.join(str(self.tmpdir), "screenshots", "1-*-test.png")
        screenshot = glob.glob(pattern)[0]
        im = Image.open(screenshot)
        bands = im.split()
        is_blank = all(band.getextrema() == (255, 255) for band in bands)
        assert not is_blank

        # Check that full page screenshot is not blank
        pattern = os.path.join(str(self.tmpdir), "screenshots",
                               "1-*-test_full.png")
        screenshot = glob.glob(pattern)[0]
        im = Image.open(screenshot)
        bands = im.split()
        is_blank = all(band.getextrema() == (255, 255) for band in bands)
        assert not is_blank
Ejemplo n.º 9
0
    def test_get_site_visits_table_valid(self, display_mode):
        """Check that get works and populates db correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = command_sequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = command_sequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        # Perform the get commands
        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(manager_params["db"],
                                    "SELECT site_url FROM site_visits")

        # We had two separate page visits
        assert len(qry_res) == 2

        assert qry_res[0][0] == url_a
        assert qry_res[1][0] == url_b
Ejemplo n.º 10
0
 def test_crash(self):
     manager_params, browser_params = self.get_config()
     manager_params["failure_limit"] = 0
     manager = task_manager.TaskManager(manager_params, browser_params)
     with pytest.raises(CommandExecutionError):
         manager.get("http://example.com")  # So we have a profile
         manager.get("example.com")  # Selenium requires scheme prefix
         manager.get("example.com")  # Requires two commands to shut down
Ejemplo n.º 11
0
 def visit(self, page_url, data_dir="", sleep_after=0):
     """Visit a test page with the given parameters."""
     manager_params, browser_params = self.get_config(data_dir)
     manager = task_manager.TaskManager(manager_params, browser_params)
     if not page_url.startswith("http"):
         page_url = utilities.BASE_TEST_URL + page_url
     manager.get(url=page_url, sleep=sleep_after)
     manager.close()
     return manager_params.database_name
Ejemplo n.º 12
0
    def test_get_http_tables_valid(self, display_mode):
        """Check that get works and populates http tables correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = command_sequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = command_sequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_requests"
            " WHERE url = ?",
            (url_a, ),
        )
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_requests"
            " WHERE url = ?",
            (url_b, ),
        )
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_responses"
            " WHERE url = ?",
            (url_a, ),
        )
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_responses"
            " WHERE url = ?",
            (url_b, ),
        )
        assert qry_res[0][0] == visit_ids[url_b]
    def test_record_file_upload(self):
        """Test that we correctly capture the uploaded file contents.

        We upload a CSS file and a PNG file to test both text based and
        binary files.

        File uploads are not expected in the crawl data, but we make sure we
        correctly parse the POST data in this very common scenario.

        Firefox is currently not able to return the FormData with the file
        contents, currently only the filenames are returned. This is due to
        a limitation in the current API implementation:

        https://searchfox.org/mozilla-central/rev/b3b401254229f0a26f7ee625ef5f09c6c31e3949/toolkit/components/extensions/webrequest/WebRequestUpload.jsm#339

        Therefore, the test is currently skipped.
        """
        img_file_path = os.path.abspath("test_pages/shared/test_image.png")
        css_file_path = os.path.abspath("test_pages/shared/test_style.css")

        def type_filenames_into_form(**kwargs):
            """Simulate typing into the file upload input fields."""
            driver = kwargs["driver"]
            img_file_upload_element = driver.find_element_by_id("upload-img")
            css_file_upload_element = driver.find_element_by_id("upload-css")
            img_file_upload_element.send_keys(img_file_path)
            css_file_upload_element.send_keys(css_file_path)
            sleep(5)  # wait for the form submission (3 sec after onload)

        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        test_url = utilities.BASE_TEST_URL + "/post_file_upload.html"
        cs = command_sequence.CommandSequence(test_url)
        cs.get(sleep=0, timeout=60)
        cs.run_custom_function(type_filenames_into_form, ())
        manager.execute_command_sequence(cs)
        manager.close()

        post_body = self.get_post_request_body_from_db(
            manager_params.database_name)
        # Binary strings get put into the database as-if they were latin-1.
        with open(img_file_path, "rb") as f:
            img_file_content = f.read().strip().decode("latin-1")
        with open(css_file_path, "rt") as f:
            css_file_content = f.read().strip()
        # POST data is stored as JSON in the DB
        post_body_decoded = json.loads(post_body)
        expected_body = {
            u"username": u"name surname+",
            u"upload-css": css_file_content,
            u"upload-img": img_file_content,
        }
        assert expected_body == post_body_decoded
    def test_custom_function(self):
        """ Test `custom_function` with an inline func that collects links """

        from openwpm.socket_interface import ClientSocket

        def collect_links(table_name, scheme, **kwargs):
            """ Collect links with `scheme` and save in table `table_name` """
            driver = kwargs["driver"]
            manager_params = kwargs["manager_params"]
            browser_id = kwargs["command"].browser_id
            visit_id = kwargs["command"].visit_id
            link_urls = [
                x
                for x in (element.get_attribute("href")
                          for element in driver.find_elements_by_tag_name("a"))
                if x.startswith(scheme + "://")
            ]
            current_url = driver.current_url

            sock = ClientSocket()
            sock.connect(*manager_params.aggregator_address)

            query = ("CREATE TABLE IF NOT EXISTS %s ("
                     "top_url TEXT, link TEXT, "
                     "visit_id INTEGER, browser_id INTEGER);" % table_name)
            sock.send(("create_table", query))

            for link in link_urls:
                query = (
                    table_name,
                    {
                        "top_url": current_url,
                        "link": link,
                        "visit_id": visit_id,
                        "browser_id": browser_id,
                    },
                )
                sock.send(query)
            sock.close()

        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        cs = command_sequence.CommandSequence(url_a)
        cs.get(sleep=0, timeout=60)
        cs.run_custom_function(collect_links, ("page_links", "http"))
        manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params.database_name,
            "SELECT top_url, link FROM page_links;",
            as_tuple=True,
        )
        assert PAGE_LINKS == set(query_result)
Ejemplo n.º 15
0
 def test_crash_profile(self):
     manager_params, browser_params = self.get_config()
     manager_params["failure_limit"] = 2
     manager = task_manager.TaskManager(manager_params, browser_params)
     try:
         manager.get("http://example.com")  # So we have a profile
         manager.get("example.com")  # Selenium requires scheme prefix
         manager.get("example.com")  # Selenium requires scheme prefix
         manager.get("example.com")  # Selenium requires scheme prefix
         manager.get("example.com")  # Requires two commands to shut down
     except CommandExecutionError:
         pass
     assert isfile(
         join(browser_params[0]["profile_archive_dir"], "profile.tar.gz"))
Ejemplo n.º 16
0
 def test_commit_on_timeout(self):
     TEST_SITE = "%s/s3_aggregator.html" % BASE_TEST_URL
     manager_params, browser_params = self.get_config(num_browsers=1)
     manager_params.s3_directory = "s3-aggregator-tests-2"
     manager = task_manager.TaskManager(manager_params, browser_params)
     manager.get(TEST_SITE, sleep=1)
     dataset = LocalS3Dataset(manager_params.s3_bucket, manager_params.s3_directory)
     with pytest.raises((FileNotFoundError, OSError)):
         requests = dataset.load_table("http_requests")
     time.sleep(45)  # Current timeout
     dataset2 = LocalS3Dataset(manager_params.s3_bucket, manager_params.s3_directory)
     requests = dataset2.load_table("http_requests")
     assert TEST_SITE in requests.top_level_url.unique()
     manager.close()
Ejemplo n.º 17
0
    def test_parse_neterror_integration(self):
        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        manager.get("http://website.invalid")
        manager.close()

        get_command = db_utils.query_db(
            manager_params["db"],
            "SELECT command_status, error FROM crawl_history WHERE command = \"<class 'openwpm.commands.types.GetCommand'>\"",
            as_tuple=True,
        )[0]

        assert get_command[0] == "neterror"
        assert get_command[1] == "dnsNotFound"
Ejemplo n.º 18
0
    def test_custom_function(self):
        """ Test `custom_function` with an inline func that collects links """

        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        cs = command_sequence.CommandSequence(url_a)
        cs.get(sleep=0, timeout=60)
        cs.append_command(CollectLinksCommand("http", "page_links"))
        manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params.database_name,
            "SELECT top_url, link FROM page_links;",
            as_tuple=True,
        )
        assert PAGE_LINKS == set(query_result)
Ejemplo n.º 19
0
    def test_command_duration(self):
        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        manager.get(url=TEST_URL, sleep=5)
        manager.close()

        get_command = db_utils.query_db(
            manager_params.database_name,
            "SELECT duration FROM crawl_history WHERE command = \"<class 'openwpm.commands.types.GetCommand'>\"",
            as_tuple=True,
        )[0]

        assert get_command[0] > (5 * 1000
                                 )  # milliseconds conversion for sleep time
        assert get_command[0] <= (
            (5 * 1000) + 2 * 1000
        )  # milliseconds conversion for sleep time + time duration a command took (milliseconds)
Ejemplo n.º 20
0
 def visit(self,
           page_url: str,
           data_dir: Optional[Path] = None,
           sleep_after: int = 0) -> Path:
     """Visit a test page with the given parameters."""
     manager_params, browser_params = self.get_config(data_dir)
     if data_dir:
         db_path = data_dir / "crawl-data.sqlite"
     else:
         db_path = self.tmpdir / "crawl-data.sqlite"
     structured_provider = SQLiteStorageProvider(db_path)
     manager = task_manager.TaskManager(manager_params, browser_params,
                                        structured_provider, None)
     if not page_url.startswith("http"):
         page_url = utilities.BASE_TEST_URL + page_url
     manager.get(url=page_url, sleep=sleep_after)
     manager.close()
     return db_path
Ejemplo n.º 21
0
    def test_recursive_dump_page_source_valid(self, display_mode):
        """Check that 'recursive_dump_page_source' works"""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)
        cs = command_sequence.CommandSequence(NESTED_FRAMES_URL)
        cs.get(sleep=1)
        cs.recursive_dump_page_source()
        manager.execute_command_sequence(cs)
        manager.close()

        outfile = os.path.join(str(self.tmpdir), "sources", "1-*.json.gz")
        src_file = glob.glob(outfile)[0]
        with gzip.GzipFile(src_file, "rb") as f:
            visit_source = json.loads(f.read().decode("utf-8"))

        observed_parents = dict()

        def verify_frame(frame, parent_frames=[]):
            # Verify structure
            observed_parents[frame["doc_url"]] = list(parent_frames)  # copy

            # Verify source
            path = urlparse(frame["doc_url"]).path
            expected_source = ""
            with open("." + path, "r") as f:
                expected_source = re.sub(r"\s", "", f.read().lower())
                if expected_source.startswith("<!doctypehtml>"):
                    expected_source = expected_source[14:]
            observed_source = re.sub(r"\s", "", frame["source"].lower())
            if observed_source.startswith("<!doctypehtml>"):
                observed_source = observed_source[14:]
            assert observed_source == expected_source

            # Verify children
            parent_frames.append(frame["doc_url"])
            for key, child_frame in frame["iframes"].items():
                verify_frame(child_frame, parent_frames)
            parent_frames.pop()

        verify_frame(visit_source)
        assert EXPECTED_PARENTS == observed_parents
Ejemplo n.º 22
0
    def test_basic_properties(self):
        TEST_SITE = "%s/s3_aggregator.html" % BASE_TEST_URL
        NUM_VISITS = 2
        NUM_BROWSERS = 4
        manager_params, browser_params = self.get_config(num_browsers=NUM_BROWSERS)
        manager = task_manager.TaskManager(manager_params, browser_params)
        for _ in range(NUM_VISITS * NUM_BROWSERS):
            manager.get(TEST_SITE, sleep=1)
        manager.close()

        dataset = LocalS3Dataset(
            manager_params["s3_bucket"], manager_params["s3_directory"]
        )

        # Test visit_id consistency
        visit_ids = defaultdict(set)
        expected_tables = dict(PQ_SCHEMAS)
        # We don't expect incomplete visits to exist
        # since the visit shouldn't be interrupted
        expected_tables.pop("incomplete_visits")
        for table_name in expected_tables:
            table = dataset.load_table(table_name)
            visit_ids[table_name] = table.visit_id.unique()
            actual = len(visit_ids[table_name])
            expected = NUM_VISITS * NUM_BROWSERS
            assert actual == expected, (
                f"Table {table_name} had {actual} " f"visit_ids, we expected {expected}"
            )
            for vid in visit_ids[table_name]:
                assert (vid >= 0) and (vid < (1 << 53))
        for table_name, ids in visit_ids.items():
            assert set(ids) == set(visit_ids["site_visits"])

        # Ensure http table is created
        assert TEST_SITE in dataset.load_table("http_requests").top_level_url.unique()

        # Ensure config directory is created and contains the correct number
        # of configuration files
        config_file = dataset.list_files("config", prepend_root=True)
        assert len(config_file) == 1  # only one instance started in test
        config = json.loads(str(dataset.get_file(config_file[0]), "utf-8"))
        assert len(config["browser_params"]) == NUM_BROWSERS
Ejemplo n.º 23
0
 def test_document_saving(self, tmpdir):
     """ check that document content is saved and hashed correctly """
     test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
     expected_hashes = {
         "2390eceab422db15bc45940b7e042e83e6cbd5f279f57e714bc4ad6cded7f966",
         "25343f42d9ffa5c082745f775b172db87d6e14dfbc3160b48669e06d727bfc8d",
     }
     manager_params, browser_params = self.get_test_config(str(tmpdir))
     browser_params[0]["http_instrument"] = True
     browser_params[0]["save_content"] = "main_frame,sub_frame"
     manager = task_manager.TaskManager(manager_params, browser_params)
     manager.get(url=test_url, sleep=1)
     manager.close()
     for chash, content in db_utils.get_content(str(tmpdir)):
         chash = chash.decode("ascii").lower()
         pyhash = sha256(content).hexdigest().lower()
         assert pyhash == chash  # Verify expected key (sha256 of content)
         assert chash in expected_hashes
         expected_hashes.remove(chash)
     assert len(expected_hashes) == 0  # All expected hashes have been seen
Ejemplo n.º 24
0
 def test_javascript_saving(self, tmpdir):
     """ check that javascript content is saved and hashed correctly """
     test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
     manager_params, browser_params = self.get_test_config(str(tmpdir))
     browser_params[0]["http_instrument"] = True
     browser_params[0]["save_content"] = "script"
     manager = task_manager.TaskManager(manager_params, browser_params)
     manager.get(url=test_url, sleep=1)
     manager.close()
     expected_hashes = {
         "0110c0521088c74f179615cd7c404816816126fa657550032f75ede67a66c7cc",
         "b34744034cd61e139f85f6c4c92464927bed8343a7ac08acf9fb3c6796f80f08",
     }
     for chash, content in db_utils.get_content(str(tmpdir)):
         chash = chash.decode("ascii").lower()
         pyhash = sha256(content).hexdigest().lower()
         assert pyhash == chash  # Verify expected key (sha256 of content)
         assert chash in expected_hashes
         expected_hashes.remove(chash)
     assert len(expected_hashes) == 0  # All expected hashes have been seen
Ejemplo n.º 25
0
    def test_s3_callbacks(self):
        TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
        manager_params, browser_params = self.get_config()
        dataset = LocalS3Dataset(manager_params.s3_bucket, manager_params.s3_directory)
        manager = task_manager.TaskManager(manager_params, browser_params)
        queue = Queue()

        def ensure_site_in_s3(success: bool):
            # Ensure http table is created
            queue.put(
                TEST_SITE in dataset.load_table("http_requests").top_level_url.unique()
            )

        sequence = CommandSequence(
            TEST_SITE, reset=True, blocking=True, callback=ensure_site_in_s3
        )
        sequence.get()
        manager.execute_command_sequence(sequence)
        manager.close()

        assert queue.get()
Ejemplo n.º 26
0
    def test_dump_page_source_valid(self, display_mode):
        """Check that 'dump_page_source' works and source is saved properly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)
        cs = command_sequence.CommandSequence(url_a)
        cs.get(sleep=1)
        cs.dump_page_source(suffix="test")
        manager.execute_command_sequence(cs)
        manager.close()

        # Source filename is of the follow structure:
        # `sources/<visit_id>-<md5_of_url>(-suffix).html`
        # thus for this test we expect `sources/1-<md5_of_test_url>-test.html`.
        outfile = os.path.join(str(self.tmpdir), "sources", "1-*-test.html")
        source_file = glob.glob(outfile)[0]
        with open(source_file, "rb") as f:
            actual_source = f.read()
        with open("./test_pages/expected_source.html", "rb") as f:
            expected_source = f.read()

        assert actual_source == expected_source
Ejemplo n.º 27
0
    def test_profile_saved_when_launch_crashes(self):
        manager_params, browser_params = self.get_config()
        browser_params[0]["proxy"] = True
        browser_params[0]["save_content"] = "script"
        manager = task_manager.TaskManager(manager_params, browser_params)
        manager.get("http://example.com")

        # Kill the LevelDBAggregator
        # This will cause the proxy launch to crash
        manager.ldb_status_queue.put("DIE")
        manager.browsers[0]._SPAWN_TIMEOUT = 2  # Have timeout occur quickly
        manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2  # Quick timeout
        manager.get("example.com")  # Cause a selenium crasht

        # The browser will fail to launch due to the proxy crashes
        try:
            manager.get("http://example.com")
        except CommandExecutionError:
            pass
        manager.close()
        assert isfile(
            join(browser_params[0]["profile_archive_dir"], "profile.tar.gz"))
Ejemplo n.º 28
0
 def test_js_profile_cookies(self):
     """ Check that profile cookies set by JS are saved """
     # Run the test crawl
     manager_params, browser_params = self.get_config()
     browser_params[0].cookie_instrument = True
     manager = task_manager.TaskManager(manager_params, browser_params)
     url = utilities.BASE_TEST_URL + "/js_cookie.html"
     cs = command_sequence.CommandSequence(url)
     cs.get(sleep=3, timeout=120)
     manager.execute_command_sequence(cs)
     manager.close()
     # Check that the JS cookie we stored is recorded
     qry_res = db_utils.query_db(
         manager_params.database_name,
         ("SELECT visit_id, record_type, change_cause, is_http_only, "
          "is_host_only, is_session, host, is_secure, name, path, "
          "value, same_site FROM javascript_cookies"),
         as_tuple=True,
     )
     assert len(qry_res) == 1  # we store only one cookie
     cookies = qry_res[0]  # take the first cookie
     # compare URL, domain, name, value, origin, path
     assert cookies == expected_js_cookie
Ejemplo n.º 29
0
def test_content_saving(http_params, xpi, server):
    """ check that content is saved and hashed correctly """
    test_url = utilities.BASE_TEST_URL + "/http_test_page.html"
    manager_params, browser_params = http_params()
    for browser_param in browser_params:
        browser_param.http_instrument = True
        browser_param.save_content = True
    db = manager_params.data_directory / "crawl-data.sqlite"
    structured_storage = SQLiteStorageProvider(db_path=db)
    ldb_path = Path(manager_params.data_directory) / "content.ldb"
    unstructured_storage = LevelDbProvider(db_path=ldb_path)
    manager = task_manager.TaskManager(manager_params, browser_params,
                                       structured_storage,
                                       unstructured_storage)
    manager.get(url=test_url, sleep=1)
    manager.close()

    rows = db_utils.query_db(db, "SELECT * FROM http_responses;")
    disk_content = dict()
    for row in rows:
        if "MAGIC_REDIRECT" in row["url"] or "404" in row["url"]:
            continue
        path = urlparse(row["url"]).path
        with open(os.path.join(BASE_PATH, path[1:]), "rb") as f:
            content = f.read()
        chash = sha256(content).hexdigest()
        assert chash == row["content_hash"]
        disk_content[chash] = content

    ldb_content = dict()
    for chash, content in db_utils.get_content(ldb_path):
        chash = chash.decode("ascii")
        ldb_content[chash] = content

    for k, v in disk_content.items():
        assert v == ldb_content[k]
Ejemplo n.º 30
0
    def test_seed_persistance(self):
        def test_config_is_set(*args, **kwargs):
            driver = kwargs["driver"]
            driver.get("about:config")
            result = driver.execute_script("""
                var prefs = Components
                            .classes["@mozilla.org/preferences-service;1"]
                            .getService(Components.interfaces.nsIPrefBranch);
                try {
                    return prefs.getBoolPref("test_pref")
                } catch (e) {
                    return false;
                }
            """)
            assert result

        manager_params, browser_params = self.get_test_config(num_browsers=1)
        browser_params[0]["seed_tar"] = "."
        command_sequences = []
        for _ in range(2):
            cs = CommandSequence(url="https://example.com", reset=True)
            cs.get()
            cs.run_custom_function(test_config_is_set)
            command_sequences.append(cs)
        manager = task_manager.TaskManager(manager_params, browser_params)
        for cs in command_sequences:
            manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params["db"],
            "SELECT * FROM crawl_history;",
        )
        assert len(query_result) > 0
        for row in query_result:
            assert row[
                "command_status"] == "ok", f"Command {tuple(row)} was not ok"