def test_get_site_visits_table_valid(http_params, task_manager_creator,
                                     display_mode):
    """Check that get works and populates db correctly."""
    # Run the test crawl
    manager_params, browser_params = http_params(display_mode)
    manager, db = task_manager_creator((manager_params, browser_params))

    # Set up two sequential get commands to two URLS
    cs_a = command_sequence.CommandSequence(url_a)
    cs_a.get(sleep=1)
    cs_b = command_sequence.CommandSequence(url_b)
    cs_b.get(sleep=1)

    # Perform the get commands
    manager.execute_command_sequence(cs_a)
    manager.execute_command_sequence(cs_b)
    manager.close()

    qry_res = db_utils.query_db(
        db,
        "SELECT site_url FROM site_visits ORDER BY site_url",
    )

    # We had two separate page visits
    assert len(qry_res) == 2

    assert qry_res[0][0] == url_a
    assert qry_res[1][0] == url_b
Beispiel #2
0
    def test_browse_site_visits_table_valid(self, display_mode):
        """Check that CommandSequence.browse() populates db correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)

        # Set up two sequential browse commands to two URLS
        cs_a = command_sequence.CommandSequence(url_a, site_rank=0)
        cs_a.browse(num_links=1, sleep=1)
        cs_b = command_sequence.CommandSequence(url_b, site_rank=1)
        cs_b.browse(num_links=1, sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params["db"], "SELECT site_url, site_rank"
            " FROM site_visits")

        # We had two separate page visits
        assert len(qry_res) == 2

        assert qry_res[0][0] == url_a
        assert qry_res[0][1] == 0
        assert qry_res[1][0] == url_b
        assert qry_res[1][1] == 1
def test_browse_site_visits_table_valid(http_params, task_manager_creator,
                                        display_mode):
    """Check that CommandSequence.browse() populates db correctly."""
    # Run the test crawl
    manager_params, browser_params = http_params(display_mode)
    manager, db = task_manager_creator((manager_params, browser_params))

    # Set up two sequential browse commands to two URLS
    cs_a = command_sequence.CommandSequence(url_a, site_rank=0)
    cs_a.browse(num_links=1, sleep=1)
    cs_b = command_sequence.CommandSequence(url_b, site_rank=1)
    cs_b.browse(num_links=1, sleep=1)

    manager.execute_command_sequence(cs_a)
    manager.execute_command_sequence(cs_b)
    manager.close()

    qry_res = db_utils.query_db(
        db,
        "SELECT site_url, site_rank FROM site_visits ORDER BY site_rank",
    )

    # We had two separate page visits
    assert len(qry_res) == 2

    assert qry_res[0][0] == url_a
    assert qry_res[0][1] == 0
    assert qry_res[1][0] == url_b
    assert qry_res[1][1] == 1
Beispiel #4
0
    def test_get_site_visits_table_valid(self, display_mode):
        """Check that get works and populates db correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = command_sequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = command_sequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        # Perform the get commands
        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(manager_params["db"],
                                    "SELECT site_url FROM site_visits")

        # We had two separate page visits
        assert len(qry_res) == 2

        assert qry_res[0][0] == url_a
        assert qry_res[1][0] == url_b
    def test_get_http_tables_valid(self, display_mode):
        """Check that get works and populates http tables correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config(display_mode)
        manager = task_manager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = command_sequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = command_sequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_requests"
            " WHERE url = ?",
            (url_a, ),
        )
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_requests"
            " WHERE url = ?",
            (url_b, ),
        )
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_responses"
            " WHERE url = ?",
            (url_a, ),
        )
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(
            manager_params.database_name,
            "SELECT visit_id FROM http_responses"
            " WHERE url = ?",
            (url_b, ),
        )
        assert qry_res[0][0] == visit_ids[url_b]
def test_dump_page_source_valid(http_params, task_manager_creator,
                                display_mode):
    """Check that 'dump_page_source' works and source is saved properly."""
    # Run the test crawl
    manager_params, browser_params = http_params(display_mode)
    manager, db = task_manager_creator((manager_params, browser_params))

    cs = command_sequence.CommandSequence(url_a)
    cs.get(sleep=1)
    cs.dump_page_source(suffix="test")
    manager.execute_command_sequence(cs)
    manager.close()

    # Source filename is of the follow structure:
    # `sources/<visit_id>-<md5_of_url>(-suffix).html`
    # thus for this test we expect `sources/1-<md5_of_test_url>-test.html`.
    outfile = os.path.join(manager_params.data_directory, "sources",
                           "*-*-test.html")
    source_file = glob.glob(outfile)[0]
    with open(source_file, "rb") as f:
        actual_source = f.read()
    with open("./test_pages/expected_source.html", "rb") as f:
        expected_source = f.read()

    assert actual_source == expected_source
def test_save_screenshot_valid(http_params, task_manager_creator,
                               display_mode):
    """Check that 'save_screenshot' works"""
    # Run the test crawl
    manager_params, browser_params = http_params(display_mode)
    manager, _ = task_manager_creator((manager_params, browser_params))

    cs = command_sequence.CommandSequence(url_a)
    cs.get(sleep=1)
    cs.save_screenshot("test")
    cs.screenshot_full_page("test_full")
    manager.execute_command_sequence(cs)
    manager.close()

    # Check that viewport image is not blank
    pattern = os.path.join(manager_params.data_directory, "screenshots",
                           "*-*-test.png")
    screenshot = glob.glob(pattern)[0]
    im = Image.open(screenshot)
    bands = im.split()
    is_blank = all(band.getextrema() == (255, 255) for band in bands)
    assert not is_blank

    # Check that full page screenshot is not blank
    pattern = os.path.join(manager_params.data_directory, "screenshots",
                           "*-*-test_full.png")
    screenshot = glob.glob(pattern)[0]
    im = Image.open(screenshot)
    bands = im.split()
    is_blank = all(band.getextrema() == (255, 255) for band in bands)
    assert not is_blank
Beispiel #8
0
def test_custom_function(default_params, xpi, server):
    """ Test `custom_function` with an inline func that collects links """
    table_name = TableName("page_links")

    manager_params, browser_params = default_params
    path = manager_params.data_directory / "crawl-data.sqlite"
    db = sqlite3.connect(path)
    cur = db.cursor()

    cur.execute(
        """CREATE TABLE IF NOT EXISTS %s (
            top_url TEXT, link TEXT,
            visit_id INTEGER, browser_id INTEGER);"""
        % table_name
    )
    cur.close()
    db.close()

    storage_provider = SQLiteStorageProvider(path)
    manager = TaskManager(manager_params, browser_params, storage_provider, None)
    cs = command_sequence.CommandSequence(url_a)
    cs.get(sleep=0, timeout=60)
    cs.append_command(CollectLinksCommand(table_name, "http"))
    manager.execute_command_sequence(cs)
    manager.close()
    query_result = db_utils.query_db(
        path,
        "SELECT top_url, link FROM page_links;",
        as_tuple=True,
    )
    assert PAGE_LINKS == set(query_result)
    def test_record_file_upload(self):
        """Test that we correctly capture the uploaded file contents.

        We upload a CSS file and a PNG file to test both text based and
        binary files.

        File uploads are not expected in the crawl data, but we make sure we
        correctly parse the POST data in this very common scenario.

        Firefox is currently not able to return the FormData with the file
        contents, currently only the filenames are returned. This is due to
        a limitation in the current API implementation:

        https://searchfox.org/mozilla-central/rev/b3b401254229f0a26f7ee625ef5f09c6c31e3949/toolkit/components/extensions/webrequest/WebRequestUpload.jsm#339

        Therefore, the test is currently skipped.
        """
        img_file_path = os.path.abspath("test_pages/shared/test_image.png")
        css_file_path = os.path.abspath("test_pages/shared/test_style.css")

        def type_filenames_into_form(**kwargs):
            """Simulate typing into the file upload input fields."""
            driver = kwargs["driver"]
            img_file_upload_element = driver.find_element_by_id("upload-img")
            css_file_upload_element = driver.find_element_by_id("upload-css")
            img_file_upload_element.send_keys(img_file_path)
            css_file_upload_element.send_keys(css_file_path)
            sleep(5)  # wait for the form submission (3 sec after onload)

        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        test_url = utilities.BASE_TEST_URL + "/post_file_upload.html"
        cs = command_sequence.CommandSequence(test_url)
        cs.get(sleep=0, timeout=60)
        cs.run_custom_function(type_filenames_into_form, ())
        manager.execute_command_sequence(cs)
        manager.close()

        post_body = self.get_post_request_body_from_db(
            manager_params.database_name)
        # Binary strings get put into the database as-if they were latin-1.
        with open(img_file_path, "rb") as f:
            img_file_content = f.read().strip().decode("latin-1")
        with open(css_file_path, "rt") as f:
            css_file_content = f.read().strip()
        # POST data is stored as JSON in the DB
        post_body_decoded = json.loads(post_body)
        expected_body = {
            u"username": u"name surname+",
            u"upload-css": css_file_content,
            u"upload-img": img_file_content,
        }
        assert expected_body == post_body_decoded
    def test_custom_function(self):
        """ Test `custom_function` with an inline func that collects links """

        from openwpm.socket_interface import ClientSocket

        def collect_links(table_name, scheme, **kwargs):
            """ Collect links with `scheme` and save in table `table_name` """
            driver = kwargs["driver"]
            manager_params = kwargs["manager_params"]
            browser_id = kwargs["command"].browser_id
            visit_id = kwargs["command"].visit_id
            link_urls = [
                x
                for x in (element.get_attribute("href")
                          for element in driver.find_elements_by_tag_name("a"))
                if x.startswith(scheme + "://")
            ]
            current_url = driver.current_url

            sock = ClientSocket()
            sock.connect(*manager_params.aggregator_address)

            query = ("CREATE TABLE IF NOT EXISTS %s ("
                     "top_url TEXT, link TEXT, "
                     "visit_id INTEGER, browser_id INTEGER);" % table_name)
            sock.send(("create_table", query))

            for link in link_urls:
                query = (
                    table_name,
                    {
                        "top_url": current_url,
                        "link": link,
                        "visit_id": visit_id,
                        "browser_id": browser_id,
                    },
                )
                sock.send(query)
            sock.close()

        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        cs = command_sequence.CommandSequence(url_a)
        cs.get(sleep=0, timeout=60)
        cs.run_custom_function(collect_links, ("page_links", "http"))
        manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params.database_name,
            "SELECT top_url, link FROM page_links;",
            as_tuple=True,
        )
        assert PAGE_LINKS == set(query_result)
Beispiel #11
0
    def test_custom_function(self):
        """ Test `custom_function` with an inline func that collects links """

        manager_params, browser_params = self.get_config()
        manager = task_manager.TaskManager(manager_params, browser_params)
        cs = command_sequence.CommandSequence(url_a)
        cs.get(sleep=0, timeout=60)
        cs.append_command(CollectLinksCommand("http", "page_links"))
        manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params.database_name,
            "SELECT top_url, link FROM page_links;",
            as_tuple=True,
        )
        assert PAGE_LINKS == set(query_result)
Beispiel #12
0
    def test_record_file_upload(self, task_manager_creator):
        """Test that we correctly capture the uploaded file contents.

        We upload a CSS file and a PNG file to test both text based and
        binary files.

        File uploads are not expected in the crawl data, but we make sure we
        correctly parse the POST data in this very common scenario.

        Firefox is currently not able to return the FormData with the file
        contents, currently only the filenames are returned. This is due to
        a limitation in the current API implementation:

        https://searchfox.org/mozilla-central/rev/b3b401254229f0a26f7ee625ef5f09c6c31e3949/toolkit/components/extensions/webrequest/WebRequestUpload.jsm#339

        Therefore, the test is currently skipped.
        """
        img_file_path = os.path.abspath("test_pages/shared/test_image.png")
        css_file_path = os.path.abspath("test_pages/shared/test_style.css")

        manager_params, browser_params = self.get_config()
        manager, db_path = task_manager_creator(
            (manager_params, browser_params))
        test_url = utilities.BASE_TEST_URL + "/post_file_upload.html"
        cs = command_sequence.CommandSequence(test_url)
        cs.get(sleep=0, timeout=60)
        cs.append_command(
            FilenamesIntoFormCommand(img_file_path, css_file_path))
        manager.execute_command_sequence(cs)
        manager.close()

        post_body = self.get_post_request_body_from_db(db_path)
        # Binary strings get put into the database as-if they were latin-1.
        with open(img_file_path, "rb") as f:
            img_file_content = f.read().strip().decode("latin-1")
        with open(css_file_path, "rt") as f:
            css_file_content = f.read().strip()
        # POST data is stored as JSON in the DB
        post_body_decoded = json.loads(post_body)
        expected_body = {
            "username": "******",
            "upload-css": css_file_content,
            "upload-img": img_file_content,
        }
        assert expected_body == post_body_decoded
def test_recursive_dump_page_source_valid(http_params, task_manager_creator,
                                          display_mode):
    """Check that 'recursive_dump_page_source' works"""
    # Run the test crawl
    manager_params, browser_params = http_params(display_mode)
    manager, db = task_manager_creator((manager_params, browser_params))
    cs = command_sequence.CommandSequence(NESTED_FRAMES_URL)
    cs.get(sleep=1)
    cs.recursive_dump_page_source()
    manager.execute_command_sequence(cs)
    manager.close()

    outfile = os.path.join(manager_params.data_directory, "sources",
                           "*-*.json.gz")
    src_file = glob.glob(outfile)[0]
    with gzip.GzipFile(src_file, "rb") as f:
        visit_source = json.loads(f.read().decode("utf-8"))

    observed_parents = dict()

    def verify_frame(frame, parent_frames=[]):
        # Verify structure
        observed_parents[frame["doc_url"]] = list(parent_frames)  # copy

        # Verify source
        path = urlparse(frame["doc_url"]).path
        expected_source = ""
        with open("." + path, "r") as f:
            expected_source = re.sub(r"\s", "", f.read().lower())
            if expected_source.startswith("<!doctypehtml>"):
                expected_source = expected_source[14:]
        observed_source = re.sub(r"\s", "", frame["source"].lower())
        if observed_source.startswith("<!doctypehtml>"):
            observed_source = observed_source[14:]
        assert observed_source == expected_source

        # Verify children
        parent_frames.append(frame["doc_url"])
        for key, child_frame in frame["iframes"].items():
            verify_frame(child_frame, parent_frames)
        parent_frames.pop()

    verify_frame(visit_source)
    assert EXPECTED_PARENTS == observed_parents
 def test_js_profile_cookies(self):
     """ Check that profile cookies set by JS are saved """
     # Run the test crawl
     manager_params, browser_params = self.get_config()
     browser_params[0].cookie_instrument = True
     manager = task_manager.TaskManager(manager_params, browser_params)
     url = utilities.BASE_TEST_URL + "/js_cookie.html"
     cs = command_sequence.CommandSequence(url)
     cs.get(sleep=3, timeout=120)
     manager.execute_command_sequence(cs)
     manager.close()
     # Check that the JS cookie we stored is recorded
     qry_res = db_utils.query_db(
         manager_params.database_name,
         ("SELECT visit_id, record_type, change_cause, is_http_only, "
          "is_host_only, is_session, host, is_secure, name, path, "
          "value, same_site FROM javascript_cookies"),
         as_tuple=True,
     )
     assert len(qry_res) == 1  # we store only one cookie
     cookies = qry_res[0]  # take the first cookie
     # compare URL, domain, name, value, origin, path
     assert cookies == expected_js_cookie
def test_browse_http_table_valid(http_params, task_manager_creator,
                                 display_mode):
    """Check CommandSequence.browse() works and populates http tables correctly.

    NOTE: Since the browse command is choosing links randomly, there is a
          (very small -- 2*0.5^20) chance this test will fail with valid
          code.
    """
    # Run the test crawl
    manager_params, browser_params = http_params(display_mode)
    manager, db = task_manager_creator((manager_params, browser_params))

    # Set up two sequential browse commands to two URLS
    cs_a = command_sequence.CommandSequence(url_a)
    cs_a.browse(num_links=20, sleep=1)
    cs_b = command_sequence.CommandSequence(url_b)
    cs_b.browse(num_links=1, sleep=1)

    manager.execute_command_sequence(cs_a)
    manager.execute_command_sequence(cs_b)
    manager.close()

    qry_res = db_utils.query_db(db,
                                "SELECT visit_id, site_url FROM site_visits")

    # Construct dict mapping site_url to visit_id
    visit_ids = dict()
    for row in qry_res:
        visit_ids[row[1]] = row[0]

    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_requests WHERE url = ?",
        (url_a, ),
    )
    assert qry_res[0][0] == visit_ids[url_a]

    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_requests WHERE url = ?",
        (url_b, ),
    )
    assert qry_res[0][0] == visit_ids[url_b]

    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_responses WHERE url = ?",
        (url_a, ),
    )
    assert qry_res[0][0] == visit_ids[url_a]

    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_responses WHERE url = ?",
        (url_b, ),
    )
    assert qry_res[0][0] == visit_ids[url_b]

    # Page simple_a.html has three links:
    # 1) An absolute link to simple_c.html
    # 2) A relative link to simple_d.html
    # 3) A javascript: link
    # 4) A link to www.google.com
    # 5) A link to example.com?localhost
    # We should see page visits for 1 and 2, but not 3-5.
    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_responses WHERE url = ?",
        (url_c, ),
    )
    assert qry_res[0][0] == visit_ids[url_a]
    qry_res = db_utils.query_db(
        db,
        "SELECT visit_id FROM http_responses WHERE url = ?",
        (url_d, ),
    )
    assert qry_res[0][0] == visit_ids[url_a]

    # We expect 4 urls: a,c,d and a favicon request
    qry_res = db_utils.query_db(
        db,
        "SELECT COUNT(DISTINCT url) FROM http_responses WHERE visit_id = ?",
        (visit_ids[url_a], ),
    )
    assert qry_res[0][0] == 4
Beispiel #16
0
# Update TaskManager configuration (use this for crawl-wide settings)
manager_params["data_directory"] = "~/Desktop/"
manager_params["log_directory"] = "~/Desktop/"
manager_params["memory_watchdog"] = True
manager_params["process_watchdog"] = True

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = task_manager.TaskManager(manager_params, browser_params)

# Visits the sites
for site in sites:

    # Parallelize sites over all number of browsers set above.
    command_sequence = command_sequence.CommandSequence(
        site,
        reset=True,
        callback=lambda success, val=site: print("CommandSequence {} done".
                                                 format(val)),
    )

    # Start by visiting the page
    command_sequence.get(sleep=3, timeout=60)

    # Run commands across the three browsers (simple parallelization)
    manager.execute_command_sequence(command_sequence)

# Shuts down the browsers and waits for the data to finish logging
manager.close()