コード例 #1
0
ファイル: test_extension.py プロジェクト: mihaiqc/soen321
    def test_extension_gets_correct_visit_id(self):
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        url_a = utilities.BASE_TEST_URL + '/simple_a.html'
        url_b = utilities.BASE_TEST_URL + '/simple_b.html'

        manager.get(url_a)
        manager.get(url_b)
        manager.close()
        qry_res = db_utils.query_db(
            manager_params['db'], "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        simple_a_visit_id = db_utils.query_db(
            manager_params['db'], "SELECT visit_id FROM javascript WHERE "
            "symbol=?", ("window.navigator.userAgent", ))

        simple_b_visit_id = db_utils.query_db(
            manager_params['db'], "SELECT visit_id FROM javascript WHERE "
            "symbol=?", ("window.navigator.platform", ))

        assert visit_ids[url_a] == simple_a_visit_id[0][0]
        assert visit_ids[url_b] == simple_b_visit_id[0][0]
コード例 #2
0
    def test_get_site_visits_table_valid(self):
        """Check that get works and populates db correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        # Perform the get commands
        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT site_url FROM site_visits")

        # We had two separate page visits
        assert len(qry_res) == 2

        assert qry_res[0][0] == url_a
        assert qry_res[1][0] == url_b
コード例 #3
0
    def test_get_http_tables_valid(self):
        """Check that get works and populates http tables correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params['db'],
            "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]
コード例 #4
0
ファイル: test_extension.py プロジェクト: mihaiqc/soen321
 def test_property_enumeration(self):
     test_url = utilities.BASE_TEST_URL + '/property_enumeration.html'
     db = self.visit(test_url)
     rows = db_utils.query_db(db,
                              "SELECT script_url, symbol FROM javascript")
     observed_symbols = set()
     for script_url, symbol in rows:
         assert script_url == test_url
         observed_symbols.add(symbol)
     assert PROPERTIES == observed_symbols
コード例 #5
0
    def test_flash_cookies(self):
        """ Check that some Flash LSOs are saved and
        are properly keyed in db."""
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        browser_params[0]['disable_flash'] = False
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Get a site we know sets Flash cookies and visit it twice
        lso_value_a = utilities.rand_str(8)
        expected_lso_content_a[5] = lso_value_a  # expected to be present
        qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
                                                          lso_value_a)
        test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
        cs = CommandSequence.CommandSequence(test_url_a)
        cs.get(sleep=3, timeout=120)
        cs.dump_flash_cookies()
        manager.execute_command_sequence(cs)

        lso_value_b = utilities.rand_str(8)
        expected_lso_content_b[5] = lso_value_b  # expected to be present
        qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
                                                          lso_value_b)
        test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
        cs = CommandSequence.CommandSequence(test_url_b)
        cs.get(sleep=3, timeout=120)
        cs.dump_flash_cookies()
        manager.execute_command_sequence(cs)

        manager.close()

        #  Check that some flash cookies are recorded
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT * FROM flash_cookies",
                                    as_tuple=True)
        lso_count = len(qry_res)
        assert lso_count == 2
        lso_content_a = list(qry_res[0][2:])  # Remove first two items
        lso_content_b = list(qry_res[1][2:])  # Remove first two items
        # remove randomly generated LSO directory name
        # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol
        lso_content_a[3] = lso_content_a[3].split("/", 1)[-1]  # rm LSO dirname
        lso_content_b[3] = lso_content_b[3].split("/", 1)[-1]  # rm LSO dirname
        assert lso_content_a == expected_lso_content_a
        assert lso_content_b == expected_lso_content_b
コード例 #6
0
    def test_profile_cookies(self):
        """ Check that some profile cookies are saved """
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)
        # TODO update this to local test site
        url = 'http://www.yahoo.com'
        cs = CommandSequence.CommandSequence(url)
        cs.get(sleep=3, timeout=120)
        cs.dump_profile_cookies()
        manager.execute_command_sequence(cs)
        manager.close()

        # Check that some flash cookies are recorded
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT COUNT(*) FROM profile_cookies")
        prof_cookie_count = qry_res[0][0]
        assert prof_cookie_count > 0
コード例 #7
0
 def test_js_profile_cookies(self):
     """ Check that profile cookies set by JS are saved """
     # Run the test crawl
     manager_params, browser_params = self.get_config()
     manager = TaskManager.TaskManager(manager_params, browser_params)
     url = utilities.BASE_TEST_URL + "/js_cookie.html"
     cs = CommandSequence.CommandSequence(url)
     cs.get(sleep=3, timeout=120)
     cs.dump_profile_cookies()
     manager.execute_command_sequence(cs)
     manager.close()
     # Check that the JS cookie we stored is recorded
     qry_res = db_utils.query_db(manager_params['db'],
                                 "SELECT * FROM profile_cookies",
                                 as_tuple=True)
     assert len(qry_res) == 1  # we store only one cookie
     cookies = qry_res[0]  # take the first cookie
     # compare URL, domain, name, value, origin, path
     assert cookies[2:8] == expected_js_cookie
コード例 #8
0
    def test_custom_function(self):
        """ Test `custom_function` with an inline func that collects links """

        from src.open_wpm.automation import clientsocket

        def collect_links(table_name, scheme, **kwargs):
            """ Collect links with `scheme` and save in table `table_name` """
            driver = kwargs['driver']
            manager_params = kwargs['manager_params']
            link_urls = [
                x
                for x in (element.get_attribute("href")
                          for element in driver.find_elements_by_tag_name('a'))
                if x.startswith(scheme + '://')
            ]
            current_url = driver.current_url

            sock = clientsocket()
            sock.connect(*manager_params['aggregator_address'])

            query = ("CREATE TABLE IF NOT EXISTS %s ("
                     "top_url TEXT, link TEXT);" % table_name)
            sock.send((query, ()))

            for link in link_urls:
                query = ("INSERT INTO %s (top_url, link) "
                         "VALUES (?, ?)" % table_name)
                sock.send((query, (current_url, link)))
            sock.close()

        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)
        cs = CommandSequence.CommandSequence(url_a)
        cs.get(sleep=0, timeout=60)
        cs.run_custom_function(collect_links, ('page_links', 'http'))
        manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params['db'],
            "SELECT top_url, link FROM page_links;",
            as_tuple=True)
        assert PAGE_LINKS == set(query_result)
コード例 #9
0
ファイル: test_crawl.py プロジェクト: mihaiqc/soen321
    def test_browser_profile_coverage(self, tmpdir):
        """ Test the coverage of the browser's profile

        This verifies that Firefox's places.sqlite database contains
        all visited sites (with a few exceptions). If it does not,
        it is likely the profile is lost at some point during the crawl
        """
        # Run the test crawl
        data_dir = os.path.join(str(tmpdir), 'data_dir')
        manager_params, browser_params = self.get_config(data_dir)
        manager = TaskManager.TaskManager(manager_params, browser_params)
        for site in TEST_SITES:
            manager.get(site)
        ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'],
                                 'profile.tar.gz')
        manager.close()

        # Extract crawl profile
        with tarfile.open(ff_db_tar) as tar:
            tar.extractall(browser_params[0]['profile_archive_dir'])

        # Output databases
        ff_db = os.path.join(browser_params[0]['profile_archive_dir'],
                             'places.sqlite')
        crawl_db = manager_params['db']

        # Grab urls from crawl database
        rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests")
        req_ps = set()  # visited domains from http_requests table
        for url, in rows:
            req_ps.add(psl.get_public_suffix(urlparse(url).hostname))

        hist_ps = set()  # visited domains from CrawlHistory Table
        successes = dict()
        rows = db_utils.query_db(
            crawl_db, "SELECT arguments, bool_success "
            "FROM CrawlHistory WHERE command='GET'")
        for url, success in rows:
            ps = psl.get_public_suffix(urlparse(url).hostname)
            hist_ps.add(ps)
            successes[ps] = success

        # Grab urls from Firefox database
        profile_ps = set()  # visited domains from firefox profile
        rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
        for host, in rows:
            try:
                profile_ps.add(psl.get_public_suffix(urlparse(host).hostname))
            except AttributeError:
                pass

        # We expect urls to be in the Firefox profile if:
        # 1. We've made requests to it
        # 2. The url is a top_url we entered into the address bar
        # 3. The url successfully loaded (see: Issue #40)
        # 4. The site does not respond to the initial request with a 204 (won't show in FF DB)
        missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
        unexpected_missing_urls = set()
        for url in missing_urls:
            if successes[url] == 0 or successes[url] == -1:
                continue

            # Get the visit id for the url
            rows = db_utils.query_db(
                crawl_db, "SELECT visit_id FROM site_visits "
                "WHERE site_url = ?", ('http://' + url, ))
            visit_id = rows[0]

            rows = db_utils.query_db(
                crawl_db, "SELECT COUNT(*) FROM http_responses "
                "WHERE visit_id = ?", (visit_id, ))
            if rows[0] > 1:
                continue

            rows = db_utils.query_db(
                crawl_db, "SELECT response_status, location FROM "
                "http_responses WHERE visit_id = ?", (visit_id, ))
            response_status, location = rows[0]
            if response_status == 204:
                continue
            if location == 'http://':  # site returned a blank redirect
                continue
            unexpected_missing_urls.add(url)

        assert len(unexpected_missing_urls) == 0
コード例 #10
0
    def test_browse_wrapper_http_table_valid(self):
        """Check that TaskManager.browse() wrapper works and populates
        http tables correctly.

        NOTE: Since the browse command is choosing links randomly, there is a
              (very small -- 2*0.5^20) chance this test will fail with valid
              code.
        """
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential browse commands to two URLS
        manager.browse(url_a, num_links=20, sleep=1)
        manager.browse(url_b, num_links=1, sleep=1)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params['db'],
            "SELECT visit_id, site_url FROM site_visits"
        )

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        # Page simple_a.html has three links:
        # 1) An absolute link to simple_c.html
        # 2) A relative link to simple_d.html
        # 3) A javascript: link
        # 4) A link to www.google.com
        # 5) A link to example.com?localtest.me
        # We should see page visits for 1 and 2, but not 3-5.
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_c,))
        assert qry_res[0][0] == visit_ids[url_a]
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_d,))
        assert qry_res[0][0] == visit_ids[url_a]

        # We expect 4 urls: a,c,d and a favicon request
        qry_res = db_utils.query_db(
            manager_params['db'],
            "SELECT COUNT(DISTINCT url) FROM http_responses"
            " WHERE visit_id = ?", (visit_ids[url_a],))
        assert qry_res[0][0] == 4