Example #1
0
    def test_extension_gets_correct_visit_id(self):
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        url_a = utilities.BASE_TEST_URL + '/simple_a.html'
        url_b = utilities.BASE_TEST_URL + '/simple_b.html'

        manager.get(url_a)
        manager.get(url_b)
        manager.close()
        qry_res = db_utils.query_db(
            manager_params['db'], "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        simple_a_visit_id = db_utils.query_db(
            manager_params['db'], "SELECT visit_id FROM javascript WHERE "
            "symbol=?", ("window.navigator.userAgent", ))

        simple_b_visit_id = db_utils.query_db(
            manager_params['db'], "SELECT visit_id FROM javascript WHERE "
            "symbol=?", ("window.navigator.platform", ))

        assert visit_ids[url_a] == simple_a_visit_id[0][0]
        assert visit_ids[url_b] == simple_b_visit_id[0][0]
Example #2
0
    def test_content_saving(self, tmpdir):
        """ check that content is saved and hashed correctly """
        test_url = utilities.BASE_TEST_URL + '/http_test_page.html'
        manager_params, browser_params = self.get_test_config(str(tmpdir))
        browser_params[0]['http_instrument'] = True
        browser_params[0]['save_all_content'] = True
        manager = TaskManager.TaskManager(manager_params, browser_params)
        manager.get(url=test_url, sleep=1)
        manager.close()
        db = manager_params['db']
        rows = db_utils.query_db(db, "SELECT * FROM http_responses;")
        disk_content = dict()
        for row in rows:
            if 'MAGIC_REDIRECT' in row['url'] or '404' in row['url']:
                continue
            path = urlparse(row['url']).path
            with open(os.path.join(BASE_PATH, path[1:]), 'rb') as f:
                content = f.read()
            chash = sha256(content).hexdigest()
            # TODO: webext instrumentation doesn't save the content_hash yet.
            # assert chash == row['content_hash']
            disk_content[chash] = content

        ldb_content = dict()
        for chash, content in db_utils.get_javascript_content(str(tmpdir)):
            chash = chash.decode('ascii')
            ldb_content[chash] = content

        for k, v in disk_content.items():
            assert v == ldb_content[k]
Example #3
0
    def test_get_site_visits_table_valid(self):
        """Check that get works and populates db correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        # Perform the get commands
        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT site_url FROM site_visits")

        # We had two separate page visits
        assert len(qry_res) == 2

        assert qry_res[0][0] == url_a
        assert qry_res[1][0] == url_b
Example #4
0
    def test_get_http_tables_valid(self):
        """Check that get works and populates http tables correctly."""
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs_a = CommandSequence.CommandSequence(url_a)
        cs_a.get(sleep=1)
        cs_b = CommandSequence.CommandSequence(url_b)
        cs_b.get(sleep=1)

        manager.execute_command_sequence(cs_a)
        manager.execute_command_sequence(cs_b)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params['db'],
            "SELECT visit_id, site_url FROM site_visits")

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]
Example #5
0
    def test_page_visit(self):
        test_url = utilities.BASE_TEST_URL + '/http_test_page.html'
        db = self.visit(test_url)

        request_id_to_url = dict()

        # HTTP Requests
        rows = db_utils.query_db(db, "SELECT * FROM http_requests")
        observed_records = set()
        for row in rows:
            observed_records.add(
                (row['url'].split('?')[0],
                 fix_about_page_url(row['top_level_url']),
                 row['triggering_origin'], row['loading_origin'],
                 row['loading_href'], row['is_XHR'], row['is_frame_load'],
                 row['is_full_page'], row['is_third_party_channel'],
                 row['is_third_party_to_top_window'], row['resource_type']))
            request_id_to_url[row['request_id']] = row['url']
        assert HTTP_REQUESTS == observed_records

        # HTTP Responses
        rows = db_utils.query_db(db, "SELECT * FROM http_responses")
        observed_records = set()
        for row in rows:
            observed_records.add((
                row['url'].split('?')[0],
                # TODO: webext-instrumentation doesn't support referrer
                # yet | row['referrer'],
                row['location']))
            assert row['request_id'] in request_id_to_url
            assert request_id_to_url[row['request_id']] == row['url']
        assert HTTP_RESPONSES == observed_records

        # HTTP Redirects
        rows = db_utils.query_db(db, "SELECT * FROM http_redirects")
        observed_records = set()
        for row in rows:
            # TODO: webext instrumentation doesn't support new_request_id yet
            # src = request_id_to_url[row['old_request_id']].split('?')[0]
            # dst = request_id_to_url[row['new_request_id']].split('?')[0]
            src = row['old_request_url'].split('?')[0]
            dst = row['new_request_url'].split('?')[0]
            observed_records.add((src, dst))
        assert HTTP_REDIRECTS == observed_records
Example #6
0
 def test_property_enumeration(self):
     test_url = utilities.BASE_TEST_URL + '/property_enumeration.html'
     db = self.visit(test_url)
     rows = db_utils.query_db(db,
                              "SELECT script_url, symbol FROM javascript")
     observed_symbols = set()
     for script_url, symbol in rows:
         assert script_url == test_url
         observed_symbols.add(symbol)
     assert PROPERTIES == observed_symbols
Example #7
0
    def test_flash_cookies(self):
        """ Check that some Flash LSOs are saved and
        are properly keyed in db."""
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        browser_params[0]['disable_flash'] = False
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Get a site we know sets Flash cookies and visit it twice
        lso_value_a = utilities.rand_str(8)
        expected_lso_content_a[5] = lso_value_a  # expected to be present
        qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
                                                          lso_value_a)
        test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
        cs = CommandSequence.CommandSequence(test_url_a)
        cs.get(sleep=3, timeout=120)
        cs.dump_flash_cookies()
        manager.execute_command_sequence(cs)

        lso_value_b = utilities.rand_str(8)
        expected_lso_content_b[5] = lso_value_b  # expected to be present
        qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key",
                                                          lso_value_b)
        test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str
        cs = CommandSequence.CommandSequence(test_url_b)
        cs.get(sleep=3, timeout=120)
        cs.dump_flash_cookies()
        manager.execute_command_sequence(cs)

        manager.close()

        #  Check that some flash cookies are recorded
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT * FROM flash_cookies",
                                    as_tuple=True)
        lso_count = len(qry_res)
        assert lso_count == 2
        lso_content_a = list(qry_res[0][2:])  # Remove first two items
        lso_content_b = list(qry_res[1][2:])  # Remove first two items
        # remove randomly generated LSO directory name
        # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol
        lso_content_a[3] = lso_content_a[3].split("/", 1)[-1]  # rm LSO dirname
        lso_content_b[3] = lso_content_b[3].split("/", 1)[-1]  # rm LSO dirname
        assert lso_content_a == expected_lso_content_a
        assert lso_content_b == expected_lso_content_b
Example #8
0
    def test_profile_cookies(self):
        """ Check that some profile cookies are saved """
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)
        # TODO update this to local test site
        url = 'http://www.yahoo.com'
        cs = CommandSequence.CommandSequence(url)
        cs.get(sleep=3, timeout=120)
        cs.dump_profile_cookies()
        manager.execute_command_sequence(cs)
        manager.close()

        # Check that some flash cookies are recorded
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT COUNT(*) FROM profile_cookies")
        prof_cookie_count = qry_res[0][0]
        assert prof_cookie_count > 0
Example #9
0
    def test_custom_function(self):
        """ Test `custom_function` with an inline func that collects links """

        from SocketInterface import clientsocket

        def collect_links(table_name, scheme, **kwargs):
            """ Collect links with `scheme` and save in table `table_name` """
            driver = kwargs['driver']
            manager_params = kwargs['manager_params']
            link_urls = [
                x
                for x in (element.get_attribute("href")
                          for element in driver.find_elements_by_tag_name('a'))
                if x.startswith(scheme + '://')
            ]
            current_url = driver.current_url

            sock = clientsocket()
            sock.connect(*manager_params['aggregator_address'])

            query = ("CREATE TABLE IF NOT EXISTS %s ("
                     "top_url TEXT, link TEXT);" % table_name)
            sock.send(("create_table", query))

            for link in link_urls:
                query = (table_name, {"top_url": current_url, "link": link})
                sock.send(query)
            sock.close()

        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)
        cs = CommandSequence.CommandSequence(url_a)
        cs.get(sleep=0, timeout=60)
        cs.run_custom_function(collect_links, ('page_links', 'http'))
        manager.execute_command_sequence(cs)
        manager.close()
        query_result = db_utils.query_db(
            manager_params['db'],
            "SELECT top_url, link FROM page_links;",
            as_tuple=True)
        assert PAGE_LINKS == set(query_result)
Example #10
0
 def test_js_profile_cookies(self):
     """ Check that profile cookies set by JS are saved """
     # Run the test crawl
     manager_params, browser_params = self.get_config()
     manager = TaskManager.TaskManager(manager_params, browser_params)
     url = utilities.BASE_TEST_URL + "/js_cookie.html"
     cs = CommandSequence.CommandSequence(url)
     cs.get(sleep=3, timeout=120)
     cs.dump_profile_cookies()
     manager.execute_command_sequence(cs)
     manager.close()
     # Check that the JS cookie we stored is recorded
     qry_res = db_utils.query_db(
         manager_params['db'],
         "SELECT * FROM profile_cookies",
         as_tuple=True
     )
     assert len(qry_res) == 1  # we store only one cookie
     cookies = qry_res[0]  # take the first cookie
     # compare URL, domain, name, value, origin, path
     assert cookies[2:8] == expected_js_cookie
Example #11
0
    def test_browse_wrapper_http_table_valid(self):
        """Check that TaskManager.browse() wrapper works and populates
        http tables correctly.

        NOTE: Since the browse command is choosing links randomly, there is a
              (very small -- 2*0.5^20) chance this test will fail with valid
              code.
        """
        # Run the test crawl
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential browse commands to two URLS
        manager.browse(url_a, num_links=20, sleep=1)
        manager.browse(url_b, num_links=1, sleep=1)
        manager.close()

        qry_res = db_utils.query_db(
            manager_params['db'],
            "SELECT visit_id, site_url FROM site_visits"
        )

        # Construct dict mapping site_url to visit_id
        visit_ids = dict()
        for row in qry_res:
            visit_ids[row[1]] = row[0]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_requests"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_a,))
        assert qry_res[0][0] == visit_ids[url_a]

        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_b,))
        assert qry_res[0][0] == visit_ids[url_b]

        # Page simple_a.html has three links:
        # 1) An absolute link to simple_c.html
        # 2) A relative link to simple_d.html
        # 3) A javascript: link
        # 4) A link to www.google.com
        # 5) A link to example.com?localtest.me
        # We should see page visits for 1 and 2, but not 3-5.
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_c,))
        assert qry_res[0][0] == visit_ids[url_a]
        qry_res = db_utils.query_db(manager_params['db'],
                                    "SELECT visit_id FROM http_responses"
                                    " WHERE url = ?", (url_d,))
        assert qry_res[0][0] == visit_ids[url_a]

        # We expect 4 urls: a,c,d and a favicon request
        qry_res = db_utils.query_db(
            manager_params['db'],
            "SELECT COUNT(DISTINCT url) FROM http_responses"
            " WHERE visit_id = ?", (visit_ids[url_a],))
        assert qry_res[0][0] == 4
Example #12
0
    def test_browser_profile_coverage(self, tmpdir):
        """ Test the coverage of the browser's profile

        This verifies that Firefox's places.sqlite database contains
        all visited sites (with a few exceptions). If it does not,
        it is likely the profile is lost at some point during the crawl
        """
        # Run the test crawl
        data_dir = os.path.join(str(tmpdir), 'data_dir')
        manager_params, browser_params = self.get_config(data_dir)
        manager = TaskManager.TaskManager(manager_params, browser_params)
        for site in TEST_SITES:
            manager.get(site)
        ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'],
                                 'profile.tar.gz')
        manager.close()

        # Extract crawl profile
        with tarfile.open(ff_db_tar) as tar:
            tar.extractall(browser_params[0]['profile_archive_dir'])

        # Output databases
        ff_db = os.path.join(browser_params[0]['profile_archive_dir'],
                             'places.sqlite')
        crawl_db = manager_params['db']

        # Grab urls from crawl database
        rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests")
        req_ps = set()  # visited domains from http_requests table
        for url, in rows:
            req_ps.add(psl.get_public_suffix(urlparse(url).hostname))

        hist_ps = set()  # visited domains from crawl_history Table
        successes = dict()
        rows = db_utils.query_db(
            crawl_db, "SELECT arguments, bool_success "
            "FROM crawl_history WHERE command='GET'")
        for url, success in rows:
            ps = psl.get_public_suffix(urlparse(url).hostname)
            hist_ps.add(ps)
            successes[ps] = success

        # Grab urls from Firefox database
        profile_ps = set()  # visited domains from firefox profile
        rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
        for host, in rows:
            try:
                profile_ps.add(psl.get_public_suffix(urlparse(host).hostname))
            except AttributeError:
                pass

        # We expect urls to be in the Firefox profile if:
        # 1. We've made requests to it
        # 2. The url is a top_url we entered into the address bar
        # 3. The url successfully loaded (see: Issue #40)
        # 4. The site does not respond to the initial request with a 204
        #    (won't show in FF DB)
        missing_urls = req_ps.intersection(hist_ps).difference(profile_ps)
        unexpected_missing_urls = set()
        for url in missing_urls:
            if successes[url] == 0 or successes[url] == -1:
                continue

            # Get the visit id for the url
            rows = db_utils.query_db(
                crawl_db, "SELECT visit_id FROM site_visits "
                "WHERE site_url = ?", ('http://' + url, ))
            visit_id = rows[0]

            rows = db_utils.query_db(
                crawl_db, "SELECT COUNT(*) FROM http_responses "
                "WHERE visit_id = ?", (visit_id, ))
            if rows[0] > 1:
                continue

            rows = db_utils.query_db(
                crawl_db, "SELECT response_status, location FROM "
                "http_responses WHERE visit_id = ?", (visit_id, ))
            response_status, location = rows[0]
            if response_status == 204:
                continue
            if location == 'http://':  # site returned a blank redirect
                continue
            unexpected_missing_urls.add(url)

        assert len(unexpected_missing_urls) == 0
Example #13
0
 def get_post_requests_from_db(self, db):
     """Query the crawl database and return the POST requests."""
     return db_utils.query_db(
         db, "SELECT * FROM http_requests\
                                    WHERE method = 'POST'")
Example #14
0
    def test_cache_hits_recorded(self):
        """Verify all http responses are recorded, including cached responses

        Note that we expect to see all of the same requests and responses
        during the second vist (even if cached) except for images. Cached
        images do not trigger Observer Notification events.
        See Bug 634073: https://bugzilla.mozilla.org/show_bug.cgi?id=634073

        The test page includes an image which does several permanent redirects
        before returning a 404. We expect to see new requests and responses
        for this image when the page is reloaded. Additionally, the redirects
        should be cached.
        """
        test_url = utilities.BASE_TEST_URL + '/http_test_page.html'
        manager_params, browser_params = self.get_config()
        manager = TaskManager.TaskManager(manager_params, browser_params)
        manager.get(test_url, sleep=5)
        manager.get(test_url, sleep=5)
        manager.close()
        db = manager_params['db']

        request_id_to_url = dict()

        # HTTP Requests
        rows = db_utils.query_db(
            db, "SELECT * FROM http_requests WHERE visit_id = 2")
        observed_records = set()
        for row in rows:
            # HACK: favicon caching is unpredictable, don't bother checking it
            if row['url'].split('?')[0].endswith('favicon.ico'):
                continue
            observed_records.add(
                (row['url'].split('?')[0],
                 fix_about_page_url(row['top_level_url']),
                 row['triggering_origin'], row['loading_origin'],
                 row['loading_href'], row['is_XHR'], row['is_frame_load'],
                 row['is_full_page'], row['is_third_party_channel'],
                 row['is_third_party_to_top_window'], row['resource_type']))
            request_id_to_url[row['request_id']] = row['url']
        assert HTTP_CACHED_REQUESTS == observed_records

        # HTTP Responses
        rows = db_utils.query_db(
            db, "SELECT * FROM http_responses WHERE visit_id = 2")
        observed_records = set()
        for row in rows:
            # HACK: favicon caching is unpredictable, don't bother checking it
            if row['url'].split('?')[0].endswith('favicon.ico'):
                continue
            observed_records.add((
                row['url'].split('?')[0],
                # TODO: referrer isn't available yet in the
                # webext instrumentation | row['referrer'],
                row['is_cached']))
            assert row['request_id'] in request_id_to_url
            assert request_id_to_url[row['request_id']] == row['url']
        assert HTTP_CACHED_RESPONSES == observed_records

        # HTTP Redirects
        rows = db_utils.query_db(
            db, "SELECT * FROM http_redirects WHERE visit_id = 2")
        observed_records = set()
        for row in rows:
            # TODO: new_request_id isn't supported yet
            # src = request_id_to_url[row['old_request_id']].split('?')[0]
            # dst = request_id_to_url[row['new_request_id']].split('?')[0]
            src = row['old_request_url'].split('?')[0]
            dst = row['new_request_url'].split('?')[0]
            observed_records.add((src, dst))
        assert HTTP_CACHED_REDIRECTS == observed_records