def test_extension_gets_correct_visit_id(self): manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) url_a = utilities.BASE_TEST_URL + '/simple_a.html' url_b = utilities.BASE_TEST_URL + '/simple_b.html' manager.get(url_a) manager.get(url_b) manager.close() qry_res = db_utils.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] simple_a_visit_id = db_utils.query_db( manager_params['db'], "SELECT visit_id FROM javascript WHERE " "symbol=?", ("window.navigator.userAgent", )) simple_b_visit_id = db_utils.query_db( manager_params['db'], "SELECT visit_id FROM javascript WHERE " "symbol=?", ("window.navigator.platform", )) assert visit_ids[url_a] == simple_a_visit_id[0][0] assert visit_ids[url_b] == simple_b_visit_id[0][0]
def test_content_saving(self, tmpdir): """ check that content is saved and hashed correctly """ test_url = utilities.BASE_TEST_URL + '/http_test_page.html' manager_params, browser_params = self.get_test_config(str(tmpdir)) browser_params[0]['http_instrument'] = True browser_params[0]['save_all_content'] = True manager = TaskManager.TaskManager(manager_params, browser_params) manager.get(url=test_url, sleep=1) manager.close() db = manager_params['db'] rows = db_utils.query_db(db, "SELECT * FROM http_responses;") disk_content = dict() for row in rows: if 'MAGIC_REDIRECT' in row['url'] or '404' in row['url']: continue path = urlparse(row['url']).path with open(os.path.join(BASE_PATH, path[1:]), 'rb') as f: content = f.read() chash = sha256(content).hexdigest() # TODO: webext instrumentation doesn't save the content_hash yet. # assert chash == row['content_hash'] disk_content[chash] = content ldb_content = dict() for chash, content in db_utils.get_javascript_content(str(tmpdir)): chash = chash.decode('ascii') ldb_content[chash] = content for k, v in disk_content.items(): assert v == ldb_content[k]
def test_get_site_visits_table_valid(self): """Check that get works and populates db correctly.""" # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.get(sleep=1) # Perform the get commands manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close() qry_res = db_utils.query_db(manager_params['db'], "SELECT site_url FROM site_visits") # We had two separate page visits assert len(qry_res) == 2 assert qry_res[0][0] == url_a assert qry_res[1][0] == url_b
def test_get_http_tables_valid(self): """Check that get works and populates http tables correctly.""" # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.get(sleep=1) manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close() qry_res = db_utils.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b]
def test_page_visit(self): test_url = utilities.BASE_TEST_URL + '/http_test_page.html' db = self.visit(test_url) request_id_to_url = dict() # HTTP Requests rows = db_utils.query_db(db, "SELECT * FROM http_requests") observed_records = set() for row in rows: observed_records.add( (row['url'].split('?')[0], fix_about_page_url(row['top_level_url']), row['triggering_origin'], row['loading_origin'], row['loading_href'], row['is_XHR'], row['is_frame_load'], row['is_full_page'], row['is_third_party_channel'], row['is_third_party_to_top_window'], row['resource_type'])) request_id_to_url[row['request_id']] = row['url'] assert HTTP_REQUESTS == observed_records # HTTP Responses rows = db_utils.query_db(db, "SELECT * FROM http_responses") observed_records = set() for row in rows: observed_records.add(( row['url'].split('?')[0], # TODO: webext-instrumentation doesn't support referrer # yet | row['referrer'], row['location'])) assert row['request_id'] in request_id_to_url assert request_id_to_url[row['request_id']] == row['url'] assert HTTP_RESPONSES == observed_records # HTTP Redirects rows = db_utils.query_db(db, "SELECT * FROM http_redirects") observed_records = set() for row in rows: # TODO: webext instrumentation doesn't support new_request_id yet # src = request_id_to_url[row['old_request_id']].split('?')[0] # dst = request_id_to_url[row['new_request_id']].split('?')[0] src = row['old_request_url'].split('?')[0] dst = row['new_request_url'].split('?')[0] observed_records.add((src, dst)) assert HTTP_REDIRECTS == observed_records
def test_property_enumeration(self): test_url = utilities.BASE_TEST_URL + '/property_enumeration.html' db = self.visit(test_url) rows = db_utils.query_db(db, "SELECT script_url, symbol FROM javascript") observed_symbols = set() for script_url, symbol in rows: assert script_url == test_url observed_symbols.add(symbol) assert PROPERTIES == observed_symbols
def test_flash_cookies(self): """ Check that some Flash LSOs are saved and are properly keyed in db.""" # Run the test crawl manager_params, browser_params = self.get_config() browser_params[0]['disable_flash'] = False manager = TaskManager.TaskManager(manager_params, browser_params) # Get a site we know sets Flash cookies and visit it twice lso_value_a = utilities.rand_str(8) expected_lso_content_a[5] = lso_value_a # expected to be present qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value_a) test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str cs = CommandSequence.CommandSequence(test_url_a) cs.get(sleep=3, timeout=120) cs.dump_flash_cookies() manager.execute_command_sequence(cs) lso_value_b = utilities.rand_str(8) expected_lso_content_b[5] = lso_value_b # expected to be present qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value_b) test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str cs = CommandSequence.CommandSequence(test_url_b) cs.get(sleep=3, timeout=120) cs.dump_flash_cookies() manager.execute_command_sequence(cs) manager.close() # Check that some flash cookies are recorded qry_res = db_utils.query_db(manager_params['db'], "SELECT * FROM flash_cookies", as_tuple=True) lso_count = len(qry_res) assert lso_count == 2 lso_content_a = list(qry_res[0][2:]) # Remove first two items lso_content_b = list(qry_res[1][2:]) # Remove first two items # remove randomly generated LSO directory name # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol lso_content_a[3] = lso_content_a[3].split("/", 1)[-1] # rm LSO dirname lso_content_b[3] = lso_content_b[3].split("/", 1)[-1] # rm LSO dirname assert lso_content_a == expected_lso_content_a assert lso_content_b == expected_lso_content_b
def test_profile_cookies(self): """ Check that some profile cookies are saved """ # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # TODO update this to local test site url = 'http://www.yahoo.com' cs = CommandSequence.CommandSequence(url) cs.get(sleep=3, timeout=120) cs.dump_profile_cookies() manager.execute_command_sequence(cs) manager.close() # Check that some flash cookies are recorded qry_res = db_utils.query_db(manager_params['db'], "SELECT COUNT(*) FROM profile_cookies") prof_cookie_count = qry_res[0][0] assert prof_cookie_count > 0
def test_custom_function(self): """ Test `custom_function` with an inline func that collects links """ from SocketInterface import clientsocket def collect_links(table_name, scheme, **kwargs): """ Collect links with `scheme` and save in table `table_name` """ driver = kwargs['driver'] manager_params = kwargs['manager_params'] link_urls = [ x for x in (element.get_attribute("href") for element in driver.find_elements_by_tag_name('a')) if x.startswith(scheme + '://') ] current_url = driver.current_url sock = clientsocket() sock.connect(*manager_params['aggregator_address']) query = ("CREATE TABLE IF NOT EXISTS %s (" "top_url TEXT, link TEXT);" % table_name) sock.send(("create_table", query)) for link in link_urls: query = (table_name, {"top_url": current_url, "link": link}) sock.send(query) sock.close() manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) cs = CommandSequence.CommandSequence(url_a) cs.get(sleep=0, timeout=60) cs.run_custom_function(collect_links, ('page_links', 'http')) manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params['db'], "SELECT top_url, link FROM page_links;", as_tuple=True) assert PAGE_LINKS == set(query_result)
def test_js_profile_cookies(self): """ Check that profile cookies set by JS are saved """ # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) url = utilities.BASE_TEST_URL + "/js_cookie.html" cs = CommandSequence.CommandSequence(url) cs.get(sleep=3, timeout=120) cs.dump_profile_cookies() manager.execute_command_sequence(cs) manager.close() # Check that the JS cookie we stored is recorded qry_res = db_utils.query_db( manager_params['db'], "SELECT * FROM profile_cookies", as_tuple=True ) assert len(qry_res) == 1 # we store only one cookie cookies = qry_res[0] # take the first cookie # compare URL, domain, name, value, origin, path assert cookies[2:8] == expected_js_cookie
def test_browse_wrapper_http_table_valid(self): """Check that TaskManager.browse() wrapper works and populates http tables correctly. NOTE: Since the browse command is choosing links randomly, there is a (very small -- 2*0.5^20) chance this test will fail with valid code. """ # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential browse commands to two URLS manager.browse(url_a, num_links=20, sleep=1) manager.browse(url_b, num_links=1, sleep=1) manager.close() qry_res = db_utils.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits" ) # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] # Page simple_a.html has three links: # 1) An absolute link to simple_c.html # 2) A relative link to simple_d.html # 3) A javascript: link # 4) A link to www.google.com # 5) A link to example.com?localtest.me # We should see page visits for 1 and 2, but not 3-5. qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_c,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_d,)) assert qry_res[0][0] == visit_ids[url_a] # We expect 4 urls: a,c,d and a favicon request qry_res = db_utils.query_db( manager_params['db'], "SELECT COUNT(DISTINCT url) FROM http_responses" " WHERE visit_id = ?", (visit_ids[url_a],)) assert qry_res[0][0] == 4
def test_browser_profile_coverage(self, tmpdir): """ Test the coverage of the browser's profile This verifies that Firefox's places.sqlite database contains all visited sites (with a few exceptions). If it does not, it is likely the profile is lost at some point during the crawl """ # Run the test crawl data_dir = os.path.join(str(tmpdir), 'data_dir') manager_params, browser_params = self.get_config(data_dir) manager = TaskManager.TaskManager(manager_params, browser_params) for site in TEST_SITES: manager.get(site) ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'], 'profile.tar.gz') manager.close() # Extract crawl profile with tarfile.open(ff_db_tar) as tar: tar.extractall(browser_params[0]['profile_archive_dir']) # Output databases ff_db = os.path.join(browser_params[0]['profile_archive_dir'], 'places.sqlite') crawl_db = manager_params['db'] # Grab urls from crawl database rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests") req_ps = set() # visited domains from http_requests table for url, in rows: req_ps.add(psl.get_public_suffix(urlparse(url).hostname)) hist_ps = set() # visited domains from crawl_history Table successes = dict() rows = db_utils.query_db( crawl_db, "SELECT arguments, bool_success " "FROM crawl_history WHERE command='GET'") for url, success in rows: ps = psl.get_public_suffix(urlparse(url).hostname) hist_ps.add(ps) successes[ps] = success # Grab urls from Firefox database profile_ps = set() # visited domains from firefox profile rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") for host, in rows: try: profile_ps.add(psl.get_public_suffix(urlparse(host).hostname)) except AttributeError: pass # We expect urls to be in the Firefox profile if: # 1. We've made requests to it # 2. The url is a top_url we entered into the address bar # 3. The url successfully loaded (see: Issue #40) # 4. The site does not respond to the initial request with a 204 # (won't show in FF DB) missing_urls = req_ps.intersection(hist_ps).difference(profile_ps) unexpected_missing_urls = set() for url in missing_urls: if successes[url] == 0 or successes[url] == -1: continue # Get the visit id for the url rows = db_utils.query_db( crawl_db, "SELECT visit_id FROM site_visits " "WHERE site_url = ?", ('http://' + url, )) visit_id = rows[0] rows = db_utils.query_db( crawl_db, "SELECT COUNT(*) FROM http_responses " "WHERE visit_id = ?", (visit_id, )) if rows[0] > 1: continue rows = db_utils.query_db( crawl_db, "SELECT response_status, location FROM " "http_responses WHERE visit_id = ?", (visit_id, )) response_status, location = rows[0] if response_status == 204: continue if location == 'http://': # site returned a blank redirect continue unexpected_missing_urls.add(url) assert len(unexpected_missing_urls) == 0
def get_post_requests_from_db(self, db): """Query the crawl database and return the POST requests.""" return db_utils.query_db( db, "SELECT * FROM http_requests\ WHERE method = 'POST'")
def test_cache_hits_recorded(self): """Verify all http responses are recorded, including cached responses Note that we expect to see all of the same requests and responses during the second vist (even if cached) except for images. Cached images do not trigger Observer Notification events. See Bug 634073: https://bugzilla.mozilla.org/show_bug.cgi?id=634073 The test page includes an image which does several permanent redirects before returning a 404. We expect to see new requests and responses for this image when the page is reloaded. Additionally, the redirects should be cached. """ test_url = utilities.BASE_TEST_URL + '/http_test_page.html' manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) manager.get(test_url, sleep=5) manager.get(test_url, sleep=5) manager.close() db = manager_params['db'] request_id_to_url = dict() # HTTP Requests rows = db_utils.query_db( db, "SELECT * FROM http_requests WHERE visit_id = 2") observed_records = set() for row in rows: # HACK: favicon caching is unpredictable, don't bother checking it if row['url'].split('?')[0].endswith('favicon.ico'): continue observed_records.add( (row['url'].split('?')[0], fix_about_page_url(row['top_level_url']), row['triggering_origin'], row['loading_origin'], row['loading_href'], row['is_XHR'], row['is_frame_load'], row['is_full_page'], row['is_third_party_channel'], row['is_third_party_to_top_window'], row['resource_type'])) request_id_to_url[row['request_id']] = row['url'] assert HTTP_CACHED_REQUESTS == observed_records # HTTP Responses rows = db_utils.query_db( db, "SELECT * FROM http_responses WHERE visit_id = 2") observed_records = set() for row in rows: # HACK: favicon caching is unpredictable, don't bother checking it if row['url'].split('?')[0].endswith('favicon.ico'): continue observed_records.add(( row['url'].split('?')[0], # TODO: referrer isn't available yet in the # webext instrumentation | row['referrer'], row['is_cached'])) assert row['request_id'] in request_id_to_url assert request_id_to_url[row['request_id']] == row['url'] assert HTTP_CACHED_RESPONSES == observed_records # HTTP Redirects rows = db_utils.query_db( db, "SELECT * FROM http_redirects WHERE visit_id = 2") observed_records = set() for row in rows: # TODO: new_request_id isn't supported yet # src = request_id_to_url[row['old_request_id']].split('?')[0] # dst = request_id_to_url[row['new_request_id']].split('?')[0] src = row['old_request_url'].split('?')[0] dst = row['new_request_url'].split('?')[0] observed_records.add((src, dst)) assert HTTP_CACHED_REDIRECTS == observed_records