def test_extension_gets_correct_visit_id(self): manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) url_a = utilities.BASE_TEST_URL + '/simple_a.html' url_b = utilities.BASE_TEST_URL + '/simple_b.html' manager.get(url_a) manager.get(url_b) manager.close() qry_res = db_utils.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] simple_a_visit_id = db_utils.query_db( manager_params['db'], "SELECT visit_id FROM javascript WHERE " "symbol=?", ("window.navigator.userAgent", )) simple_b_visit_id = db_utils.query_db( manager_params['db'], "SELECT visit_id FROM javascript WHERE " "symbol=?", ("window.navigator.platform", )) assert visit_ids[url_a] == simple_a_visit_id[0][0] assert visit_ids[url_b] == simple_b_visit_id[0][0]
def test_get_site_visits_table_valid(self): """Check that get works and populates db correctly.""" # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.get(sleep=1) # Perform the get commands manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close() qry_res = db_utils.query_db(manager_params['db'], "SELECT site_url FROM site_visits") # We had two separate page visits assert len(qry_res) == 2 assert qry_res[0][0] == url_a assert qry_res[1][0] == url_b
def test_get_http_tables_valid(self): """Check that get works and populates http tables correctly.""" # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs_a = CommandSequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = CommandSequence.CommandSequence(url_b) cs_b.get(sleep=1) manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close() qry_res = db_utils.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b]
def test_property_enumeration(self): test_url = utilities.BASE_TEST_URL + '/property_enumeration.html' db = self.visit(test_url) rows = db_utils.query_db(db, "SELECT script_url, symbol FROM javascript") observed_symbols = set() for script_url, symbol in rows: assert script_url == test_url observed_symbols.add(symbol) assert PROPERTIES == observed_symbols
def test_flash_cookies(self): """ Check that some Flash LSOs are saved and are properly keyed in db.""" # Run the test crawl manager_params, browser_params = self.get_config() browser_params[0]['disable_flash'] = False manager = TaskManager.TaskManager(manager_params, browser_params) # Get a site we know sets Flash cookies and visit it twice lso_value_a = utilities.rand_str(8) expected_lso_content_a[5] = lso_value_a # expected to be present qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value_a) test_url_a = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str cs = CommandSequence.CommandSequence(test_url_a) cs.get(sleep=3, timeout=120) cs.dump_flash_cookies() manager.execute_command_sequence(cs) lso_value_b = utilities.rand_str(8) expected_lso_content_b[5] = lso_value_b # expected to be present qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value_b) test_url_b = utilities.BASE_TEST_URL + '/lso/setlso.html' + qry_str cs = CommandSequence.CommandSequence(test_url_b) cs.get(sleep=3, timeout=120) cs.dump_flash_cookies() manager.execute_command_sequence(cs) manager.close() # Check that some flash cookies are recorded qry_res = db_utils.query_db(manager_params['db'], "SELECT * FROM flash_cookies", as_tuple=True) lso_count = len(qry_res) assert lso_count == 2 lso_content_a = list(qry_res[0][2:]) # Remove first two items lso_content_b = list(qry_res[1][2:]) # Remove first two items # remove randomly generated LSO directory name # e.g. TY2FOJUG/localtest.me/Flash.sol -> localtest.me/Flash.sol lso_content_a[3] = lso_content_a[3].split("/", 1)[-1] # rm LSO dirname lso_content_b[3] = lso_content_b[3].split("/", 1)[-1] # rm LSO dirname assert lso_content_a == expected_lso_content_a assert lso_content_b == expected_lso_content_b
def test_profile_cookies(self): """ Check that some profile cookies are saved """ # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # TODO update this to local test site url = 'http://www.yahoo.com' cs = CommandSequence.CommandSequence(url) cs.get(sleep=3, timeout=120) cs.dump_profile_cookies() manager.execute_command_sequence(cs) manager.close() # Check that some flash cookies are recorded qry_res = db_utils.query_db(manager_params['db'], "SELECT COUNT(*) FROM profile_cookies") prof_cookie_count = qry_res[0][0] assert prof_cookie_count > 0
def test_js_profile_cookies(self): """ Check that profile cookies set by JS are saved """ # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) url = utilities.BASE_TEST_URL + "/js_cookie.html" cs = CommandSequence.CommandSequence(url) cs.get(sleep=3, timeout=120) cs.dump_profile_cookies() manager.execute_command_sequence(cs) manager.close() # Check that the JS cookie we stored is recorded qry_res = db_utils.query_db(manager_params['db'], "SELECT * FROM profile_cookies", as_tuple=True) assert len(qry_res) == 1 # we store only one cookie cookies = qry_res[0] # take the first cookie # compare URL, domain, name, value, origin, path assert cookies[2:8] == expected_js_cookie
def test_custom_function(self): """ Test `custom_function` with an inline func that collects links """ from src.open_wpm.automation import clientsocket def collect_links(table_name, scheme, **kwargs): """ Collect links with `scheme` and save in table `table_name` """ driver = kwargs['driver'] manager_params = kwargs['manager_params'] link_urls = [ x for x in (element.get_attribute("href") for element in driver.find_elements_by_tag_name('a')) if x.startswith(scheme + '://') ] current_url = driver.current_url sock = clientsocket() sock.connect(*manager_params['aggregator_address']) query = ("CREATE TABLE IF NOT EXISTS %s (" "top_url TEXT, link TEXT);" % table_name) sock.send((query, ())) for link in link_urls: query = ("INSERT INTO %s (top_url, link) " "VALUES (?, ?)" % table_name) sock.send((query, (current_url, link))) sock.close() manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) cs = CommandSequence.CommandSequence(url_a) cs.get(sleep=0, timeout=60) cs.run_custom_function(collect_links, ('page_links', 'http')) manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params['db'], "SELECT top_url, link FROM page_links;", as_tuple=True) assert PAGE_LINKS == set(query_result)
def test_browser_profile_coverage(self, tmpdir): """ Test the coverage of the browser's profile This verifies that Firefox's places.sqlite database contains all visited sites (with a few exceptions). If it does not, it is likely the profile is lost at some point during the crawl """ # Run the test crawl data_dir = os.path.join(str(tmpdir), 'data_dir') manager_params, browser_params = self.get_config(data_dir) manager = TaskManager.TaskManager(manager_params, browser_params) for site in TEST_SITES: manager.get(site) ff_db_tar = os.path.join(browser_params[0]['profile_archive_dir'], 'profile.tar.gz') manager.close() # Extract crawl profile with tarfile.open(ff_db_tar) as tar: tar.extractall(browser_params[0]['profile_archive_dir']) # Output databases ff_db = os.path.join(browser_params[0]['profile_archive_dir'], 'places.sqlite') crawl_db = manager_params['db'] # Grab urls from crawl database rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests") req_ps = set() # visited domains from http_requests table for url, in rows: req_ps.add(psl.get_public_suffix(urlparse(url).hostname)) hist_ps = set() # visited domains from CrawlHistory Table successes = dict() rows = db_utils.query_db( crawl_db, "SELECT arguments, bool_success " "FROM CrawlHistory WHERE command='GET'") for url, success in rows: ps = psl.get_public_suffix(urlparse(url).hostname) hist_ps.add(ps) successes[ps] = success # Grab urls from Firefox database profile_ps = set() # visited domains from firefox profile rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") for host, in rows: try: profile_ps.add(psl.get_public_suffix(urlparse(host).hostname)) except AttributeError: pass # We expect urls to be in the Firefox profile if: # 1. We've made requests to it # 2. The url is a top_url we entered into the address bar # 3. The url successfully loaded (see: Issue #40) # 4. The site does not respond to the initial request with a 204 (won't show in FF DB) missing_urls = req_ps.intersection(hist_ps).difference(profile_ps) unexpected_missing_urls = set() for url in missing_urls: if successes[url] == 0 or successes[url] == -1: continue # Get the visit id for the url rows = db_utils.query_db( crawl_db, "SELECT visit_id FROM site_visits " "WHERE site_url = ?", ('http://' + url, )) visit_id = rows[0] rows = db_utils.query_db( crawl_db, "SELECT COUNT(*) FROM http_responses " "WHERE visit_id = ?", (visit_id, )) if rows[0] > 1: continue rows = db_utils.query_db( crawl_db, "SELECT response_status, location FROM " "http_responses WHERE visit_id = ?", (visit_id, )) response_status, location = rows[0] if response_status == 204: continue if location == 'http://': # site returned a blank redirect continue unexpected_missing_urls.add(url) assert len(unexpected_missing_urls) == 0
def test_browse_wrapper_http_table_valid(self): """Check that TaskManager.browse() wrapper works and populates http tables correctly. NOTE: Since the browse command is choosing links randomly, there is a (very small -- 2*0.5^20) chance this test will fail with valid code. """ # Run the test crawl manager_params, browser_params = self.get_config() manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential browse commands to two URLS manager.browse(url_a, num_links=20, sleep=1) manager.browse(url_b, num_links=1, sleep=1) manager.close() qry_res = db_utils.query_db( manager_params['db'], "SELECT visit_id, site_url FROM site_visits" ) # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b,)) assert qry_res[0][0] == visit_ids[url_b] # Page simple_a.html has three links: # 1) An absolute link to simple_c.html # 2) A relative link to simple_d.html # 3) A javascript: link # 4) A link to www.google.com # 5) A link to example.com?localtest.me # We should see page visits for 1 and 2, but not 3-5. qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_c,)) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db(manager_params['db'], "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_d,)) assert qry_res[0][0] == visit_ids[url_a] # We expect 4 urls: a,c,d and a favicon request qry_res = db_utils.query_db( manager_params['db'], "SELECT COUNT(DISTINCT url) FROM http_responses" " WHERE visit_id = ?", (visit_ids[url_a],)) assert qry_res[0][0] == 4