def should_whitelist(self, url, top_url): """Check if `url` is whitelisted on `top_url` due to the entitylist Parameters ---------- url : string The URL or hostname to classify. top_url : string The URL or hostname of the top-level page on which `url` was loaded Returns ------- boolean : True if the url would have been whitelisted by the entitylist """ if not url.startswith('http'): url = 'http://' + url if not top_url.startswith('http'): top_url = 'http://' + top_url top_host = urlparse(top_url).hostname top_ps1 = du.get_ps_plus_1(top_url) url_host = urlparse(url).hostname url_ps1 = du.get_ps_plus_1(url) if top_host in self._entitylist: resources = self._entitylist[top_host] elif top_ps1 in self._entitylist: resources = self._entitylist[top_ps1] else: return False return url_host in resources or url_ps1 in resources
def get_intra_links(webdriver, url): ps1 = du.get_ps_plus_1(url) links = filter( lambda x: (x.get_attribute("href") and du.get_ps_plus_1( urljoin(url, x.get_attribute("href"))) == ps1), webdriver.find_elements_by_tag_name("a")) return links
def get_intra_links(webdriver, url): ps1 = du.get_ps_plus_1(url) links = list() for elem in webdriver.find_elements_by_tag_name("a"): try: href = elem.get_attribute("href") except StaleElementReferenceException: continue if href is None: continue full_href = urlparse.urljoin(url, href) if not full_href.startswith("http"): continue if du.get_ps_plus_1(full_href) == ps1: links.append(elem) return links
def get_intra_links(webdriver: WebDriver, url: str) -> List[WebElement]: """ Get all links that lead to a subdomain. Ignores StaleElement Exceptions. """ ps1 = du.get_ps_plus_1(url) links = list() for elem in webdriver.find_elements_by_tag_name("a"): try: href = elem.get_attribute('href') except StaleElementReferenceException: continue if href is None or type(href) is not str: continue full_href = urlparse.urljoin(url, href) if not full_href.startswith('http'): continue if du.get_ps_plus_1(full_href) == ps1: links.append(elem) return links
def get_option_dict(url, top_level_url, resource_type=None): """Build an options dict for BlockListParser. These options are checked here: * https://github.com/englehardt/abp-blocklist-parser/blob/40f6bb5b91ea403b7b9852a16d6c57d5ec26cf7f/abp_blocklist_parser/RegexParser.py#L104-L117 * https://github.com/englehardt/abp-blocklist-parser/blob/40f6bb5b91ea403b7b9852a16d6c57d5ec26cf7f/abp_blocklist_parser/RegexParser.py#L240-L248 Parameters ---------- url : string The URL of the requested resource. top_level_url : string The URL of the top-level frame of the requested resource resource_type : string All possible values are here https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/webRequest/ResourceType Returns ------- dict An "options" dictionary for use with BlockListParser """ options = {} # Add type option. Value doesn't matter. if resource_type: try: options[type_to_option[resource_type]] = True except KeyError: raise ValueError( "Argument %s given for `resource_type` not found in map." % resource_type) options["domain"] = urlparse(top_level_url).hostname if options["domain"] == None: # If somehow the top_level_url should be unparseable return None # Add third-party option if third party. Value doesn't matter. if du.get_ps_plus_1(url) != du.get_ps_plus_1(top_level_url): options["third-party"] = True return options
def test_get_ps_plus_1_on_vanilla_public_suffix(): assert get_ps_plus_1('http://www.google.com') == 'google.com'
def test_when_anchor(): assert get_ps_plus_1('http://www.google.com#anchor') == 'google.com'
def test_get_ps_plus_one_on_relative_url(): assert get_ps_plus_1('/my/path/is.html') == ''
def test_get_ps_plus_one_on_about_blank(): result = get_ps_plus_1('about:blank') assert result == ''
def test_get_ps_plus_one_no_https(): result = get_ps_plus_1('my.domain.cloudfront.net') assert result == 'domain.cloudfront.net'
jsondata = [] for f in files: jsonfile = open(f) jsonstr = jsonfile.read() jsondata.append(json.loads(jsonstr)) finaltld = {} def parse(string): return string.split("/")[2] #for other understanding of TLD+1, use the below return function #return "".join([str.split("/")[2], "/", str.split("/")[3]]) results = [set() for i in range(4)] frequencey_dict = {} for k in range(len(jsondata)): for i in range(len(jsondata[k])): for j in jsondata[k][i][1]: if j.startswith("http"): tld = du.get_ps_plus_1(j) results[k].add(tld) intersect = (results[0].intersection(results[1])).intersection( results[2]).intersection(results[3]) f = open('intersect.json', 'w') print(len(intersect)) json.dump(list(intersect), f) f.close()
def get_option_dict(request): """Build an options dict for BlockListParser Parameters ---------- request : sqlite3.Row A single HTTP request record pulled from OpenWPM's http_requests table public_suffix_list : PublicSuffixList An instance of PublicSuffixList() BINARY_OPTIONS = [ "script", "image", "stylesheet", "object", "xmlhttprequest", "object-subrequest", "subdocument", "document", "elemhide", "other", "background", "xbl", "ping", "dtd", "media", "third-party", "match-case", "collapse", "donottrack", ] Returns ------- dict An "options" dictionary for use with BlockListParser refs: [1] https://github.com/MoonchildProductions/UXP/blob/master/dom/base/nsIContentPolicyBase.idl [2] https://adblockplus.org/en/filters#options [3] Englehardt, S., & Narayanan, A. (2016, October). Online tracking: A 1-million-site measurement and analysis. In Proceedings of the 2016 ACM SIGSAC Conference on Computer and Communications Security (pp. 1388-1401). ACM. """ OPTIONS = { "other":{ 'enabled':False, 'content_policy_type': 1}, "script":{ 'enabled':True, 'content_policy_type': 2}, "image":{ 'enabled':True, 'content_policy_type': 3}, "stylesheet":{ 'enabled':True, 'content_policy_type': 4}, "object":{ 'enabled':True, 'content_policy_type': 5}, "document":{ 'enabled':False, 'content_policy_type': 6}, "subdocument":{ 'enabled':True, 'content_policy_type': 7}, "xbl":{ 'enabled':False, 'content_policy_type': 9}, "ping":{ 'enabled':False, 'content_policy_type': 10}, "xmlhttprequest":{ 'enabled':True, 'content_policy_type': 11}, "object-subrequest":{ 'enabled':True, 'content_policy_type': 12}, "dtd":{ 'enabled':False, 'content_policy_type': 13}, "media":{ 'enabled':False, 'content_policy_type': 15}, "elemhide":{ 'enabled':False}, "background":{ 'enabled':False}, "third-party":{ 'enabled':False}, "match-case":{ 'enabled':False}, "collapse":{ 'enabled':False}, "donottrack":{ 'enabled':False}, "domain":{ 'enabled':False}, } options = {} try: for name in OPTIONS: if OPTIONS[name]['enabled']: if name == 'third-party': options["third-party"] = du.get_ps_plus_1( request['url']) != du.get_ps_plus_1(request['top_level_url']) if name == 'domain': options["domain"] = urlparse(request['top_level_url']).hostname else: if 'content_policy_type' in OPTIONS[name]: options[name] = request['content_policy_type'] == OPTIONS[name]["content_policy_type"] except Exception as e: # print("exception {}".format(e)) pass return options
def test_get_ps_plus_1_on_fbsbx_example(): # apps.fbsbx.com is on the public sufix list (Apr 2, 2020) assert get_ps_plus_1( 'http://foo.blah.apps.fbsbx.com') == 'blah.apps.fbsbx.com' assert get_ps_plus_1('http://foo.blah.www.fbsbx.com') == 'fbsbx.com'
def fill_forms(email_producer, num_links, page_timeout, debug, visit_id, webdriver, proxy_queue, browser_params, manager_params, extension_socket, failfile, furl): """Finds a newsletter form on the page. If not found, visits <num_links> internal links and scans those pages for a form. Submits the form if found. """ # skipping: load the site # skipping: connecting to logger # try to find a newsletter form on the landing page if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params): return # otherwise, scan more pages print("couldn't find form, going to click around") main_handle = webdriver.current_window_handle visited_links = set() for i in range(num_links): # get all links on the page links = webdriver.find_elements_by_tag_name('a') random.shuffle(links) current_url = webdriver.current_url current_ps1 = domain_utils.get_ps_plus_1(current_url) # find links to click match_links = [] start_time = timeit.default_timer() for link in links: try: if not link.is_displayed(): continue # check if link is valid and not already visited href = link.get_attribute('href') if href is None or href in visited_links: continue # check if this is an internal link if not _is_internal_link(href, current_url, current_ps1): continue link_text = link.text.lower() # skip links with blacklisted text blacklisted = False for bl_text in _LINK_TEXT_BLACKLIST: if bl_text in link_text: blacklisted = True break if blacklisted: continue # should we click this link? link_rank = 0 for type, s, rank, flags in _LINK_TEXT_RANK: if (type == _TYPE_TEXT and s in link_text) or (type == _TYPE_HREF and s in href): if flags & _FLAG_IN_NEW_URL_ONLY: # don't use this link if the current page URL already matches too if type == _TYPE_HREF and s in current_url: continue # link matches! link_rank = rank match_links.append( (link, rank, link_text, href, flags)) break if link_rank >= _LINK_RANK_SKIP: # good enough, stop looking break except: print("ERROR while looping through links...") sys.exit(1) # quit if too much time passed (for some reason, this is really slow...) if match_links and timeit.default_timer( ) - start_time > _LINK_MATCH_TIMEOUT: break # find the best link to click if not match_links: break # no more links to click match_links.sort(key=lambda l: l[1]) next_link = match_links[-1] visited_links.add(next_link[3]) # click the link try: # load the page print("clicking on link '%s' - %s" % (next_link[2], next_link[3])) next_link[0].click() time.sleep(_PAGE_LOAD_TIME) wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # if browser_params['bot_mitigation']: # bot_mitigation(webdriver) # find newsletter form if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params): return # should we stay on this page? if next_link[4] & _FLAG_STAY_ON_PAGE: continue # go back webdriver.back() wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # check other windows (ex. pop-ups) windows = webdriver.window_handles if len(windows) > 1: form_found_in_popup = False for window in windows: if window != main_handle: webdriver.switch_to_window(window) wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # find newsletter form if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params): form_found_in_popup = True webdriver.close() webdriver.switch_to_window(main_handle) time.sleep(1) if form_found_in_popup: return except: pass # if you reach here, signup wasn't successful -- save the information with open(failfile, 'a') as wh: wh.write(furl + '\n')
def test_get_ps_plus_1_on_exotic_public_suffix(): assert get_ps_plus_1( 'http://foo.bar.website.apartments') == 'website.apartments'
def _is_internal_link(href, url, ps1=None): """Returns whether the given link is an internal link.""" if ps1 is None: ps1 = domain_utils.get_ps_plus_1(url) return domain_utils.get_ps_plus_1(urljoin(url, href)) == ps1
if __name__ == '__main__': # to switch tabs: driver.switch_to_window(driver.window_handles[0]) port = sys.argv[1] chrome_options = Options() chrome_options.debugger_address = "127.0.0.1:" + port exe_path = "./chromedriver" driver = webdriver.Chrome(executable_path=exe_path, options=chrome_options) # config for fill_forms # TODO: give large num_links and longer wait time to js in final crawl num_links = 3 page_timeout = 8 # actually not used anywhere debug = True webdriver = driver # email_producer is a function, modify according to use-case proxy_queue = None browser_params = None manager_params = None # name of the website is passed here from js to screenshot full_url = sys.argv[2] # don't want slashes or anything when naming file names visit_id = domain_utils.get_ps_plus_1(full_url) extension_socket = chrome_options.debugger_address # actually don't give a f**k about this # the file to save the names of all unsuccessful signups failfile = 'signup_fails.txt' # Here we go... fill_forms(email_producer, num_links, page_timeout, debug, visit_id, webdriver, proxy_queue, browser_params, manager_params, extension_socket, failfile, full_url)
def test_browser_profile_coverage(default_params, task_manager_creator): """Test the coverage of the browser's profile. This verifies that Firefox's places.sqlite database contains all visited sites. If it does not, it is likely the profile is lost at some point during the crawl. """ # Run the test crawl manager_params, browser_params = default_params manager_params.num_browsers = 1 manager_params.testing = False browser_params[0].profile_archive_dir = (manager_params.data_directory / "browser_profile") browser_params[0].http_instrument = True manager, crawl_db = task_manager_creator( (manager_params, browser_params[:1])) for site in TEST_SITES: manager.get(site) manager.close() # Extract crawl profile ff_db_tar = browser_params[0].profile_archive_dir / "profile.tar.gz" with tarfile.open(ff_db_tar) as tar: tar.extractall(browser_params[0].profile_archive_dir) # Output databases ff_db = browser_params[0].profile_archive_dir / "places.sqlite" # Grab urls from crawl database rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests") req_ps = set() # visited domains from http_requests table for (url, ) in rows: req_ps.add(du.get_ps_plus_1(url)) hist_ps = set() # visited domains from crawl_history Table rows = db_utils.query_db( crawl_db, "SELECT arguments FROM crawl_history WHERE command='GetCommand'", ) for (arguments, ) in rows: url = json.loads(arguments)["url"] ps = du.get_ps_plus_1(url) hist_ps.add(ps) # Grab urls from Firefox database profile_ps = set() # visited domains from firefox profile rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") for (host, ) in rows: try: profile_ps.add(du.get_ps_plus_1(host)) except AttributeError: pass # We expect a url to be in the Firefox profile if: # 1. We've made requests to it # 2. The url is a top_url we entered into the address bar # # Previously, we expected some missing urls if the following # conditions were not met, but this is no longer the case: # 3. The url successfully loaded (see: Issue #40) # 4. The site does not respond to the initial request with a 204 # (won't show in FF DB) # See PR #893 to restore this behavior in case this test fails. assert req_ps.intersection(hist_ps).difference(profile_ps) == set()
counter += 1 rows = cur.fetchmany(CHUNKSIZE) if len(rows) == 0: break tp_rows = list() if ONLY_SEARCH_IN_THIRD_PARTY_HTTP: # This is a major change: we start to look for leaks in the first # party requests/responses # This was the case for the form filling crawl, we now extend it to # other crawls. # TODO: Filter 3rd p requests and responses at the analysis stage # Having first party leaks is necessary when detecting the origin of # the leaks or "first leaks" for row in rows: if (row['top_level_url'] is not None and row['top_level_url'] != '' and du.get_ps_plus_1(row['site_url']) != du.get_ps_plus_1( row['url']) # noqa and du.get_ps_plus_1(row['site_url']) == du.get_ps_plus_1( row['top_level_url'])): # noqa tp_rows.append(row) rows = tp_rows if where_to_search == "requests": results = pool.map(check_row_for_leaks, [(x['url'], x['headers'], x['post_body']) for x in rows]) else: results = pool.map(check_resp_row_for_leaks, [(x['url'], x['headers'], x['location']) for x in rows])
def contains_ps1(self, hostname): """Returns True if the Disconnect list contains any domains from ps1""" if not hostname.startswith('http'): hostname = 'http://' + hostname return du.get_ps_plus_1(hostname) in self._blocklist
def test_get_ps_plus_1_on_data_url(): assert get_ps_plus_1( "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAA") == ''
def get_internal_links_depth(site, depth): """Request and parse internal links from `site`""" headers = requests.utils.default_headers() headers.update({'User-Agent': USER_AGENT}) if (depth == 0): result = list() result.append(site) return None, result try: try: if depth == DEPTH: resp = requests.get('http://' + site, headers=headers, timeout=60) else: resp = requests.get(site, headers=headers, timeout=60) except Exception as e: if depth == DEPTH: resp = requests.get('http://www.' + site, headers=headers, timeout=60) else: resp = requests.get(site, headers=headers, timeout=60) if resp.status_code != 200: print("Non-200 response code %i for site %s" % (resp.status_code, site)) return (site, list()) if resp.content is None: print("No content returned for site %s" % site) return (site, list()) # Current URL after HTTP Redirects current_url = resp.url top_ps1 = du.get_ps_plus_1(current_url) # Find all internal a tags soup = BeautifulSoup(resp.content, 'lxml') links = set() for tag in soup.find_all('a'): href = tag.get('href') if href is None: continue href = urlparse.urljoin(current_url, href) if (not href.startswith('http') or du.get_ps_plus_1(href) == top_ps1): #if (not href.startswith('http')): continue links.add(urlparse.urldefrag(href)[0]) # Craw Next Level links_next_layer = set() for link in links: links_next_layer |= set( get_internal_links_depth(link, depth - 1)[1]) links |= links_next_layer return site, list(links) except (KeyboardInterrupt, SystemExit): raise except Exception as e: print("Exception while requesting %s\n%s" % (site, str(e))) return (site, list())
def test_get_ps_plus_1_on_ip_addresses(): assert get_ps_plus_1('http://192.168.1.1') == '192.168.1.1' assert get_ps_plus_1('http://127.0.0.1/foo.html') == '127.0.0.1'
def get_set_of_script_ps1s_from_call_stack(script_urls): if len(script_urls): return ", ".join( set((du.get_ps_plus_1(x) or "") for x in script_urls.split(", "))) else: return ""