Beispiel #1
0
    def __init__(self, browser="firefox", log_file="log.txt", unit_id=0, treatment_id=0, headless=False, proxy=None,rules=None):
        

        # if easylist is not passed in, then consider this is a bare unit that 
        # that should only be used to fetch easylist and then parse into
        # adblockplus rules for use with adblockparser.
        if rules == None:
            self._fetch_easylist()
            self.filterlist = self._load_easylist()
            self.rules = AdblockRules(self.filterlist)
        else:
            logging.basicConfig(filename="adb_"+log_file,level=logging.INFO)
            self.logger = logging.getLogger(__name__)

            # call parent constructor
            browser_unit.BrowserUnit.__init__(self, browser, log_file, unit_id, treatment_id, headless, proxy=proxy)

            self.session = self.driver.session_id
            print("Running adblock unit session: {}".format(self.session))
            
            # set rules to those that where passed in
            self.rules = rules
            self.all_options = {opt:True for opt in AdblockRule.BINARY_OPTIONS}

            # internal ad data structure 
            self.data = []

            self.Ad = namedtuple('Ad',['url','outerhtml','tag','link_text','link_location','on_site', 'reloads'])

            # dictionary to memoize url checks
            self.memo = {}

            # store current context where we are collecting ads
            self.site = ""
            self.reloads= 0
Beispiel #2
0
def test_rule_with_options(rule_text, results, use_re2):
    rule = AdblockRule(rule_text)
    rules = AdblockRules([rule_text], use_re2=use_re2)

    for url, params, match in results:
        assert rule.match_url(url, params) == match
        assert rules.should_block(url, params) == match
Beispiel #3
0
def test_rule_exceptions(rules, results, use_re2):
    rules = AdblockRules(rules, use_re2=use_re2)

    for url in results["blocks"]:
        assert rules.should_block(url)

    for url in results["doesn't block"]:
        assert not rules.should_block(url)
Beispiel #4
0
def test_documented_examples(rule_text, results, use_re2):
    rule = AdblockRule(rule_text)
    rules = AdblockRules([rule_text], use_re2=use_re2)

    for url in results["blocks"]:
        assert rule.match_url(url)
        assert rules.should_block(url)

    for url in results["doesn't block"]:
        assert not rule.match_url(url)
        assert not rules.should_block(url)
Beispiel #5
0
	def __init__(self, filename):
		self.rules = []
		with open(filename, "r") as blacklist:
			for line in blacklist.xreadlines():
				if line.startswith('!'):
					continue
				if '##' in line: # HTML rule
					continue
				else:
					self.rules.append(line)
		#, supported_options=['script', 'domain']
		self.adblock = AdblockRules(self.rules, supported_options=['script', 'domain'])
Beispiel #6
0
def main(harfile_path, domain, blockfile):
    harfile = open(harfile_path, "r")
    harfile_json = json.loads(harfile.read())
    harfile.close()
    bfile = open(blockfile, "r")
    block_list = bfile.readlines()
    bfile.close()
    rules = AdblockRules(block_list)
    adblock_db = {
        "url_data": {},
        "stats": {
            "domain": domain,
            "req": 0,
            "succ": 0,
            "block": 0
        }
    }
    options = ('image', 'xmlhttprequest', 'document', 'font', 'script', 'stylesheet', 'other')
    for entry in harfile_json['log']['entries']:
        url = entry['request']['url']
        urlparts = urlparse(url)
        print("Processing {} ...".format(url))
        try:
            fld = get_fld(url, fail_silently=True)
            adblock_db["stats"]["req"] += 1
            if fld != domain:
                d = {}
                if entry["_resourceType"] == "xhr":
                    entry["_resourceType"] = "xmlhttprequest"
                if entry["_resourceType"] not in options:
                    d = {"third-party": True, "domain": urlparts.hostname}
                else:
                    d = {entry["_resourceType"]: True, "third-party": True, "domain": urlparts.hostname}

                if rules.should_block(url, d):
                    adblock_db["stats"]["block"] += 1
                else:
                    adblock_db["stats"]["succ"] += 1
            else:
                if entry["_resourceType"] == "xhr":
                    entry["_resourceType"] = "xmlhttprequest"
                if entry["_resourceType"] not in options:
                    d = {"third-party": False, "domain": urlparts.hostname}
                else:
                    d = {entry["_resourceType"]: True, "third-party": False, "domain": urlparts.hostname}

                if rules.should_block(url, d):
                    adblock_db["stats"]["block"] += 1
                else:
                    adblock_db["stats"]["succ"] += 1
        except:
            continue
    return adblock_db
Beispiel #7
0
def UrlBlock():
    global rules, adpath
    if sets.ADBLOCKER == 1:
        rules = AdblockRules(file('hosts3.txt'))
        adpath = ['zoneid=', 'auction_id=', 'campaign_id', 'offer?']
        
        sys.stdout.write("Adblocking is ON\r\n")
    elif sets.ADBLOCKER == 0:
        rules = AdblockRules([])
        sys.stdout.write("Adblocking is OFF\r\n")
    else:
        print "Invalid value for Blocking, try 0 or 1, will use 0"
        sets.ADBLOCKER == 0
def get_adblock_rules():
    raw_easylist_rules = read_ab_rules_from_file("blocklists/easylist.txt")
    raw_easyprivacy_rules = read_ab_rules_from_file(
        "blocklists/easyprivacy.txt")
    # raw_ublock_rules = read_ab_rules_from_file("blocklists/adblock_blacklist_white.txt")

    print("Loaded %s from EasyList, %s rules from EasyPrivacy" %
          (len(raw_easylist_rules), len(raw_easyprivacy_rules)))
    #        len(raw_ublock_rules)))

    easylist_rules = AdblockRules(raw_easylist_rules)
    easyprivacy_rules = AdblockRules(raw_easyprivacy_rules)
    # ublock_rules = AdblockRules(raw_ublock_rules)
    # return easylist_rules, easyprivacy_rules, ublock_rules
    return easylist_rules, easyprivacy_rules
def main(urlfile=None):
    rules = AdblockRules(open(rule_file).read().splitlines())

    urls = UrlList(max=max_url_list_size, filter=url_filter)
    saved = UrlList()

    if urlfile:
        urls.list = open(urlfile, "r").read().splitlines()

    else:
        auth = OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)

        gt = Thread(target=gather_urls, args=(auth, urls))
        gt.start()

    cts = []
    for i in range(6):
        ct = Thread(target=crawl_urls, args=(urls, saved, rules))
        cts.append(cts)
        ct.start()

    global fetched_pages, fetched_jses
    while True:
        print("[pages] queued:%d fetched:%d [js] fetched:%d saved:%d" %
              (len(urls.list), fetched_pages, fetched_jses, len(saved.list)),
              end="\033[K\r")
        sleep(1)
Beispiel #10
0
def load_easylist():
    """Fetches the most recent easylist and returns an AdblockRules object
    using it."""
    r = requests.get("https://easylist.to/easylist/easylist.txt")
    r.raise_for_status()
    easy = r.text
    return AdblockRules(easy.splitlines())
Beispiel #11
0
def get_rules():
    raw_rules = [
        x.replace('\r', '').replace('\n', '')
        for x in codecs.open(EASYLIST_FILE, 'r', 'utf8').readlines()
    ]

    rules = AdblockRules(raw_rules)
    return rules
def get_adblock_rules():
    raw_easylist_rules = read_ab_rules_from_file("adblock/easylist.txt")
    raw_easyprivacy_rules = read_ab_rules_from_file("adblock/easyprivacy.txt")
    if ENABLE_UBLOCK:
        raw_ublock_rules = read_ab_rules_from_file(
            "adblock/adblock_blacklist_white.txt")
    else:
        raw_ublock_rules = []
    print(
        "Loaded %s from EasyList, %s rules from EasyPrivacy"
        " and %s rules from UBlockOrigin" %
        (len(raw_easylist_rules), len(raw_easyprivacy_rules),
         len(raw_ublock_rules)))
    easylist_rules = AdblockRules(raw_easylist_rules)
    easyprivacy_rules = AdblockRules(raw_easyprivacy_rules)
    ublock_rules = AdblockRules(raw_ublock_rules)
    return easylist_rules, easyprivacy_rules, ublock_rules
Beispiel #13
0
def load_rules(blocklists=None):
    rules = AdblockRules(
        combined(blocklists),
        use_re2=True,
        max_mem=512 * 1024 * 1024,
        # supported_options=['script', 'domain', 'image', 'stylesheet', 'object']
    )
    return rules
Beispiel #14
0
class EasyListHandler:
    with open('easylist.txt', 'r', encoding='UTF-8') as f:
        rules = AdblockRules(list(f))
    #def __init__(self):
    #with open('easylist.txt', 'r', encoding='UTF-8') as f:
    #self.rules = AdblockRules(list(f))
    def is_harmful_url(self, url):
        return EasyListHandler.rules.should_block(input())
Beispiel #15
0
def main(harfile_path):
    """Reads a har file from the filesystem, converts to CSV, then dumps to
    stdout.
    """
    txt_file = 'easylist.txt'
    raw_rules = readfile(txt_file)

    harfile = open(harfile_path, encoding  = 'UTF-8')
    harfile_json = json.loads(harfile.read())
    i = 0

    first_party = harfile_path.split('.')[1]+'.'+harfile_path.split('.')[2]
    rules = AdblockRules(raw_rules)
    blocked = 0
    blocked_domains = set()
    opt = {'script': True,'image':True,'stylesheet':True,'object':True,'subdocument':True,'xmlhttprequest':True,'websocket':True,'webrtc':True,'popup':True,'generichide':True,'genericblock':True}
    
    for entry in harfile_json['log']['entries']:
        i = i + 1
        url = entry['request']['url']
        urlparts = urlparse(entry['request']['url'])
        size_bytes = entry['response']['bodySize']
        size_kilobytes = float(entry['response']['bodySize'])/1024
        mimetype = 'unknown'
        if 'mimeType' in entry['response']['content']:
            mimetype = entry['response']['content']['mimeType']
        
        option = ''
        res = get_tld(url, as_object=True)
        mime_opt = mimetype.split('/')[0]

        if mime_opt in opt:
        	option = mime_opt
        
        if res.fld != first_party and option in opt and rules.should_block(url, {option: opt[option]}):
        	blocked += 1
        	blocked_domains.add(res.fld)
    
    blocked_domains = [dom for dom in blocked_domains] if blocked_domains else 'No domains blocked'

    print(f'\nSite: {first_party}\n# of total HTTP requests: {i}\n# of HTTP requests blocked: {blocked}\nBlocked domains: {blocked_domains}\n')
Beispiel #16
0
def test_rules_supported_options():
    rules = AdblockRules(["adv", "@@advice.$~script"])
    assert not rules.should_block("http://example.com/advice.html", {'script': False})

    # exception rule should be discarded if "script" option is not supported
    rules2 = AdblockRules(["adv", "@@advice.$~script"], supported_options=[])
    assert rules2.should_block("http://example.com/advice.html", {'script': False})
Beispiel #17
0
	def get_rules():
		"Loads Adblock filter rules from file."
		from adblockparser import AdblockRules

		raw_rules = []
		raw_rules.extend(ADBLOCK_RULES)
		with closing(requests.get(ADBLOCK_EASYLIST, stream=True)) as file:
			# lines = 0 # to be removed
			for rule in file.iter_lines():
				raw_rules.append(rule.strip())
				# lines += 1 # tbr
				# if lines == 2500: break # tbr, only for windoze with no re2
		rules = AdblockRules(raw_rules)
		return rules
Beispiel #18
0
def load_gfwlist():
    global gfwlist_loaded
    with open('gfwlist.txt.gzipped', 'rb') as f:
        rules = zlib.decompress(f.read()).decode().split('\n')
    print('utils: loading GFWList rule')
    if const.USE_GFWLIST_ANYWAY:
        filt = AdblockRules(rules)
        if not filt.uses_re2:
            print(
                'utils: warning: GFWList is not using re2. THE EFFICIENCY IS NOT GUARANTEED!'
            )
        gfwlist_loaded = True
        return filt.should_block
    else:
        try:
            filt = AdblockRules(rules, use_re2=True, max_mem=const.RE2_MAX_MEM)
        except ImportError:
            print(
                'utils: warning: GFWList is disabled unless you have pyre2 installed'
            )
            return lambda _: False
        else:
            gfwlist_loaded = True
            return filt.should_block
def load_rules(blocklists=[
    "easylist.txt", "easyprivacy.txt", "fanboy-annoyance.txt",
    "fanboy-social.txt"
]):
    print "Loading rules:", blocklists

    # rules = AdblockRules( combined(blocklists), use_re2=True, max_mem=512*1024*1024, supported_options=['script', 'domain'] )
    rules = AdblockRules(combined(blocklists),
                         use_re2=True,
                         supported_options=[
                             'script', 'domain', 'image', 'stylesheet',
                             'object'
                         ])
    # rules = AdblockRules( combined(blocklists), use_re2=True )

    return rules
Beispiel #20
0
class Filter(object):
	def __init__(self, filename):
		self.rules = []
		with open(filename, "r") as blacklist:
			for line in blacklist.xreadlines():
				if line.startswith('!'):
					continue
				if '##' in line: # HTML rule
					continue
				else:
					self.rules.append(line)
		#, supported_options=['script', 'domain']
		self.adblock = AdblockRules(self.rules, supported_options=['script', 'domain'])
		
	def match(self, url, options=None):
		return self.adblock.should_block(url)
Beispiel #21
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        required=True,
                        help='BigQuery request input table.')
    parser.add_argument('--output',
                        dest='output',
                        help='BigQuery output table.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    output_table = '%s' % known_args.output
    input_query = """
    SELECT
      page, url,
      DOMAIN(page) as domain,
      IF (DOMAIN(page) == DOMAIN(url), false, true) AS third_party,
    FROM [%s]
  """ % known_args.input

    classifiers = {}
    for file in ['ad', 'tracker', 'social']:
        rules = [line.rstrip('\n') for line in open('local/' + file + '.txt')]
        classifier = AdblockRules(rules,
                                  supported_options=['domain', 'third-party'],
                                  skip_unsupported_rules=False,
                                  use_re2=True)
        del rules
        classifiers[file] = classifier

    p = df.Pipeline(argv=pipeline_args)

    (p
     | df.Read('read', df.io.BigQuerySource(query=input_query))
     | df.ParDo('classify', EasylistClassifyDoFn(), classifiers)
     # | df.io.Write('write', df.io.TextFileSink('out')))
     | df.Write(
         'write',
         df.io.BigQuerySink(
             output_table,
             schema='page:STRING, url:STRING, type:STRING',
             create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE)))

    p.run()
Beispiel #22
0
    def _load_rules(self):
        global _adblock_rules_cache

        easylist_files = [
            EASYLIST_PATH / filename for filename in EASYLIST_FILES
        ]

        mtime = max(filename.stat().st_mtime for filename in easylist_files)
        if _adblock_rules_cache is not None and _adblock_rules_cache[
                'mtime'] >= mtime:
            self.rules = _adblock_rules_cache['rules']
            return

        cache_file = self.options.get('adblockrules_cache')
        if cache_file:
            cache_file = Path(cache_file)
        if cache_file and cache_file.exists(
        ) and cache_file.stat().st_mtime >= mtime:
            with cache_file.open('rb') as f:
                rules = pickle.load(f)
        else:
            lines = []
            for easylist_file in easylist_files:
                for line in (EASYLIST_PATH / easylist_file).open():
                    # Lines with @@ are exceptions which are not blocked
                    # even if other adblocking rules match. This is done
                    # to fix a few sites. We do not need those exceptions.
                    if line.startswith('@@'):
                        continue
                    lines.append(line)
            rules = AdblockRules(lines)
            if cache_file:
                with cache_file.open('wb') as f:
                    pickle.dump(rules, f, pickle.HIGHEST_PROTOCOL)

        _adblock_rules_cache = {'mtime': mtime, 'rules': rules}
        self.rules = rules
            }

"""Check each path against filter for maliciousness, ads, tracking, ads and tracking, bitcoin, or pornography.
    (number of n at the end file determines the version for the file. The more the number of n, the newer the version of the file)
"""
for filt in filterlist[1:]:
    print('Start checking against: ' + filt[0] + ': ' + filt[1])
    if 'host' not in filt[1]:
        try:
            req = Request(filt[1], headers={'User-Agent': 'Mozilla/5.0'})
            raw_rules = urlopen(req).readlines()
            raw_rules2 = [x.decode('utf8') for x in raw_rules if x.decode('utf8') != '\r\n']
            raw_rules3 = []
            for raw in raw_rules2:
                raw_rules3.append(raw.replace('\n', '').replace('\r', ''))
            rules = AdblockRules(raw_rules3)
        except KeyboardInterrupt:
            raise
        except:
            print('====cannot read filter====')
            raw_rules3 = ''
        if raw_rules3 != '':
            #print(raw_rules3)
            for path in array:
                if rules.should_block(path) is True:
                    print(path + ' : Yes')
                    dictionary[path][filt[2]] = True
                else:
                    print(path + ' : No')
        print('---------------------------------')
Beispiel #24
0
def test_regex_rules():
    rules = AdblockRules(["/banner\d+/"])
    assert rules.should_block("banner123")
    assert not rules.should_block("banners")
Beispiel #25
0
def test_rules_with_options(rules, results, use_re2):
    rules = AdblockRules(rules, use_re2=use_re2)
    for url, params, should_block in results:
        assert rules.should_block(url, params) == should_block
import adblockparser, sys
from adblockparser import AdblockRules
f = open('easylist.txt', 'r')
all = f.read().splitlines()
rules = AdblockRules(all)
while True:
	# sys.stdin.open()
	s = sys.stdin.readline()

# print sys.argv[1]
	print rules.should_block(s)
	# sys.stdin.close()
	sys.stdout.close()
	sys.stdout = open('/dev/stdout', 'w')
	# open(sys.stdout)
	# open(sys.stdin)
	# print "^D"
class AdBlockUnit(browser_unit.BrowserUnit):

    EASYLIST = 'easylist.txt'
    EASYLIST_URL = "https://easylist-downloads.adblockplus.org/easylist.txt"

    def _easylist_version(self,path=EASYLIST):
        '''
        Reads the version from the current easylist, or a file that is passed in
        '''
        if os.path.isfile(path):
            with open(path) as f:
                lines = f.read().splitlines()
                return lines[2].split(':')[1].strip()
        else:
            return -1

    def _fetch_easylist(self):
        '''
        Downloads the latest version of easylist, and if newer replaces any
        existing one.
        '''
        tmp_easylist = "tmp_"+self.EASYLIST
        cur_version = self._easylist_version()

        # download latest easylist from the Internet
        urllib.urlretrieve(self.EASYLIST_URL,tmp_easylist)
        tmp_version = self._easylist_version(path=tmp_easylist)
        
        # if necessary update
        if tmp_version > cur_version and cur_version != -1:
            os.remove(self.EASYLIST)
            shutil.move(tmp_easylist,self.EASYLIST)
            print ("Updated easylist from {} to {}".format(cur_version,tmp_version))
        elif cur_version == -1:
            shutil.move(tmp_easylist,self.EASYLIST)
            print("New easylist {}".format(tmp_version))
        else:
            os.remove(tmp_easylist)
            print("Easylist already up to date at: {}".format(tmp_version))

    def _load_easylist(self):
        '''
        Reads in easylist from a file and parses it into lines to be passed to
        abblockparser.
        '''
        with open(self.EASYLIST) as f:
            lines = f.read().splitlines()
        print("Loaded easylist version: {} with : {} items".format(self._easylist_version(),len(lines)))
        return lines


    def __init__(self, browser="firefox", log_file="log.txt", unit_id=0, treatment_id=0, headless=False, proxy=None,rules=None):
        

        # if easylist is not passed in, then consider this is a bare unit that 
        # that should only be used to fetch easylist and then parse into
        # adblockplus rules for use with adblockparser.
        if rules == None:
            self._fetch_easylist()
            self.filterlist = self._load_easylist()
            self.rules = AdblockRules(self.filterlist)
        else:
            logging.basicConfig(filename="adb_"+log_file,level=logging.INFO)
            self.logger = logging.getLogger(__name__)

            # call parent constructor
            browser_unit.BrowserUnit.__init__(self, browser, log_file, unit_id, treatment_id, headless, proxy=proxy)

            self.session = self.driver.session_id
            print("Running adblock unit session: {}".format(self.session))
            
            # set rules to those that where passed in
            self.rules = rules
            self.all_options = {opt:True for opt in AdblockRule.BINARY_OPTIONS}

            # internal ad data structure 
            self.data = []

            self.Ad = namedtuple('Ad',['url','outerhtml','tag','link_text','link_location','on_site', 'reloads'])

            # dictionary to memoize url checks
            self.memo = {}

            # store current context where we are collecting ads
            self.site = ""
            self.reloads= 0

    def save_data(self):
        json_file = os.path.splitext(self.log_file)[0]+"."+self.session+".json"
        with open(json_file, 'w') as outfile:
            json.dump(self.data, outfile)

        # This is the log line adblock_analysis will parse to identify data files
        self.logger.info("save_data:{}:{}:{}".format(self.unit_id,self.treatment_id,self.session))

    def log_element(self,element,source):
        '''
        Input: An element that has been identified as an ad and how it was identified
        Result: Inserts appropriate information into the log
        '''
    

    
        url = element.get_attribute(source)
        html = element.get_attribute('outerHTML').encode('utf-8')
        tag = element.tag_name
        link_text = element.text
        link_location = element.location
         
        # update internal datastore
        ad_data = self.Ad(url=url, outerhtml=html, tag=tag, link_text=link_text, link_location=link_location, on_site=self.site, reloads=self.reloads)
        
        # store to internal data structure
        self.data.append(ad_data)

        # log to plaintext log
        self.logger.debug("Ad:Data:{}".format(ad_data))

    def check_elements(self, elements, source, options=None):
        '''
        Input: Given an element in the currently active page and an attribute to query on
        Result: Queries the given attribute (source) and checks the url against the 
        filterlist. Logs any identified elements and returns the count.
        '''
        count = 0
        for e in elements:
            try:
                url = e.get_attribute(source)
                if url != None:
                    self.logger.debug("Checking:{}:{}".format(source, url))
                    # check if we have evaluated this ad before
                    if url not in self.memo:
                        # actually check the url against the filter list
                        self.memo[url] = self.rules.should_block(url, options)

                    if self.memo[url]:
                        self.log_element(e,source)
                        count+=1

            # occurs with stale elements that no longer exist in the DOM
            except selenium.common.exceptions.StaleElementReferenceException as e:
                self.logger.error(e)
        return count


    def check_href(self):
        '''
        Identifies and captures ads based on HTML hyperlink tags.
        These are considered "text" ads.
        '''
        driver = self.driver
        ### xpath could be less performant than other find_* methods
        # common tags: <a>,<link>
        elements = driver.find_elements_by_xpath("//*[@href]")
        count = self.check_elements(elements,"href", self.all_options)
        self.logger.debug("href search found: {}".format(count))
    

    def check_src(self):
        '''
        Identifies and captures ads based on tags with a 'src' attribute
        These are considered "media" ads and are often img, iframe,script
        tags
        '''
        driver = self.driver
        ### xpath could be less performant than other find_* methods
        # common tags: <img>, <iframe>, <frame>, <embed>, <script>
        elements = driver.find_elements_by_xpath("//*[@src]")
        count = self.check_elements(elements, "src", self.all_options)
        self.logger.debug("src search found: {}".format(count))


    def check_iframe(self,parents=()):
        '''
        Functionality to check within nested iframes for ad related resources.
        Invariants: expects webdriver to enter at the level defined by parents
        resets webdriver to top level contents prior to leaving
        Input: a tuple describing the iframe name attribute of parent levels
        '''

        driver = self.driver
        children = driver.find_elements_by_tag_name('iframe')

        for child in children:

            try:
                driver.switch_to.frame(child)

                # check in the iframe for ads
                self.check_href()
                self.check_src()

                # set parent for children we check
                nesting = parents + (child,)
                self.check_iframe(parents=nesting)

            except selenium.common.exceptions.StaleElementReferenceException as e:
                self.logger.error(e)

            # return to correct level of nesting
            driver.switch_to_default_content()

            for p in parents:
                try:
                    driver.switch_to.frame(p)
                except selenium.common.exceptions.NoSuchElementException as e:
                    # this should not occur but just in case, preserve invariant
                    # of function leaving at top level
                    self.logger.error("resetting level in iframe recursion")
                    driver.switch_to_default_content()


        # always reset to top level content prior to exiting
        driver.switch_to_default_content()

    def find_ads(self):
        '''
        Primary convenience function to use all ad identification mechanisms
        '''
        self.check_href()
        self.check_src()
        self.check_iframe()

    def visit_url(self,url):
        driver = self.driver
        try:
            driver.get(url)
            self.logger.debug("Visited: {}".format(url))
            self.site = url
            return True
        except selenium.common.exceptions.TimeoutException as e:
            print("Timeout Visiting: {} : {}".format(url,self.session))
            print e
            return False


    def collect_ads(self,url, reloads=1, delay=0, file_name=None):
        '''
        Visits a specified url and runs ad collection functions
        Result: 
        '''
        print("collecting ads on: {}".format(url))
        if file_name == None:
            file_name = self.log_file

        # number of reloads on site to capture all ads
        for r in range(reloads):
            time.sleep(delay)

            # if a successful visit
            if self.visit_url(url):
                # collect ads
                self.reloads=r
                self.find_ads()
Beispiel #28
0
def test_empty_regexp_rules():
    with pytest.raises(AdblockParsingError):
        AdblockRules(['adv', '/', '//'])
Beispiel #29
0
def test_rules_instantiation():
    rule = AdblockRule("adv")
    rules = AdblockRules([rule])
    assert rule.match_url("http://example.com/adv")
    assert rules.should_block("http://example.com/adv")
def load_rules(blocklists=None):
    rules = AdblockRules(combined(blocklists),
                         use_re2=True,
                         max_mem=512 * 1024 * 1024)

    return rules
Beispiel #31
0
def test_empty_rules():
    rules = AdblockRules(["adv", "", " \t", AdblockRule("adv2")])
    assert len(rules.rules) == 2
def test_regex_rules():
    # Regex rules are not supported yet.
    # There are no such rules in EasyList filters.
    rules = AdblockRules(["/banner\d+/"])
    assert rules.should_block("banner123")
    assert not rules.should_block("banners")
Beispiel #33
0
import sqlite3 as lite
import tldextract

from tracking_rules import TrackingRules
from ad_rules import AdRules
from adblockparser import AdblockRules
import json

BLOCKLIST = "../../assets/disconnect_blocklist.json"

global rules_instance
ad_rules_instance = AdRules()
tracking_rules_instance = TrackingRules()
raw_rules = ad_rules_instance.rules
raw_rules += tracking_rules_instance.rules
rules = AdblockRules(raw_rules, use_re2=True)
#
#
#
#
# NON CUMULATIVE
# MAIN CONFIG
wpm_db = '/media/tobi/Daten/Workspace/OpenWPM/Output/1000_2.sqlite'
selected_crawl = 1
display_index = 0 # 0 is landing page, 1-4 subsites
show_tracking_and_third_parties = True # True: show tracking-percentage as part of third-party percentage in diagram
                                           # False: show only tracking percentage in diagram

def _load_json(path):
    '''Reads json file ignoring comments'''
    ignore = ["__comment", "license"]
Beispiel #34
0
def process(self, instance, parameters=None, commit=True, **kwargs):
    """ See source code. """

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    urls = URL_MATCH_REGEX.findall(instance.content)

    if not urls:
        LOGGER.info(u'url-crawler: nothing to crawl in %s %s.',
                    instance_name, instance_id)
        return

    # Start with EasyList
    adblock_rules_list = requests_get(
        # WARNING: do not .split() with no parameters, else
        # adblock will block everything due to empty rules.
        'https://easylist-downloads.adblockplus.org/easylist.txt').split('\n')

    # Append our eventual specific exclusions
    adblock_rules_list.extend(
        parameters.get(
            'integration', {}).get(
                'fetch_content_urls',
                {}).get(
                    'adblock_rules',
                    []))

    if re2 is None:
        # Things will be dogly slow…
        adblock_rules = AdblockRules(
            adblock_rules_list,
            max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY)

    else:
        # Things will go faster
        adblock_rules = AdblockRules(
            adblock_rules_list, use_re2=True,
            max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY)

    if isinstance(instance, models.Email):
        origin = models.ORIGINS.EMAIL

        # NOTE: there will be at least one here, else
        # accepts() would have rejected the email.
        feeds = instance.feeds.exclude(
            MailFeed___match_action=MAIL_MATCH_ACTIONS.STORE)

    else:
        origin = models.ORIGINS.CRAWLING
        feeds = instance.feeds.all()

    dupes = 0
    blocked = 0

    # LOGGER.debug('URLS: %s %s', len(urls), urls)

    for url in urls:
        if url.startswith('('):
            url = url[1:]

            if url.endswith(')'):
                # Skip Markdown's enclosing parenthesis
                # that we explicitely matched manually.
                url = url[:-1]

            # In case we've got garbage at the end of the RE.
            splitted = url.split(')')

            if len(splitted) == 1:
                pass

            if len(splitted) == 2 and len(splitted[1]) < 4:
                    # Highly probable that we got some garbage at the end.
                url = splitted[0]

            else:
                LOGGER.error(u'url-crawler: probable nasty unhandled '
                             u'URL “%s” too-greedily matched by RE.',
                             url)

        if adblock_rules.should_block(url):
            LOGGER.info(u'url-crawler: URL %s skipped, in adblocked rules.',
                        url)
            blocked += 1
            continue

        LOGGER.info('url-crawler: importing from %s.', url)

        try:
            item, created = create_item_from_url(
                url=clean_url(url), feeds=feeds, origin=origin,
            )

        except:
            LOGGER.exception(u'Could not create item from URL “%s”', url)

        else:
            if created:
                LOGGER.info(u'url-crawler: successfully imported %s from '
                            u'%s %s.', item, instance_name, instance_id)

            else:
                dupes += 1
                LOGGER.warning(u'url-crawler: %s already in database.', item)

            # link newly created item to the item it was found into.
            item.sources.add(instance)

    LOGGER.info(u'url-crawler: crawled %s items (%s new) from %s %s.',
                len(urls) - blocked, len(urls) - blocked - dupes,
                instance_name, instance_id)