Ejemplo n.º 1
0
def test_rules_supported_options():
    rules = AdblockRules(["adv", "@@advice.$~script"])
    assert not rules.should_block("http://example.com/advice.html", {'script': False})

    # exception rule should be discarded if "script" option is not supported
    rules2 = AdblockRules(["adv", "@@advice.$~script"], supported_options=[])
    assert rules2.should_block("http://example.com/advice.html", {'script': False})
Ejemplo n.º 2
0
def test_rule_exceptions(rules, results, use_re2):
    rules = AdblockRules(rules, use_re2=use_re2)

    for url in results["blocks"]:
        assert rules.should_block(url)

    for url in results["doesn't block"]:
        assert not rules.should_block(url)
Ejemplo n.º 3
0
def test_documented_examples(rule_text, results, use_re2):
    rule = AdblockRule(rule_text)
    rules = AdblockRules([rule_text], use_re2=use_re2)

    for url in results["blocks"]:
        assert rule.match_url(url)
        assert rules.should_block(url)

    for url in results["doesn't block"]:
        assert not rule.match_url(url)
        assert not rules.should_block(url)
Ejemplo n.º 4
0
def main(harfile_path, domain, blockfile):
    harfile = open(harfile_path, "r")
    harfile_json = json.loads(harfile.read())
    harfile.close()
    bfile = open(blockfile, "r")
    block_list = bfile.readlines()
    bfile.close()
    rules = AdblockRules(block_list)
    adblock_db = {
        "url_data": {},
        "stats": {
            "domain": domain,
            "req": 0,
            "succ": 0,
            "block": 0
        }
    }
    options = ('image', 'xmlhttprequest', 'document', 'font', 'script', 'stylesheet', 'other')
    for entry in harfile_json['log']['entries']:
        url = entry['request']['url']
        urlparts = urlparse(url)
        print("Processing {} ...".format(url))
        try:
            fld = get_fld(url, fail_silently=True)
            adblock_db["stats"]["req"] += 1
            if fld != domain:
                d = {}
                if entry["_resourceType"] == "xhr":
                    entry["_resourceType"] = "xmlhttprequest"
                if entry["_resourceType"] not in options:
                    d = {"third-party": True, "domain": urlparts.hostname}
                else:
                    d = {entry["_resourceType"]: True, "third-party": True, "domain": urlparts.hostname}

                if rules.should_block(url, d):
                    adblock_db["stats"]["block"] += 1
                else:
                    adblock_db["stats"]["succ"] += 1
            else:
                if entry["_resourceType"] == "xhr":
                    entry["_resourceType"] = "xmlhttprequest"
                if entry["_resourceType"] not in options:
                    d = {"third-party": False, "domain": urlparts.hostname}
                else:
                    d = {entry["_resourceType"]: True, "third-party": False, "domain": urlparts.hostname}

                if rules.should_block(url, d):
                    adblock_db["stats"]["block"] += 1
                else:
                    adblock_db["stats"]["succ"] += 1
        except:
            continue
    return adblock_db
Ejemplo n.º 5
0
def test_rule_with_options(rule_text, results, use_re2):
    rule = AdblockRule(rule_text)
    rules = AdblockRules([rule_text], use_re2=use_re2)

    for url, params, match in results:
        assert rule.match_url(url, params) == match
        assert rules.should_block(url, params) == match
Ejemplo n.º 6
0
class Filter(object):
	def __init__(self, filename):
		self.rules = []
		with open(filename, "r") as blacklist:
			for line in blacklist.xreadlines():
				if line.startswith('!'):
					continue
				if '##' in line: # HTML rule
					continue
				else:
					self.rules.append(line)
		#, supported_options=['script', 'domain']
		self.adblock = AdblockRules(self.rules, supported_options=['script', 'domain'])
		
	def match(self, url, options=None):
		return self.adblock.should_block(url)
Ejemplo n.º 7
0
def main(harfile_path):
    """Reads a har file from the filesystem, converts to CSV, then dumps to
    stdout.
    """
    txt_file = 'easylist.txt'
    raw_rules = readfile(txt_file)

    harfile = open(harfile_path, encoding  = 'UTF-8')
    harfile_json = json.loads(harfile.read())
    i = 0

    first_party = harfile_path.split('.')[1]+'.'+harfile_path.split('.')[2]
    rules = AdblockRules(raw_rules)
    blocked = 0
    blocked_domains = set()
    opt = {'script': True,'image':True,'stylesheet':True,'object':True,'subdocument':True,'xmlhttprequest':True,'websocket':True,'webrtc':True,'popup':True,'generichide':True,'genericblock':True}
    
    for entry in harfile_json['log']['entries']:
        i = i + 1
        url = entry['request']['url']
        urlparts = urlparse(entry['request']['url'])
        size_bytes = entry['response']['bodySize']
        size_kilobytes = float(entry['response']['bodySize'])/1024
        mimetype = 'unknown'
        if 'mimeType' in entry['response']['content']:
            mimetype = entry['response']['content']['mimeType']
        
        option = ''
        res = get_tld(url, as_object=True)
        mime_opt = mimetype.split('/')[0]

        if mime_opt in opt:
        	option = mime_opt
        
        if res.fld != first_party and option in opt and rules.should_block(url, {option: opt[option]}):
        	blocked += 1
        	blocked_domains.add(res.fld)
    
    blocked_domains = [dom for dom in blocked_domains] if blocked_domains else 'No domains blocked'

    print(f'\nSite: {first_party}\n# of total HTTP requests: {i}\n# of HTTP requests blocked: {blocked}\nBlocked domains: {blocked_domains}\n')
Ejemplo n.º 8
0
def test_regex_rules():
    # Regex rules are not supported yet.
    # There are no such rules in EasyList filters.
    rules = AdblockRules(["/banner\d+/"])
    assert rules.should_block("banner123")
    assert not rules.should_block("banners")
Ejemplo n.º 9
0
"""Check each path against filter for maliciousness, ads, tracking, ads and tracking, bitcoin, or pornography.
    (number of n at the end file determines the version for the file. The more the number of n, the newer the version of the file)
"""
for filt in filterlist[1:]:
    print('Start checking against: ' + filt[0] + ': ' + filt[1])
    if 'host' not in filt[1]:
        try:
            req = Request(filt[1], headers={'User-Agent': 'Mozilla/5.0'})
            raw_rules = urlopen(req).readlines()
            raw_rules2 = [x.decode('utf8') for x in raw_rules if x.decode('utf8') != '\r\n']
            raw_rules3 = []
            for raw in raw_rules2:
                raw_rules3.append(raw.replace('\n', '').replace('\r', ''))
            rules = AdblockRules(raw_rules3)
        except KeyboardInterrupt:
            raise
        except:
            print('====cannot read filter====')
            raw_rules3 = ''
        if raw_rules3 != '':
            #print(raw_rules3)
            for path in array:
                if rules.should_block(path) is True:
                    print(path + ' : Yes')
                    dictionary[path][filt[2]] = True
                else:
                    print(path + ' : No')
        print('---------------------------------')

jsonify.export_json('filterChecked_nn.json', dictionary)
Ejemplo n.º 10
0
import adblockparser, sys
from adblockparser import AdblockRules
f = open('easylist.txt', 'r')
all = f.read().splitlines()
rules = AdblockRules(all)
while True:
	# sys.stdin.open()
	s = sys.stdin.readline()

# print sys.argv[1]
	print rules.should_block(s)
	# sys.stdin.close()
	sys.stdout.close()
	sys.stdout = open('/dev/stdout', 'w')
	# open(sys.stdout)
	# open(sys.stdin)
	# print "^D"
Ejemplo n.º 11
0
def test_rules_instantiation():
    rule = AdblockRule("adv")
    rules = AdblockRules([rule])
    assert rule.match_url("http://example.com/adv")
    assert rules.should_block("http://example.com/adv")
Ejemplo n.º 12
0
    df = pd.read_csv(csvfile)
    domains = df[df.columns[1]]
    mime = df[df.columns[5]]
    for domain, m in zip(domains, mime):
        options = {}
        total += 1
        if (m.find('image')):
            options['image'] = True
        else:
            options['image'] = False
        if (m.find('javascript')):
            options['script'] = True
        else:
            options['script'] = False

        if rules.should_block(domain, options) == True:
            res = get_tld(domain, as_object=True)

            if (res.fld.find(site) == -1):
                if res.fld not in l:
                    l.append(res.fld.encode('utf-8'))

            blocked += 1
        else:
            unblocked += 1

    strdom = '\n'.join(l)

    x.add_row([site, total, blocked, strdom])
print(x)
Ejemplo n.º 13
0
def detect_trackers(third_parties):
    """
    Detect 3rd party trackers and return a list of them.

    :param third_parties: List of third-party requests (not: hosts) to analyze
    :return: a list of unique hosts in the form domain.tld
    """
    if len(third_parties) == 0:
        return []

    blacklist = [re.compile('^[\|]*http[s]*[:/]*$'),  # match http[s]:// in all variations
                 re.compile('^[\|]*ws[:/]*$'),  # match ws:// in all variations
                 re.compile('^\.'),  # match rules like .com
                 re.compile('^\/'),  # match rules like /stuff
                 re.compile('^\#'),  # match rules beginning with #
                 re.compile('^\:'),  # match rules beginning with :
                 re.compile('^\?'),  # match rules beginning with ?
                 ]

    def is_acceptable_rule(rule):
        if '@' in rule:
            return False
        for exp in blacklist:
            if exp.match(rule) is not None:
                return False
        return True

    lines = []
    rules = []
    result = []
    
    start_time = timeit.default_timer()
    
    # Generate paths to files
    easylist_path = os.path.join(
        settings.SCAN_TEST_BASEPATH, 'vendor/EasyList', 'easylist.txt')
    easyprivacy_path = os.path.join(
        settings.SCAN_TEST_BASEPATH, 'vendor/EasyList', 'easyprivacy.txt')
    fanboy_path = os.path.join(
        settings.SCAN_TEST_BASEPATH, 'vendor/EasyList', 'fanboy-annoyance.txt')

    # Read in files:
    for line in open(easylist_path, 'r', encoding="utf-8"):
        lines.append(line)
    for line in open(easyprivacy_path, 'r', encoding="utf-8"):
        lines.append(line)
    for line in open(fanboy_path, 'r', encoding="utf-8"):
        lines.append(line)

    # Clean up lines:
    for line in lines:
        try:
            rule = line.split('$')[0]
            if is_acceptable_rule(rule):
                rules.append(rule)
        except:
            print("Unexpected error:", sys.exc_info()[0])

    abr = AdblockRules(rules)
    
    elapsed = timeit.default_timer() - start_time
    print("Elapsed: %i secs" % elapsed)
    
    i = 0
    
    for url in third_parties:
        if abr.should_block(url):
            ext = tldextract.extract(url)
            result.append("{}.{}".format(ext.domain, ext.suffix))
        i = i + 1
        if i % 20 == 0:
            elapsed = timeit.default_timer() - start_time
            print("Checked %i domains, %i secs elapsed..." % (i, elapsed))

    return list(set(result))
Ejemplo n.º 14
0
class AdBlockUnit(browser_unit.BrowserUnit):

    EASYLIST = 'easylist.txt'
    EASYLIST_URL = "https://easylist-downloads.adblockplus.org/easylist.txt"

    def _easylist_version(self, path=EASYLIST):
        '''
        Reads the version from the current easylist, or a file that is passed in
        '''
        if os.path.isfile(path):
            with open(path) as f:
                lines = f.read().splitlines()
                return lines[2].split(':')[1].strip()
        else:
            return -1

    def _fetch_easylist(self):
        '''
        Downloads the latest version of easylist, and if newer replaces any
        existing one.
        '''
        tmp_easylist = "tmp_" + self.EASYLIST
        cur_version = self._easylist_version()

        # download latest easylist from the Internet
        urllib.request.urlretrieve(self.EASYLIST_URL, tmp_easylist)
        tmp_version = self._easylist_version(path=tmp_easylist)

        # if necessary update
        if tmp_version > cur_version and cur_version != -1:
            os.remove(self.EASYLIST)
            shutil.move(tmp_easylist, self.EASYLIST)
            print(("Updated easylist from {} to {}".format(
                cur_version, tmp_version)))
        elif cur_version == -1:
            shutil.move(tmp_easylist, self.EASYLIST)
            print(("New easylist {}".format(tmp_version)))
        else:
            os.remove(tmp_easylist)
            print(("Easylist already up to date at: {}".format(tmp_version)))

    def _load_easylist(self):
        '''
        Reads in easylist from a file and parses it into lines to be passed to
        abblockparser.
        '''
        with open(self.EASYLIST) as f:
            lines = f.read().splitlines()
        print(("Loaded easylist version: {} with : {} items".format(
            self._easylist_version(), len(lines))))
        return lines

    def __init__(self,
                 browser="firefox",
                 log_file="log.txt",
                 unit_id=0,
                 treatment_id=0,
                 headless=False,
                 proxy=None,
                 rules=None):

        # if easylist is not passed in, then consider this is a bare unit that
        # that should only be used to fetch easylist and then parse into
        # adblockplus rules for use with adblockparser.
        if rules == None:
            self._fetch_easylist()
            self.filterlist = self._load_easylist()
            self.rules = AdblockRules(self.filterlist)
        else:
            logging.basicConfig(filename="adb_" + log_file, level=logging.INFO)
            self.logger = logging.getLogger(__name__)

            # call parent constructor
            browser_unit.BrowserUnit.__init__(self,
                                              browser,
                                              log_file,
                                              unit_id,
                                              treatment_id,
                                              headless,
                                              proxy=proxy)

            self.session = self.driver.session_id
            print(("Running adblock unit session: {}".format(self.session)))

            # set rules to those that where passed in
            self.rules = rules
            self.all_options = {
                opt: True
                for opt in AdblockRule.BINARY_OPTIONS
            }

            # internal ad data structure
            self.data = []

            self.Ad = namedtuple('Ad', [
                'url', 'outerhtml', 'tag', 'link_text', 'link_location',
                'on_site', 'reloads'
            ])

            # dictionary to memoize url checks
            self.memo = {}

            # store current context where we are collecting ads
            self.site = ""
            self.reloads = 0

    def save_data(self):
        json_file = os.path.splitext(
            self.log_file)[0] + "." + self.session + ".json"
        with open(json_file, 'w') as outfile:
            json.dump(self.data, outfile)

        # This is the log line adblock_analysis will parse to identify data files
        self.logger.info("save_data:{}:{}:{}".format(self.unit_id,
                                                     self.treatment_id,
                                                     self.session))

    def log_element(self, element, source):
        '''
        Input: An element that has been identified as an ad and how it was identified
        Result: Inserts appropriate information into the log
        '''

        url = element.get_attribute(source)
        html = element.get_attribute('outerHTML').encode('utf-8')
        tag = element.tag_name
        link_text = element.text
        link_location = element.location

        # update internal datastore
        ad_data = self.Ad(url=url,
                          outerhtml=html,
                          tag=tag,
                          link_text=link_text,
                          link_location=link_location,
                          on_site=self.site,
                          reloads=self.reloads)

        # store to internal data structure
        self.data.append(ad_data)

        # log to plaintext log
        self.logger.debug("Ad:Data:{}".format(ad_data))

    def check_elements(self, elements, source, options=None):
        '''
        Input: Given an element in the currently active page and an attribute to query on
        Result: Queries the given attribute (source) and checks the url against the 
        filterlist. Logs any identified elements and returns the count.
        '''
        count = 0
        for e in elements:
            try:
                url = e.get_attribute(source)
                if url != None:
                    self.logger.debug("Checking:{}:{}".format(source, url))
                    # check if we have evaluated this ad before
                    if url not in self.memo:
                        # actually check the url against the filter list
                        self.memo[url] = self.rules.should_block(url, options)

                    if self.memo[url]:
                        self.log_element(e, source)
                        count += 1

            # occurs with stale elements that no longer exist in the DOM
            except selenium.common.exceptions.StaleElementReferenceException as e:
                self.logger.error(e)
        return count

    def check_href(self):
        '''
        Identifies and captures ads based on HTML hyperlink tags.
        These are considered "text" ads.
        '''
        driver = self.driver
        ### xpath could be less performant than other find_* methods
        # common tags: <a>,<link>
        elements = driver.find_elements_by_xpath("//*[@href]")
        count = self.check_elements(elements, "href", self.all_options)
        self.logger.debug("href search found: {}".format(count))

    def check_src(self):
        '''
        Identifies and captures ads based on tags with a 'src' attribute
        These are considered "media" ads and are often img, iframe,script
        tags
        '''
        driver = self.driver
        ### xpath could be less performant than other find_* methods
        # common tags: <img>, <iframe>, <frame>, <embed>, <script>
        elements = driver.find_elements_by_xpath("//*[@src]")
        count = self.check_elements(elements, "src", self.all_options)
        self.logger.debug("src search found: {}".format(count))

    def check_iframe(self, parents=()):
        '''
        Functionality to check within nested iframes for ad related resources.
        Invariants: expects webdriver to enter at the level defined by parents
        resets webdriver to top level contents prior to leaving
        Input: a tuple describing the iframe name attribute of parent levels
        '''

        driver = self.driver
        children = driver.find_elements_by_tag_name('iframe')

        for child in children:

            try:
                driver.switch_to.frame(child)

                # check in the iframe for ads
                self.check_href()
                self.check_src()

                # set parent for children we check
                nesting = parents + (child, )
                self.check_iframe(parents=nesting)

            except selenium.common.exceptions.StaleElementReferenceException as e:
                self.logger.error(e)

            # return to correct level of nesting
            driver.switch_to_default_content()

            for p in parents:
                try:
                    driver.switch_to.frame(p)
                except selenium.common.exceptions.NoSuchElementException as e:
                    # this should not occur but just in case, preserve invariant
                    # of function leaving at top level
                    self.logger.error("resetting level in iframe recursion")
                    driver.switch_to_default_content()

        # always reset to top level content prior to exiting
        driver.switch_to_default_content()

    def find_ads(self):
        '''
        Primary convenience function to use all ad identification mechanisms
        '''
        self.check_href()
        self.check_src()
        self.check_iframe()

    def visit_url(self, url):
        driver = self.driver
        try:
            driver.get(url)
            self.logger.debug("Visited: {}".format(url))
            self.site = url
            return True
        except selenium.common.exceptions.TimeoutException as e:
            print(("Timeout Visiting: {} : {}".format(url, self.session)))
            print(e)
            return False

    def collect_ads(self, url, reloads=1, delay=0, file_name=None):
        '''
        Visits a specified url and runs ad collection functions
        Result: 
        '''
        print(("collecting ads on: {}".format(url)))
        if file_name == None:
            file_name = self.log_file

        # number of reloads on site to capture all ads
        for r in range(reloads):
            time.sleep(delay)

            # if a successful visit
            if self.visit_url(url):
                # collect ads
                self.reloads = r
                self.find_ads()
Ejemplo n.º 15
0
def process(self, instance, parameters=None, commit=True, **kwargs):
    """ See source code. """

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    urls = URL_MATCH_REGEX.findall(instance.content)

    if not urls:
        LOGGER.info(u'url-crawler: nothing to crawl in %s %s.', instance_name,
                    instance_id)
        return

    # Start with EasyList
    adblock_rules_list = requests_get(
        # WARNING: do not .split() with no parameters, else
        # adblock will block everything due to empty rules.
        'https://easylist-downloads.adblockplus.org/easylist.txt').split('\n')

    # Append our eventual specific exclusions
    adblock_rules_list.extend(
        parameters.get('integration', {}).get('fetch_content_urls',
                                              {}).get('adblock_rules', []))

    if re2 is None:
        # Things will be dogly slow…
        adblock_rules = AdblockRules(
            adblock_rules_list,
            max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY)

    else:
        # Things will go faster
        adblock_rules = AdblockRules(
            adblock_rules_list,
            use_re2=True,
            max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY)

    if isinstance(instance, models.Email):
        origin = models.ORIGINS.EMAIL

        # NOTE: there will be at least one here, else
        # accepts() would have rejected the email.
        feeds = instance.feeds.exclude(
            MailFeed___match_action=MAIL_MATCH_ACTIONS.STORE)

    else:
        origin = models.ORIGINS.CRAWLING
        feeds = instance.feeds.all()

    dupes = 0
    blocked = 0

    # LOGGER.debug('URLS: %s %s', len(urls), urls)

    for url in urls:
        if url.startswith('('):
            url = url[1:]

            if url.endswith(')'):
                # Skip Markdown's enclosing parenthesis
                # that we explicitely matched manually.
                url = url[:-1]

            # In case we've got garbage at the end of the RE.
            splitted = url.split(')')

            if len(splitted) == 1:
                pass

            if len(splitted) == 2 and len(splitted[1]) < 4:
                # Highly probable that we got some garbage at the end.
                url = splitted[0]

            else:
                LOGGER.error(
                    u'url-crawler: probable nasty unhandled '
                    u'URL “%s” too-greedily matched by RE.', url)

        if adblock_rules.should_block(url):
            LOGGER.info(u'url-crawler: URL %s skipped, in adblocked rules.',
                        url)
            blocked += 1
            continue

        LOGGER.info('url-crawler: importing from %s.', url)

        try:
            item, created = create_item_from_url(
                url=clean_url(url),
                feeds=feeds,
                origin=origin,
            )

        except:
            LOGGER.exception(u'Could not create item from URL “%s”', url)

        else:
            if created:
                LOGGER.info(
                    u'url-crawler: successfully imported %s from '
                    u'%s %s.', item, instance_name, instance_id)

            else:
                dupes += 1
                LOGGER.warning(u'url-crawler: %s already in database.', item)

            # link newly created item to the item it was found into.
            item.sources.add(instance)

    LOGGER.info(u'url-crawler: crawled %s items (%s new) from %s %s.',
                len(urls) - blocked,
                len(urls) - blocked - dupes, instance_name, instance_id)
Ejemplo n.º 16
0
class AdBlockUnit(browser_unit.BrowserUnit):

    EASYLIST = 'easylist.txt'
    EASYLIST_URL = "https://easylist-downloads.adblockplus.org/easylist.txt"

    def _easylist_version(self,path=EASYLIST):
        '''
        Reads the version from the current easylist, or a file that is passed in
        '''
        if os.path.isfile(path):
            with open(path) as f:
                lines = f.read().splitlines()
                return lines[2].split(':')[1].strip()
        else:
            return -1

    def _fetch_easylist(self):
        '''
        Downloads the latest version of easylist, and if newer replaces any
        existing one.
        '''
        tmp_easylist = "tmp_"+self.EASYLIST
        cur_version = self._easylist_version()

        # download latest easylist from the Internet
        urllib.urlretrieve(self.EASYLIST_URL,tmp_easylist)
        tmp_version = self._easylist_version(path=tmp_easylist)
        
        # if necessary update
        if tmp_version > cur_version and cur_version != -1:
            os.remove(self.EASYLIST)
            shutil.move(tmp_easylist,self.EASYLIST)
            print ("Updated easylist from {} to {}".format(cur_version,tmp_version))
        elif cur_version == -1:
            shutil.move(tmp_easylist,self.EASYLIST)
            print("New easylist {}".format(tmp_version))
        else:
            os.remove(tmp_easylist)
            print("Easylist already up to date at: {}".format(tmp_version))

    def _load_easylist(self):
        '''
        Reads in easylist from a file and parses it into lines to be passed to
        abblockparser.
        '''
        with open(self.EASYLIST) as f:
            lines = f.read().splitlines()
        print("Loaded easylist version: {} with : {} items".format(self._easylist_version(),len(lines)))
        return lines


    def __init__(self, browser="firefox", log_file="log.txt", unit_id=0, treatment_id=0, headless=False, proxy=None,rules=None):
        

        # if easylist is not passed in, then consider this is a bare unit that 
        # that should only be used to fetch easylist and then parse into
        # adblockplus rules for use with adblockparser.
        if rules == None:
            self._fetch_easylist()
            self.filterlist = self._load_easylist()
            self.rules = AdblockRules(self.filterlist)
        else:
            logging.basicConfig(filename="adb_"+log_file,level=logging.INFO)
            self.logger = logging.getLogger(__name__)

            # call parent constructor
            browser_unit.BrowserUnit.__init__(self, browser, log_file, unit_id, treatment_id, headless, proxy=proxy)

            self.session = self.driver.session_id
            print("Running adblock unit session: {}".format(self.session))
            
            # set rules to those that where passed in
            self.rules = rules
            self.all_options = {opt:True for opt in AdblockRule.BINARY_OPTIONS}

            # internal ad data structure 
            self.data = []

            self.Ad = namedtuple('Ad',['url','outerhtml','tag','link_text','link_location','on_site', 'reloads'])

            # dictionary to memoize url checks
            self.memo = {}

            # store current context where we are collecting ads
            self.site = ""
            self.reloads= 0

    def save_data(self):
        json_file = os.path.splitext(self.log_file)[0]+"."+self.session+".json"
        with open(json_file, 'w') as outfile:
            json.dump(self.data, outfile)

        # This is the log line adblock_analysis will parse to identify data files
        self.logger.info("save_data:{}:{}:{}".format(self.unit_id,self.treatment_id,self.session))

    def log_element(self,element,source):
        '''
        Input: An element that has been identified as an ad and how it was identified
        Result: Inserts appropriate information into the log
        '''
    

    
        url = element.get_attribute(source)
        html = element.get_attribute('outerHTML').encode('utf-8')
        tag = element.tag_name
        link_text = element.text
        link_location = element.location
         
        # update internal datastore
        ad_data = self.Ad(url=url, outerhtml=html, tag=tag, link_text=link_text, link_location=link_location, on_site=self.site, reloads=self.reloads)
        
        # store to internal data structure
        self.data.append(ad_data)

        # log to plaintext log
        self.logger.debug("Ad:Data:{}".format(ad_data))

    def check_elements(self, elements, source, options=None):
        '''
        Input: Given an element in the currently active page and an attribute to query on
        Result: Queries the given attribute (source) and checks the url against the 
        filterlist. Logs any identified elements and returns the count.
        '''
        count = 0
        for e in elements:
            try:
                url = e.get_attribute(source)
                if url != None:
                    self.logger.debug("Checking:{}:{}".format(source, url))
                    # check if we have evaluated this ad before
                    if url not in self.memo:
                        # actually check the url against the filter list
                        self.memo[url] = self.rules.should_block(url, options)

                    if self.memo[url]:
                        self.log_element(e,source)
                        count+=1

            # occurs with stale elements that no longer exist in the DOM
            except selenium.common.exceptions.StaleElementReferenceException as e:
                self.logger.error(e)
        return count


    def check_href(self):
        '''
        Identifies and captures ads based on HTML hyperlink tags.
        These are considered "text" ads.
        '''
        driver = self.driver
        ### xpath could be less performant than other find_* methods
        # common tags: <a>,<link>
        elements = driver.find_elements_by_xpath("//*[@href]")
        count = self.check_elements(elements,"href", self.all_options)
        self.logger.debug("href search found: {}".format(count))
    

    def check_src(self):
        '''
        Identifies and captures ads based on tags with a 'src' attribute
        These are considered "media" ads and are often img, iframe,script
        tags
        '''
        driver = self.driver
        ### xpath could be less performant than other find_* methods
        # common tags: <img>, <iframe>, <frame>, <embed>, <script>
        elements = driver.find_elements_by_xpath("//*[@src]")
        count = self.check_elements(elements, "src", self.all_options)
        self.logger.debug("src search found: {}".format(count))


    def check_iframe(self,parents=()):
        '''
        Functionality to check within nested iframes for ad related resources.
        Invariants: expects webdriver to enter at the level defined by parents
        resets webdriver to top level contents prior to leaving
        Input: a tuple describing the iframe name attribute of parent levels
        '''

        driver = self.driver
        children = driver.find_elements_by_tag_name('iframe')

        for child in children:

            try:
                driver.switch_to.frame(child)

                # check in the iframe for ads
                self.check_href()
                self.check_src()

                # set parent for children we check
                nesting = parents + (child,)
                self.check_iframe(parents=nesting)

            except selenium.common.exceptions.StaleElementReferenceException as e:
                self.logger.error(e)

            # return to correct level of nesting
            driver.switch_to_default_content()

            for p in parents:
                try:
                    driver.switch_to.frame(p)
                except selenium.common.exceptions.NoSuchElementException as e:
                    # this should not occur but just in case, preserve invariant
                    # of function leaving at top level
                    self.logger.error("resetting level in iframe recursion")
                    driver.switch_to_default_content()


        # always reset to top level content prior to exiting
        driver.switch_to_default_content()

    def find_ads(self):
        '''
        Primary convenience function to use all ad identification mechanisms
        '''
        self.check_href()
        self.check_src()
        self.check_iframe()

    def visit_url(self,url):
        driver = self.driver
        try:
            driver.get(url)
            self.logger.debug("Visited: {}".format(url))
            self.site = url
            return True
        except selenium.common.exceptions.TimeoutException as e:
            print("Timeout Visiting: {} : {}".format(url,self.session))
            print e
            return False


    def collect_ads(self,url, reloads=1, delay=0, file_name=None):
        '''
        Visits a specified url and runs ad collection functions
        Result: 
        '''
        print("collecting ads on: {}".format(url))
        if file_name == None:
            file_name = self.log_file

        # number of reloads on site to capture all ads
        for r in range(reloads):
            time.sleep(delay)

            # if a successful visit
            if self.visit_url(url):
                # collect ads
                self.reloads=r
                self.find_ads()
Ejemplo n.º 17
0
def test_regex_rules():
    rules = AdblockRules(["/banner\d+/"])
    assert rules.should_block("banner123")
    assert not rules.should_block("banners")
Ejemplo n.º 18
0
def process(self, instance, parameters=None, commit=True, **kwargs):
    """ See source code. """

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    urls = URL_MATCH_REGEX.findall(instance.content)

    if not urls:
        LOGGER.info(u'url-crawler: nothing to crawl in %s %s.',
                    instance_name, instance_id)
        return

    # Start with EasyList
    adblock_rules_list = requests_get(
        # WARNING: do not .split() with no parameters, else
        # adblock will block everything due to empty rules.
        'https://easylist-downloads.adblockplus.org/easylist.txt').split('\n')

    # Append our eventual specific exclusions
    adblock_rules_list.extend(
        parameters.get(
            'integration', {}).get(
                'fetch_content_urls',
                {}).get(
                    'adblock_rules',
                    []))

    if re2 is None:
        # Things will be dogly slow…
        adblock_rules = AdblockRules(
            adblock_rules_list,
            max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY)

    else:
        # Things will go faster
        adblock_rules = AdblockRules(
            adblock_rules_list, use_re2=True,
            max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY)

    if isinstance(instance, models.Email):
        origin = models.ORIGINS.EMAIL

        # NOTE: there will be at least one here, else
        # accepts() would have rejected the email.
        feeds = instance.feeds.exclude(
            MailFeed___match_action=MAIL_MATCH_ACTIONS.STORE)

    else:
        origin = models.ORIGINS.CRAWLING
        feeds = instance.feeds.all()

    dupes = 0
    blocked = 0

    # LOGGER.debug('URLS: %s %s', len(urls), urls)

    for url in urls:
        if url.startswith('('):
            url = url[1:]

            if url.endswith(')'):
                # Skip Markdown's enclosing parenthesis
                # that we explicitely matched manually.
                url = url[:-1]

            # In case we've got garbage at the end of the RE.
            splitted = url.split(')')

            if len(splitted) == 1:
                pass

            if len(splitted) == 2 and len(splitted[1]) < 4:
                    # Highly probable that we got some garbage at the end.
                url = splitted[0]

            else:
                LOGGER.error(u'url-crawler: probable nasty unhandled '
                             u'URL “%s” too-greedily matched by RE.',
                             url)

        if adblock_rules.should_block(url):
            LOGGER.info(u'url-crawler: URL %s skipped, in adblocked rules.',
                        url)
            blocked += 1
            continue

        LOGGER.info('url-crawler: importing from %s.', url)

        try:
            item, created = create_item_from_url(
                url=clean_url(url), feeds=feeds, origin=origin,
            )

        except:
            LOGGER.exception(u'Could not create item from URL “%s”', url)

        else:
            if created:
                LOGGER.info(u'url-crawler: successfully imported %s from '
                            u'%s %s.', item, instance_name, instance_id)

            else:
                dupes += 1
                LOGGER.warning(u'url-crawler: %s already in database.', item)

            # link newly created item to the item it was found into.
            item.sources.add(instance)

    LOGGER.info(u'url-crawler: crawled %s items (%s new) from %s %s.',
                len(urls) - blocked, len(urls) - blocked - dupes,
                instance_name, instance_id)
Ejemplo n.º 19
0
def check_if_ad(url):
    with open('easylist.txt', 'r') as file:
        raw_rules = file.readlines()
    rules = AdblockRules(raw_rules)
    return rules.should_block(url)
Ejemplo n.º 20
0
def test_rules_with_options(rules, results, use_re2):
    rules = AdblockRules(rules, use_re2=use_re2)
    for url, params, should_block in results:
        assert rules.should_block(url, params) == should_block
        mimietype=entry['response']['content']
        if mimietype.get('mimeType')!=None:
            mimt[url]=mimietype.get('mimeType')

        
if __name__ == '__main__':
    argparser = argparse.ArgumentParser(
        prog='parsehar',
        description='Parse .har files into comma separated values (csv).')
    argparser.add_argument('harfile', type=str, nargs=2,
                        help='path to harfile to be processed.')
    args = argparser.parse_args()
    current_site=args.harfile[1]
    main(args.harfile[0])

read_files();
rules=AdblockRules(raw_rules,supported_options=['third-party','script','image','stylesheet','domain','object','subdocument','xmlhttprequest','websocket','webrtc','popup','generichide','genericblock'],skip_unsupported_rules=False)
print(rules)
no_sites_blocked=0
for dm in domains:
    options=write_options()
    if(rules.should_block(dm,options)):
        no_sites_blocked+=1
        get_blocked_sites_domain(dm)

print(tabulate([[ current_site, total_no_sites , no_sites_blocked,blocked_sites ]], headers=['Site'
,'# of total HTTP requests',
'# of HTTP requests blocked'
,'Third-party domains (not URL) blocked']))

Ejemplo n.º 22
0
        raw_rules.append(line)

with open("/user/ifouad/home/PycharmProjects/OpenWpm/addblock/easyprivacy",
          'r') as f:
    for line in f:
        raw_rules_easyp.append(line)
rules = AdblockRules(raw_rules)
rules_priv = AdblockRules(raw_rules_easyp)
db2 = sys.argv[1]
conn = sqlite3.connect(db2)
curr = conn.cursor()
for site_id, link_id, response_id, url in curr.execute(
        'select site_id, link_id, response_id, url from http_responses where link_id = 0 order by site_id ASC'
).fetchall():
    print site_id, link_id, url
    if rules.should_block(url):
        cur2.execute(
            "insert into blocked (site_id , link_id  , resp_id , url, list ) Values (?,?,?,?,?)",
            (site_id, link_id, response_id, url, "easylist"))
    elif rules_priv.should_block(url):
        cur2.execute(
            "insert into blocked (site_id , link_id  , resp_id , url, list ) Values (?,?,?,?,?)",
            (site_id, link_id, response_id, url, "easyprivacy"))
    else:
        cur2.execute(
            "insert into blocked (site_id , link_id  , resp_id , url ) Values (?,?,?,?)",
            (site_id, link_id, response_id, url))
    '''        
    print rules.should_block("http://search.ch/htmlbanner.html")
    print rules.should_block("g.doubleclick.net")
    print rules.should_block("http://ads.example.com/notbanner", {'script': False})
Ejemplo n.º 23
0
# Load url dataset

print("Loading URLs...")

urls = []

with open(URL_PATH, 'r') as infile:
    for line in infile:
        url, _ = line.strip().split("\t")
        url = url[1:-2]
        urls.append(url)

print len(urls), "URLs loaded!"

# Make a mapping from urls to whether they should be blocked

print("Parsing URLs...")

block_map = Counter()

for url in urls:
    block_map[url] = rules.should_block(url)

print("Finished!")

# Save mapping to pickle

with open(URL_MAP_PKL_PATH, 'wb') as output:  # Overwrites any existing file.
    pickle.dump(block_map, output, pickle.HIGHEST_PROTOCOL)

Ejemplo n.º 24
0
            if "http" in url:
                third_party_tld = getDomain(url)
                if visited_tld != third_party_tld:
                    # check if third-party
                    temp_orga = get_organisation(blocklist_json, third_party_tld)
                    if temp_orga is not None:
                        if not temp_orga in all_third_party_orgas:
                            all_third_party_orgas.append(temp_orga)
                        if not temp_orga in third_party_sites:
                            third_party_sites.append(temp_orga)
                    else:
                        if not third_party_tld in all_third_party_orgas:
                            all_third_party_orgas.append(third_party_tld)
                        if not third_party_tld in third_party_sites:
                            third_party_sites.append(third_party_tld)
                    if rules.should_block(url) is True:
                        # check if tracking site
                        if temp_orga is not None:
                            if not temp_orga in all_ad_and_tracking_orgas:
                                all_ad_and_tracking_orgas.append(temp_orga)
                            if not temp_orga in tracking_sites:
                                tracking_sites.append(temp_orga)
                        else:
                            if not third_party_tld in all_ad_and_tracking_orgas:
                                all_ad_and_tracking_orgas.append(third_party_tld)
                            if not third_party_tld in tracking_sites:
                                tracking_sites.append(third_party_tld)
            else:
                raise ValueError('http is not in url!', url)

        resObject["third_party_sites"] = third_party_sites
    soup = BeautifulSoup(body, features='html.parser')
    scripts = soup.find_all('script')
    srcs = [link['src'] for link in scripts if 'src' in link.attrs]
    # Test with known dodgy URL
    # srcs.append('//pushlat.com/ntfc.php?p=1273711139')

    # Set up caching
    sess = CacheControl(requests.Session(), FileCache(args.cachedir))

    response = sess.get(args.blacklist)
    rules = AdblockRules(response.text.splitlines(),
                         supported_options=['third-party'],
                         skip_unsupported_rules=False)
    options = {'third-party': True}
    for src in srcs:
        if (rules.should_block(src, options)):
            crit_msg.append(args.url +
                            " contains dodgy 'script src' parameter: " + src)
        else:
            scanned_srcs.append(src)

    ok_msg.append("None of the " + str(len(scanned_srcs)) +
                  " found 'script src' URLs on " + args.url +
                  " are listed in " + args.blacklist)
    if (args.verbose):
        ok_msg.append("\n".join(scanned_srcs))

except Exception as e:
    nagios_exit("UNKNOWN: Unknown error: {0}.".format(e), 3)

# Exit with accumulated message(s)