def __init__(self, browser="firefox", log_file="log.txt", unit_id=0, treatment_id=0, headless=False, proxy=None,rules=None): # if easylist is not passed in, then consider this is a bare unit that # that should only be used to fetch easylist and then parse into # adblockplus rules for use with adblockparser. if rules == None: self._fetch_easylist() self.filterlist = self._load_easylist() self.rules = AdblockRules(self.filterlist) else: logging.basicConfig(filename="adb_"+log_file,level=logging.INFO) self.logger = logging.getLogger(__name__) # call parent constructor browser_unit.BrowserUnit.__init__(self, browser, log_file, unit_id, treatment_id, headless, proxy=proxy) self.session = self.driver.session_id print("Running adblock unit session: {}".format(self.session)) # set rules to those that where passed in self.rules = rules self.all_options = {opt:True for opt in AdblockRule.BINARY_OPTIONS} # internal ad data structure self.data = [] self.Ad = namedtuple('Ad',['url','outerhtml','tag','link_text','link_location','on_site', 'reloads']) # dictionary to memoize url checks self.memo = {} # store current context where we are collecting ads self.site = "" self.reloads= 0
def test_rule_with_options(rule_text, results, use_re2): rule = AdblockRule(rule_text) rules = AdblockRules([rule_text], use_re2=use_re2) for url, params, match in results: assert rule.match_url(url, params) == match assert rules.should_block(url, params) == match
def test_rule_exceptions(rules, results, use_re2): rules = AdblockRules(rules, use_re2=use_re2) for url in results["blocks"]: assert rules.should_block(url) for url in results["doesn't block"]: assert not rules.should_block(url)
def test_documented_examples(rule_text, results, use_re2): rule = AdblockRule(rule_text) rules = AdblockRules([rule_text], use_re2=use_re2) for url in results["blocks"]: assert rule.match_url(url) assert rules.should_block(url) for url in results["doesn't block"]: assert not rule.match_url(url) assert not rules.should_block(url)
def __init__(self, filename): self.rules = [] with open(filename, "r") as blacklist: for line in blacklist.xreadlines(): if line.startswith('!'): continue if '##' in line: # HTML rule continue else: self.rules.append(line) #, supported_options=['script', 'domain'] self.adblock = AdblockRules(self.rules, supported_options=['script', 'domain'])
def main(harfile_path, domain, blockfile): harfile = open(harfile_path, "r") harfile_json = json.loads(harfile.read()) harfile.close() bfile = open(blockfile, "r") block_list = bfile.readlines() bfile.close() rules = AdblockRules(block_list) adblock_db = { "url_data": {}, "stats": { "domain": domain, "req": 0, "succ": 0, "block": 0 } } options = ('image', 'xmlhttprequest', 'document', 'font', 'script', 'stylesheet', 'other') for entry in harfile_json['log']['entries']: url = entry['request']['url'] urlparts = urlparse(url) print("Processing {} ...".format(url)) try: fld = get_fld(url, fail_silently=True) adblock_db["stats"]["req"] += 1 if fld != domain: d = {} if entry["_resourceType"] == "xhr": entry["_resourceType"] = "xmlhttprequest" if entry["_resourceType"] not in options: d = {"third-party": True, "domain": urlparts.hostname} else: d = {entry["_resourceType"]: True, "third-party": True, "domain": urlparts.hostname} if rules.should_block(url, d): adblock_db["stats"]["block"] += 1 else: adblock_db["stats"]["succ"] += 1 else: if entry["_resourceType"] == "xhr": entry["_resourceType"] = "xmlhttprequest" if entry["_resourceType"] not in options: d = {"third-party": False, "domain": urlparts.hostname} else: d = {entry["_resourceType"]: True, "third-party": False, "domain": urlparts.hostname} if rules.should_block(url, d): adblock_db["stats"]["block"] += 1 else: adblock_db["stats"]["succ"] += 1 except: continue return adblock_db
def UrlBlock(): global rules, adpath if sets.ADBLOCKER == 1: rules = AdblockRules(file('hosts3.txt')) adpath = ['zoneid=', 'auction_id=', 'campaign_id', 'offer?'] sys.stdout.write("Adblocking is ON\r\n") elif sets.ADBLOCKER == 0: rules = AdblockRules([]) sys.stdout.write("Adblocking is OFF\r\n") else: print "Invalid value for Blocking, try 0 or 1, will use 0" sets.ADBLOCKER == 0
def get_adblock_rules(): raw_easylist_rules = read_ab_rules_from_file("blocklists/easylist.txt") raw_easyprivacy_rules = read_ab_rules_from_file( "blocklists/easyprivacy.txt") # raw_ublock_rules = read_ab_rules_from_file("blocklists/adblock_blacklist_white.txt") print("Loaded %s from EasyList, %s rules from EasyPrivacy" % (len(raw_easylist_rules), len(raw_easyprivacy_rules))) # len(raw_ublock_rules))) easylist_rules = AdblockRules(raw_easylist_rules) easyprivacy_rules = AdblockRules(raw_easyprivacy_rules) # ublock_rules = AdblockRules(raw_ublock_rules) # return easylist_rules, easyprivacy_rules, ublock_rules return easylist_rules, easyprivacy_rules
def main(urlfile=None): rules = AdblockRules(open(rule_file).read().splitlines()) urls = UrlList(max=max_url_list_size, filter=url_filter) saved = UrlList() if urlfile: urls.list = open(urlfile, "r").read().splitlines() else: auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) gt = Thread(target=gather_urls, args=(auth, urls)) gt.start() cts = [] for i in range(6): ct = Thread(target=crawl_urls, args=(urls, saved, rules)) cts.append(cts) ct.start() global fetched_pages, fetched_jses while True: print("[pages] queued:%d fetched:%d [js] fetched:%d saved:%d" % (len(urls.list), fetched_pages, fetched_jses, len(saved.list)), end="\033[K\r") sleep(1)
def load_easylist(): """Fetches the most recent easylist and returns an AdblockRules object using it.""" r = requests.get("https://easylist.to/easylist/easylist.txt") r.raise_for_status() easy = r.text return AdblockRules(easy.splitlines())
def get_rules(): raw_rules = [ x.replace('\r', '').replace('\n', '') for x in codecs.open(EASYLIST_FILE, 'r', 'utf8').readlines() ] rules = AdblockRules(raw_rules) return rules
def get_adblock_rules(): raw_easylist_rules = read_ab_rules_from_file("adblock/easylist.txt") raw_easyprivacy_rules = read_ab_rules_from_file("adblock/easyprivacy.txt") if ENABLE_UBLOCK: raw_ublock_rules = read_ab_rules_from_file( "adblock/adblock_blacklist_white.txt") else: raw_ublock_rules = [] print( "Loaded %s from EasyList, %s rules from EasyPrivacy" " and %s rules from UBlockOrigin" % (len(raw_easylist_rules), len(raw_easyprivacy_rules), len(raw_ublock_rules))) easylist_rules = AdblockRules(raw_easylist_rules) easyprivacy_rules = AdblockRules(raw_easyprivacy_rules) ublock_rules = AdblockRules(raw_ublock_rules) return easylist_rules, easyprivacy_rules, ublock_rules
def load_rules(blocklists=None): rules = AdblockRules( combined(blocklists), use_re2=True, max_mem=512 * 1024 * 1024, # supported_options=['script', 'domain', 'image', 'stylesheet', 'object'] ) return rules
class EasyListHandler: with open('easylist.txt', 'r', encoding='UTF-8') as f: rules = AdblockRules(list(f)) #def __init__(self): #with open('easylist.txt', 'r', encoding='UTF-8') as f: #self.rules = AdblockRules(list(f)) def is_harmful_url(self, url): return EasyListHandler.rules.should_block(input())
def main(harfile_path): """Reads a har file from the filesystem, converts to CSV, then dumps to stdout. """ txt_file = 'easylist.txt' raw_rules = readfile(txt_file) harfile = open(harfile_path, encoding = 'UTF-8') harfile_json = json.loads(harfile.read()) i = 0 first_party = harfile_path.split('.')[1]+'.'+harfile_path.split('.')[2] rules = AdblockRules(raw_rules) blocked = 0 blocked_domains = set() opt = {'script': True,'image':True,'stylesheet':True,'object':True,'subdocument':True,'xmlhttprequest':True,'websocket':True,'webrtc':True,'popup':True,'generichide':True,'genericblock':True} for entry in harfile_json['log']['entries']: i = i + 1 url = entry['request']['url'] urlparts = urlparse(entry['request']['url']) size_bytes = entry['response']['bodySize'] size_kilobytes = float(entry['response']['bodySize'])/1024 mimetype = 'unknown' if 'mimeType' in entry['response']['content']: mimetype = entry['response']['content']['mimeType'] option = '' res = get_tld(url, as_object=True) mime_opt = mimetype.split('/')[0] if mime_opt in opt: option = mime_opt if res.fld != first_party and option in opt and rules.should_block(url, {option: opt[option]}): blocked += 1 blocked_domains.add(res.fld) blocked_domains = [dom for dom in blocked_domains] if blocked_domains else 'No domains blocked' print(f'\nSite: {first_party}\n# of total HTTP requests: {i}\n# of HTTP requests blocked: {blocked}\nBlocked domains: {blocked_domains}\n')
def test_rules_supported_options(): rules = AdblockRules(["adv", "@@advice.$~script"]) assert not rules.should_block("http://example.com/advice.html", {'script': False}) # exception rule should be discarded if "script" option is not supported rules2 = AdblockRules(["adv", "@@advice.$~script"], supported_options=[]) assert rules2.should_block("http://example.com/advice.html", {'script': False})
def get_rules(): "Loads Adblock filter rules from file." from adblockparser import AdblockRules raw_rules = [] raw_rules.extend(ADBLOCK_RULES) with closing(requests.get(ADBLOCK_EASYLIST, stream=True)) as file: # lines = 0 # to be removed for rule in file.iter_lines(): raw_rules.append(rule.strip()) # lines += 1 # tbr # if lines == 2500: break # tbr, only for windoze with no re2 rules = AdblockRules(raw_rules) return rules
def load_gfwlist(): global gfwlist_loaded with open('gfwlist.txt.gzipped', 'rb') as f: rules = zlib.decompress(f.read()).decode().split('\n') print('utils: loading GFWList rule') if const.USE_GFWLIST_ANYWAY: filt = AdblockRules(rules) if not filt.uses_re2: print( 'utils: warning: GFWList is not using re2. THE EFFICIENCY IS NOT GUARANTEED!' ) gfwlist_loaded = True return filt.should_block else: try: filt = AdblockRules(rules, use_re2=True, max_mem=const.RE2_MAX_MEM) except ImportError: print( 'utils: warning: GFWList is disabled unless you have pyre2 installed' ) return lambda _: False else: gfwlist_loaded = True return filt.should_block
def load_rules(blocklists=[ "easylist.txt", "easyprivacy.txt", "fanboy-annoyance.txt", "fanboy-social.txt" ]): print "Loading rules:", blocklists # rules = AdblockRules( combined(blocklists), use_re2=True, max_mem=512*1024*1024, supported_options=['script', 'domain'] ) rules = AdblockRules(combined(blocklists), use_re2=True, supported_options=[ 'script', 'domain', 'image', 'stylesheet', 'object' ]) # rules = AdblockRules( combined(blocklists), use_re2=True ) return rules
class Filter(object): def __init__(self, filename): self.rules = [] with open(filename, "r") as blacklist: for line in blacklist.xreadlines(): if line.startswith('!'): continue if '##' in line: # HTML rule continue else: self.rules.append(line) #, supported_options=['script', 'domain'] self.adblock = AdblockRules(self.rules, supported_options=['script', 'domain']) def match(self, url, options=None): return self.adblock.should_block(url)
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='BigQuery request input table.') parser.add_argument('--output', dest='output', help='BigQuery output table.') known_args, pipeline_args = parser.parse_known_args(argv) output_table = '%s' % known_args.output input_query = """ SELECT page, url, DOMAIN(page) as domain, IF (DOMAIN(page) == DOMAIN(url), false, true) AS third_party, FROM [%s] """ % known_args.input classifiers = {} for file in ['ad', 'tracker', 'social']: rules = [line.rstrip('\n') for line in open('local/' + file + '.txt')] classifier = AdblockRules(rules, supported_options=['domain', 'third-party'], skip_unsupported_rules=False, use_re2=True) del rules classifiers[file] = classifier p = df.Pipeline(argv=pipeline_args) (p | df.Read('read', df.io.BigQuerySource(query=input_query)) | df.ParDo('classify', EasylistClassifyDoFn(), classifiers) # | df.io.Write('write', df.io.TextFileSink('out'))) | df.Write( 'write', df.io.BigQuerySink( output_table, schema='page:STRING, url:STRING, type:STRING', create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run()
def _load_rules(self): global _adblock_rules_cache easylist_files = [ EASYLIST_PATH / filename for filename in EASYLIST_FILES ] mtime = max(filename.stat().st_mtime for filename in easylist_files) if _adblock_rules_cache is not None and _adblock_rules_cache[ 'mtime'] >= mtime: self.rules = _adblock_rules_cache['rules'] return cache_file = self.options.get('adblockrules_cache') if cache_file: cache_file = Path(cache_file) if cache_file and cache_file.exists( ) and cache_file.stat().st_mtime >= mtime: with cache_file.open('rb') as f: rules = pickle.load(f) else: lines = [] for easylist_file in easylist_files: for line in (EASYLIST_PATH / easylist_file).open(): # Lines with @@ are exceptions which are not blocked # even if other adblocking rules match. This is done # to fix a few sites. We do not need those exceptions. if line.startswith('@@'): continue lines.append(line) rules = AdblockRules(lines) if cache_file: with cache_file.open('wb') as f: pickle.dump(rules, f, pickle.HIGHEST_PROTOCOL) _adblock_rules_cache = {'mtime': mtime, 'rules': rules} self.rules = rules
} """Check each path against filter for maliciousness, ads, tracking, ads and tracking, bitcoin, or pornography. (number of n at the end file determines the version for the file. The more the number of n, the newer the version of the file) """ for filt in filterlist[1:]: print('Start checking against: ' + filt[0] + ': ' + filt[1]) if 'host' not in filt[1]: try: req = Request(filt[1], headers={'User-Agent': 'Mozilla/5.0'}) raw_rules = urlopen(req).readlines() raw_rules2 = [x.decode('utf8') for x in raw_rules if x.decode('utf8') != '\r\n'] raw_rules3 = [] for raw in raw_rules2: raw_rules3.append(raw.replace('\n', '').replace('\r', '')) rules = AdblockRules(raw_rules3) except KeyboardInterrupt: raise except: print('====cannot read filter====') raw_rules3 = '' if raw_rules3 != '': #print(raw_rules3) for path in array: if rules.should_block(path) is True: print(path + ' : Yes') dictionary[path][filt[2]] = True else: print(path + ' : No') print('---------------------------------')
def test_regex_rules(): rules = AdblockRules(["/banner\d+/"]) assert rules.should_block("banner123") assert not rules.should_block("banners")
def test_rules_with_options(rules, results, use_re2): rules = AdblockRules(rules, use_re2=use_re2) for url, params, should_block in results: assert rules.should_block(url, params) == should_block
import adblockparser, sys from adblockparser import AdblockRules f = open('easylist.txt', 'r') all = f.read().splitlines() rules = AdblockRules(all) while True: # sys.stdin.open() s = sys.stdin.readline() # print sys.argv[1] print rules.should_block(s) # sys.stdin.close() sys.stdout.close() sys.stdout = open('/dev/stdout', 'w') # open(sys.stdout) # open(sys.stdin) # print "^D"
class AdBlockUnit(browser_unit.BrowserUnit): EASYLIST = 'easylist.txt' EASYLIST_URL = "https://easylist-downloads.adblockplus.org/easylist.txt" def _easylist_version(self,path=EASYLIST): ''' Reads the version from the current easylist, or a file that is passed in ''' if os.path.isfile(path): with open(path) as f: lines = f.read().splitlines() return lines[2].split(':')[1].strip() else: return -1 def _fetch_easylist(self): ''' Downloads the latest version of easylist, and if newer replaces any existing one. ''' tmp_easylist = "tmp_"+self.EASYLIST cur_version = self._easylist_version() # download latest easylist from the Internet urllib.urlretrieve(self.EASYLIST_URL,tmp_easylist) tmp_version = self._easylist_version(path=tmp_easylist) # if necessary update if tmp_version > cur_version and cur_version != -1: os.remove(self.EASYLIST) shutil.move(tmp_easylist,self.EASYLIST) print ("Updated easylist from {} to {}".format(cur_version,tmp_version)) elif cur_version == -1: shutil.move(tmp_easylist,self.EASYLIST) print("New easylist {}".format(tmp_version)) else: os.remove(tmp_easylist) print("Easylist already up to date at: {}".format(tmp_version)) def _load_easylist(self): ''' Reads in easylist from a file and parses it into lines to be passed to abblockparser. ''' with open(self.EASYLIST) as f: lines = f.read().splitlines() print("Loaded easylist version: {} with : {} items".format(self._easylist_version(),len(lines))) return lines def __init__(self, browser="firefox", log_file="log.txt", unit_id=0, treatment_id=0, headless=False, proxy=None,rules=None): # if easylist is not passed in, then consider this is a bare unit that # that should only be used to fetch easylist and then parse into # adblockplus rules for use with adblockparser. if rules == None: self._fetch_easylist() self.filterlist = self._load_easylist() self.rules = AdblockRules(self.filterlist) else: logging.basicConfig(filename="adb_"+log_file,level=logging.INFO) self.logger = logging.getLogger(__name__) # call parent constructor browser_unit.BrowserUnit.__init__(self, browser, log_file, unit_id, treatment_id, headless, proxy=proxy) self.session = self.driver.session_id print("Running adblock unit session: {}".format(self.session)) # set rules to those that where passed in self.rules = rules self.all_options = {opt:True for opt in AdblockRule.BINARY_OPTIONS} # internal ad data structure self.data = [] self.Ad = namedtuple('Ad',['url','outerhtml','tag','link_text','link_location','on_site', 'reloads']) # dictionary to memoize url checks self.memo = {} # store current context where we are collecting ads self.site = "" self.reloads= 0 def save_data(self): json_file = os.path.splitext(self.log_file)[0]+"."+self.session+".json" with open(json_file, 'w') as outfile: json.dump(self.data, outfile) # This is the log line adblock_analysis will parse to identify data files self.logger.info("save_data:{}:{}:{}".format(self.unit_id,self.treatment_id,self.session)) def log_element(self,element,source): ''' Input: An element that has been identified as an ad and how it was identified Result: Inserts appropriate information into the log ''' url = element.get_attribute(source) html = element.get_attribute('outerHTML').encode('utf-8') tag = element.tag_name link_text = element.text link_location = element.location # update internal datastore ad_data = self.Ad(url=url, outerhtml=html, tag=tag, link_text=link_text, link_location=link_location, on_site=self.site, reloads=self.reloads) # store to internal data structure self.data.append(ad_data) # log to plaintext log self.logger.debug("Ad:Data:{}".format(ad_data)) def check_elements(self, elements, source, options=None): ''' Input: Given an element in the currently active page and an attribute to query on Result: Queries the given attribute (source) and checks the url against the filterlist. Logs any identified elements and returns the count. ''' count = 0 for e in elements: try: url = e.get_attribute(source) if url != None: self.logger.debug("Checking:{}:{}".format(source, url)) # check if we have evaluated this ad before if url not in self.memo: # actually check the url against the filter list self.memo[url] = self.rules.should_block(url, options) if self.memo[url]: self.log_element(e,source) count+=1 # occurs with stale elements that no longer exist in the DOM except selenium.common.exceptions.StaleElementReferenceException as e: self.logger.error(e) return count def check_href(self): ''' Identifies and captures ads based on HTML hyperlink tags. These are considered "text" ads. ''' driver = self.driver ### xpath could be less performant than other find_* methods # common tags: <a>,<link> elements = driver.find_elements_by_xpath("//*[@href]") count = self.check_elements(elements,"href", self.all_options) self.logger.debug("href search found: {}".format(count)) def check_src(self): ''' Identifies and captures ads based on tags with a 'src' attribute These are considered "media" ads and are often img, iframe,script tags ''' driver = self.driver ### xpath could be less performant than other find_* methods # common tags: <img>, <iframe>, <frame>, <embed>, <script> elements = driver.find_elements_by_xpath("//*[@src]") count = self.check_elements(elements, "src", self.all_options) self.logger.debug("src search found: {}".format(count)) def check_iframe(self,parents=()): ''' Functionality to check within nested iframes for ad related resources. Invariants: expects webdriver to enter at the level defined by parents resets webdriver to top level contents prior to leaving Input: a tuple describing the iframe name attribute of parent levels ''' driver = self.driver children = driver.find_elements_by_tag_name('iframe') for child in children: try: driver.switch_to.frame(child) # check in the iframe for ads self.check_href() self.check_src() # set parent for children we check nesting = parents + (child,) self.check_iframe(parents=nesting) except selenium.common.exceptions.StaleElementReferenceException as e: self.logger.error(e) # return to correct level of nesting driver.switch_to_default_content() for p in parents: try: driver.switch_to.frame(p) except selenium.common.exceptions.NoSuchElementException as e: # this should not occur but just in case, preserve invariant # of function leaving at top level self.logger.error("resetting level in iframe recursion") driver.switch_to_default_content() # always reset to top level content prior to exiting driver.switch_to_default_content() def find_ads(self): ''' Primary convenience function to use all ad identification mechanisms ''' self.check_href() self.check_src() self.check_iframe() def visit_url(self,url): driver = self.driver try: driver.get(url) self.logger.debug("Visited: {}".format(url)) self.site = url return True except selenium.common.exceptions.TimeoutException as e: print("Timeout Visiting: {} : {}".format(url,self.session)) print e return False def collect_ads(self,url, reloads=1, delay=0, file_name=None): ''' Visits a specified url and runs ad collection functions Result: ''' print("collecting ads on: {}".format(url)) if file_name == None: file_name = self.log_file # number of reloads on site to capture all ads for r in range(reloads): time.sleep(delay) # if a successful visit if self.visit_url(url): # collect ads self.reloads=r self.find_ads()
def test_empty_regexp_rules(): with pytest.raises(AdblockParsingError): AdblockRules(['adv', '/', '//'])
def test_rules_instantiation(): rule = AdblockRule("adv") rules = AdblockRules([rule]) assert rule.match_url("http://example.com/adv") assert rules.should_block("http://example.com/adv")
def load_rules(blocklists=None): rules = AdblockRules(combined(blocklists), use_re2=True, max_mem=512 * 1024 * 1024) return rules
def test_empty_rules(): rules = AdblockRules(["adv", "", " \t", AdblockRule("adv2")]) assert len(rules.rules) == 2
def test_regex_rules(): # Regex rules are not supported yet. # There are no such rules in EasyList filters. rules = AdblockRules(["/banner\d+/"]) assert rules.should_block("banner123") assert not rules.should_block("banners")
import sqlite3 as lite import tldextract from tracking_rules import TrackingRules from ad_rules import AdRules from adblockparser import AdblockRules import json BLOCKLIST = "../../assets/disconnect_blocklist.json" global rules_instance ad_rules_instance = AdRules() tracking_rules_instance = TrackingRules() raw_rules = ad_rules_instance.rules raw_rules += tracking_rules_instance.rules rules = AdblockRules(raw_rules, use_re2=True) # # # # # NON CUMULATIVE # MAIN CONFIG wpm_db = '/media/tobi/Daten/Workspace/OpenWPM/Output/1000_2.sqlite' selected_crawl = 1 display_index = 0 # 0 is landing page, 1-4 subsites show_tracking_and_third_parties = True # True: show tracking-percentage as part of third-party percentage in diagram # False: show only tracking percentage in diagram def _load_json(path): '''Reads json file ignoring comments''' ignore = ["__comment", "license"]
def process(self, instance, parameters=None, commit=True, **kwargs): """ See source code. """ instance_name = instance._meta.verbose_name instance_id = instance.id urls = URL_MATCH_REGEX.findall(instance.content) if not urls: LOGGER.info(u'url-crawler: nothing to crawl in %s %s.', instance_name, instance_id) return # Start with EasyList adblock_rules_list = requests_get( # WARNING: do not .split() with no parameters, else # adblock will block everything due to empty rules. 'https://easylist-downloads.adblockplus.org/easylist.txt').split('\n') # Append our eventual specific exclusions adblock_rules_list.extend( parameters.get( 'integration', {}).get( 'fetch_content_urls', {}).get( 'adblock_rules', [])) if re2 is None: # Things will be dogly slow… adblock_rules = AdblockRules( adblock_rules_list, max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY) else: # Things will go faster adblock_rules = AdblockRules( adblock_rules_list, use_re2=True, max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY) if isinstance(instance, models.Email): origin = models.ORIGINS.EMAIL # NOTE: there will be at least one here, else # accepts() would have rejected the email. feeds = instance.feeds.exclude( MailFeed___match_action=MAIL_MATCH_ACTIONS.STORE) else: origin = models.ORIGINS.CRAWLING feeds = instance.feeds.all() dupes = 0 blocked = 0 # LOGGER.debug('URLS: %s %s', len(urls), urls) for url in urls: if url.startswith('('): url = url[1:] if url.endswith(')'): # Skip Markdown's enclosing parenthesis # that we explicitely matched manually. url = url[:-1] # In case we've got garbage at the end of the RE. splitted = url.split(')') if len(splitted) == 1: pass if len(splitted) == 2 and len(splitted[1]) < 4: # Highly probable that we got some garbage at the end. url = splitted[0] else: LOGGER.error(u'url-crawler: probable nasty unhandled ' u'URL “%s” too-greedily matched by RE.', url) if adblock_rules.should_block(url): LOGGER.info(u'url-crawler: URL %s skipped, in adblocked rules.', url) blocked += 1 continue LOGGER.info('url-crawler: importing from %s.', url) try: item, created = create_item_from_url( url=clean_url(url), feeds=feeds, origin=origin, ) except: LOGGER.exception(u'Could not create item from URL “%s”', url) else: if created: LOGGER.info(u'url-crawler: successfully imported %s from ' u'%s %s.', item, instance_name, instance_id) else: dupes += 1 LOGGER.warning(u'url-crawler: %s already in database.', item) # link newly created item to the item it was found into. item.sources.add(instance) LOGGER.info(u'url-crawler: crawled %s items (%s new) from %s %s.', len(urls) - blocked, len(urls) - blocked - dupes, instance_name, instance_id)