def load_nl_refetch_interval_mapping(): rules = load_rules() ret = {} for ruleset in rules: interval = ruleset['rewalk_interval_days'] if ruleset['rewalk_disabled']: interval = 0 if ruleset['netlocs']: for nl in ruleset['netlocs']: ret[nl] = interval return ret
def get_random_url_group(num_items): dat = g.session.execute('''SELECT url FROM web_pages TABLESAMPLE SYSTEM(:percentage);''', {'percentage' : num_items}) dat = list(dat) ruleset = rules.load_rules(override=True) ret = [] for linkurl, in dat: nl = urllib.parse.urlparse(linkurl).netloc badwords, badcompounds = getBadWords(ruleset, nl) filtered = isFiltered(linkurl, badwords, badcompounds) ret.append((linkurl, filtered)) return ret
def get_urls(self): self.rules = load_rules() feeds = [] for item in self.rules: feeds += item['feedurls'] return feeds