def processFeedData(self, feedDat, tx_raw=True, tx_parse=True): if any([item in feedDat['linkUrl'] for item in skip_filter]): print("LinkURL '%s' contains a filtered string. Not fetching!" % feedDat['linkUrl']) return netloc = urllib.parse.urlparse(feedDat['linkUrl']).netloc nicename = feedNameLut.getNiceName(feedDat['linkUrl']) if not nicename: nicename = netloc feedDat['srcname'] = nicename # print("ProcessFeedData! ", netloc) if not WebMirror.rules.netloc_send_feed(netloc): print("Not sending data for netloc: ", netloc) return if tx_raw: raw = self.getRawFeedMessage(feedDat) if raw: self.amqp_put_item(raw) if tx_parse: new = self.getProcessedReleaseInfo(feedDat) if new: self.amqp_put_item(new)
def processFeedData(self, session, feedDat, tx_raw=True, tx_parse=True): if any([item in feedDat['linkUrl'] for item in common.global_constants.RSS_SKIP_FILTER]): print("LinkURL '%s' contains a filtered string. Not fetching!" % feedDat['linkUrl']) return if any([feedDat['title'].lower().startswith(item) for item in common.global_constants.RSS_TITLE_FILTER]): print("LinkURL '%s' contains a filtered string. Not fetching!" % feedDat['linkUrl']) return # print("Feed item title: ", feedDat['title'], feedDat) if feedDat['title'].lower().startswith("by: "): self.log.warning("Skipping due to title: '%s'", feedDat['title']) return netloc = urllib.parse.urlparse(feedDat['linkUrl']).netloc nicename = feedNameLut.getNiceName(session, feedDat['linkUrl']) if not nicename: nicename = netloc feedDat['srcname'] = nicename if should_ignore_feed_post(feedDat): self.log.warning("Skipping due to should_ignore_feed_post") return # print("ProcessFeedData! ", netloc) # A bunch of crap is aggregated through the "feedproxy.google.com" netloc. if "feedproxy.google.com" in netloc: print("Not sending data for feedproxy netloc: ", netloc) return try: new = self.getProcessedReleaseInfo(feedDat) except AssertionError: self.log.error("Exception when processing release!") for line in traceback.format_exc().split("\n"): self.log.error(line.rstrip()) return if tx_parse: if new: self.log.info("Sending parsed release!") self.amqp_put_item(new) # A bunch of crap is aggregated through the "feedproxy.google.com" netloc. if not WebMirror.rules.netloc_send_feed(netloc): print("Not sending raw feed for netloc due to rules: ", netloc) return raw = self.getRawFeedMessage(feedDat) if tx_raw: if raw: self.amqp_put_item(raw)
def missing_lut(): import WebMirror.OutputFilters.util.feedNameLut as fnl rules = WebMirror.rules.load_rules() feeds = [item['feedurls'] for item in rules] feeds = [item for sublist in feeds for item in sublist] # feeds = [urllib.parse.urlsplit(tmp).netloc for tmp in feeds] for feed in feeds: if not fnl.getNiceName(feed): print("Missing: ", urllib.parse.urlsplit(feed).netloc)
def missing_lut(): import WebMirror.OutputFilters.util.feedNameLut as fnl rules = WebMirror.rules.load_rules() feeds = [item['feedurls'] for item in rules] feeds = [item for sublist in feeds for item in sublist] # feeds = [urllib.parse.urlsplit(tmp).netloc for tmp in feeds] for feed in feeds: if not fnl.getNiceName(feed): print("Missing: ", urllib.parse.urlsplit(feed).netloc) pass
def exposed_nu_new(): ''' Parse outbound netlocs from NovelUpdates releases, extracting any sites that are not known in the feednamelut. ''' import WebMirror.OutputFilters.util.feedNameLut as fnl sess = db.get_db_session() nu_items = sess.query(db.NuOutboundWrapperMap) \ .filter(db.NuOutboundWrapperMap.validated == True) \ .filter(db.NuOutboundWrapperMap.actual_target != None) \ .all() netlocs = [urllib.parse.urlsplit(row.actual_target).netloc for row in nu_items] print("Nu outbound items: ", len(netlocs)) netlocs = set(netlocs) for netloc in netlocs: if not fnl.getNiceName(None, netloc): fnl.getNiceName(None, netloc, debug=True) print("Missing: ", netloc) print("Nu outbound items: ", len(netlocs))
def exposed_missing_lut(): ''' Iterate over distinct RSS feed sources in database, and print any for which there is not an entry in feedDataLut.py to the console. ''' import WebMirror.OutputFilters.util.feedNameLut as fnl rules = WebMirror.rules.load_rules() feeds = [item['feedurls'] for item in rules] feeds = [item for sublist in feeds for item in sublist] # feeds = [urllib.parse.urlsplit(tmp).netloc for tmp in feeds] for feed in feeds: if not fnl.getNiceName(feed): print("Missing: ", urllib.parse.urlsplit(feed).netloc)
def get_dotted(self): self.fix_names() dotted_series = [] dotted_authors = [] with db.session_context() as db_sess: print("Counting items to load.") count = db_sess.query(db.NuReleaseItem) \ .filter(db.NuReleaseItem.reviewed == 'valid') \ .filter(db.NuReleaseItem.validated == True) \ .count() print("Loading") validated = db_sess.query(db.NuReleaseItem) \ .filter(db.NuReleaseItem.reviewed == 'valid') \ .filter(db.NuReleaseItem.validated == True) \ .yield_per(1000) validated = [tmp for tmp in validated] # validated = [tmp for tmp in tqdm.tqdm(validated, total=count)] print("Found %s releases" % len(validated)) # for row in tqdm.tqdm(validated): for row in validated: if row.seriesname.endswith("..."): dotted_series.append((row.seriesname.strip(), row.actual_target)) if row.groupinfo.endswith("..."): dotted_authors.append((row.groupinfo, row.actual_target)) dseries = {} dauths = {} with db.session_context() as db_sess: for name, url in dotted_series: nl = urllib.parse.urlparse(url).netloc nlname = feedNameLut.getNiceName(db_sess, url) if not nlname: nlname = nl dseries.setdefault(nlname, {}) dseries[nlname][name] = url for name, url in dotted_authors: nl = urllib.parse.urlparse(url).netloc nlname = feedNameLut.getNiceName(db_sess, url) if not nlname: nlname = nl dauths.setdefault(nlname, {}) dauths[nlname][name] = url self.log.info("Found %s dotted series, %s dotted authors", len(dseries), len(dauths)) with open("dotted_nu_items.pyson", "w") as fp: out = pprint.pformat((dseries, dauths), indent=4) fp.write(out) return (dseries, dauths)