def retrieve_RSSfeed(URL, date, RSS=True): if RSS: content = feedparser.parse(URL) count = -1 if len(content)==0: print "Tsy nahitana na tsy nahasokatra ilay topaka RSS" raise Exception() while count < len(content.entries)-1: count += 1 Y = content.entries[count].updated_parsed.tm_year M = content.entries[count].updated_parsed.tm_mon D = content.entries[count].updated_parsed.tm_mday if D==date[2] and M==date[1] and Y==date[0]: g = NewWordFinder.HTMLEntitiesToUnicode(content.entries[count].content[0].value) g = NewWordFinder.strip_tags(g) g = g.replace(u'’',u"'") yield { 'text' : g, 'title': content.entries[count].title, 'link' : content.entries[count].links[0].href } else: #Ho an'ny gazety tsy mazaka RSS na tsy mampiasa azy reader = htmlnewsp.HTMLcorpusretriever() for data in reader.crawllinks(URL): yield { 'title': data[0], 'text': data[1], 'link': data[2] } print data[1]
def processFeed(self, feed): try: f = feedparser.parse(feed["url"]) except: self.RPC.RPCCommand("log", "error", "Invalid RSS feed (id: %s, alias: %s), disabling", feed["id"], feed["alias"]) self.RPC.RPCCommand("disable_rss", feed["id"]) else: if _DEBUG: _debugstore = shelve.open("feed.debug.rss") for x, y in f.iteritems(): _debugstore[x] = y _debugstore.sync() _debugstore.close() if len(f.entries) == 0: self.RPC.RPCCommand("log", "warning", "RSS feed (id: %s, alias: %s) is empty", feed["id"], feed["alias"]) return lasthash = hashlib.sha256(f.entries[0].link).hexdigest() if lasthash == feed["lasthash"]: # no new entries # self.RPC.RPCCommand("log", "debug", "No new entries for feed (id: %s, alias: %s)", feed["id"], feed["alias"]) pass else: self.RPC.RPCCommand("updatehash_rss", feed["id"], lasthash) newentries = [f.entries[0]] for e in f.entries[1:]: h = hashlib.sha256(e.link).hexdigest() if h == feed["lasthash"]: break else: newentries.append(e) # self.RPC.RPCCommand("log", "debug", "%i new entries for feed (id: %s, alias: %s)", len(newentries), feed["id"], feed["alias"]) for e in newentries: for positive, negative, sizelim in feed["filters"]: contTrue = 0 for regex in [re.compile(x, re.I) for x in positive]: if not regex.search(e.title): break else: contTrue += 1 if contTrue != len(positive): continue cont = True for regex in [re.compile(y, re.I) for y in negative]: if regex.search(e.title): cont = False break else: cont = True if not cont: continue self.RPC.RPCCommand("fetch_torrent_rss", ID=feed["id"], alias=feed["alias"], link=e.link, sizelim=sizelim)
def processFeed(self, feed): try: f = feedparser.parse(feed["url"]) except: self.RPC.RPCCommand("log","error","Invalid RSS feed (id: %s, alias: %s), disabling", feed["id"], feed["alias"]) self.RPC.RPCCommand("disable_rss", feed["id"]) else: if len(f.entries) == 0: self.RPC.RPCCommand("log","warning", "RSS feed (id: %s, alias: %s) is empty", feed["id"], feed["alias"]) return lasthash = hashlib.sha256(f.entries[0].link).hexdigest() if lasthash == feed["lasthash"]: #no new entries #self.RPC.RPCCommand("log", "debug", "No new entries for feed (id: %s, alias: %s)", feed["id"], feed["alias"]) pass else: self.RPC.RPCCommand("updatehash_rss", feed["id"], lasthash) newentries = [f.entries[0]] for e in f.entries[1:]: h = hashlib.sha256(e.link).hexdigest() if h == feed["lasthash"]: break else: newentries.append(e) #self.RPC.RPCCommand("log","debug","%i new entries for feed (id: %s, alias: %s)", len(newentries), feed["id"], feed["alias"]) for e in newentries: for positive, negative, sizelim in feed["filters"]: contTrue = 0 for regex in [re.compile(x, re.I) for x in positive]: if not regex.search(e.title): break else: contTrue += 1 if contTrue != len(positive): continue cont = True for regex in [re.compile(y, re.I) for y in negative]: if regex.search(e.title): cont = False break else: cont = True if not cont: continue self.RPC.RPCCommand("fetch_torrent_rss", ID=feed["id"], alias=feed["alias"], link=e.link, sizelim=sizelim)
def addRSSFeed(self, url, ttl, alias=None): try: ttl = float(ttl) except ValueError: raise UndefinedError("TTL must be a number") url_chk = urlparse.urlparse(url) if not url_chk.netloc: raise UndefinedError("URL malformed") #check feed is parseable try: feed_chk = feedparser.parse(url) except: raise UndefinedError("URL is not an RSS feed") else: last_item = feed_chk.entries[0] lasthash = hashlib.sha256(last_item.link).hexdigest() if not alias: alias = url_chk.netloc rand_id = self._randomID() #self.ID = rand_id #self.url = url #self.ttl = ttl #self.alias = alias #self.enabled = enabled #self.filters = filters #self.updated = 0 #self.lasthash = lasthash newRSS = { "ID": rand_id, "url": url, "ttl": ttl, "alias": alias, "enabled": False, "filters": [], "updated": 0, "lasthash": lasthash, } #newRSS = RSSFeed(rand_id, url, ttl, alias, lasthash) self.RSS[rand_id] = newRSS self._flushRSS()
def addRSSFeed(self, url, ttl, alias=None): try: ttl = float(ttl) except ValueError: raise UndefinedError("TTL must be a number") url_chk = urlparse.urlparse(url) if not url_chk.netloc: raise UndefinedError("URL malformed") # check feed is parseable try: feed_chk = feedparser.parse(url) except: raise UndefinedError("URL is not an RSS feed") else: last_item = feed_chk.entries[0] lasthash = hashlib.sha256(last_item.link).hexdigest() if not alias: alias = url_chk.netloc rand_id = self._randomID() # self.ID = rand_id # self.url = url # self.ttl = ttl # self.alias = alias # self.enabled = enabled # self.filters = filters # self.updated = 0 # self.lasthash = lasthash newRSS = { "ID": rand_id, "url": url, "ttl": ttl, "alias": alias, "enabled": False, "filters": [], "updated": 0, "lasthash": lasthash, } # newRSS = RSSFeed(rand_id, url, ttl, alias, lasthash) self.RSS[rand_id] = newRSS self._flushRSS()