Example #1
0
def retrieve_RSSfeed(URL, date, RSS=True):
    if RSS:
        content = feedparser.parse(URL)
        count = -1

        if len(content)==0:
            print "Tsy nahitana na tsy nahasokatra ilay topaka RSS"
            raise Exception()

        while count < len(content.entries)-1:
            count += 1
            Y = content.entries[count].updated_parsed.tm_year
            M = content.entries[count].updated_parsed.tm_mon
            D = content.entries[count].updated_parsed.tm_mday
            if D==date[2] and M==date[1] and Y==date[0]:
                g = NewWordFinder.HTMLEntitiesToUnicode(content.entries[count].content[0].value)
                g = NewWordFinder.strip_tags(g)
                g = g.replace(u'’',u"'")
                yield {
                    'text' : g,
                    'title': content.entries[count].title,
                    'link' : content.entries[count].links[0].href
                    }
    else: #Ho an'ny gazety tsy mazaka RSS na tsy mampiasa azy
        reader = htmlnewsp.HTMLcorpusretriever()
        for data in reader.crawllinks(URL):
            yield {
                'title': data[0],
                'text': data[1],
                'link': data[2]
                }

            print data[1]
Example #2
0
    def processFeed(self, feed):
        try:
            f = feedparser.parse(feed["url"])
        except:
            self.RPC.RPCCommand("log", "error", "Invalid RSS feed (id: %s, alias: %s), disabling", feed["id"], feed["alias"])
            self.RPC.RPCCommand("disable_rss", feed["id"])
        else:
            if _DEBUG:
                _debugstore = shelve.open("feed.debug.rss")
                for x, y in f.iteritems():
                    _debugstore[x] = y
                _debugstore.sync()
                _debugstore.close()
            if len(f.entries) == 0:
                self.RPC.RPCCommand("log", "warning", "RSS feed (id: %s, alias: %s) is empty", feed["id"], feed["alias"])
                return

            lasthash = hashlib.sha256(f.entries[0].link).hexdigest()
            if lasthash == feed["lasthash"]:
                # no new entries
                # self.RPC.RPCCommand("log", "debug", "No new entries for feed (id: %s, alias: %s)", feed["id"], feed["alias"])
                pass
            else:
                self.RPC.RPCCommand("updatehash_rss", feed["id"], lasthash)
                newentries = [f.entries[0]]
                for e in f.entries[1:]:
                    h = hashlib.sha256(e.link).hexdigest()
                    if h == feed["lasthash"]:
                        break
                    else:
                        newentries.append(e)
                # self.RPC.RPCCommand("log", "debug", "%i new entries for feed (id: %s, alias: %s)", len(newentries), feed["id"], feed["alias"])
                for e in newentries:
                    for positive, negative, sizelim in feed["filters"]:
                        contTrue = 0
                        for regex in [re.compile(x, re.I) for x in positive]:
                            if not regex.search(e.title):
                                break
                            else:
                                contTrue += 1
                        if contTrue != len(positive):
                            continue

                        cont = True

                        for regex in [re.compile(y, re.I) for y in negative]:
                            if regex.search(e.title):
                                cont = False
                                break
                            else:
                                cont = True

                        if not cont:
                            continue

                        self.RPC.RPCCommand("fetch_torrent_rss", ID=feed["id"], alias=feed["alias"], link=e.link, sizelim=sizelim)
Example #3
0
    def processFeed(self, feed):
        try:
            f = feedparser.parse(feed["url"])
        except:
            self.RPC.RPCCommand("log","error","Invalid RSS feed (id: %s, alias: %s), disabling", feed["id"], feed["alias"])
            self.RPC.RPCCommand("disable_rss", feed["id"])
        else:
            if len(f.entries) == 0:
                self.RPC.RPCCommand("log","warning", "RSS feed (id: %s, alias: %s) is empty", feed["id"], feed["alias"])
                return
            
            lasthash = hashlib.sha256(f.entries[0].link).hexdigest()
            if lasthash == feed["lasthash"]:
                #no new entries
                #self.RPC.RPCCommand("log", "debug", "No new entries for feed (id: %s, alias: %s)", feed["id"], feed["alias"])
                pass
            else:
                self.RPC.RPCCommand("updatehash_rss", feed["id"], lasthash)
                newentries = [f.entries[0]]
                for e in f.entries[1:]:
                    h = hashlib.sha256(e.link).hexdigest()
                    if h == feed["lasthash"]:
                        break
                    else:
                        newentries.append(e)
                #self.RPC.RPCCommand("log","debug","%i new entries for feed (id: %s, alias: %s)", len(newentries), feed["id"], feed["alias"])
                for e in newentries:
                    for positive, negative, sizelim in feed["filters"]:
                        contTrue = 0
                        for regex in [re.compile(x, re.I) for x in positive]:
                            if not regex.search(e.title):
                                break
                            else:
                                contTrue += 1
                        if contTrue != len(positive):
                            continue
                        
                        cont = True

                        for regex in [re.compile(y, re.I) for y in negative]:
                            if regex.search(e.title):
                                cont = False
                                break
                            else:
                                cont = True
                                
                        if not cont:
                            continue

                        self.RPC.RPCCommand("fetch_torrent_rss", ID=feed["id"], alias=feed["alias"], link=e.link, sizelim=sizelim)
Example #4
0
    def addRSSFeed(self, url, ttl, alias=None):
        try:
            ttl = float(ttl)
        except ValueError:
            raise UndefinedError("TTL must be a number")
        url_chk = urlparse.urlparse(url)
        if not url_chk.netloc:
            raise UndefinedError("URL malformed")

        #check feed is parseable
        try:
            feed_chk = feedparser.parse(url)
        except:
            raise UndefinedError("URL is not an RSS feed")
        else:
            last_item = feed_chk.entries[0]
            lasthash = hashlib.sha256(last_item.link).hexdigest()

        if not alias:
            alias = url_chk.netloc
        rand_id = self._randomID()

        #self.ID = rand_id
        #self.url = url
        #self.ttl = ttl
        #self.alias = alias
        #self.enabled = enabled
        #self.filters = filters
        #self.updated = 0
        #self.lasthash = lasthash
        newRSS = {
            "ID": rand_id,
            "url": url,
            "ttl": ttl,
            "alias": alias,
            "enabled": False,
            "filters": [],
            "updated": 0,
            "lasthash": lasthash,
        }
        #newRSS = RSSFeed(rand_id, url, ttl, alias, lasthash)
        self.RSS[rand_id] = newRSS
        self._flushRSS()
Example #5
0
    def addRSSFeed(self, url, ttl, alias=None):
        try:
            ttl = float(ttl)
        except ValueError:
            raise UndefinedError("TTL must be a number")
        url_chk = urlparse.urlparse(url)
        if not url_chk.netloc:
            raise UndefinedError("URL malformed")

        # check feed is parseable
        try:
            feed_chk = feedparser.parse(url)
        except:
            raise UndefinedError("URL is not an RSS feed")
        else:
            last_item = feed_chk.entries[0]
            lasthash = hashlib.sha256(last_item.link).hexdigest()

        if not alias:
            alias = url_chk.netloc
        rand_id = self._randomID()

#        self.ID = rand_id
#        self.url = url
#        self.ttl = ttl
#        self.alias = alias
#        self.enabled = enabled
#        self.filters = filters
#        self.updated = 0
#        self.lasthash = lasthash
        newRSS = {
            "ID": rand_id, "url": url, "ttl": ttl, "alias": alias,
            "enabled": False, "filters": [], "updated": 0, "lasthash": lasthash,
        }
        # newRSS = RSSFeed(rand_id, url, ttl, alias, lasthash)
        self.RSS[rand_id] = newRSS
        self._flushRSS()