Exemple #1
0
 def Items(self, opts=None):
     """
     生成器,返回一个元组
     对于HTML:section,url,title,content,brief
     对于图片,mime,url,filename,content,brief
     """
     urls = self.ParseFeedUrls()
     readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
     prevsection = ''
     decoder = AutoDecoder(False)
     if USE_ASYNC_URLFETCH:
         #启动异步下载
         asyncopener = AsyncURLOpener(self.log)
         rpcs = [asyncopener.fetch(url,self.timeout,sec,title) 
                 for sec,title,url,desc in urls if not desc]
         
         #为了效率起见,先处理全文RSS
         #在处理全文RSS的时候,其他RSS在后台拼命下载中...
         for section, ftitle, url, desc in urls:
             if not desc:
                 continue
             
             article = self.FragToXhtml(desc, ftitle)
             #如果是图片,title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url,opts):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title: title = ftitle
                     content =  self.postprocess(content)
                     yield (section, url, title, content, brief)
         
         #轮到摘要RSS了
         for result,url,(section,ftitle) in asyncopener.get_result():
             if section != prevsection or prevsection == '':
                 decoder.encoding = '' #每个小节都重新探测编码
                 prevsection = section
                 
             status_code, content = result.status_code, result.content
             if status_code != 200 or not content:
                 self.log.warn('async fetch article failed(%d):%s.' % (status_code,url))
                 continue
             
             if self.page_encoding:
                 article = content.decode(self.page_encoding)
             else:
                 article = decoder.decode(content,url)
             
             #如果是图片,title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url,opts):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title: title = ftitle
                     content = self.postprocess(content)
                     yield (section, url, title, content, brief)
     else: #同步UrlFetch方式
         for section, ftitle, url, desc in urls:
             if not desc: #非全文RSS
                 if section != prevsection or prevsection == '':
                     decoder.encoding = '' #每个小节都重新探测编码
                     prevsection = section
                 
                 article = self.fetcharticle(url, decoder)
                 if not article:
                     continue
             else:
                 article = self.FragToXhtml(desc, ftitle)
             
             #如果是图片,title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url,opts):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title: title = ftitle
                     content =  self.postprocess(content)
                     yield (section, url, title, content, brief)
Exemple #2
0
    def Items(self, opts=None):
        """
        生成器,返回一个元组
        对于HTML:section,url,title,content,brief
        对于图片,mime,url,filename,content,brief
        """
        urls = self.ParseFeedUrls()
        readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
        prevsection = ''
        decoder = AutoDecoder(False)
        if USE_ASYNC_URLFETCH:
            #启动异步下载
            asyncopener = AsyncURLOpener(self.log)
            rpcs = [
                asyncopener.fetch(url, self.timeout, sec, title)
                for sec, title, url, desc in urls if not desc
            ]

            #为了效率起见,先处理全文RSS
            #在处理全文RSS的时候,其他RSS在后台拼命下载中...
            for section, ftitle, url, desc in urls:
                if not desc:
                    continue

                article = self.FragToXhtml(desc, ftitle)
                #如果是图片,title则是mime
                for title, imgurl, imgfn, content, brief in readability(
                        article, url, opts):
                    if title.startswith(r'image/'):  #图片
                        yield (title, imgurl, imgfn, content, brief)
                    else:
                        if not title: title = ftitle
                        content = self.postprocess(content)
                        yield (section, url, title, content, brief)

            #轮到摘要RSS了
            for result, url, (section, ftitle) in asyncopener.get_result():
                if section != prevsection or prevsection == '':
                    decoder.encoding = ''  #每个小节都重新探测编码
                    prevsection = section

                status_code, content = result.status_code, result.content
                if status_code != 200 or not content:
                    self.log.warn('async fetch article failed(%d):%s.' %
                                  (status_code, url))
                    continue

                if self.page_encoding:
                    article = content.decode(self.page_encoding)
                else:
                    article = decoder.decode(content, url)

                #如果是图片,title则是mime
                for title, imgurl, imgfn, content, brief in readability(
                        article, url, opts):
                    if title.startswith(r'image/'):  #图片
                        yield (title, imgurl, imgfn, content, brief)
                    else:
                        if not title: title = ftitle
                        content = self.postprocess(content)
                        yield (section, url, title, content, brief)
        else:  #同步UrlFetch方式
            for section, ftitle, url, desc in urls:
                if not desc:  #非全文RSS
                    if section != prevsection or prevsection == '':
                        decoder.encoding = ''  #每个小节都重新探测编码
                        prevsection = section

                    article = self.fetcharticle(url, decoder)
                    if not article:
                        continue
                else:
                    article = self.FragToXhtml(desc, ftitle)

                #如果是图片,title则是mime
                for title, imgurl, imgfn, content, brief in readability(
                        article, url, opts):
                    if title.startswith(r'image/'):  #图片
                        yield (title, imgurl, imgfn, content, brief)
                    else:
                        if not title: title = ftitle
                        content = self.postprocess(content)
                        yield (section, url, title, content, brief)
Exemple #3
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     tnow = datetime.utcnow()
     urladded = set()
     asyncopener = AsyncURLOpener(self.log)
     
     for feed in self.feeds:
         section, url = feed[0], feed[1]
         isfulltext = feed[2] if len(feed) > 2 else False
         timeout = self.timeout+10 if isfulltext else self.timeout
         if USE_ASYNC_URLFETCH_IN_FEEDS:
             asyncopener.fetch(url,timeout,section,isfulltext)
             continue
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         if result.status_code == 200 and result.content:
             if self.feed_encoding:
                 content = result.content.decode(self.feed_encoding)
             else:
                 content = AutoDecoder(True).decode(result.content,url)    
             feed = feedparser.parse(content)
             
             for e in feed['entries'][:self.max_articles_per_feed]:
                 if self.oldest_article > 0 and hasattr(e, 'updated_parsed'):
                     updated = e.updated_parsed
                     if updated:
                         delta = tnow - datetime(*(updated[0:6]))
                         if delta.days*86400+delta.seconds > 86400*self.oldest_article:
                             self.log.debug("article '%s' is too old"%e.title)
                             continue
                 #支持HTTPS
                 urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
                 if urlfeed in urladded:
                     continue
                     
                 desc = None
                 if isfulltext:
                     if hasattr(e, 'content') and e.content[0]['value']:
                         desc = e.content[0]['value']
                     elif hasattr(e, 'description'):
                         desc = e.description
                     else:
                         self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title)
                 urls.append((section, e.title, urlfeed, desc))
                 urladded.add(urlfeed)
         else:
             self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
     
     #异步下载
     if USE_ASYNC_URLFETCH_IN_FEEDS:
         for result,url,(section,isfulltext) in asyncopener.get_result():
             if result.status_code == 200 and result.content:
                 if self.feed_encoding:
                     content = result.content.decode(self.feed_encoding)
                 else:
                     content = AutoDecoder(True).decode(result.content,url)    
                 feed = feedparser.parse(content)
                 
                 for e in feed['entries'][:self.max_articles_per_feed]:
                     if self.oldest_article > 0 and hasattr(e, 'updated_parsed'):
                         updated = e.updated_parsed
                         if updated:
                             delta = tnow - datetime(*(updated[0:6]))
                             if delta.days*86400+delta.seconds > 86400*self.oldest_article:
                                 self.log.debug("article '%s' is too old"%e.title)
                                 continue
                     #支持HTTPS
                     urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
                     if urlfeed in urladded:
                         continue
                         
                     desc = None
                     if isfulltext:
                         if hasattr(e, 'content') and e.content[0]['value']:
                             desc = e.content[0]['value']
                         elif hasattr(e, 'description'):
                             desc = e.description
                         else:
                             self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title)
                     urls.append((section, e.title, urlfeed, desc))
                     urladded.add(urlfeed)
             else:
                 self.log.warn('async fetch rss failed(%d):%s'%(result.status_code,url))
     return urls
Exemple #4
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.utcnow()
        urladded = set()
        asyncopener = AsyncURLOpener(self.log)

        for feed in self.feeds:
            section, url = feed[0], feed[1]
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout + 10 if isfulltext else self.timeout
            if USE_ASYNC_URLFETCH_IN_FEEDS:
                asyncopener.fetch(url, timeout, section, isfulltext)
                continue
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    content = result.content.decode(self.feed_encoding)
                else:
                    content = AutoDecoder(True).decode(result.content, url)
                feed = feedparser.parse(content)

                for e in feed['entries'][:self.max_articles_per_feed]:
                    if self.oldest_article > 0 and hasattr(
                            e, 'updated_parsed'):
                        updated = e.updated_parsed
                        if updated:
                            delta = tnow - datetime(*(updated[0:6]))
                            if delta.days * 86400 + delta.seconds > 86400 * self.oldest_article:
                                self.log.debug("article '%s' is too old" %
                                               e.title)
                                continue
                    #支持HTTPS
                    urlfeed = e.link.replace(
                        'http://',
                        'https://') if url.startswith('https://') else e.link
                    if urlfeed in urladded:
                        continue

                    desc = None
                    if isfulltext:
                        if hasattr(e, 'content') and e.content[0]['value']:
                            desc = e.content[0]['value']
                        elif hasattr(e, 'description'):
                            desc = e.description
                        else:
                            self.log.warn(
                                'feed item invalid,link to webpage for article.(%s)'
                                % e.title)
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s' %
                              (result.status_code, url))

        #异步下载
        if USE_ASYNC_URLFETCH_IN_FEEDS:
            for result, url, (section, isfulltext) in asyncopener.get_result():
                if result.status_code == 200 and result.content:
                    if self.feed_encoding:
                        content = result.content.decode(self.feed_encoding)
                    else:
                        content = AutoDecoder(True).decode(result.content, url)
                    feed = feedparser.parse(content)

                    for e in feed['entries'][:self.max_articles_per_feed]:
                        if self.oldest_article > 0 and hasattr(
                                e, 'updated_parsed'):
                            updated = e.updated_parsed
                            if updated:
                                delta = tnow - datetime(*(updated[0:6]))
                                if delta.days * 86400 + delta.seconds > 86400 * self.oldest_article:
                                    self.log.debug("article '%s' is too old" %
                                                   e.title)
                                    continue
                        #支持HTTPS
                        urlfeed = e.link.replace('http://',
                                                 'https://') if url.startswith(
                                                     'https://') else e.link
                        if urlfeed in urladded:
                            continue

                        desc = None
                        if isfulltext:
                            if hasattr(e, 'content') and e.content[0]['value']:
                                desc = e.content[0]['value']
                            elif hasattr(e, 'description'):
                                desc = e.description
                            else:
                                self.log.warn(
                                    'feed item invalid,link to webpage for article.(%s)'
                                    % e.title)
                        urls.append((section, e.title, urlfeed, desc))
                        urladded.add(urlfeed)
                else:
                    self.log.warn('async fetch rss failed(%d):%s' %
                                  (result.status_code, url))
        return urls