Ejemplo n.º 1
0
 def _parse_rss(self, urltype):
     result = namedtuple('Result', ['title', 'description', 'link',
                                    'content', 'image', 'pubDate',
                                    'src', 'newstype', 'urlkey'])
     results = []
     for url in urllist[urltype]:            
         try:                
             _page_ = urlfetch.fetch(url)  
         except urlfetch.DeadlineExceededError:
             logging.error("DeadlineExceededError")
             continue                
         page = _page_.content                            
         xml = BeautifulStoneSoup(page, convertEntities=BeautifulSoup.HTML_ENTITIES)           
         posts = xml.findAll("item")[:POST_LIMIT]
         urlkey = urllist[urltype][url]
         source = srcmap[urlkey]                        
         for post in posts:
             newstype = urltype                
             #title = self.get_data("title", post)
             title = post.find("title").getText()                
             _desc = post.find("description").getText()
             #desc = post.find("description").getText()                
             #desc = self.get_data("description", post)                
             link = post.find("link").getText()         
             try:
                 image = BeautifulSoup(_desc, convertEntities=BeautifulSoup.HTML_ENTITIES).find('img')['src']
             except(TypeError):                
                 try:
                     image = post.find("image").getText()
                 except(AttributeError):
                     try:
                         image = post.find("ipimage").getText()
                     except(AttributeError):                
                         try:
                             image = xml.find("image").find('url').getText()
                             logging.error('bokllu resim')
                         except(AttributeError):
                             image = ''
             desc = BeautifulSoup(_desc).find(text = True)
             image = image.replace("htufak", "detay")                
             if not desc:
                 desc = ""
                 logging.info("Empty Desc")                
             _pubDate_ = post.find("pubDate")
             if not _pubDate_:
                 _pubDate_ = post.find("pubdate")
             try:
                 pubDate = parser.parse(_pubDate_.getText())           
             except (ValueError):
                 logging.info("Auto pubDate added")                    
                 pubDate = datetime.now()  
             content = "no content yet" #add parsing for content later"
             src = source      
             urlkey = urllist[urltype][url]                
             if link:                
                 results.append(result(title, desc, link, content, image, pubDate, src, newstype, urlkey))
             else: logging.info("no link")        
     return results