예제 #1
0
 def accept_url(self, url, tree):
     if self.pac and isinstance(self.pac, dict): 
         pac = self.pac
         identifier = self.identifier
     else:
         baseurl, rssurl, rssbody = guess_baseurl(url, tree)
         identifier = md5(baseurl).hexdigest()
         if identifier not in self.pacs:
             pac = self.get_full_pac(identifier, tree)
             self.pacs[identifier] = pac
         else:
             pac = self.pacs[identifier]
     if not pac:
         AddToAnalyzerCandidates(identifier, url)
         return None
     if self.rss: return True
     try:
         return re.match(pac['url'], url)
     except:
         logger.debug('without rss and pac url')
         #AddToAnalyzerCandidates(identifier, url)
         return None
예제 #2
0
 def parse(self, page):
     if self.pac and isinstance(self.pac, dict): 
         pac = self.pac
         identifier = self.identifier
     else: 
         baseurl, rssurl, rssbody = guess_baseurl(page.effective_url, page.tree)
         identifier = md5(baseurl).hexdigest()
         if self.pacs.has_key(identifier): pac = self.pacs[identifier]
         else: pac = None
         if isinstance(pac, unicode): pac = self.parse_pac(pac, page.tree)
     if not pac:
         AddToAnalyzerCandidates(identifier, page.effective_url)
         return
     logger.debug('parse: %s, %s' % (page.name, page.effective_url))
     data = {}
     rss_entry = None
     if self.rss: rss_entry = self.extract_entry_from_rss(page.effective_url)
     for k in pac:
         if k not in ('url', 'effective_url', 'type'):
             if rss_entry and k=='title':
                 data[k] = rss_entry.title
                 continue
             data[k] = self.extract(page, k, pac)
     if 'title' not in data or not data['title']:
         try:
             data['title'] = page.tree.title.string
         except:
             pass
     if not (data['title'] and data['content']):
         AddToAnalyzerCandidates(identifier, page.effective_url)
     if (not data['content'] or len(data['content']) < 50) and rss_entry:
         data['content'] = rss_entry.description
     if not data['title'] or not data['content']:
         logger.debug('Ignore item because of no title or content')
         return
     data['url'] = page.effective_url
     data['identifier'] = identifier
     if rss_entry:
         try:
             data['published_at'] = datetime(*rss_entry.updated_parsed[:6])
         except:
             pass
         try:
             data['author'] = rss_entry.author
         except:
             pass
     url_hash = md5(data['url']).hexdigest()
     entry = Entry().one({'url_hash': url_hash})
     if not entry:
         entry = New(Entry())
         entry.url = data['url'] if isinstance(data['url'], unicode) else data['url'].decode('utf-8')
         entry.url_hash = url_hash if isinstance(url_hash, unicode) else url_hash.decode('utf-8')
         entry.identifier = identifier if isinstance(identifier, unicode) else identifier.decode('utf-8')
     entry.title = data['title'] if isinstance(data['title'], unicode) else data['title'].decode('utf-8')
     entry.content = data['content'] if isinstance(data['content'], unicode) else data['content'].decode('utf-8')
     if data.has_key('author') and data['author']:
         entry.author = data['author'] if isinstance(data['author'], unicode) else data['author'].decode('utf-8')
     if data.has_key('published_at') and data['published_at']:
         entry.published_at = data['published_at']
     entry.updated_at = datetime.utcnow()
     entry.save()