def accept_url(self, url, tree): if self.pac and isinstance(self.pac, dict): pac = self.pac identifier = self.identifier else: baseurl, rssurl, rssbody = guess_baseurl(url, tree) identifier = md5(baseurl).hexdigest() if identifier not in self.pacs: pac = self.get_full_pac(identifier, tree) self.pacs[identifier] = pac else: pac = self.pacs[identifier] if not pac: AddToAnalyzerCandidates(identifier, url) return None if self.rss: return True try: return re.match(pac['url'], url) except: logger.debug('without rss and pac url') #AddToAnalyzerCandidates(identifier, url) return None
def parse(self, page): if self.pac and isinstance(self.pac, dict): pac = self.pac identifier = self.identifier else: baseurl, rssurl, rssbody = guess_baseurl(page.effective_url, page.tree) identifier = md5(baseurl).hexdigest() if self.pacs.has_key(identifier): pac = self.pacs[identifier] else: pac = None if isinstance(pac, unicode): pac = self.parse_pac(pac, page.tree) if not pac: AddToAnalyzerCandidates(identifier, page.effective_url) return logger.debug('parse: %s, %s' % (page.name, page.effective_url)) data = {} rss_entry = None if self.rss: rss_entry = self.extract_entry_from_rss(page.effective_url) for k in pac: if k not in ('url', 'effective_url', 'type'): if rss_entry and k=='title': data[k] = rss_entry.title continue data[k] = self.extract(page, k, pac) if 'title' not in data or not data['title']: try: data['title'] = page.tree.title.string except: pass if not (data['title'] and data['content']): AddToAnalyzerCandidates(identifier, page.effective_url) if (not data['content'] or len(data['content']) < 50) and rss_entry: data['content'] = rss_entry.description if not data['title'] or not data['content']: logger.debug('Ignore item because of no title or content') return data['url'] = page.effective_url data['identifier'] = identifier if rss_entry: try: data['published_at'] = datetime(*rss_entry.updated_parsed[:6]) except: pass try: data['author'] = rss_entry.author except: pass url_hash = md5(data['url']).hexdigest() entry = Entry().one({'url_hash': url_hash}) if not entry: entry = New(Entry()) entry.url = data['url'] if isinstance(data['url'], unicode) else data['url'].decode('utf-8') entry.url_hash = url_hash if isinstance(url_hash, unicode) else url_hash.decode('utf-8') entry.identifier = identifier if isinstance(identifier, unicode) else identifier.decode('utf-8') entry.title = data['title'] if isinstance(data['title'], unicode) else data['title'].decode('utf-8') entry.content = data['content'] if isinstance(data['content'], unicode) else data['content'].decode('utf-8') if data.has_key('author') and data['author']: entry.author = data['author'] if isinstance(data['author'], unicode) else data['author'].decode('utf-8') if data.has_key('published_at') and data['published_at']: entry.published_at = data['published_at'] entry.updated_at = datetime.utcnow() entry.save()