if 'tag' in root: related = [] for x in re.findall(Douban.tag_from_tag_anchor, content): url = x.encode('utf8') title = root.split('/')[-2] url = root.partition('?')[0] + url data = {'url':url, 'title':title} yield ('save', data['url'], json.dumps(data, ensure_ascii = False)) yield ('static', url, 2) for x in re.findall(Douban.book_from_tag_anchor, content): url, title = x title = title.strip() if not title : continue related.append([url.encode('utf8'), title.encode('utf8')]) #print(url, title) data = {'url':url.encode('utf8'), 'title':title.encode('utf8')} yield ('save', data['url'], json.dumps(data, ensure_ascii = False)) yield ('static', url, 2) if related : data = {'url':root, 'related':related} yield ('save', 'parsed:'+data['url'], json.dumps(data, ensure_ascii = False)) return if False: yield None if __name__ == '__main__': papa.quickstart(Douban(), 'douban')
data = {'url': url, 'title': title} yield ('save', data['url'], json.dumps(data, ensure_ascii=False)) yield ('static', url, 2) for x in re.findall(Douban.book_from_tag_anchor, content): url, title = x title = title.strip() if not title: continue related.append([url.encode('utf8'), title.encode('utf8')]) #print(url, title) data = { 'url': url.encode('utf8'), 'title': title.encode('utf8') } yield ('save', data['url'], json.dumps(data, ensure_ascii=False)) yield ('static', url, 2) if related: data = {'url': root, 'related': related} yield ('save', 'parsed:' + data['url'], json.dumps(data, ensure_ascii=False)) return if False: yield None if __name__ == '__main__': papa.quickstart(Douban(), 'douban')
#coding:utf8 from __future__ import print_function import papa import json import datetime class BaiduNews(): def gen_seeds(self): yield ('sleep', 300) yield ('forget', ) yield ('dynamic', 'http://news.baidu.com/', 1) def parse(self, url, content, tree): for e in tree.xpath('//a[@href]'): url = e.get('href') url = url.partition('?')[0] url = url.partition('#')[0] if e.text and len(e.text) > 10 and url.startswith('http://'): data = { 'url': url.encode('utf8'), 'title': e.text.encode('utf8'), 'extract_date': str(datetime.date.today()) } yield ('save', url, json.dumps(data, ensure_ascii=False)) if __name__ == '__main__': papa.quickstart(BaiduNews(), 'baidu_news')
#!/usr/bin/env python2 #coding:utf8 from __future__ import print_function import papa import json import datetime class BaiduNews() : def gen_seeds(self): yield ('sleep', 300) yield ('forget',) yield ('dynamic', 'http://news.baidu.com/', 1) def parse(self, url, content, tree): for e in tree.xpath('//a[@href]'): url = e.get('href') url = url.partition('?')[0] url = url.partition('#')[0] if e.text and len(e.text) > 10 and url.startswith('http://'): data = {'url':url.encode('utf8'), 'title':e.text.encode('utf8'), 'extract_date':str(datetime.date.today()) } yield ('save', url, json.dumps(data, ensure_ascii = False)) if __name__ == '__main__': papa.quickstart(BaiduNews(), 'baidu_news')
def gen_seeds(self): yield ('sleep', 300) # rest for 300 seconds yield ('forget',) # forget all the dynamic yield ('dynamic', 'http://news.163.com/', 2) def parse(self, url, content, tree): print(url) try : for e in tree.xpath('//a[@href]'): url = e.get('href') if not url.startswith('http://news.163.com/') :continue url = url.partition('?')[0] url = url.partition('#')[0] if 'photo' in url : continue if (url.endswith('.html') and e.text is not None): data = {'url':url.encode('utf8'), 'title':e.text.encode('utf8'), 'extract_date':str(datetime.date.today()) } yield ('save', url, json.dumps(data, ensure_ascii = False)) if 'editor' in url : continue yield ('static', url, 2) else : yield ('dynamic', url) except : return if __name__ == '__main__': papa.quickstart(NetEase(), 'netease_news')