Exemple #1
0
        if 'tag' in root:
            related = []
            for x in re.findall(Douban.tag_from_tag_anchor, content):
                url = x.encode('utf8')
                title = root.split('/')[-2]
                url = root.partition('?')[0] + url
                data = {'url':url, 'title':title}
                yield ('save', data['url'], json.dumps(data, ensure_ascii = False))
                yield ('static', url, 2)
            for x in re.findall(Douban.book_from_tag_anchor, content):
                url, title = x
                title = title.strip()
                if not title : continue
                related.append([url.encode('utf8'), title.encode('utf8')])
                #print(url, title)
                data = {'url':url.encode('utf8'), 'title':title.encode('utf8')}
                yield ('save', data['url'], json.dumps(data, ensure_ascii = False))
                yield ('static', url, 2)
            if related :

                data = {'url':root, 'related':related}
                yield ('save', 'parsed:'+data['url'], json.dumps(data, ensure_ascii = False))
            return

        if False:
            yield None


if __name__ == '__main__':
    papa.quickstart(Douban(), 'douban')
Exemple #2
0
                data = {'url': url, 'title': title}
                yield ('save', data['url'], json.dumps(data,
                                                       ensure_ascii=False))
                yield ('static', url, 2)
            for x in re.findall(Douban.book_from_tag_anchor, content):
                url, title = x
                title = title.strip()
                if not title: continue
                related.append([url.encode('utf8'), title.encode('utf8')])
                #print(url, title)
                data = {
                    'url': url.encode('utf8'),
                    'title': title.encode('utf8')
                }
                yield ('save', data['url'], json.dumps(data,
                                                       ensure_ascii=False))
                yield ('static', url, 2)
            if related:

                data = {'url': root, 'related': related}
                yield ('save', 'parsed:' + data['url'],
                       json.dumps(data, ensure_ascii=False))
            return

        if False:
            yield None


if __name__ == '__main__':
    papa.quickstart(Douban(), 'douban')
Exemple #3
0
#coding:utf8
from __future__ import print_function
import papa
import json
import datetime


class BaiduNews():
    def gen_seeds(self):
        yield ('sleep', 300)
        yield ('forget', )
        yield ('dynamic', 'http://news.baidu.com/', 1)

    def parse(self, url, content, tree):
        for e in tree.xpath('//a[@href]'):
            url = e.get('href')
            url = url.partition('?')[0]
            url = url.partition('#')[0]

            if e.text and len(e.text) > 10 and url.startswith('http://'):
                data = {
                    'url': url.encode('utf8'),
                    'title': e.text.encode('utf8'),
                    'extract_date': str(datetime.date.today())
                }
                yield ('save', url, json.dumps(data, ensure_ascii=False))


if __name__ == '__main__':
    papa.quickstart(BaiduNews(), 'baidu_news')
Exemple #4
0
#!/usr/bin/env python2
#coding:utf8
from __future__ import print_function
import papa
import json
import datetime

class BaiduNews() :
    def gen_seeds(self):
        yield ('sleep', 300)
        yield ('forget',)
        yield ('dynamic', 'http://news.baidu.com/', 1)

    def parse(self, url, content, tree):
        for e in tree.xpath('//a[@href]'):
            url = e.get('href')
            url = url.partition('?')[0]
            url = url.partition('#')[0]

            if e.text and len(e.text) > 10 and url.startswith('http://'):
                data = {'url':url.encode('utf8'), 'title':e.text.encode('utf8'), 
                        'extract_date':str(datetime.date.today()) }
                yield ('save', url, json.dumps(data, ensure_ascii = False))

if __name__ == '__main__':
    papa.quickstart(BaiduNews(), 'baidu_news')
Exemple #5
0
    def gen_seeds(self):
        yield ('sleep', 300) # rest for 300 seconds
        yield ('forget',) # forget all the dynamic 
        yield ('dynamic', 'http://news.163.com/', 2)

    def parse(self, url, content, tree):
        print(url)
        try :
            for e in tree.xpath('//a[@href]'):
                url = e.get('href')
                if not url.startswith('http://news.163.com/') :continue
                url = url.partition('?')[0]
                url = url.partition('#')[0]

                if 'photo' in url : continue
                if (url.endswith('.html') and e.text is not None):
                    
                    data = {'url':url.encode('utf8'), 'title':e.text.encode('utf8'), 
                            'extract_date':str(datetime.date.today()) }
                    yield ('save', url, json.dumps(data, ensure_ascii = False))

                    if 'editor' in url : continue
                    yield ('static', url, 2)
                else :
                    yield ('dynamic', url)
        except :
            return

if __name__ == '__main__':
    papa.quickstart(NetEase(), 'netease_news')