Exemple #1
0
class SpiderMan:
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parser_url(root_url, content)
        # 构造一个获取评分和票房链接
        for url in urls:
            try:
                t = time.strftime('%Y%m%d%H%M%S3282', time.localtime())
                rank_url = ('http://service.library.mtime.com/Movie.api'
                            '?Ajax_CallBack=true'
                            '&Ajax_CallBackType=Mtime.Library.Service'
                            '&Ajax_CallBackMethod=GetMovieOverviewRating'
                            '&Ajax_CrossDomain=1'
                            '&Ajax_RequestUrl=%s'
                            '&t=%s'
                            '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1]))
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                print(e, 'Crawl failed')
        self.output.output_end()
        print('Crawl finish')
Exemple #2
0
def getPOI2(q,region):
    q = q.encode('utf-8')
    region = region.encode('utf-8')
    L = []
    ak = u'BjZFyCBFktfZmdj7SVP98fEFx78KzFn4'
    #ak=u'skS8wg9wP1VVFk2iuDuQATzoWKMb8FuY'
    #ak=u'AKsr88dgGDK8d74q7wTRbhiSb567HVmA'
    q = urllib2.quote(q)
    region = urllib2.quote(region)
    #hd = HtmlDownloader.HtmlDownloader()
    #http://api.map.baidu.com/place/v2/search?query=购物中心&region=天津&city_limit=true&output=json&ak=BjZFyCBFktfZmdj7SVP98fEFx78KzFn4&page_num=0
    baseUrl = 'http://api.map.baidu.com/place/v2/search?query=%s&region=%s&city_limit=true&output=json&ak=%s&page_num=' %(q,region,ak)
    page = 0
    total= 1
    while page*10 < total:
        url = baseUrl+unicode(str(page),'utf-8')
        print url
        res = HtmlDownloader.download(url)
        while(res is None):
            res = HtmlDownloader.download(url)
            print "retrying...",url
        res = unicode(res,'utf-8')
        data = JsonUtils.readStr(res)
        status = data[u'status']
        message = data[u'message']
        if status==0 and message=='ok':
            #返回成功
            L.extend(data[u'results'])
            total = data[u'total']
            page = page + 1
        else:
            #失败
            print u"查询失败",message
            return L
    return L
Exemple #3
0
class SpiderMan(object):
    """docstring for SpiderMan"""
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        #添加入口URL
        self.manager.add_new_url(root_url)
        #判断url管理器中是否有新的url,同时判断抓取了多少个url
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                #从URL管理器获取新的url
                new_url = self.manager.get_new_url()
                #HTML下载器下载网页
                html = self.downloader.download(new_url)
                #HTML解析器抽取网页数据
                new_urls, data = self.parser.parser(new_url, html)
                #将抽取的url添加到URL管理器中
                self.manager.add_new_urls(new_urls)
                #数据存储器存储文件
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_url_size())
            except Exception as e:
                print("crawl failed")
        #数据存储器将文件输出成指定格式
        self.output.output_html()
def crawl():
    try:
        global count, mutex
        if mutex.acquire():
            count += 1
            new_url = url.get_new_url()
            print('正在爬第' + str(count) + '条:' + new_url)
            mutex.release()
            html = downloader.download(new_url)
            url_list = parser.parser(html)
            url.add_new_urls(url_list)
    except:
        print('未知异常')
Exemple #5
0
class SpiderWork:
    def __init__(self):
        # 初始化分布式进程中工作节点的链接工作
        # 实现第一步: 使用BaseManager注册用于获取Queue的方法名称
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # 实现第二步: 连接到服务器
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)
        # 注意保持端口和验证口令与服务器进程设置的完全一致
        self.m = BaseManager(address=(server_addr, 8001), authkey='baike')
        # 从网络连接
        self.m.connect()
        # 实现第三步: 获取Queue的对象
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        # 初始化网页下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish.')

    def crawl(self):
        while True:
            try:
                if not self.task.emtpy():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        # 接着通知其它节点停止工作
                        self.result.put({'new_urls': 'end', 'data': 'end'})
                        return
                    print('爬虫节点正在解析: %s' % url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parser(url, content)
                    self.result.put({'new_urls': new_urls, 'data': data})
            except EOFError:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('Crawl fail.')
Exemple #6
0
class SpiderMan(object):
    def __init__(self):
        self.manager = URLManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()
               and self.manager.old_url_size() < 100):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser(new_url)
                self.manager.add_new_url(new_urls)
                self.output.store_data(data)
                print('已经抓取了{}个连接'.format(self.manager.old_url_size()))
            except Exception:
                print('爬取失败')
        self.output.output_html()