class SpiderMan: def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.download(root_url) urls = self.parser.parser_url(root_url, content) # 构造一个获取评分和票房链接 for url in urls: try: t = time.strftime('%Y%m%d%H%M%S3282', time.localtime()) rank_url = ('http://service.library.mtime.com/Movie.api' '?Ajax_CallBack=true' '&Ajax_CallBackType=Mtime.Library.Service' '&Ajax_CallBackMethod=GetMovieOverviewRating' '&Ajax_CrossDomain=1' '&Ajax_RequestUrl=%s' '&t=%s' '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1])) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url, rank_content) self.output.store_data(data) except Exception as e: print(e, 'Crawl failed') self.output.output_end() print('Crawl finish')
def getPOI2(q,region): q = q.encode('utf-8') region = region.encode('utf-8') L = [] ak = u'BjZFyCBFktfZmdj7SVP98fEFx78KzFn4' #ak=u'skS8wg9wP1VVFk2iuDuQATzoWKMb8FuY' #ak=u'AKsr88dgGDK8d74q7wTRbhiSb567HVmA' q = urllib2.quote(q) region = urllib2.quote(region) #hd = HtmlDownloader.HtmlDownloader() #http://api.map.baidu.com/place/v2/search?query=购物中心®ion=天津&city_limit=true&output=json&ak=BjZFyCBFktfZmdj7SVP98fEFx78KzFn4&page_num=0 baseUrl = 'http://api.map.baidu.com/place/v2/search?query=%s®ion=%s&city_limit=true&output=json&ak=%s&page_num=' %(q,region,ak) page = 0 total= 1 while page*10 < total: url = baseUrl+unicode(str(page),'utf-8') print url res = HtmlDownloader.download(url) while(res is None): res = HtmlDownloader.download(url) print "retrying...",url res = unicode(res,'utf-8') data = JsonUtils.readStr(res) status = data[u'status'] message = data[u'message'] if status==0 and message=='ok': #返回成功 L.extend(data[u'results']) total = data[u'total'] page = page + 1 else: #失败 print u"查询失败",message return L return L
class SpiderMan(object): """docstring for SpiderMan""" def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): #添加入口URL self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: #从URL管理器获取新的url new_url = self.manager.get_new_url() #HTML下载器下载网页 html = self.downloader.download(new_url) #HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) #将抽取的url添加到URL管理器中 self.manager.add_new_urls(new_urls) #数据存储器存储文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) except Exception as e: print("crawl failed") #数据存储器将文件输出成指定格式 self.output.output_html()
def crawl(): try: global count, mutex if mutex.acquire(): count += 1 new_url = url.get_new_url() print('正在爬第' + str(count) + '条:' + new_url) mutex.release() html = downloader.download(new_url) url_list = parser.parser(html) url.add_new_urls(url_list) except: print('未知异常')
class SpiderWork: def __init__(self): # 初始化分布式进程中工作节点的链接工作 # 实现第一步: 使用BaseManager注册用于获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 实现第二步: 连接到服务器 server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) # 注意保持端口和验证口令与服务器进程设置的完全一致 self.m = BaseManager(address=(server_addr, 8001), authkey='baike') # 从网络连接 self.m.connect() # 实现第三步: 获取Queue的对象 self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() # 初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish.') def crawl(self): while True: try: if not self.task.emtpy(): url = self.task.get() if url == 'end': print('控制节点通知爬虫节点停止工作...') # 接着通知其它节点停止工作 self.result.put({'new_urls': 'end', 'data': 'end'}) return print('爬虫节点正在解析: %s' % url.encode('utf-8')) content = self.downloader.download(url) new_urls, data = self.parser.parser(url, content) self.result.put({'new_urls': new_urls, 'data': data}) except EOFError: print('连接工作节点失败') return except Exception as e: print(e) print('Crawl fail.')
class SpiderMan(object): def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser(new_url) self.manager.add_new_url(new_urls) self.output.store_data(data) print('已经抓取了{}个连接'.format(self.manager.old_url_size())) except Exception: print('爬取失败') self.output.output_html()