class SpiderWork(object): def __init__(self): self.dataout = DataOut() self.parser = HtmlParser() self.downloader = HtmlDownloader() def crawl(self, root_url): content = self.downloader.download(root_url) urls = self.parser.parse_url(root_url, content) for url in urls: t = time.strftime('%Y%m%d%H%M', time.localtime()) try: movie_id = '' mobj = re.match(r'.*?/(\d+)/.*?', url) if mobj: movie_id = mobj.group(1) # print(movie_id) ajax_url = '''http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library. Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1& Ajax_RequestUrl={0}&t={1}&Ajax_CallBackArgument0={2}'''.format( url, t, movie_id) # 构造ajax的url ajax_content = self.downloader.download(ajax_url) # 获取ajax响应内容 data = self.parser.parse_ajax(ajax_url, ajax_content) # 解析出数据 self.dataout.store_data(data) print('crawling: ', ajax_url) except Exception as e: print('crawl failed: ', url, e) self.dataout.output_end() print('crawl finish!')
class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParse() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.downloader(root_url) urls = self.parser.parser_url(root_url, content) # 构造一个获取票房连接的URL for url in urls: print '---------->URL', url, url[0], url[1] try: t = time.strftime("%Y%m%d%H%M%S3282", time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1]) rank_content = self.downloader.downloader(rank_url) print 'ajax接口返回内容,汉字正常显示-------->', rank_content print 'ajax接口返回内容type为unicode-------->', type(rank_content) data = self.parser.parser_json(rank_url, rank_content) self.output.store_data(data) except Exception, e: print '获取ajax动态数据失败', e self.output.output_end() print '=======end========='
class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): content = self.downloader.download(root_url) urls = self.parser.parser_url(root_url,content) for url in urls: try: time.sleep(0.1) t = time.strftime("%Y%m%d%H%M%S",time.localtime()) rank_url ='http://service.library.mtime.com/Movie.api'\ '?Ajax_CallBack=true'\ '&Ajax_CallBackType=Mtime.Library.Services'\ '&Ajax_CallBackMethod=GetMovieOverviewRating'\ '&Ajax_CrossDomain=1'\ '&Ajax_RequestUrl=%s'\ '&t=%s'\ '&Ajax_CallBackArgument0=%s'% (url[0],t,url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url,rank_content) self.output.store_data(data) except Exception,e: print 'Crawl failed' self.output.output_end() print "Crawl finish"
class EySpider(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() def urlsCrawl(self, root_url): #主要用来获取链接 self.manager.add_new_url(root_url) #判断url管理器中是否有新的url并且可以规定爬取url的数量 #self.manager.old_url_size()<*** while (self.manager.has_new_url()): try: #从url管理器中取出未爬取的连接 new_url = self.manager.get_new_url() #下载页面 html = self.downloader.staticPageDownload(new_url) #获取到新的urls urls = self.parser.urlsparser(html) self.manager.add_new_urls_to_old(new_url) except: print("爬取链接失败") def keywordsCrawl(self): while (self.manager.has_new_url()): try: # 从url管理器中取出未爬取的连接 new_url = self.manager.get_new_url() # 下载页面 html = self.downloader.staticPageDownload(new_url) # 获取到新的urls keywords = self.parser.Parser(html) self.manager.add_new_urls_to_old(new_url) except: print("爬取关键字失败")
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() pass
def __init__(self): #初始化分布式进程中的工作节点的连接工作 class QueueManager(BaseManager): pass # 实现第一步:使用BaseManager注册获取Queue的方法名称 QueueManager.register('get_task_queue') QueueManager.register('get_result_queue') # 实现第二步:连接到服务器: server_addr = ('192.168.10.128', 8004) print('Connect to server {}...'.format(server_addr)) # 端口和验证口令注意保持与服务进程设置的完全一致: self.m = QueueManager(address=server_addr, authkey='janson'.encode()) # 从网络连接: self.m.connect() # 实现第三步:获取Queue的对象: self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() #初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish')
def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() self.pageUrl = [] for num in range(1, 29): self.pageUrl.append( f'https://cl.887x.xyz/thread0806.php?fid=20&search=&page={num}' )
def __init__(self, sort, sort_url, sortFilename): threading.Thread.__init__(self) self.sort = sort self.sort_url = sort_url self.sortFilename = sortFilename self.manager = UrlManager(self.sort) self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
def __init__(self, bind_domain): # 建立管理爬取URL的物件 , 用于记录已经爬过的URL self.urlManager = UrlManager(enable_external_link=False, bind_domain=bind_domain) # 建立请求链接的物件 self.downloader = HtmlDownloader() # 建立转换Html源码成lxml.html物件 , 获取新的链接 self.parser = HtmlParser()
def main(): idi = 1405150114 urlmanager = UrlManager() pageurl = urlmanager.url_login(idi) infourl = urlmanager.url_userinfo(idi) htmldownloader = HtmlDownloader() htmlf, htmli = htmldownloader.download( 'http://ygcp.njtech.edu.cn/User/LoginInSampleAction.aspx', idi, pageurl, infourl) parse = HtmlParser() parse.parser(infourl, pageurl, htmli, htmlf)
def __init__(self): BaseManager.register("get_task_queue") BaseManager.register("get_result_queue") server_addr = "127.0.0.1" logging.info('Connect to server %s ...' % server_addr) self.m = BaseManager(address=(server_addr, 8001), authkey="baike".encode()) self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParparser() logging.info("init finish")
def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('Connect to server %s'%server_addr) self.m = BaseManager(address = (server_addr,8001),authkey = b'baike') self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish')
class SpiderWork(object): def __init__(self): # 初始化分布式进程中的工作节点的连接工作 # 第一步:使用BaseManageer获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 第二步:连接到服务器 server_addr = '127.0.0.1' print(('Connect to server %s...' % server_addr)) # 端口和验证口令注意和服务进程设置的完全一致: self.m = BaseManager(address=(server_addr, 8002), authkey='lagou'.encode('utf-8')) # 从网络连接 self.m.connect() # 第三步:获取Queue的对象 self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() # 初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): while True: try: if not self.task.empty(): url = self.task.get() if url == 'end': print('控制节点通知爬虫节点停止工作...') # 接着通知其他节点停止工作 self.result.put('end') return #print('成功获取到第%d个任务'%(316-self.task.qsize())) print('该爬虫节点正在解析:%s' % url) # 先下载第一页来获取总页 html = self.downloader.download_job(url, 1) tal_page = self.parser.get_page(html) print("共%d页职位信息" % tal_page) for page in range(1, tal_page + 1): print("正在爬取第%d页" % page + "共%d页" % tal_page) html = self.downloader.download_job(url, page) data = self.parser.get_job(html) self.result.put(data) except EOFError as e: print("连接工作节点失败") return except Exception as e: print(e) print('crawl fail')
class HtmlParser(object): def __init__(self): self.downloader = HtmlDownloader() self.page_num = self._get_page_num() def _get_page_num(self): data = self.downloader.get_page(1) if data and 'value' in data.keys(): count = re.search(r',"wareCount":(\d*?),"', data['value']).groups()[0] if count: count = int(count) num = count % 10 if num == 0: return num else: return count // 10 + 1 def get_page_urls(self, page): urls = [] url = '' data = self.downloader.get_page(page) pattern = re.compile(r',\"eBookFlag\":(.*?),\".*?,"wareId":"(\d*?)"') result = re.findall(pattern, data['value']) if result: for item in result: if item[0] == 'true': url = 'https://e.m.jd.com/ebook/' + item[1] + '.html' if item[0] == 'false': url = 'https://item.m.jd.com/product/' + item[1] + '.html' urls.append(url) return urls def get_data(self, url): html = self.downloader.get_page_data(url) soup = BeautifulSoup(html, 'html.parser') title = '' price = '0' if url[0:12] == 'https://e.m.': title = soup.find('p', class_='db-title').text price = soup.find('span', class_='db-price-num').text if url[0:12] == 'https://item': title = soup.find('span', class_='title-text').text price = float(soup.find('span', class_='big-price').text) + float( soup.find('span', class_='small-price').text) if price == None: price = '0' data = {'title': title, 'price': price, 'url': url} return data
def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_adrr = '127.0.0.1' print('connect to %s...' % server_adrr) self.m = BaseManager(address=(server_adrr, 8001), authkey=b'qiye') self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.htmlparser = HtmlParser() self.dataoutput = DataOutput()
def __init__(self): # 爬虫调度器需要先连接上控制节点,然后从url_q队列中获取URL,下载并解析网页,接着将获取的数据提交给 # result_q队列并返回给控制节点 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 链接到服务器 server_addr = '127.0.0.1' print ('connect to server %s....' % server_addr) self.m = BaseManager(address=(server_addr, 8001), authkey='baike') self.m.connect() # 获取Queue对象 self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawlOneTag(self, book_tag): page_num = 0 book_list = [] while page_num <= 2: try: new_url = self.manager.get_new_url(page_num, book_tag) html = self.downloader.download(new_url) book_list += self.parser.parser(html) except Exception as e: print("crawl failed") page_num += 1 return book_list def crawlAllTags(self, book_tag_lists, topath): book_lists = [] for book_tag in book_tag_lists: book_list = self.crawlOneTag(book_tag) book_list = sorted(book_list, key=lambda x: x[1], reverse=True) book_lists.append(book_list) self.output.output(book_lists, book_tag_lists, topath)
class Spiderman(object): def __init__(self): self.manager = Urlmanager() self.downloader = HtmlDownloader() self.parser = Htmlparser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器是否有新的url,同时判断抓取了多少url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从url管理器获得新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取数据 new_urls, data = self.parser.parse(new_url, html) # 将抽取的url添加到url管理器中 self.manager.add_new_urls(new_urls) # 数据存储器存储文件 self.output.store_data(data) print('已经抓取%s个连接' % self.manager.old_url_size()) except Exception as e: print('crawl failed') raise # 数据存储器将文件输出指定格式 self.output.output_html()
class SpiderMain(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取到url添加到URL管理器中 self.manager.add_new_urls(new_urls) # 数据存储器储存文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) #print(new_url) except Exception as e: print("crawl failed") # 数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderWork(): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('Connect to server %s'%server_addr) self.m = BaseManager(address = (server_addr,8001),authkey = b'baike') self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): while True: try: if not self.task.empty(): url = self.task.get() if url == 'end': print('控制节点通知爬虫结束工作') self.result.put({'new_urls':'end','data':'end'}) return print('爬虫正在解析%s'%url.encode('utf-8')) content = self.downloader.download(url) new_urls,data = self.parser.parser(url,content) self.result.put({'new_urls':new_urls,'data':data}) except EOFError: print('连接工作节点失败') return except Exception: print(Exception) print('Crawl fail')
class Spider_Scheduler(object): def __init__(self): self.urlmanager = UrlQueue() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 入口放url种子 self.urlmanager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.urlmanager.has_new_url() and self.urlmanager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.urlmanager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取到url添加到URL管理器中 self.urlmanager.add_new_urls(new_urls) # 存储器将数据序列化 self.output.data_to_list(data) print("已经抓取%s个链接" % self.urlmanager.old_url_size()) except Exception as e: print("crawl failed") # 存储器输出成指定格式 self.output.output_html()
class SpiderSchedule(object): ''' 爬虫调度器,负责初始化各个模块,然后通过crawl传递入口url 方法内部安卓运行流畅控制各个模块工作 ''' def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口url self.manager.add_new_url(root_url) # 判断是否有新的url,同时判断抓取url个数 while self.manager.has_new_url() and self.manager.old_urls_size() < 10: try: # 1.从URL管理器获取新的URL new_url = self.manager.get_new_url() # 2.将URL交给HtmlDownloader下载 html = self.downloader.download(new_url) # 3.将下载的页面交给HtmlParser解析 urls, data = self.parser.parser(new_url, html) # 4.将解析的数据存储,将重新抽取的URL交给URLManager self.output.store_data(data) for url in urls: self.manager.add_new_url(url) print('已经抓取{0}个链接:'.format(self.manager.old_urls_size()), new_url) except Exception as e: print(e.args) print('crawl failed:', url) self.output.output_html()
class SpiderMan(object): def __init__(self): self.downloader=HtmlDownloader() self.parser=HtmlParser() self.output=HtmlOutput() def crawl(self,root_url): album_response = self.downloader.download(root_url) self.output.output_head() for album in self.parser.get_kw_album(album_response): self.output.output_album(album) track_url = 'http://mobile.ximalaya.com/mobile/v1/album/ts-1552364593682?ac=WIFI&albumId=%d&device=android&isAsc=true&isQueryInvitationBrand=true&pageId=1&pageSize=20&pre_page=0&source=0&supportWebp=true' %album['albumId'] track_response = self.downloader.download(track_url) track_info = self.parser.get_kw_track(track_response) self.output.output_track(track_info) self.output.output_end()
class HtmlParser(object): def __init__(self): self.downloader = HtmlDownloader() def get_all_patents_info(self, url_cont): html = self.downloader.download(url_cont) text = pq(html).text() soup = BeautifulSoup(html, 'html5lib') Patent_name = soup.find('font', size="+1").text.strip().lower() Patent_num = int( soup.select( 'body > table:nth-of-type(1) > tbody > tr:nth-of-type(1) > td:nth-of-type(2) > b' )[0].text.strip()) Description = soup.find_all('i', text="Description")[0].text Patent_description = re.findall( r"(?<=%s)[\w\W]*?(?=\* \* \* \* \*)" % Description, text)[0] # Patent_abstract = soup.select('body > p:nth-of-type(2)')[0].text.strip() # Patent_claim = re.findall(r"(?<=Claims)[\w\W]*?(?=Description)", text)[0].replace("\n", "").replace(' ', '') pat_dict = { 'Patent_name': Patent_name, 'Patent_num': Patent_num, # 'Patent_text':text, # 'Patent_claim':Patent_claim, 'Patent_description': Patent_description, 'Patent_url': url_cont, } print(pat_dict) return pat_dict
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = DataOutput() self.output = DataOutput() def crawl(self, root_url): #添加入口 self.manager.add_new_url(root_url) #判断url管理管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.hes_new_url() and self.manager.old_url_size() < 100): try: #从url管理器中获取新的url new_url = self.manager.get_new_url() #html解释器抽取网页数据 html = self.downloader.download(new_url) #将抽取的url添加到url管理器中 self.manager.add_new_url(new_url) #将数据存储到文件 self.output.stor_data(data) print("已经抓取%个链接" % self.manager.old_url_size()) except Exception, e: print("crawl failed") #数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderMan(object): """爬虫调度器""" def __init__(self): self.urlManager = UrlManager() self.htmlDownloader = HtmlDownloader() self.htmlParser = HtmlParser() self.htmlOutput = DataOutput() def crawl(self, root_url): # 添加入口URL self.urlManager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.urlManager.has_new_url() and self.urlManager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.urlManager.get_new_url() # HTML下载器下载网页 html = self.htmlDownloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.htmlParser.parser(new_url, html) # 将抽取的url添加到URL管理器中 self.urlManager.add_new_urls(new_urls) # 数据存储器存储数据 self.htmlOutput.store_data(data) except Exception as e: print(traceback.format_exc()) # 数据存储器将文件输出成指定格式 self.htmlOutput.output_html()
class SpiderWork(object): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) self.m = BaseManager(address=(server_addr, 8001),authkey=('baike'.encode('utf-8'))) self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): while(True): try: if not self.task.empty(): url = self.task.get() print(url) if url =='end': print('控制节点通知爬虫节点停止工作...') self.result.put({'new_urls':'end','data':'end'}) return print('爬虫节点正在解析:%s'%url.encode('utf-8')) content = self.downloader.download(url) new_urls,data = self.parser.parser(url,content) self.result.put({"new_urls":new_urls,"data":data}) except EOFError as e: print("连接工作节点失败") return except Exception as e: print(e) print('Crawl fail')
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def start(self, url, numMax=50): self.manager.addUrl(url) num = 0 errorsNum = 0 while self.manager.sizeofNew() != 0 and num < numMax: try: num = num + 1 url = self.manager.getUrl() print('%d\n %s' % (num, url)) html = self.downloader.download(url) newUrls, data = self.parser.parser(url, html) self.output.addData(data) if self.manager.sizeofNew() + self.manager.sizeofOld( ) < numMax: self.manager.addUrls(newUrls) print(data['title']) except: num = num - 1 errorsNum = errorsNum + 1 print('crawl failed %d' % errorsNum) self.output.outputData()
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.dataoutput = DataOutput() self.mongoengine = Use_MongoEngine() self.urloutput = Url_info_Output()
class SpiderMan: def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口 url self.manager.add_new_url(root_url) # 判断 url 管理器是否有新的 url,同时判断抓去了多少 url while self.manager.has_new_url() and self.manager.old_url_size() < 100: try: # 从 URL 管理器获取新的 url new_url = self.manager.get_new_url() # 从 html 下载器下载网页 html = self.downloader.download(new_url) # print(html) # 从 html 解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取的 url 添加到 URl 管理器 self.manager.add_new_urls(new_urls) # 数据存储器存储文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) except Exception as e: print("crawl failed") self.output.output_html()
def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print ('Connect to server %s...' % server_addr) self.m=BaseManager(address=(server_addr,8001),authkey='qiye'.encode('utf-8')) print 'connecting...' self.m.connect() print 'connected' self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print 'spider init finish'
class SpiderWork(object): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print ('Connect to server %s...' % server_addr) self.m=BaseManager(address=(server_addr,8001),authkey='qiye'.encode('utf-8')) print 'connecting...' self.m.connect() print 'connected' self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print 'spider init finish' def crawl(self): while True: try: # print self.task if not self.task.empty(): url = self.task.get() if url == 'end': print ('stop...') # 通知其它节点停止 self.result.put({'new_urls':'end','data':'end'}) return print ('spider is working on %s'%url) content = self.downloader.download(url) new_urls, data = self.parser.parser(url, content) self.result.put({'new_urls':new_urls,'data':data}) except EOFError as e: print 'cannot connect other' return except Exception as e: print e print 'crawl fail'
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html=self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url,html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print "already get %s url" % self.manager.old_url_size() except Exception,e: print "crawl failed" self.output.output_html()
class DataHandler(): searchData = None downloader = None failureList = [] def __init__(self, user, passwd): self.searchData = SearchData() self.downloader = HtmlDownloader(user, passwd) def isOffLine(self): return self.downloader.isOffLine() def download(self, url): return self.downloader.downLoad(url) def printLog(self): print "failure records: " for failure in self.failureList: print " " + failure self.failureList = [] def addSearchItem(self, alias, name, category, quality): newItem = SearchItem() newItem.alias = alias newItem.name = name newItem.category = category newItem.quality = quality for item in self.searchData.searchItems: if cmp (newItem.alias, item.alias) == 0 or (cmp(newItem.name, item.name) == 0 and cmp(newItem.category, item.category) and cmp(newItem.quality, item.quality)): print "exist" return item self.searchData.searchItems.append(newItem) return newItem def getSearchItemURL(self, searchItem, page): url = "http://www.zhaoonline.com/search/" url += urllib.pathname2url(searchItem.name) url += "-8-3-trade-" url += urllib.pathname2url(categoryDic[searchItem.category]) url += "-" url += urllib.pathname2url(qualityDic[searchItem.quality]) url += "-00-N-0-N-1-" url += str(page) url += ".htm" return url def getHistoryItemURL(self, ref): url = "http://www.zhaoonline.com" url += ref return url def updateSearchItem(self, searchItem): page = 1 historyItemListParser = HistoryItemListParser() while historyItemListParser.hasNextPage(page): url = self.getSearchItemURL(searchItem, page) print "parsing search list: " + url html = self.downloader.getHtml(url) if html == None: break #html = self.downloader.download(url) if historyItemListParser.parse(html) == False: self.failureList.append(url) self.saveToListFile(searchItem.name+"_"+str(page), html) page += 1 historyItemList = historyItemListParser.getHistoryItemList() historyItemListParser.clean() searchItem.historyItems = historyItemList # now every HistoryItem has id only for i in range(0, len(searchItem.historyItems)): historyItem = searchItem.historyItems[i] url = self.getHistoryItemURL(historyItem.ref) print "(" + str(i) + "/" + str(len(searchItem.historyItems))+ ") downloading page: " + url html = self.downloader.getHtml(url) if html == None: continue #html = self.downloader.download(url) historyItemParser = HistoryItemParser() historyItemParser.parse(html) tmpItem = historyItemParser.getHistoryItem() #historyItem.ref = tmpItem.ref historyItem.id = tmpItem.id historyItem.name = tmpItem.name historyItem.comments = tmpItem.comments historyItem.quality = tmpItem.quality historyItem.date = tmpItem.date historyItem.price = tmpItem.price historyItem.auctionText = tmpItem.auctionText historyItem.auctionData = tmpItem.auctionData # save the html content to tmp directory self.saveToTmpFile(historyItem, html) return def loadAllSearchItemsFromXml(self): searchResultXmlLoader = SearchResultXmlLoader() self.searchData = searchResultXmlLoader.loadAllXmlFiles() # debug #for searchItem in self.searchData.searchItems: # self.dumpSearchItem(searchItem) return def saveAllSearchItemsToXml(self): for searchItem in self.searchData.searchItems: self.saveSearchItemToXml(searchItem) return def saveSearchItemToXml(self, searchItem): searchResultXmlGenerator = SearchResultXmlGenerator(searchItem) searchResultXmlGenerator.generateXml() return def getSearchItemByAlias(self, alias): if alias == None: return None for searchItem in self.searchData.searchItems: if cmp(searchItem.alias, alias) == 0: return searchItem return None def getAllSearchItems(self): return self.searchData.searchItems def saveToTmpFile(self, historyItem, html): fileName = 'tmp/' + historyItem.id + ".shtml" f= open(fileName, 'w') f.write(html) f.close() def saveToListFile(self, name, html): fileName = 'list/' + name + ".html" f= open(fileName, 'w') f.write(html) f.close() # debug function def dumpSearchItem(self, searchItem): print "Dumping SearchItem: " + searchItem.name print " alias: " + searchItem.alias print " category: " + searchItem.category print " quality: " + searchItem.quality for historyItem in searchItem.historyItems: print " name = " + historyItem.name print " comments = " + historyItem.comments
def __init__(self, user, passwd): self.searchData = SearchData() self.downloader = HtmlDownloader(user, passwd)
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()