def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() pass
def __init__(self): #初始化分布式进程中的工作节点的连接工作 class QueueManager(BaseManager): pass # 实现第一步:使用BaseManager注册获取Queue的方法名称 QueueManager.register('get_task_queue') QueueManager.register('get_result_queue') # 实现第二步:连接到服务器: server_addr = ('192.168.10.128', 8004) print('Connect to server {}...'.format(server_addr)) # 端口和验证口令注意保持与服务进程设置的完全一致: self.m = QueueManager(address=server_addr, authkey='janson'.encode()) # 从网络连接: self.m.connect() # 实现第三步:获取Queue的对象: self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() #初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish')
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self, my_root_url): count = 1 self.urls.add_new_url(my_root_url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() print("craw %d : %s" % (count, new_url)) # 下载网页 html_cont = self.downloader.download(new_url) # 解析网页 self.parser.parse_test(new_url, html_cont) """ new_urls, new_data = self.parser.parse(new_url, html_cont) self.urls.add_new_urls(new_urls) # 网页输出器收集数据 self.outputer.collect_data(new_data) if count == 10: break count += 1 """ except: print("craw failed") self.outputer.output_html()
class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): content = self.downloader.download(root_url) urls = self.parser.parser_url(root_url,content) for url in urls: try: time.sleep(0.1) t = time.strftime("%Y%m%d%H%M%S",time.localtime()) rank_url ='http://service.library.mtime.com/Movie.api'\ '?Ajax_CallBack=true'\ '&Ajax_CallBackType=Mtime.Library.Services'\ '&Ajax_CallBackMethod=GetMovieOverviewRating'\ '&Ajax_CrossDomain=1'\ '&Ajax_RequestUrl=%s'\ '&t=%s'\ '&Ajax_CallBackArgument0=%s'% (url[0],t,url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url,rank_content) self.output.store_data(data) except Exception,e: print 'Crawl failed' self.output.output_end() print "Crawl finish"
class EySpider(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() def urlsCrawl(self, root_url): #主要用来获取链接 self.manager.add_new_url(root_url) #判断url管理器中是否有新的url并且可以规定爬取url的数量 #self.manager.old_url_size()<*** while (self.manager.has_new_url()): try: #从url管理器中取出未爬取的连接 new_url = self.manager.get_new_url() #下载页面 html = self.downloader.staticPageDownload(new_url) #获取到新的urls urls = self.parser.urlsparser(html) self.manager.add_new_urls_to_old(new_url) except: print("爬取链接失败") def keywordsCrawl(self): while (self.manager.has_new_url()): try: # 从url管理器中取出未爬取的连接 new_url = self.manager.get_new_url() # 下载页面 html = self.downloader.staticPageDownload(new_url) # 获取到新的urls keywords = self.parser.Parser(html) self.manager.add_new_urls_to_old(new_url) except: print("爬取关键字失败")
def __init__(self, conf): self.url1 = "https://www.mk.co.kr/news/economy" self.conf = conf self.html = HtmlParser(conf) self.news = CrawlerNewspaper(conf) self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port']) self.util = Utils()
def get_params(self, url, content): urlparams = urlparse(url) html = HtmlParser(content, urlparams.scheme+'://'+urlparams.netloc) errors = [] results = {} for i in range(len(self.search_params)): param = self.search_params[i] if 'eval' in param: eval_param = param['eval'].split('.') if len(eval_param)==2: param[eval_param[0]][eval_param[1]] = eval(param['eval_string']) res = html.get(param['possible'], param['params']) results[param['name']] = res if res is None: errors.append(param['name']) if 'critical' in param and param['critical']: return { 'parsed': False, 'errors': errors, } results['media'] = {} results['media']['image'] = html.get_image() return { 'parsed': True, 'results': results, 'errors': errors, }
def __init__(self, conf): self.url1 = "http://www.sisanews.kr/news/articleList.html?sc_section_code=S1N17&view_type=sm" self.url2 = "http://www.sisanews.kr/news/articleList.html?sc_section_code=S1N16&view_type=sm" self.burl = "http://www.sisanews.kr" self.html = HtmlParser(conf) self.news = CrawlerNewspaper(conf) self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])
def __init__(self, conf): self.url1 = "https://www.hankyung.com/finance/0104" self.url2 = "https://www.hankyung.com/finance/0103" self.url3 = "https://www.hankyung.com/finance/0102" self.conf = conf self.html = HtmlParser(conf) self.news = CrawlerNewspaper(conf) self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])
def __init__(self): self.G_STATE_OK = 200 self.crawMaxNum = -1 self.crawCountNum = 0 self.urlManager = UrlManager() self.dispatch = Dispatch() self.htmlParser = HtmlParser("http://baike.baidu.com") self.applicationShow = ApplicationShow()
def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() self.pageUrl = [] for num in range(1, 29): self.pageUrl.append( f'https://cl.887x.xyz/thread0806.php?fid=20&search=&page={num}' )
def __init__(self, sort, sort_url, sortFilename): threading.Thread.__init__(self) self.sort = sort self.sort_url = sort_url self.sortFilename = sortFilename self.manager = UrlManager(self.sort) self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
def __init__(self, bind_domain): # 建立管理爬取URL的物件 , 用于记录已经爬过的URL self.urlManager = UrlManager(enable_external_link=False, bind_domain=bind_domain) # 建立请求链接的物件 self.downloader = HtmlDownloader() # 建立转换Html源码成lxml.html物件 , 获取新的链接 self.parser = HtmlParser()
def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('Connect to server %s'%server_addr) self.m = BaseManager(address = (server_addr,8001),authkey = b'baike') self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish')
class SpiderWork(object): def __init__(self): # 初始化分布式进程中的工作节点的连接工作 # 第一步:使用BaseManageer获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 第二步:连接到服务器 server_addr = '127.0.0.1' print(('Connect to server %s...' % server_addr)) # 端口和验证口令注意和服务进程设置的完全一致: self.m = BaseManager(address=(server_addr, 8002), authkey='lagou'.encode('utf-8')) # 从网络连接 self.m.connect() # 第三步:获取Queue的对象 self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() # 初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): while True: try: if not self.task.empty(): url = self.task.get() if url == 'end': print('控制节点通知爬虫节点停止工作...') # 接着通知其他节点停止工作 self.result.put('end') return #print('成功获取到第%d个任务'%(316-self.task.qsize())) print('该爬虫节点正在解析:%s' % url) # 先下载第一页来获取总页 html = self.downloader.download_job(url, 1) tal_page = self.parser.get_page(html) print("共%d页职位信息" % tal_page) for page in range(1, tal_page + 1): print("正在爬取第%d页" % page + "共%d页" % tal_page) html = self.downloader.download_job(url, page) data = self.parser.get_job(html) self.result.put(data) except EOFError as e: print("连接工作节点失败") return except Exception as e: print(e) print('crawl fail')
def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_adrr = '127.0.0.1' print('connect to %s...' % server_adrr) self.m = BaseManager(address=(server_adrr, 8001), authkey=b'qiye') self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.htmlparser = HtmlParser() self.dataoutput = DataOutput()
def __init__(self): # 爬虫调度器需要先连接上控制节点,然后从url_q队列中获取URL,下载并解析网页,接着将获取的数据提交给 # result_q队列并返回给控制节点 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 链接到服务器 server_addr = '127.0.0.1' print ('connect to server %s....' % server_addr) self.m = BaseManager(address=(server_addr, 8001), authkey='baike') self.m.connect() # 获取Queue对象 self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def start(self, url, numMax=50): self.manager.addUrl(url) num = 0 errorsNum = 0 while self.manager.sizeofNew() != 0 and num < numMax: try: num = num + 1 url = self.manager.getUrl() print('%d\n %s' % (num, url)) html = self.downloader.download(url) newUrls, data = self.parser.parser(url, html) self.output.addData(data) if self.manager.sizeofNew() + self.manager.sizeofOld( ) < numMax: self.manager.addUrls(newUrls) print(data['title']) except: num = num - 1 errorsNum = errorsNum + 1 print('crawl failed %d' % errorsNum) self.output.outputData()
class SpiderSchedule(object): ''' 爬虫调度器,负责初始化各个模块,然后通过crawl传递入口url 方法内部安卓运行流畅控制各个模块工作 ''' def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口url self.manager.add_new_url(root_url) # 判断是否有新的url,同时判断抓取url个数 while self.manager.has_new_url() and self.manager.old_urls_size() < 10: try: # 1.从URL管理器获取新的URL new_url = self.manager.get_new_url() # 2.将URL交给HtmlDownloader下载 html = self.downloader.download(new_url) # 3.将下载的页面交给HtmlParser解析 urls, data = self.parser.parser(new_url, html) # 4.将解析的数据存储,将重新抽取的URL交给URLManager self.output.store_data(data) for url in urls: self.manager.add_new_url(url) print('已经抓取{0}个链接:'.format(self.manager.old_urls_size()), new_url) except Exception as e: print(e.args) print('crawl failed:', url) self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownLoader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断取了多少个url while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print(self.manager.old_url_size()) print(data) except Exception as e: print('crawl failed') self.output.output_question() self.output.output_answer()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 50): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) if html == None: print('failded to get pages') new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('has scraped %s links' % self.manager.old_url_size()) except Exception as e: print('crawl failed') self.output.output_html() '''
class SpiderWork(object): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) self.m = BaseManager(address=(server_addr, 8001),authkey=('baike'.encode('utf-8'))) self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): while(True): try: if not self.task.empty(): url = self.task.get() print(url) if url =='end': print('控制节点通知爬虫节点停止工作...') self.result.put({'new_urls':'end','data':'end'}) return print('爬虫节点正在解析:%s'%url.encode('utf-8')) content = self.downloader.download(url) new_urls,data = self.parser.parser(url,content) self.result.put({"new_urls":new_urls,"data":data}) except EOFError as e: print("连接工作节点失败") return except Exception as e: print(e) print('Crawl fail')
class SpiderWorker(): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_adrr = '127.0.0.1' print('connect to %s...' % server_adrr) self.m = BaseManager(address=(server_adrr, 8001), authkey=b'qiye') self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.htmlparser = HtmlParser() self.dataoutput = DataOutput() def crawl(self): while True: try: if self.task.empty: url = self.task.get() if url == 'end': return None print('正在解析 %s' % url.encode('utf-8')) content = self.downloader.download(url) new_urls, data = self.htmlparser.parser(url, content) self.result.put({'new_urls': new_urls}) self.dataoutput.output_mongo({'data': data}) except Exception as e: print(e)
class SpiderMan(object): """爬虫调度器""" def __init__(self): self.urlManager = UrlManager() self.htmlDownloader = HtmlDownloader() self.htmlParser = HtmlParser() self.htmlOutput = DataOutput() def crawl(self, root_url): # 添加入口URL self.urlManager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.urlManager.has_new_url() and self.urlManager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.urlManager.get_new_url() # HTML下载器下载网页 html = self.htmlDownloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.htmlParser.parser(new_url, html) # 将抽取的url添加到URL管理器中 self.urlManager.add_new_urls(new_urls) # 数据存储器存储数据 self.htmlOutput.store_data(data) except Exception as e: print(traceback.format_exc()) # 数据存储器将文件输出成指定格式 self.htmlOutput.output_html()
class SpiderMan: def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口 url self.manager.add_new_url(root_url) # 判断 url 管理器是否有新的 url,同时判断抓去了多少 url while self.manager.has_new_url() and self.manager.old_url_size() < 100: try: # 从 URL 管理器获取新的 url new_url = self.manager.get_new_url() # 从 html 下载器下载网页 html = self.downloader.download(new_url) # print(html) # 从 html 解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取的 url 添加到 URl 管理器 self.manager.add_new_urls(new_urls) # 数据存储器存储文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) except Exception as e: print("crawl failed") self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawlOneTag(self, book_tag): page_num = 0 book_list = [] while page_num <= 2: try: new_url = self.manager.get_new_url(page_num, book_tag) html = self.downloader.download(new_url) book_list += self.parser.parser(html) except Exception as e: print("crawl failed") page_num += 1 return book_list def crawlAllTags(self, book_tag_lists, topath): book_lists = [] for book_tag in book_tag_lists: book_list = self.crawlOneTag(book_tag) book_list = sorted(book_list, key=lambda x: x[1], reverse=True) book_lists.append(book_list) self.output.output(book_lists, book_tag_lists, topath)
class SpiderWork(): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('Connect to server %s'%server_addr) self.m = BaseManager(address = (server_addr,8001),authkey = b'baike') self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): while True: try: if not self.task.empty(): url = self.task.get() if url == 'end': print('控制节点通知爬虫结束工作') self.result.put({'new_urls':'end','data':'end'}) return print('爬虫正在解析%s'%url.encode('utf-8')) content = self.downloader.download(url) new_urls,data = self.parser.parser(url,content) self.result.put({'new_urls':new_urls,'data':data}) except EOFError: print('连接工作节点失败') return except Exception: print(Exception) print('Crawl fail')
class SpiderMain(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取到url添加到URL管理器中 self.manager.add_new_urls(new_urls) # 数据存储器储存文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) #print(new_url) except Exception as e: print("crawl failed") # 数据存储器将文件输出成指定格式 self.output.output_html()
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.dataoutput = DataOutput() self.mongoengine = Use_MongoEngine() self.urloutput = Url_info_Output()
class SpiderMan(object): def __init__(self): self.downloader=HtmlDownloader() self.parser=HtmlParser() self.output=HtmlOutput() def crawl(self,root_url): album_response = self.downloader.download(root_url) self.output.output_head() for album in self.parser.get_kw_album(album_response): self.output.output_album(album) track_url = 'http://mobile.ximalaya.com/mobile/v1/album/ts-1552364593682?ac=WIFI&albumId=%d&device=android&isAsc=true&isQueryInvitationBrand=true&pageId=1&pageSize=20&pre_page=0&source=0&supportWebp=true' %album['albumId'] track_response = self.downloader.download(track_url) track_info = self.parser.get_kw_track(track_response) self.output.output_track(track_info) self.output.output_end()
class Spider_Scheduler(object): def __init__(self): self.urlmanager = UrlQueue() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 入口放url种子 self.urlmanager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.urlmanager.has_new_url() and self.urlmanager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.urlmanager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取到url添加到URL管理器中 self.urlmanager.add_new_urls(new_urls) # 存储器将数据序列化 self.output.data_to_list(data) print("已经抓取%s个链接" % self.urlmanager.old_url_size()) except Exception as e: print("crawl failed") # 存储器输出成指定格式 self.output.output_html()
def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print ('Connect to server %s...' % server_addr) self.m=BaseManager(address=(server_addr,8001),authkey='qiye'.encode('utf-8')) print 'connecting...' self.m.connect() print 'connected' self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print 'spider init finish'
class SpiderWork(object): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print ('Connect to server %s...' % server_addr) self.m=BaseManager(address=(server_addr,8001),authkey='qiye'.encode('utf-8')) print 'connecting...' self.m.connect() print 'connected' self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print 'spider init finish' def crawl(self): while True: try: # print self.task if not self.task.empty(): url = self.task.get() if url == 'end': print ('stop...') # 通知其它节点停止 self.result.put({'new_urls':'end','data':'end'}) return print ('spider is working on %s'%url) content = self.downloader.download(url) new_urls, data = self.parser.parser(url, content) self.result.put({'new_urls':new_urls,'data':data}) except EOFError as e: print 'cannot connect other' return except Exception as e: print e print 'crawl fail'
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html=self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url,html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print "already get %s url" % self.manager.old_url_size() except Exception,e: print "crawl failed" self.output.output_html()
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
class collector(): ''' 从html中提取相关tag内容 ''' def __init__(self,html): self.html=html self.d=pq(html) self.d('script').remove() self.d('style').remove() self.html_parser=HtmlParser(self.html) def clear_other_node(self): ''' 删除无用标签 ''' self.d('head').remove() self.d('h1').remove() self.d('h2').remove() self.d('h3').remove() self.d('b').remove() self.d('a').remove() def get_title(self): ''' 提取 title ''' return self.d('title').text() def get_node(self,node): ''' 提取 字符型节点 字符串 ''' nodes=self.html_parser.get_node(node) text='' for i in nodes: text+=i return text def get_urls(self): ''' 返回url 与 get_as 想配套 ''' def xml(self,docID): '返回xml源码' #通过docID 在sortedurls 中确定 tem_home_url self.transurl.setTemHomeUrl(docID) #确定tem_home_url str='<html></html>' titleText=self.d('title').text() self.dd=dom.parseString(str) #print self.dd html=self.dd.firstChild #生成title htmlCtrl=htmlctrl(self.d.html()) title=self.dd.createElement('title') html.appendChild(title) title.setAttribute('text',titleText) #生成b bb=htmlCtrl.gNode('b') b=self.dd.createElement('b') for i in bb: ii=self.dd.createElement('item') ii.setAttribute('text',i) b.appendChild(ii) html.appendChild(b) #生成h1 bb=htmlCtrl.gNode('h1') b=self.dd.createElement('h1') for i in bb: ii=self.dd.createElement('item') ii.setAttribute('text',i) b.appendChild(ii) html.appendChild(b) #生成h2 bb=htmlCtrl.gNode('h2') b=self.dd.createElement('h2') for i in bb: ii=self.dd.createElement('item') ii.setAttribute('text',i) b.appendChild(ii) html.appendChild(b) #生成h3 bb=htmlCtrl.gNode('h3') b=self.dd.createElement('h3') for i in bb: ii=self.dd.createElement('item') ii.setAttribute('text',i) b.appendChild(ii) html.appendChild(b) #生成a aa=htmlCtrl.gA() a=self.dd.createElement('a') for i in aa: #i=self.transurl.trans_d(i) #对url转化为标准绝对地址 aindex=self.dd.createElement('item') aindex.setAttribute('name',i) #aindex.setAttribute('href',self.a_trav(aa[i])) aindex.setAttribute('href',self.transurl.trans_d(aa[i])) a.appendChild(aindex) html.appendChild(a) #加入content htmltext=self.d.html().decode('gbk','ignore').encode('utf-8') ht=pq(htmltext) #bug 说明 #此处 需啊注意 其中有html的特殊字符 &# 等等 #在分词的时候另外说明 content=ht.text() cc=self.dd.createElement('content') ctext=self.dd.createTextNode(content) cc.appendChild(ctext) html.appendChild(cc) #print self.dd.toprettyxml() return self.dd
def __init__(self,html): self.html=html self.d=pq(html) self.d('script').remove() self.d('style').remove() self.html_parser=HtmlParser(self.html)