def crawler_list(self, url, conf, type_p='rp', charset='utf8',row={}): try: htmlSource = HtmlSource() # 获取网页原文 try: html_context = htmlSource.get_html(url_p=url, type_p=type_p, charset_p=charset) except Exception as e: raise e index = 0 while len(html_context) < 128 and index < 2: html_context = htmlSource.get_html(url_p=url) index += 1 if len(html_context) < 128: raise Exception(1001, '网页访问失败,无内容!') # 解析原文 tree = html.fromstring(html_context) result_list = tree.xpath(conf['group']) result_list_context = self._analysis_list(list=result_list, columns=conf['columns'], url=url,row=row) if 'nextPage' in conf.keys(): next_page = tree.xpath(conf['nextPage']) if len(next_page) > 0: return result_list_context, parse.urljoin(url, next_page[0]) else: return result_list_context, None else: return result_list_context, None except Exception as e: raise e
def openUrl(self): self.textBrowser.setVisible(True) self.tableView.setVisible(False) self.conf["url"] = self.urlLineEdit.text() # 加载外部页面,调用 # self.webEngineView.setUrl(QtCore.QUrl(url)) # print(self.conf["url"]) htmlSource = HtmlSource() self.html_context = htmlSource.get_html(url_p=self.conf["url"], type_p='rg') self.textBrowser.setText(self.html_context) self.lineEdit.setText("//div[@class =\"padd w645\"]/div[@class=\"list_left\"]/div[@class=\"topic-list\"]/ul/li") self.lineEdit_4.setText( "//div[@class =\"padd w645\"]/div[@class=\"list_left\"]/div[@class=\"show-page\"]/a[@class=\"next\"]/@href")
def crawler(self, nextPageUrl='', times=1): if (times > 5): return times = times + 1 # 表格头信息 title = [] for column in self.conf['columns']: title.append(column['名称']) # 输出表格头 TODO print(1) self.textBrowser.setVisible(True) self.tableView.setVisible(False) htmlSource = HtmlSource() html_context = htmlSource.get_html(url_p=nextPageUrl, type_p='rg') self.textBrowser.setText(html_context) time.sleep(1) tree = html.fromstring(html_context) result_list = tree.xpath(self.conf['body_content_xpath']) if (len(result_list) > 0): # 表格数据 rule = Rule() list = rule._analysis_list(list=result_list, columns=self.conf["columns"]) # TODO 输出数据 print(list) # 设置数据层次结构,4行4列 self.tableView.model = QStandardItemModel(len(list), len(self.conf['columns'])) # 设置水平方向四个头标签文本内容 self.tableView.model.setHorizontalHeaderLabels(title) for row in range(len(list)): for column in range(len(title)): item = QStandardItem(list[row][title[column]]) # 设置每个位置的文本值 self.tableView.model.setItem(row, column, item) # 实例化表格视图,设置模型为自定义的模型 self.tableView.setModel(self.tableView.model) # 展示表格 self.textBrowser.setVisible(False) self.tableView.setVisible(True) time.sleep(1) # 下一页url nextpageurl = tree.xpath(self.conf['nextpage']) # 递归调用 if (len(nextpageurl) > 0): print(nextpageurl[0]) self.crawler(nextPageUrl=nextpageurl[0], times=times)
def crawler_detail(self, conf, url='', type_p='rp', charset='utf8',row={}): htmlSource = HtmlSource() # 获取网页原文 try: html_context = htmlSource.get_html(url_p=url, type_p=type_p, charset_p=charset) except Exception as e: raise e index = 0 while len(html_context) < 128 and index < 2: html_context = htmlSource.get_html(url_p=url) index += 1 if len(html_context) < 128: raise Exception(1001, '网页访问失败,无内容!') # 解析原文 tree = html.fromstring(html_context) result_list = tree.xpath(conf['group']) if result_list is not None and len(result_list) > 0: result_list_context = self._analysis_context(tree=result_list[0], columns=conf['columns'], url=url,row=row) return result_list_context else: return None
# -- coding: UTF-8 -- from common.HtmlSource import HtmlSource htmlSource = HtmlSource() url = 'https://www.meishij.net/zuofa/hubeixianroutangyuan.html' html_source = htmlSource.get_html(url_p=url) print(html_source) detial_html = htmlSource.get_html(url_p=url, type_p='rg') print(detial_html)
#!/usr/bin/env python # -*- coding: UTF-8 -*- from common.HtmlSource import HtmlSource from common.Rule import Rule # from common.inc_conn import Conn_mysql from common.inc_file import File_file, File_floder from common.inc_csv import Csv_base import time htmlSource = HtmlSource() rule = Rule() path = 'D:/newpro/6.1' # 多线程 def read_detial(url, i): detial_html = htmlSource.get_html(url_p=url, type_p='rg') #print(detial_html) # 写html files = File_file() names = url.split('/') file_name = names[len(names) - 1] files.save_source(path=path, file=file_name, all_the_text=detial_html, encoding_='utf-8') colum = [ ('title', '//h1[@class="articleHead"]//text()', 'l'), ('pushDate', '//div[@class="info"]//span[@class="infoA"][@id="pubtime_baidu"]//text()',
def openUrl(self): self.conf["urlList"] = self.textEdit.toPlainText().split("\n") # 获取文本用toPlainText self.textBrowser.setVisible(True) htmlSource = HtmlSource() html_context = htmlSource.get_html(url_p=self.conf["urlList"][0], type_p='rg') self.textBrowser.setText(html_context)