def __init__(self): self.urls = url_manager.UrlManager() # 初始化url管理器 self.downloader = html_downloader.HtmlDownloader() # 初始化页面下载器 self.parser = html_parser.HtmlParser() # 初始化页面解析器 self.outputer = html_outputer.HtmlOutputer() # 初始化结果页面输出器
def __init__(self): self.urls = url_manager.UrlManager() #初始化url调度器 self.downloader = html_downloader.HtmlDownloader() #初始化下载器(请求) self.parser = html_parser.HtmlParser() #初始化html解析器 self.outputer = html_outputer.HtmlOutputer() #初始化html输出器
def __init__(self): self.urls = url_manager.Url_Manager() self.downloader = html_downLoader.HtmlDownLoader() self.parser = html_parser.HtmlParser() self.outper = html_outputer.OutPuter() self.cache = disk_cache.DiskCache()
return cookie def get_cookies(self): driver = webdriver.Chrome() driver.get("http://weixin.sogou.com/") time.sleep(5) driver.find_element_by_xpath('//*[@id="loginBtn"]').click() time.sleep(10) cookies = driver.get_cookies() cookie = {} for items in cookies: cookie[items.get('name')] = items.get('value') return cookie b = html_parser.HtmlParser() f = open('category2.csv', 'a') def task(self, link): data = None while data is None: html = self.download_articles_ph(link) data = self.b.parse_article(html) self.f.write(data[1] + '#' + data[0] + '#' + data[2] + '#' + data[3]) self.f.write('\n') if __name__ == "__main__": a = HtmlDownloader() b = html_parser.HtmlParser()
def __init__(self): self.urls = url_manager.UrlManager() #url管理器 self.downloader = html_downloader.HtmlDownloader() #网页下载器 self.parser = html_parser.HtmlParser() #解析器 self.output = html_output.HtmlOutput() #结果输出器
def __init__(self): self.maxcount = 100 # 鎶撳彇鏁版嵁鏁伴噺 self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): # 初始化 self.urls = url_manager.UrlManager() # url管理器 self.downloader = html_downloader.HtmlDownloader() # 下载器 self.parser = html_parser.HtmlParser() # 解析器 self.outputer = html_outputer.HtmlOutputer() # 输出器
def __init__(self): self.parser = html_parser.HtmlParser() self.cookies = {}
def __init__(self): # 初始化 self.downloader = html_downloader.HtmlDownloader() # 下载器 self.parser = html_parser.HtmlParser() # 解析器 self.outputer = html_outputer.HtmlOutputer() # 输出器
try: for data in self.datas: try: sql = """INSERT INTO book(书籍ID,书名,作者,出版社,出版年,页数,定价,ISBN,评分,评价人数,推荐书籍ID,简介)\ VALUE ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")"""%(data['书籍ID'], \ data['书名'],data['作者'],data['出版社'],data['出版年'],int(data['页数']),\ data['定价'],data['ISBN'],float(data['评分']),data['评价人数'],data['推荐书籍ID'],data['简介']) with self.connect.cursor() as cursor: cursor.execute(sql) self.connect.commit() self.saved += 1 except: print("存储书籍ID:%d失败" % (data["书籍ID"])) finally: print("本次存储书籍信息%d条,共存储%d条" % (self.data_size(), self.saved)) self.datas.clear() self.connect.close() if __name__ == '__main__': outputer = HtmlOutputer() data = html_parser.HtmlParser().parse( "https://book.douban.com/subject/25862578/", requests.get("https://book.douban.com/subject/25862578/").content) sql = "INSERT INTO 'book' VALUE (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"%(data['书籍ID'], data['书名'],data['作者'],data['出版社'],data['出版年'],data['页数'],\ data['定价'],data['ISBN'], data['评分'],data['评价人数'],data['推荐书籍ID'],data['简介']) print(sql) outputer.college(data) outputer.output_database()
def __init__(self): self.url_manager = url_manager.Url_Manager() self.html_downloader = html_downLoader.HtmlDownLoader() self.html_parser = html_parser.HtmlParser() self.outputer = html_outputer.OutPuter()
def __init__(self): self.urls = url_manager.UrlManager() self.download = pic_download.PicDownload() self.parser = html_parser.HtmlParser() self.ORC = pic_ORC.PicORC() self.pics = pic_manager.PicManager()
def __init__(self): """绑定属性的方法,第一参数永远是self,表示创建的类实例本身感觉相当于this""" self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = save_result.DataBaseOutputer()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.parser = html_parser.HtmlParser() self.out_put = html_output.HtmlOutput() self.conn_mysql = connectmysql.ConnectMysql()
def __init__(self): self.datas = [] self.downloader_image = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser()
def __init__(self): self.urls = url_manager.UrlManager() #初始化启动管理器 self.downloader = html_downloader.HtmlDownloader() #初始化启动下载器 self.parser = html_parser.HtmlParser() #初始化启动解析器 self.outputer = html_outputer.HtmlOutputer() #启动初始化输出器
def __init__(self): self.urls = url_manage.UrlManage() self.html_download = html_download.HtmlDownload() self.html_parser = html_parser.HtmlParser() self.output = output.OutPut()
import html_downloader,html_parser,html_outputer from bs4 import BeautifulSoup downloader = html_downloader.HtmlDownloader() parser = html_parser.HtmlParser() outputer = html_outputer.HtmlOutputer() num_list = [i for i in range(10,20)] url_list =[] for num in num_list: url = 'http://jib.xywy.com/il_sii/gaishu/' + str(num) + '.htm' url_list.append(url) def spyder(url): html_cont = downloader.download(url) soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') #new_urls, new_data = parser.parse(url, html_cont) #print(new_data) #print(new_urls) res_data = {} res_data['url'] = url title_node = soup.find('div', class_="jb-name fYaHei gre") res_data['title'] = title_node.get_text() summary_node = soup.find('div', class_='jib-articl-con jib-lh-articl').find('p') gaishu = summary_node.get_text() gaishu = gaishu.replace('\r\n\t','') res_data['summary'] = gaishu.strip() outputer.collect_data(res_data)
def __init__(self, file_path): self.urls = url_manager.UrlManager() # Url管理器 self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser(file_path) self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.parser = html_parser.HtmlParser() self.outputer = mus_output.MusOutPut() self.filePath = ("/spider-work/mus-spider/music/")
def __init__(self): self.downloader = html_download.HtmlDownloader() self.parser = html_parser.HtmlParser()
def __init__(self): self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self, config): self.config = config self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser(root_url=config['url']) self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.urls = url_manager.UrlManager() #url管理器 self.downloader = html_downloader.HtmlDownloader() #html网页下载器 self.parser = html_parser.HtmlParser() #html分析器 self.outputer = html_outputer.HtmlOutputer() #html输出器
def __init__(self): self.urls = url_manager.UrlManager() # URL管理器 self.downloader = html_downloader.HtmlDownloader() # 下载器 self.parser = html_parser.HtmlParser() # 解析器 self.outputer = html_outputer.HtmlOutputer() # 输出器 self.outputer2 = html_db.HtmlDb()
def __init__(self): self.urlManager = url_manager.UrlManager() self.parser = html_parser.HtmlParser() self.downloader = html_downloader.HtmlDownloader() self.collector = data_collector.DataCollector()
def __init__(self): # self.keyword = keyword_manager.KeywordManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.url_list = url_list.UrlList() self.processer = html_processer.HtmlProcesser() self.parser = html_parser.HtmlParser() self.dataSet = dataset.DataSet()