def __init__(self): #第一個是模塊的名字,第二個是模塊內部對象的名字 #html_download.HtmlDownload() self.html_download = html_download3.HtmlDownload() self.html_parser = HtmlParse.HtmlParse() self.urlmanage = UrlManage.UrlManage() self.html_output = html_outputer.HtmlOutputer()
def populateFrame_2(self): self.frame_3.grid_forget() self.enableFrame_2() self.insertBtn.grid_forget() self.insertBtn = ttk.Button(self.frame_2, text = 'Confirm Edit and View', command = self.confirmEdit).grid(row = 6, column = 0, sticky = N+S+E+W) self.filename.insert(0, self.combox.get()) self.body.insert(1.0, hp.getBodyText('Pages/' + self.combox.get())) self.filename.configure(state='disable')
def GetDataFromWebsite(L, dbType, siteType, sleepTime): spiderdb = qbdb() spider = htParse.URLPARSE() spiderdb.init_database_type(dbType) curpage = 0 for site in L: #指定爬虫要爬取的网址,以及网址的类型 spider.mySite(site, siteType) #将读取到的数据插入数据库 InsertIntoDataBase(spider, spiderdb, dbType) curpage += 1 print('''当前处理页数:%d''' % (str(curpage))) time.sleep(sleepTime)
def run(self, http_address): self.is_running = True analyse = HtmlParse(http_address) if (self.is_running): analyse.get_keywords() else: return 0 if (self.is_running): analyse.get_tag_hs() else: return 0 if (self.is_running): analyse.get_tag_p() else: return 0 return 1
def InsertData(startindex, endindex): spiderdb = qbdb() spider = htParse.URLPARSE() website = ('http://www.ixxzy22.com/?m=vod-index-pg-%s.html', 2, 3, 14) #子爬虫,因为有两种网址要解析 spiderson = htParse.URLPARSE() spiderdb.init_database_type(website[1]) curpage = startindex while curpage <= endindex: print(u'''当前处理页数:%s''' % (str(curpage))) #指定爬虫要爬取的网址,以及网址的类型 spider.pagestory = [] spider.mySite(website[0] % (str(curpage)), website[1]) for index in range(len(spider.pagestory)): # print(u'''当前处理编号:%s''' % (str(index))) spiderson.pagestory = [] spiderson.mySite(spider.pagestory[index][1], 256) if (len(spiderson.pagestory) > 0): spider.pagestory[index][1] = spiderson.pagestory[0] time.sleep(1.5) curpage = (curpage + 1) #将读取到的数据插入数据库 InsertIntoDataBase(spider, spiderdb, website[1]) spiderdb.Showwebsite()
def getQiuBai(): spiderdb = qbdb() spider = htParse.URLPARSE() qiubaiType = [('http://www.qiushibaike.com/textnew/page/%s', 1, 3, 14), ('http://www.qiushibaike.com/text/page/%s', 1, 3, 14), ('http://www.qiushibaike.com/hot/page/%s', 1, 3, 14)] for sinTask in qiubaiType: spiderdb.init_database_type(sinTask[1]) curpage = 1 while curpage <= 35: #指定爬虫要爬取的网址,以及网址的类型 spider.mySite(sinTask[0] % (str(curpage)), 1) #将读取到的数据插入数据库 InsertIntoDataBase(spider, spiderdb, sinTask[2]) curpage += 1 print('''当前处理页数:''') print(curpage) time.sleep(0.5) spiderdb.ShowTopTen() return
def populateFrame_2(self): self.enableFrame_2() self.filename.insert(0, self.combox.get()) self.body.insert(1.0, hp.getBodyText('Pages/' + self.combox.get())) self.filename.configure(state='disable')