Ejemplo n.º 1
0
 def __init__(self):
     #第一個是模塊的名字,第二個是模塊內部對象的名字
     #html_download.HtmlDownload()
     self.html_download = html_download3.HtmlDownload()
     self.html_parser = HtmlParse.HtmlParse()
     self.urlmanage = UrlManage.UrlManage()
     self.html_output = html_outputer.HtmlOutputer()
Ejemplo n.º 2
0
 def populateFrame_2(self):
     self.frame_3.grid_forget()
     self.enableFrame_2()
     self.insertBtn.grid_forget()
     self.insertBtn = ttk.Button(self.frame_2, text = 'Confirm Edit and View', command = self.confirmEdit).grid(row = 6, column = 0, sticky = N+S+E+W)
     self.filename.insert(0, self.combox.get())
     self.body.insert(1.0, hp.getBodyText('Pages/' + self.combox.get()))
     self.filename.configure(state='disable')
Ejemplo n.º 3
0
def GetDataFromWebsite(L, dbType, siteType, sleepTime):
    spiderdb = qbdb()
    spider = htParse.URLPARSE()
    spiderdb.init_database_type(dbType)
    curpage = 0
    for site in L:
        #指定爬虫要爬取的网址,以及网址的类型
        spider.mySite(site, siteType)
        #将读取到的数据插入数据库
        InsertIntoDataBase(spider, spiderdb, dbType)
        curpage += 1
        print('''当前处理页数:%d''' % (str(curpage)))
        time.sleep(sleepTime)
Ejemplo n.º 4
0
 def run(self, http_address):
     self.is_running = True
     analyse = HtmlParse(http_address)
     if (self.is_running):
         analyse.get_keywords()
     else:
         return 0
     if (self.is_running):
         analyse.get_tag_hs()
     else:
         return 0
     if (self.is_running):
         analyse.get_tag_p()
     else:
         return 0
     return 1
Ejemplo n.º 5
0
def InsertData(startindex, endindex):
    spiderdb = qbdb()
    spider = htParse.URLPARSE()
    website = ('http://www.ixxzy22.com/?m=vod-index-pg-%s.html', 2, 3, 14)
    #子爬虫,因为有两种网址要解析
    spiderson = htParse.URLPARSE()
    spiderdb.init_database_type(website[1])
    curpage = startindex
    while curpage <= endindex:
        print(u'''当前处理页数:%s''' % (str(curpage)))
        #指定爬虫要爬取的网址,以及网址的类型
        spider.pagestory = []
        spider.mySite(website[0] % (str(curpage)), website[1])
        for index in range(len(spider.pagestory)):  #
            print(u'''当前处理编号:%s''' % (str(index)))
            spiderson.pagestory = []
            spiderson.mySite(spider.pagestory[index][1], 256)
            if (len(spiderson.pagestory) > 0):
                spider.pagestory[index][1] = spiderson.pagestory[0]
            time.sleep(1.5)
        curpage = (curpage + 1)
        #将读取到的数据插入数据库
        InsertIntoDataBase(spider, spiderdb, website[1])
    spiderdb.Showwebsite()
Ejemplo n.º 6
0
def getQiuBai():
    spiderdb = qbdb()
    spider = htParse.URLPARSE()
    qiubaiType = [('http://www.qiushibaike.com/textnew/page/%s', 1, 3, 14),
                  ('http://www.qiushibaike.com/text/page/%s', 1, 3, 14),
                  ('http://www.qiushibaike.com/hot/page/%s', 1, 3, 14)]
    for sinTask in qiubaiType:
        spiderdb.init_database_type(sinTask[1])
        curpage = 1
        while curpage <= 35:
            #指定爬虫要爬取的网址,以及网址的类型
            spider.mySite(sinTask[0] % (str(curpage)), 1)
            #将读取到的数据插入数据库
            InsertIntoDataBase(spider, spiderdb, sinTask[2])
            curpage += 1
            print('''当前处理页数:''')
            print(curpage)
            time.sleep(0.5)
        spiderdb.ShowTopTen()
        return
Ejemplo n.º 7
0
 def populateFrame_2(self):
     self.enableFrame_2()
     self.filename.insert(0, self.combox.get())
     self.body.insert(1.0, hp.getBodyText('Pages/' + self.combox.get()))
     self.filename.configure(state='disable')