Ejemplo n.º 1
0
 def __init__(self, name, urlQueue, urlist, urlInQueue, Flock, homeUrls ,curSiteID = [0], continueRun = [True]):
     threading.Thread.__init__(self, name = name )  
     #own data
     self.__homeUrls = homeUrls
     self.__urlist = urlist
     self.__urlQueue = urlQueue
     self.__urlInQueue = urlInQueue
     self.__Flock = Flock
     self.__curSiteID = curSiteID
     self.__temSiteID = -1
     self.__conn = None
     self.__homeurl = None
     self.continueRun = continueRun
     #---------------------
     self.urlparser = UrlParser(homeUrls)
     self.htmlparser = HtmlParser(self.urlparser)
     self.htmldb = HtmlDB(self.htmlparser)
Ejemplo n.º 2
0
class Reptile(threading.Thread):
    '''
    单个线程
    '''
    def __init__(self, name, urlQueue, urlist, urlInQueue, Flock, homeUrls ,curSiteID = [0], continueRun = [True]):
        threading.Thread.__init__(self, name = name )  
        #own data
        self.__homeUrls = homeUrls
        self.__urlist = urlist
        self.__urlQueue = urlQueue
        self.__urlInQueue = urlInQueue
        self.__Flock = Flock
        self.__curSiteID = curSiteID
        self.__temSiteID = -1
        self.__conn = None
        self.__homeurl = None
        self.continueRun = continueRun
        #---------------------
        self.urlparser = UrlParser(homeUrls)
        self.htmlparser = HtmlParser(self.urlparser)
        self.htmldb = HtmlDB(self.htmlparser)

    @dec
    def conn(self):
        '''
        DNS缓存
        '''
        if self.__curSiteID[0] != self.__temSiteID:
            '''
            更新DNS
            '''
            self.__temSiteID = self.__curSiteID[0]
            self.__homeurl = self.__homeUrls[self.__temSiteID]
            netloc = self.urlparser.transNetloc(self.__homeUrls[self.__temSiteID])
            print netloc
            self.__conn = httplib.HTTPConnection(netloc, 80, timeout = 10)
        return self.__conn
            
    @dec
    def requestSource(self, path):
        conn = self.conn()
        conn.request("GET", path)
        r1 = conn.getresponse()
        data = r1.read()
        #需要对data的返回转台进行解析
        return data

    @dec
    def getPage(self, path):
        return self.requestSource(path)

    @dec
    def run(self):
        '''
        运行主程序 
        '''
        self.conn()
        while(True):
            #外界控制是否继续运行
            if not self.continueRun[0]:
                return
            newPathInfo = self.__urlQueue.get(timeout=5)
            pageStdUrl = self.urlparser.transToStdUrl(self.__homeurl, newPathInfo[1])
            print 'start download: ',pageStdUrl
            #[title, path]
            source = self.getPage(newPathInfo[1])
            #判断是否为html源码
            if not self.htmlparser.init(source):
                '''
                图片或其他文件
                '''
                continue
            #解析和存储
            self.addNewInQueue(pageStdUrl)
            self.htmldb.saveHtml(newPathInfo[0], pageStdUrl, source)

    @dec
    def addNewInQueue(self, pageStdUrl):
        '''
        直接从html source中提取出path列表
        直接添加到各自的inqueue
        '''
        urlist = self.htmlparser.getLinks()
        for urlInfor in urlist:
            #[title, path]
            stdUrl = self.urlparser.transToStdUrl(pageStdUrl, urlInfor[1])
            siteId = self.urlparser.judgeUrl(pageStdUrl, urlInfor[1])
            path = self.urlparser.transPathByStd(stdUrl)
            #判断是否为本平台url
            if siteId != -1:
                if not self.__urlist.find(path):
                    self.__urlInQueue.put(siteId, urlInfor[1], path)