Beispiel #1
0
 def __init__(self, name, url_queue, url_list, url_in_queue, Flock, home_urls ,tem_siteID = [0], continue_run = [True]):
     '''
     name
     url_queue       从主服务器中分配到的url
     url_list        本地区分是否重复
     url_in_queue    解析得到的新url    将为每一个站点分配一个 UrlQueue
     Flock
     home_urls       测试是否符合爬取集合
     tem_conn        初始的DNS 缓存
     is_new_task     通过引用传递 由communitor修改  以判断是否需要修改
     tem_home_url    
     old_home_url    引用传递
     continue_run[]  是否继续运行的标志
     '''
     threading.Thread.__init__(self, name = name )  
     #本地测试url队列 如果在本地重复 则直接舍弃
     #如果不重复 加入临时队列 将来传输到中央服务器进行测试
     #为每个站点分配了一个list对象 分开进行url的分辨
     self.__url_list = url_list
     self.__url_queue = url_queue
     #默认为每一个站点分配一个inqueue
     #本地临时记录队列 在url_list中测试不重复后 加入in_queue
     #在积累到一定量后 传输给中央服务器管理
     #Queue()
     self.__url_in_queue = url_in_queue
     #----------------------------------------------------------------
     self.__Flock = Flock
     self.__home_urls = home_urls
     #强制刷新 DNS
     self.__tem_siteID = None
     #引用传递 方便进行对照
     self.__tem_siteID = tem_siteID
     #----------------------------------------------------------------
     self.__Flock = Flock
     self.__htmlparser = HtmlParser()
     self.__picparser = PicParser()
     self.__judger = Judger(self.__home_urls)
     #init temporary home_url and siteID
     #both to determine weather to refresh DNS cache
     self.__dbsource = DBSource()
     self.__collector = Collector(home_urls)
     #continue run
     self.__continue_run = continue_run
Beispiel #2
0
class Reptile(threading.Thread):
    '''
    单个线程
    '''
    def __init__(self, name, url_queue, url_list, url_in_queue, Flock, home_urls ,tem_siteID = [0], continue_run = [True]):
        '''
        name
        url_queue       从主服务器中分配到的url
        url_list        本地区分是否重复
        url_in_queue    解析得到的新url    将为每一个站点分配一个 UrlQueue
        Flock
        home_urls       测试是否符合爬取集合
        tem_conn        初始的DNS 缓存
        is_new_task     通过引用传递 由communitor修改  以判断是否需要修改
        tem_home_url    
        old_home_url    引用传递
        continue_run[]  是否继续运行的标志
        '''
        threading.Thread.__init__(self, name = name )  
        #本地测试url队列 如果在本地重复 则直接舍弃
        #如果不重复 加入临时队列 将来传输到中央服务器进行测试
        #为每个站点分配了一个list对象 分开进行url的分辨
        self.__url_list = url_list
        self.__url_queue = url_queue
        #默认为每一个站点分配一个inqueue
        #本地临时记录队列 在url_list中测试不重复后 加入in_queue
        #在积累到一定量后 传输给中央服务器管理
        #Queue()
        self.__url_in_queue = url_in_queue
        #----------------------------------------------------------------
        self.__Flock = Flock
        self.__home_urls = home_urls
        #强制刷新 DNS
        self.__tem_siteID = None
        #引用传递 方便进行对照
        self.__tem_siteID = tem_siteID
        #----------------------------------------------------------------
        self.__Flock = Flock
        self.__htmlparser = HtmlParser()
        self.__picparser = PicParser()
        self.__judger = Judger(self.__home_urls)
        #init temporary home_url and siteID
        #both to determine weather to refresh DNS cache
        self.__dbsource = DBSource()
        self.__collector = Collector(home_urls)
        #continue run
        self.__continue_run = continue_run
    #------------------------------------------------------
    @dec
    def init(self, siteID):
        console('self.init()')
        self.siteID = -1
        self.__tem_siteID[0] = siteID
        self.__dbsource.init(siteID)
        self.__url_queue.init(siteID)
        netloc = self.transNetloc(self.__home_urls[siteID])
        print 'get netloc',netloc
        self.__conn = httplib.HTTPConnection(netloc, 80, timeout = 10)
    @dec 
    def conn(self):
        '''
        包含刷新DNS功能
        siteID引用传入  检测DNS改变
        '''
        if self.siteID != self.__tem_siteID[0]:
            '''
            更新DNS
            '''
            self.siteID = self.__tem_siteID[0]
            #netloc = (urlparse.urlsplit(self.__home_urls[self.__tem_siteID[0]])).netloc
            netloc = self.transNetloc(self.__home_urls[self.__tem_siteID[0]])
            print 'netloc',netloc
            self.__conn = httplib.HTTPConnection(netloc, 80, timeout = 10)
        return self.__conn
    
    def transcode(self, source):
        '''
        转码 自动转化为utf8
        '''
        res = chardet.detect(source)
        confidence = res['confidence']
        encoding = res['encoding']
        p = re.compile("&#(\S+);")
        source = p.sub("",source)
        print 'transcode', res
        if encoding == 'utf-8':
            return source
        if confidence < 0.6:
            return False
        else:
            return unicode(source, encoding, 'ignore')
    @dec
    def transPath(self, page_url, path):
        '''
        将任意一个链接转化为 路径
        '''
        url = self.__judger.transToStdUrl(page_url, path)
        return urlparse.urlsplit(url).path
    @dec
    def transNetloc(self, url):
        '''
        传入绝对url  
        '''
        return urlparse.urlsplit(url).netloc
    #-------------------------------------------------------------
    @dec       
    def run(self):
        '''
        运行主程序
        '''
        console('self.run()')
        self.conn()
        home_url = self.__home_urls[self.siteID]
        print 'home_url',home_url
        while(True):
            #从外界传入标志 是否继续运行
            #实现中断或者停止
            if not self.__continue_run[0]:
                return 
            #[title, path]
            urlinfo = self.getAUrl()
            print 'get urlinfo ',urlinfo
            if not urlinfo:
                print "No Task\nqueue is empty!"
                return
            #全局 页面信息
            page_path = urlinfo[1]
            page_url = self.__judger.transToStdUrl(home_url, page_path)
            print 'page_path',page_path
            source = self.getPage(home_url, page_path)
            #判断是否为html源码
            if not self.__htmlparser.init(source):
                '''
                图片和其他文件单独处理
                此处不作解析
                '''
                continue
            #取得绝对地址
            #url = self.__judger.transToStdUrl(home_url, page_path)
            #url统一存储为绝对地址
            #save html source
            print 'saveHtml'+'-'*200
            self.saveHtml(page_url, urlinfo[0])
            imgsrcs = self.getImgSrcs()
            #save images
            self.saveImgList(page_url, imgsrcs)
            newurls = self.__htmlparser.getALinkText_List()
            self.addNewInQueue(page_url, newurls)
    @dec 
    def requestSource(self, path):
        '''
        page_url    子页面 如 ./index.html
        url: 直接传入绝对url 包括home_url
        内部进行解析
        '''
        conn = self.conn()
        conn.request("GET", path)
        #print self.__conn
        r1 = conn.getresponse()
        #print r1
        print r1.status
        data = r1.read()
        '''
        if r1.status != 'OK':
            print 'status is ',r1.status
            print 'status not OK'
            print r1.reason
            return False
        data = r1.read()
        if not len(data):
            print 'length of data is 0'
            return False
        '''
        return data
    @dec 
    def getPage(self,page_url, url):
        '''
        任意传入url
        将自动转化为path 然后调用底层 requestSource()
        '''
        console('self.getPage()')
        path = self.transPath(page_url, url)
        data = self.requestSource(path)
        print 'page_url: url',page_url, url
        if len(data):
            data = self.transcode(data)
            #print 'data',data
            if not len(data):
                return False
            if not self.__collector.init(data):
                print 'collector.init',
                return False
            #self.__htmlparser.init(data)
            self.__htmlparser = self.__collector.htmlparser
        return data
    @dec    
    def getImg(self,page_url, url):
        '''
        path
        img_path    './img/1.jpg'
        返回 [绝对url, source]
        '''
        url = self.transPath(page_url, url)
        return [url, self.requestSource(url)]
    @dec    
    def getAUrl(self):
        return self.__url_queue.get(timeout = 3)
    @dec 
    def getUrls(self):
        '''
        取得urls
        并且进行判断 
        '''
        return self.__htmlparser.getALink_list()
    @dec 
    def getImgSrcs(self):
        '''
        parse html source and return src_list
        '''
        return self.__htmlparser.getPicSrcs_List()
    @dec
    def addNewQueue(self, path_list):
        '''
        外界: 控制服务器传来的新的paths
        url_list = [
            ['cau','path'],
        ]
        '''
        #控制刷新
        for url in path_list:
            self.__url_queue.put(url)
    @dec    
    def addNewInQueue(self, page_url, url_list):
        '''
        url直接为原始的url   不需要另外进行处理
        将new_url添加到对应的queue中
        '''
        for urlinfo in url_list:
            #处理为绝对url
            url = self.__judger.transToStdUrl(page_url, urlinfo[1])
            siteID = self.__judger.judgeUrl(page_url, url)
            path = urlparse.urlsplit(url).path
            #判断是否为本平台url
            if siteID != -1:
                if not self.__url_list.find(siteID, path):
                    '''
                    not duplicate in url_list
                    '''
                    #将url减少
                    self.__url_in_queue.put(siteID, urlinfo[0], path)
        self.__url_in_queue.show()
    @dec
    def saveHtml(self, url, title):
        '''
        存储 source 和 parsedsource to database
        '''
        #得到绝对url
        assert self.siteID != -1
        #url = self.__judger.transToStdUrl(self.__home_urls[self.siteID], path)
        today = datetime.date.today()
        info = {
            'title' :   title,
            'url':      url,
            'date':     datetime.date.isoformat(today)
        }
        self.__dbsource.saveHtml(info, self.__collector.html, self.__collector.transXml_Str(url))

    def saveImg(self, url, source):
        imgsource = self.__picparser.getCompressedPic()
        size = imgsource['size']
        source = imgsource['source']
        #print 'source',source
        info = {
            'url':url,
            'width':size[0],
            'height':size[1]
        }
        self.__dbsource.saveImg(info, source)

    def saveImgList(self, page_url, srcs):
        '''
        传入绝对src
        传入 srcs 系列存储
        '''
        for src in srcs:
            imgsource = self.getImg(page_url, src)
            url = imgsource[0]
            source = imgsource[1]
            self.__picparser.init(source)
            self.saveImg(url, imgsource)