Beispiel #1
0
class _Reptile:
    '''
    single tutorial
    '''
    def __init__(self, pageNum=200):
        self._urlist = Urlist()
        self._queue = Q.Queue()
        self.pageNum = pageNum
        self.downloadedPageNum = 0

    def matchUrl(self, url):
        '''
        @in absolute path
        return true/false
        '''
        pass
    def inQueue(self, url):
        if not self.outPageRange(): 
            if self.matchUrl(url) and not self._urlist.find(url):
                self._queue.put(url)

    def outPageRange(self):
        '''
        num of downloaded page is outof range?
        return true/false
        '''
        return self.pageNum < self.downloadedPageNum

    def requestSource(self, url):
        self.downloadedPageNum += 1
        self.opener = urllib2.build_opener()     
        request = urllib2.Request(url) 
        request.add_header('Accept-encoding', 'gzip')
        request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')
        try:            
            page = self.opener.open(request, timeout=2) #设置超时为2s

            if page.code == 200:      
                predata = page.read()
                pdata = StringIO.StringIO(predata)
                gzipper = gzip.GzipFile(fileobj = pdata)  
                
                try:  
                    data = gzipper.read()  
                except(IOError):  
                    data = predata

                length = len(data)    

                if length<300 or length > 3000000:
                    return False
                #begain to parse the page
                return data

            page.close()  
        except:  
            print 'time out'  
Beispiel #2
0
 def __init__(self, pageNum=200):
     self._urlist = Urlist()
     self._queue = Q.Queue()
     self.pageNum = pageNum
     self.downloadedPageNum = 0