class _Reptile: ''' single tutorial ''' def __init__(self, pageNum=200): self._urlist = Urlist() self._queue = Q.Queue() self.pageNum = pageNum self.downloadedPageNum = 0 def matchUrl(self, url): ''' @in absolute path return true/false ''' pass def inQueue(self, url): if not self.outPageRange(): if self.matchUrl(url) and not self._urlist.find(url): self._queue.put(url) def outPageRange(self): ''' num of downloaded page is outof range? return true/false ''' return self.pageNum < self.downloadedPageNum def requestSource(self, url): self.downloadedPageNum += 1 self.opener = urllib2.build_opener() request = urllib2.Request(url) request.add_header('Accept-encoding', 'gzip') request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6') try: page = self.opener.open(request, timeout=2) #设置超时为2s if page.code == 200: predata = page.read() pdata = StringIO.StringIO(predata) gzipper = gzip.GzipFile(fileobj = pdata) try: data = gzipper.read() except(IOError): data = predata length = len(data) if length<300 or length > 3000000: return False #begain to parse the page return data page.close() except: print 'time out'
def __init__(self, pageNum=200): self._urlist = Urlist() self._queue = Q.Queue() self.pageNum = pageNum self.downloadedPageNum = 0