def fetchFrame(self,url,path,usecookie): """ 抓取特定的一个框架 返回:(string)html pars: url(string) path(string)保存路径 usecookie(boolean) """ global processLock,resourceUrlPool,processEventBus protocal=url[:url.find('://')]#协议名 try: response=urllib2.urlopen(url,timeout=self.timeout) except URLError: raise NoConnectionError html=response.read() framename=parserlib.getFrameName(url) resourceUrls=parserlib.parseSrcs(html)|parserlib.parseStyleImgs(html) if processLock: processLock.acquire() resourceUrlPool|=resourceUrls processLock.release() frameUrls=parserlib.parseFrames(html) if not os.path.exists((self.path+framename+'/').decode('utf-8')): os.makedirs((self.path+framename+'/').decode('utf-8')) for resourceurl in resourceUrls: if not self.alive: return resourceurl=parserlib.getAbsUrl(resourceurl, url) try: response=urllib2.urlopen(resourceurl) except HTTPError: continue except URLError: print resourceurl continue if resourceurl[-3:]=="css": self.saveResource(path+framename+'/'+parserlib.getFileName(resourceurl),parserlib.filtUrl(response.read(),resourceurl)) else: self.saveResource(path+framename+'/'+parserlib.getFileName(resourceurl),response.read()) self.saveText(("<!-- saved from %s-->\n"%url)+parserlib.filtUrl(html,url),path+parserlib.getFrameName(url)) if processLock: processLock.acquire() processEventBus.pushEvent(events.ProcessEvent(content=-1)) processLock.release() for frameurl in frameUrls: if not self.alive: return if not os.path.exists(self.path+self.title+'/'+parserlib.getFileName(frameurl)): try: self.fetchFrame(frameurl,path+framename,usecookie)#xxx except HTTPError: continue
def fetchPage(self, url, usecookie=False): """ 抓取特定的一个页面 返回:(string)html pars: url(string) usecookie(boolean) """ global processLock, resourceUrlPool, processEventBus protocal = url[:url.find('://')] #协议名 if usecookie: urllib2.install_opener(self.opener) else: urllib2.install_opener(None) try: response = urllib2.urlopen(url, timeout=self.timeout) except URLError: print url raise NoConnectionError html = response.read() try: title = parserlib.getTitle(html) except NoTitleError: title = ("Untitled-%d" % self.notitleid) self.notitleid += 1 self.title = title resourceUrls = parserlib.parseSrcs(html) | parserlib.parseStyleImgs( html) if processLock: processLock.acquire() resourceUrlPool |= resourceUrls processLock.release() frameUrls = parserlib.parseFrames(html) self.path = self.path.encode('u8') sameNameNum = 0 while os.path.exists((self.path + title + ".html").decode('u8')): f = open((self.path + title + ".html").decode('u8'), 'r') #从保存的网页的开头拿出该网页的原url text = f.readline() f.close() text = text[text.find('from ') + 5:-4] if text in url or url in text: return sameNameNum += 1 title = '%s(%d)' % (self.title, sameNameNum) if not os.path.exists((self.path + title + '/').decode('u8')): os.makedirs((self.path + title + '/').decode('utf-8')) for resourceurl in resourceUrls: if not self.alive: return resourceurl = parserlib.getAbsUrl(resourceurl, url) try: response = urllib2.urlopen(resourceurl) except HTTPError: continue except URLError: print resourceurl continue if resourceurl[-3:] == "css": self.saveResource( self.path + self.title + '/' + parserlib.getFileName(resourceurl), parserlib.filtUrl(response.read(), resourceurl)) else: self.saveResource( self.path + self.title + '/' + parserlib.getFileName(resourceurl), response.read()) self.saveText(("<!-- saved from %s-->\n" % url) + parserlib.filtUrl(html, url, title), self.path + title + ".html") for frameurl in frameUrls: if not self.alive: return if not os.path.exists(self.path + self.title + '/' + parserlib.getFileName(frameurl)): try: self.fetchFrame(frameurl, self.path, usecookie) except HTTPError: continue if processLock: processLock.acquire() processEventBus.pushEvent(events.ProcessEvent(content=-1)) processLock.release()
def fetchFrame(self, url, path, usecookie): """ 抓取特定的一个框架 返回:(string)html pars: url(string) path(string)保存路径 usecookie(boolean) """ global processLock, resourceUrlPool, processEventBus protocal = url[:url.find('://')] #协议名 try: response = urllib2.urlopen(url, timeout=self.timeout) except URLError: raise NoConnectionError html = response.read() framename = parserlib.getFrameName(url) resourceUrls = parserlib.parseSrcs(html) | parserlib.parseStyleImgs( html) if processLock: processLock.acquire() resourceUrlPool |= resourceUrls processLock.release() frameUrls = parserlib.parseFrames(html) if not os.path.exists((self.path + framename + '/').decode('utf-8')): os.makedirs((self.path + framename + '/').decode('utf-8')) for resourceurl in resourceUrls: if not self.alive: return resourceurl = parserlib.getAbsUrl(resourceurl, url) try: response = urllib2.urlopen(resourceurl) except HTTPError: continue except URLError: print resourceurl continue if resourceurl[-3:] == "css": self.saveResource( path + framename + '/' + parserlib.getFileName(resourceurl), parserlib.filtUrl(response.read(), resourceurl)) else: self.saveResource( path + framename + '/' + parserlib.getFileName(resourceurl), response.read()) self.saveText( ("<!-- saved from %s-->\n" % url) + parserlib.filtUrl(html, url), path + parserlib.getFrameName(url)) if processLock: processLock.acquire() processEventBus.pushEvent(events.ProcessEvent(content=-1)) processLock.release() for frameurl in frameUrls: if not self.alive: return if not os.path.exists(self.path + self.title + '/' + parserlib.getFileName(frameurl)): try: self.fetchFrame(frameurl, path + framename, usecookie) #xxx except HTTPError: continue
def fetchPage(self,url,usecookie=False): """ 抓取特定的一个页面 返回:(string)html pars: url(string) usecookie(boolean) """ global processLock,resourceUrlPool,processEventBus protocal=url[:url.find('://')]#协议名 if usecookie: urllib2.install_opener(self.opener) else: urllib2.install_opener(None) try: response=urllib2.urlopen(url,timeout=self.timeout) except URLError: print url raise NoConnectionError html=response.read() try: title=parserlib.getTitle(html) except NoTitleError: title=("Untitled-%d"%self.notitleid) self.notitleid+=1 self.title=title resourceUrls=parserlib.parseSrcs(html)|parserlib.parseStyleImgs(html) if processLock: processLock.acquire() resourceUrlPool|=resourceUrls processLock.release() frameUrls=parserlib.parseFrames(html) self.path=self.path.encode('u8') sameNameNum=0 while os.path.exists((self.path+title+".html").decode('u8')): f=open((self.path+title+".html").decode('u8'),'r')#从保存的网页的开头拿出该网页的原url text=f.readline() f.close() text=text[text.find('from ')+5:-4] if text in url or url in text: return sameNameNum+=1 title='%s(%d)'%(self.title,sameNameNum) if not os.path.exists((self.path+title+'/').decode('u8')): os.makedirs((self.path+title+'/').decode('utf-8')) for resourceurl in resourceUrls: if not self.alive: return resourceurl=parserlib.getAbsUrl(resourceurl, url) try: response=urllib2.urlopen(resourceurl) except HTTPError: continue except URLError: print resourceurl continue if resourceurl[-3:]=="css": self.saveResource(self.path+self.title+'/'+parserlib.getFileName(resourceurl),parserlib.filtUrl(response.read(),resourceurl)) else: self.saveResource(self.path+self.title+'/'+parserlib.getFileName(resourceurl),response.read()) self.saveText(("<!-- saved from %s-->\n"%url)+parserlib.filtUrl(html,url,title),self.path+title+".html") for frameurl in frameUrls: if not self.alive: return if not os.path.exists(self.path+self.title+'/'+parserlib.getFileName(frameurl)): try: self.fetchFrame(frameurl,self.path,usecookie) except HTTPError: continue if processLock: processLock.acquire() processEventBus.pushEvent(events.ProcessEvent(content=-1)) processLock.release()