Beispiel #1
0
 def fetchFrame(self,url,path,usecookie):
     """
             抓取特定的一个框架
             返回:(string)html
     pars:
         url(string)
         path(string)保存路径
         usecookie(boolean)
     """
     global processLock,resourceUrlPool,processEventBus
     
     protocal=url[:url.find('://')]#协议名
     try:
         response=urllib2.urlopen(url,timeout=self.timeout)
     except URLError:
         raise NoConnectionError
     html=response.read()
     framename=parserlib.getFrameName(url)
     resourceUrls=parserlib.parseSrcs(html)|parserlib.parseStyleImgs(html)
     if processLock:
         processLock.acquire()
         resourceUrlPool|=resourceUrls
         processLock.release()
     frameUrls=parserlib.parseFrames(html)
     if not os.path.exists((self.path+framename+'/').decode('utf-8')):
         os.makedirs((self.path+framename+'/').decode('utf-8'))
     for resourceurl in resourceUrls:
         if not self.alive:
             return
         resourceurl=parserlib.getAbsUrl(resourceurl, url)
         try:
             response=urllib2.urlopen(resourceurl)
         except HTTPError:
             continue
         except URLError:
             print resourceurl
             continue
         if resourceurl[-3:]=="css":
             self.saveResource(path+framename+'/'+parserlib.getFileName(resourceurl),parserlib.filtUrl(response.read(),resourceurl))
         else:
             self.saveResource(path+framename+'/'+parserlib.getFileName(resourceurl),response.read())
     self.saveText(("<!-- saved from %s-->\n"%url)+parserlib.filtUrl(html,url),path+parserlib.getFrameName(url))
     if processLock:
         processLock.acquire()
         processEventBus.pushEvent(events.ProcessEvent(content=-1))
         processLock.release()
     for frameurl in frameUrls:
         if not self.alive:
             return
         if not os.path.exists(self.path+self.title+'/'+parserlib.getFileName(frameurl)):
             try:
                 self.fetchFrame(frameurl,path+framename,usecookie)#xxx
             except HTTPError:
                 continue
Beispiel #2
0
    def fetchPage(self, url, usecookie=False):
        """
                抓取特定的一个页面
                返回:(string)html
        pars:
            url(string)
            usecookie(boolean)
        """
        global processLock, resourceUrlPool, processEventBus

        protocal = url[:url.find('://')]  #协议名

        if usecookie:
            urllib2.install_opener(self.opener)
        else:
            urllib2.install_opener(None)

        try:
            response = urllib2.urlopen(url, timeout=self.timeout)
        except URLError:
            print url
            raise NoConnectionError
        html = response.read()
        try:
            title = parserlib.getTitle(html)
        except NoTitleError:
            title = ("Untitled-%d" % self.notitleid)
            self.notitleid += 1
        self.title = title
        resourceUrls = parserlib.parseSrcs(html) | parserlib.parseStyleImgs(
            html)
        if processLock:
            processLock.acquire()
            resourceUrlPool |= resourceUrls
            processLock.release()
        frameUrls = parserlib.parseFrames(html)
        self.path = self.path.encode('u8')
        sameNameNum = 0
        while os.path.exists((self.path + title + ".html").decode('u8')):
            f = open((self.path + title + ".html").decode('u8'),
                     'r')  #从保存的网页的开头拿出该网页的原url
            text = f.readline()
            f.close()
            text = text[text.find('from ') + 5:-4]
            if text in url or url in text:
                return
            sameNameNum += 1
            title = '%s(%d)' % (self.title, sameNameNum)
        if not os.path.exists((self.path + title + '/').decode('u8')):
            os.makedirs((self.path + title + '/').decode('utf-8'))
        for resourceurl in resourceUrls:
            if not self.alive:
                return
            resourceurl = parserlib.getAbsUrl(resourceurl, url)
            try:
                response = urllib2.urlopen(resourceurl)
            except HTTPError:
                continue
            except URLError:
                print resourceurl
                continue
            if resourceurl[-3:] == "css":
                self.saveResource(
                    self.path + self.title + '/' +
                    parserlib.getFileName(resourceurl),
                    parserlib.filtUrl(response.read(), resourceurl))
            else:
                self.saveResource(
                    self.path + self.title + '/' +
                    parserlib.getFileName(resourceurl), response.read())
        self.saveText(("<!-- saved from %s-->\n" % url) +
                      parserlib.filtUrl(html, url, title),
                      self.path + title + ".html")
        for frameurl in frameUrls:
            if not self.alive:
                return
            if not os.path.exists(self.path + self.title + '/' +
                                  parserlib.getFileName(frameurl)):
                try:
                    self.fetchFrame(frameurl, self.path, usecookie)
                except HTTPError:
                    continue
        if processLock:
            processLock.acquire()
            processEventBus.pushEvent(events.ProcessEvent(content=-1))
            processLock.release()
Beispiel #3
0
    def fetchFrame(self, url, path, usecookie):
        """
                抓取特定的一个框架
                返回:(string)html
        pars:
            url(string)
            path(string)保存路径
            usecookie(boolean)
        """
        global processLock, resourceUrlPool, processEventBus

        protocal = url[:url.find('://')]  #协议名
        try:
            response = urllib2.urlopen(url, timeout=self.timeout)
        except URLError:
            raise NoConnectionError
        html = response.read()
        framename = parserlib.getFrameName(url)
        resourceUrls = parserlib.parseSrcs(html) | parserlib.parseStyleImgs(
            html)
        if processLock:
            processLock.acquire()
            resourceUrlPool |= resourceUrls
            processLock.release()
        frameUrls = parserlib.parseFrames(html)
        if not os.path.exists((self.path + framename + '/').decode('utf-8')):
            os.makedirs((self.path + framename + '/').decode('utf-8'))
        for resourceurl in resourceUrls:
            if not self.alive:
                return
            resourceurl = parserlib.getAbsUrl(resourceurl, url)
            try:
                response = urllib2.urlopen(resourceurl)
            except HTTPError:
                continue
            except URLError:
                print resourceurl
                continue
            if resourceurl[-3:] == "css":
                self.saveResource(
                    path + framename + '/' +
                    parserlib.getFileName(resourceurl),
                    parserlib.filtUrl(response.read(), resourceurl))
            else:
                self.saveResource(
                    path + framename + '/' +
                    parserlib.getFileName(resourceurl), response.read())
        self.saveText(
            ("<!-- saved from %s-->\n" % url) + parserlib.filtUrl(html, url),
            path + parserlib.getFrameName(url))
        if processLock:
            processLock.acquire()
            processEventBus.pushEvent(events.ProcessEvent(content=-1))
            processLock.release()
        for frameurl in frameUrls:
            if not self.alive:
                return
            if not os.path.exists(self.path + self.title + '/' +
                                  parserlib.getFileName(frameurl)):
                try:
                    self.fetchFrame(frameurl, path + framename,
                                    usecookie)  #xxx
                except HTTPError:
                    continue
Beispiel #4
0
    def fetchPage(self,url,usecookie=False):
        """
                抓取特定的一个页面
                返回:(string)html
        pars:
            url(string)
            usecookie(boolean)
        """
        global processLock,resourceUrlPool,processEventBus
        
        protocal=url[:url.find('://')]#协议名

        if usecookie:
            urllib2.install_opener(self.opener)
        else:
            urllib2.install_opener(None)
        
        try:
            response=urllib2.urlopen(url,timeout=self.timeout)
        except URLError:
            print url
            raise NoConnectionError
        html=response.read()
        try:
            title=parserlib.getTitle(html)
        except NoTitleError:
            title=("Untitled-%d"%self.notitleid)
            self.notitleid+=1
        self.title=title
        resourceUrls=parserlib.parseSrcs(html)|parserlib.parseStyleImgs(html)
        if processLock:
            processLock.acquire()
            resourceUrlPool|=resourceUrls
            processLock.release()
        frameUrls=parserlib.parseFrames(html)
        self.path=self.path.encode('u8')
        sameNameNum=0
        while os.path.exists((self.path+title+".html").decode('u8')):
            f=open((self.path+title+".html").decode('u8'),'r')#从保存的网页的开头拿出该网页的原url
            text=f.readline()
            f.close()
            text=text[text.find('from ')+5:-4]
            if text in url or url in text:
                return
            sameNameNum+=1
            title='%s(%d)'%(self.title,sameNameNum)
        if not os.path.exists((self.path+title+'/').decode('u8')):
            os.makedirs((self.path+title+'/').decode('utf-8'))
        for resourceurl in resourceUrls:
            if not self.alive:
                return
            resourceurl=parserlib.getAbsUrl(resourceurl, url)
            try:
                response=urllib2.urlopen(resourceurl)
            except HTTPError:
                continue
            except URLError:
                print resourceurl
                continue
            if resourceurl[-3:]=="css":
                self.saveResource(self.path+self.title+'/'+parserlib.getFileName(resourceurl),parserlib.filtUrl(response.read(),resourceurl))
            else:
                self.saveResource(self.path+self.title+'/'+parserlib.getFileName(resourceurl),response.read())
        self.saveText(("<!-- saved from %s-->\n"%url)+parserlib.filtUrl(html,url,title),self.path+title+".html")
        for frameurl in frameUrls:
            if not self.alive:
                return
            if not os.path.exists(self.path+self.title+'/'+parserlib.getFileName(frameurl)):
                try:
                    self.fetchFrame(frameurl,self.path,usecookie)
                except HTTPError:
                    continue
        if processLock:
            processLock.acquire()
            processEventBus.pushEvent(events.ProcessEvent(content=-1))
            processLock.release()