Esempio n. 1
0
    def _process_html(self, path, url, task, site):

        url_mapping = {}

        def _process_link(l):
            if not self._accept_download(l):
                return l

            if not url_mapping.has_key(l):
                if l.count(":"):  #mailto:, javascript:, http:
                    url_mapping[l] = l
                else:
                    url_mapping[l] = utils.absolute_path(path, l)

            return url_mapping[l]

        crawler = LinkCrawler()
        crawler.crawling(site.real_path(path), url, _process_link)

        for link, path in url_mapping.iteritems():
            if link.count(':') > 0: continue
            link = utils.absolute_url(url, link)

            self.logger.info("add spider:%s==>%s" % (link, path))
            task.add_action("%s==>%s" % (link, path))
Esempio n. 2
0
        def _process_link(l):
            if not self._accept_download(l):
                return l

            if not url_mapping.has_key(l):
                if l.count(":"):  #mailto:, javascript:, http:
                    url_mapping[l] = l
                else:
                    url_mapping[l] = utils.absolute_path(path, l)

            return url_mapping[l]
Esempio n. 3
0
 def _process_link(l):
     if not self._accept_download(l):
         return l
     
     if not url_mapping.has_key(l):
         if l.count(":"): #mailto:, javascript:, http:
             url_mapping[l] = l
         else:
             url_mapping[l] = utils.absolute_path(path, l)
     
     return url_mapping[l]
Esempio n. 4
0
 def repl(m):
     l = m.group(3).strip("'").strip('"')
     #print "url:%s" % url
     local_path = action(l)
     if local_path:
         local_url = utils.relative_url(url, local_path)
     else:
         local_url = l
     #print "local_url:%s" % local_url
     
     #self.logger.info("%s-->%s" % (l, local_url))
     return "%s='%s'" % (m.group(2), local_url)
Esempio n. 5
0
        def repl(m):
            l = m.group(3).strip("'").strip('"')
            #print "url:%s" % url
            local_path = action(l)
            if local_path:
                local_url = utils.relative_url(url, local_path)
            else:
                local_url = l
            #print "local_url:%s" % local_url

            #self.logger.info("%s-->%s" % (l, local_url))
            return "%s='%s'" % (m.group(2), local_url)
Esempio n. 6
0
 def _process_html(self, path, url, task, site):
     
     url_mapping = {}
     def _process_link(l):
         if not self._accept_download(l):
             return l
         
         if not url_mapping.has_key(l):
             if l.count(":"): #mailto:, javascript:, http:
                 url_mapping[l] = l
             else:
                 url_mapping[l] = utils.absolute_path(path, l)
         
         return url_mapping[l]
     
     crawler = LinkCrawler()
     crawler.crawling(site.real_path(path), url, _process_link)
     
     for link, path in url_mapping.iteritems():
         if link.count(':') > 0: continue
         link = utils.absolute_url(url, link)
                     
         self.logger.info("add spider:%s==>%s" % (link, path))            
         task.add_action("%s==>%s" % (link, path))