def _process_html(self, path, url, task, site): url_mapping = {} def _process_link(l): if not self._accept_download(l): return l if not url_mapping.has_key(l): if l.count(":"): #mailto:, javascript:, http: url_mapping[l] = l else: url_mapping[l] = utils.absolute_path(path, l) return url_mapping[l] crawler = LinkCrawler() crawler.crawling(site.real_path(path), url, _process_link) for link, path in url_mapping.iteritems(): if link.count(':') > 0: continue link = utils.absolute_url(url, link) self.logger.info("add spider:%s==>%s" % (link, path)) task.add_action("%s==>%s" % (link, path))
def _process_link(l): if not self._accept_download(l): return l if not url_mapping.has_key(l): if l.count(":"): #mailto:, javascript:, http: url_mapping[l] = l else: url_mapping[l] = utils.absolute_path(path, l) return url_mapping[l]
def repl(m): l = m.group(3).strip("'").strip('"') #print "url:%s" % url local_path = action(l) if local_path: local_url = utils.relative_url(url, local_path) else: local_url = l #print "local_url:%s" % local_url #self.logger.info("%s-->%s" % (l, local_url)) return "%s='%s'" % (m.group(2), local_url)