def geturls(self): if not self.html: self.fetch() urls = [] from jsb.imports import getBeautifulSoup soup = getBeautifulSoup() s = soup.BeautifulSoup(self.html) tags = s('a') for tag in tags: href = tag.get("href") if href: href = href.split("#")[0] if not href: continue if not href.endswith(".html"): continue if ".." in href: continue if href.startswith("mailto"): continue if not "http" in href: if href.startswith("/"): href = self.root + href else: href = self.base + "/" + href if not self.root in href: logging.warn("%s not in %s" % (self.root, href)) ; continue if href not in urls: urls.append(href) logging.warn("found %s urls" % len(urls)) return urls
## jsb imports from jsb.utils.name import stripname from jsb.utils.exception import handle_exception from jsb.utils.urldata import UrlData from jsb.utils.generic import waitforqueue from jsb.utils.url import geturl2, striphtml, Url from jsb.lib.datadir import getdatadir from jsb.lib.persist import PersistCollection from jsb.lib.commands import cmnds from jsb.lib.examples import examples from jsb.lib.threadloop import ThreadLoop from jsb.lib.callbacks import callbacks from jsb.imports import getBeautifulSoup soup = getBeautifulSoup() ## basic imports from collections import deque import os import logging import re import sys import time import math import urllib2 import urlparse import optparse from cgi import escape from traceback import format_exc