Exemple #1
0
 def geturls(self):
     if not self.html: self.fetch()
     urls = []
     from jsb.imports import getBeautifulSoup
     soup = getBeautifulSoup()
     s = soup.BeautifulSoup(self.html)
     tags = s('a')
     for tag in tags:
        href = tag.get("href")
        if href:
            href = href.split("#")[0]
            if not href: continue
            if not href.endswith(".html"): continue
            if ".." in href: continue
            if href.startswith("mailto"): continue
            if not "http" in href:
                 if href.startswith("/"): href = self.root + href
                 else: href = self.base + "/" + href
            if not self.root in href: logging.warn("%s not in %s" % (self.root, href)) ; continue
            if href not in urls: urls.append(href)
     logging.warn("found %s urls" % len(urls))
     return urls
Exemple #2
0
## jsb imports

from jsb.utils.name import stripname
from jsb.utils.exception import handle_exception
from jsb.utils.urldata import UrlData
from jsb.utils.generic import waitforqueue
from jsb.utils.url import geturl2, striphtml, Url
from jsb.lib.datadir import getdatadir
from jsb.lib.persist import PersistCollection
from jsb.lib.commands import cmnds
from jsb.lib.examples import examples
from jsb.lib.threadloop import ThreadLoop
from jsb.lib.callbacks import callbacks
from jsb.imports import getBeautifulSoup

soup = getBeautifulSoup()

## basic imports

from collections import deque
import os
import logging
import re
import sys
import time
import math
import urllib2
import urlparse
import optparse
from cgi import escape
from traceback import format_exc
Exemple #3
0
## jsb imports

from jsb.utils.name import stripname
from jsb.utils.exception import handle_exception
from jsb.utils.urldata import UrlData
from jsb.utils.generic import waitforqueue
from jsb.utils.url import geturl2, striphtml, Url
from jsb.lib.datadir import getdatadir
from jsb.lib.persist import PersistCollection
from jsb.lib.commands import cmnds
from jsb.lib.examples import examples
from jsb.lib.threadloop import ThreadLoop
from jsb.lib.callbacks import callbacks
from jsb.imports import getBeautifulSoup
soup = getBeautifulSoup()

## basic imports

from collections import deque 
import os
import logging
import re
import sys
import time
import math
import urllib2
import urlparse
import optparse
from cgi import escape
from traceback import format_exc