class Crawler(object): """ This class defines methods used to perform crawling (command line option '--crawl' """ def getTargetUrls(self): try: threadData = getCurrentThreadData() threadData.shared.outputs = oset() def crawlThread(): threadData = getCurrentThreadData() while kb.threadContinue: with kb.locks.limits: if threadData.shared.unprocessed: current = threadData.shared.unprocessed.pop() else: break content = None try: if current: content = Request.getPage(url=current, crawling=True, raise404=False)[0] except SqlmapConnectionException, e: errMsg = "connection exception detected (%s). skipping " % e errMsg += "url '%s'" % current logger.critical(errMsg) except httplib.InvalidURL, e: errMsg = "invalid url detected (%s). skipping " % e errMsg += "url '%s'" % current logger.critical(errMsg) if not kb.threadContinue: break if isinstance(content, unicode): try: soup = BeautifulSoup(content) for tag in soup('a'): if tag.get("href"): url = urlparse.urljoin( conf.url, tag.get("href")) # flag to know if we are dealing with the same target host _ = reduce( lambda x, y: x == y, map( lambda x: urlparse.urlparse( x).netloc.split(':')[0], (url, conf.url))) if conf.scope: if not re.search( conf.scope, url, re.I): continue elif not _: continue if url.split('.')[-1].lower( ) not in CRAWL_EXCLUDE_EXTENSIONS: with kb.locks.outputs: threadData.shared.deeper.add(url) if re.search(r"(.*?)\?(.+)", url): threadData.shared.outputs.add( url) except UnicodeEncodeError: # for non-HTML files pass finally: if conf.forms: findPageForms(content, current, False, True) if conf.verbose in (1, 2): threadData.shared.count += 1 status = '%d/%d links visited (%d%s)' % ( threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length), '%') dataToStdout( "\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
if conf.scope: if not re.search(conf.scope, url, re.I): continue elif not _: continue if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS: with kb.locks.value: threadData.shared.deeper.add(url) if re.search(r"(.*?)\?(.+)", url): threadData.shared.value.add(url) except UnicodeEncodeError: # for non-HTML files pass finally: if conf.forms: findPageForms(content, current, False, True) if conf.verbose in (1, 2): threadData.shared.count += 1 status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length)) dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True) threadData.shared.deeper = set() threadData.shared.unprocessed = set([target]) logger.info("starting crawler") for i in xrange(conf.crawlDepth): if i > 0 and conf.threads == 1: singleTimeWarnMessage("running in a single-thread mode. This could take a while.") threadData.shared.count = 0
def crawlThread(): threadData = getCurrentThreadData() while kb.threadContinue: with kb.locks.limit: if threadData.shared.unprocessed: current = threadData.shared.unprocessed.pop() if current in visited: continue elif conf.crawlExclude and re.search(conf.crawlExclude, current): dbgMsg = "skipping '%s'" % current logger.debug(dbgMsg) continue else: visited.add(current) else: break content = None try: if current: content = Request.getPage(url=current, crawling=True, raise404=False)[0] except SqlmapConnectionException as ex: errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex) errMsg += "URL '%s'" % current logger.critical(errMsg) except SqlmapSyntaxException: errMsg = "invalid URL detected. skipping '%s'" % current logger.critical(errMsg) except _http_client.InvalidURL as ex: errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex) errMsg += "URL '%s'" % current logger.critical(errMsg) if not kb.threadContinue: break if isinstance(content, six.text_type): try: match = re.search(r"(?si)<html[^>]*>(.+)</html>", content) if match: content = "<html>%s</html>" % match.group(1) soup = BeautifulSoup(content) tags = soup('a') tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content) tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content) for tag in tags: href = tag.get("href") if hasattr(tag, "get") else tag.group("href") if href: if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID: current = threadData.lastRedirectURL[1] url = _urllib.parse.urljoin(current, htmlUnescape(href)) # flag to know if we are dealing with the same target host _ = checkSameHost(url, target) if conf.scope: if not re.search(conf.scope, url, re.I): continue elif not _: continue if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS: with kb.locks.value: threadData.shared.deeper.add(url) if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url): threadData.shared.value.add(url) except UnicodeEncodeError: # for non-HTML files pass except ValueError: # for non-valid links pass finally: if conf.forms: threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0 if conf.verbose in (1, 2): threadData.shared.count += 1 status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length)) dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
continue elif not _: continue if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS: with kb.locks.value: threadData.shared.deeper.add(url) if re.search(r"(.*?)\?(.+)", url): threadData.shared.value.add(url) except UnicodeEncodeError: # 用于非HTML文件 pass except ValueError: # 用于非有效链接 pass finally: if conf.forms: findPageForms(content, current, False, True) if conf.verbose in (1, 2): threadData.shared.count += 1 status = u'访问了%d/%d个链接(%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length)) dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True) threadData.shared.deeper = set() threadData.shared.unprocessed = set([target]) if not conf.sitemapUrl: message = u"你想检查网站是否存在站点地图sitemap(.xml)文件吗? [y/N] " if readInput(message, default='N', boolean=True): found = True items = None
# print traceback.print_exc() return if isinstance(content, unicode): try: match = re.search(r"(?si)<html[^>]*>(.+)</html>", content) if match: content = "<html>%s</html>" % match.group(1) soup = BeautifulSoup(content, 'lxml') tags = soup('a') if not tags: tags = re.finditer(r'(?si)<a[^>]+href="(?P<href>[^>"]+)"', content) for tag in tags: href = tag.get("href") if hasattr(tag, "get") else tag.group("href") if href and 'javascript:' not in href: href = urlparse.urljoin(conf.CRAWL_SITE, href) if conf.CRAWL_SITE in href: redisCon.lpush('tmpVisit', href) # logger.log(CUSTOM_LOGGING.ERROR, href) except Exception, ex: # for non-HTML files logger.log(CUSTOM_LOGGING.ERROR, ex) finally: forms = findPageForms(content, url, False) for form in forms: formMsg = '%s has form, url: %s method: %s data: %s' % (url, form[0], form[1], form[2]) logger.log(CUSTOM_LOGGING.WARNING, formMsg)