soup = BeautifulSoup(content) tags = soup('a') if not tags: tags = re.finditer(r'(?i)<a[^>]+href="(?P<href>[^>"]+)"', content) for tag in tags: href = tag.get("href") if hasattr(tag, "get") else tag.group("href") if href: if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID: current = threadData.lastRedirectURL[1] url = urlparse.urljoin(current, href) # flag to know if we are dealing with the same target host _ = checkSameHost(url, target) if conf.scope: if not re.search(conf.scope, url, re.I): continue elif not _: continue if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS: with kb.locks.value: threadData.shared.deeper.add(url) if re.search(r"(.*?)\?(.+)", url): threadData.shared.value.add(url) except UnicodeEncodeError: # for non-HTML files pass except ValueError: # for non-valid links
soup = BeautifulSoup(content) tags = soup('a') if not tags: tags = re.finditer(r'(?i)<a[^>]+href="(?P<href>[^>"]+)"', content) for tag in tags: href = tag.get("href") if hasattr(tag, "get") else tag.group("href") if href: if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID: current = threadData.lastRedirectURL[1] url = urlparse.urljoin(current, href) # 通过标记来判断我们是否在处理同一个目标主机 _ = checkSameHost(url, target) if conf.scope: if not re.search(conf.scope, url, re.I): continue elif not _: continue if url.split('.')[-1].lower() not in CRAWL_EXCLUDE_EXTENSIONS: with kb.locks.value: threadData.shared.deeper.add(url) if re.search(r"(.*?)\?(.+)", url): threadData.shared.value.add(url) except UnicodeEncodeError: # 用于非HTML文件 pass except ValueError: # 用于非有效链接
def crawlThread(): threadData = getCurrentThreadData() while kb.threadContinue: with kb.locks.limit: if threadData.shared.unprocessed: current = threadData.shared.unprocessed.pop() if current in visited: continue elif conf.crawlExclude and re.search(conf.crawlExclude, current): dbgMsg = "skipping '%s'" % current logger.debug(dbgMsg) continue else: visited.add(current) else: break content = None try: if current: content = Request.getPage(url=current, crawling=True, raise404=False)[0] except SqlmapConnectionException as ex: errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex) errMsg += "URL '%s'" % current logger.critical(errMsg) except SqlmapSyntaxException: errMsg = "invalid URL detected. skipping '%s'" % current logger.critical(errMsg) except _http_client.InvalidURL as ex: errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex) errMsg += "URL '%s'" % current logger.critical(errMsg) if not kb.threadContinue: break if isinstance(content, six.text_type): try: match = re.search(r"(?si)<html[^>]*>(.+)</html>", content) if match: content = "<html>%s</html>" % match.group(1) soup = BeautifulSoup(content) tags = soup('a') tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content) tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content) for tag in tags: href = tag.get("href") if hasattr(tag, "get") else tag.group("href") if href: if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID: current = threadData.lastRedirectURL[1] url = _urllib.parse.urljoin(current, htmlUnescape(href)) # flag to know if we are dealing with the same target host _ = checkSameHost(url, target) if conf.scope: if not re.search(conf.scope, url, re.I): continue elif not _: continue if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS: with kb.locks.value: threadData.shared.deeper.add(url) if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url): threadData.shared.value.add(url) except UnicodeEncodeError: # for non-HTML files pass except ValueError: # for non-valid links pass finally: if conf.forms: threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0 if conf.verbose in (1, 2): threadData.shared.count += 1 status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length)) dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)