def searchGoogle(): infoMsg = "[INFO] google dorking is running, please wait...\n" cetakData(infoMsg) dork, page = konf.target page = page if page > 1 else 1 # atur kembali konf.googleDork = dork data = { "q": dork, "num": 100, "hl": "en", "complete": 0, "safe": "off", "filter": 0, "btnG": "search", "start": page } url = "https://www.google.com/search?" + urllib.urlencode(data) response = UserAgent.open(url) htmltext = response.read() if re.search("(?i)captcha", htmltext): criMsg = "can't get dorking results. " criMsg += "captcha challenge detected" logger.critical(criMsg) raise W3bruteNextStepException soup = BeautifulSoup(htmltext) h3tags = soup.findAll("h3", attrs={"class": "r"}) urls = [ urlparse.parse_qsl(urlparse.urlsplit(tag.a["href"]).query)[0][1] for tag in h3tags ] return urls or None
class Crawler(object): """ This class defines methods used to perform crawling (command line option '--crawl' """ def getTargetUrls(self): try: threadData = getCurrentThreadData() threadData.shared.outputs = oset() def crawlThread(): threadData = getCurrentThreadData() while kb.threadContinue: with kb.locks.limits: if threadData.shared.unprocessed: current = threadData.shared.unprocessed.pop() else: break content = None try: if current: content = Request.getPage(url=current, crawling=True, raise404=False)[0] except SqlmapConnectionException, e: errMsg = "connection exception detected (%s). skipping " % e errMsg += "url '%s'" % current logger.critical(errMsg) except httplib.InvalidURL, e: errMsg = "invalid url detected (%s). skipping " % e errMsg += "url '%s'" % current logger.critical(errMsg) if not kb.threadContinue: break if isinstance(content, unicode): try: soup = BeautifulSoup(content) for tag in soup('a'): if tag.get("href"): url = urlparse.urljoin( conf.url, tag.get("href")) # flag to know if we are dealing with the same target host _ = reduce( lambda x, y: x == y, map( lambda x: urlparse.urlparse( x).netloc.split(':')[0], (url, conf.url))) if conf.scope: if not re.search( conf.scope, url, re.I): continue elif not _: continue if url.split('.')[-1].lower( ) not in CRAWL_EXCLUDE_EXTENSIONS: with kb.locks.outputs: threadData.shared.deeper.add(url) if re.search(r"(.*?)\?(.+)", url): threadData.shared.outputs.add( url) except UnicodeEncodeError: # for non-HTML files pass finally: if conf.forms: findPageForms(content, current, False, True) if conf.verbose in (1, 2): threadData.shared.count += 1 status = '%d/%d links visited (%d%s)' % ( threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length), '%') dataToStdout( "\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
logger.critical(errMsg) except httplib.InvalidURL, ex: errMsg = u"检测到无效的网址(%s) " % getSafeExString(ex) errMsg += u"跳过网址 '%s'" % current logger.critical(errMsg) if not kb.threadContinue: break if isinstance(content, unicode): try: match = re.search(r"(?si)<html[^>]*>(.+)</html>", content) if match: content = "<html>%s</html>" % match.group(1) soup = BeautifulSoup(content) tags = soup('a') if not tags: tags = re.finditer(r'(?i)<a[^>]+href="(?P<href>[^>"]+)"', content) for tag in tags: href = tag.get("href") if hasattr(tag, "get") else tag.group("href") if href: if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID: current = threadData.lastRedirectURL[1] url = urlparse.urljoin(current, href) # 通过标记来判断我们是否在处理同一个目标主机 _ = checkSameHost(url, target)
def crawlThread(): threadData = getCurrentThreadData() while kb.threadContinue: with kb.locks.limit: if threadData.shared.unprocessed: current = threadData.shared.unprocessed.pop() if current in visited: continue elif conf.crawlExclude and re.search(conf.crawlExclude, current): dbgMsg = "skipping '%s'" % current logger.debug(dbgMsg) continue else: visited.add(current) else: break content = None try: if current: content = Request.getPage(url=current, crawling=True, raise404=False)[0] except SqlmapConnectionException as ex: errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex) errMsg += "URL '%s'" % current logger.critical(errMsg) except SqlmapSyntaxException: errMsg = "invalid URL detected. skipping '%s'" % current logger.critical(errMsg) except _http_client.InvalidURL as ex: errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex) errMsg += "URL '%s'" % current logger.critical(errMsg) if not kb.threadContinue: break if isinstance(content, six.text_type): try: match = re.search(r"(?si)<html[^>]*>(.+)</html>", content) if match: content = "<html>%s</html>" % match.group(1) soup = BeautifulSoup(content) tags = soup('a') tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content) tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content) for tag in tags: href = tag.get("href") if hasattr(tag, "get") else tag.group("href") if href: if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID: current = threadData.lastRedirectURL[1] url = _urllib.parse.urljoin(current, htmlUnescape(href)) # flag to know if we are dealing with the same target host _ = checkSameHost(url, target) if conf.scope: if not re.search(conf.scope, url, re.I): continue elif not _: continue if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS: with kb.locks.value: threadData.shared.deeper.add(url) if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url): threadData.shared.value.add(url) except UnicodeEncodeError: # for non-HTML files pass except ValueError: # for non-valid links pass finally: if conf.forms: threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0 if conf.verbose in (1, 2): threadData.shared.count += 1 status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length)) dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
def __init__(self, response): htmltext = response.read() source = io.BytesIO(htmltext) source.geturl = response.geturl self.forms = ParseForm(source) self.soup = BeautifulSoup(htmltext)
class ParsePage(object): """ menguraikan html """ def __init__(self, response): htmltext = response.read() source = io.BytesIO(htmltext) source.geturl = response.geturl self.forms = ParseForm(source) self.soup = BeautifulSoup(htmltext) @property def title(self): """ :return: judul halaman """ elem = self.soup.find("title") return str(elem.text) def getValidForms(self): """ fungsi ini untuk mendapatkan form yang menuju ke dashboard website """ if auth.IS_AUTHORIZATION: # skip... return infoMsg = "[INFO] try searching for form that goes to the website dashboard...\n" cetakData(infoMsg) try: for form in self.forms: input_controls = form.controls for input_elem in input_controls: input_type = input_elem.type # jika input type 'password' ditemukan # itu berarti form tersebut menuju ke # dashboard website. if input_type == "password": html.form = form html.soup = self.soup.find("form", attrs=form.attrs) raise W3bruteSkipParsingFormException except W3bruteSkipParsingFormException: infoMsg = "form that goes to the website dashboard is found" logger.info(infoMsg) else: criMsg = "form that goes to the website dashboard is not found. " if not konf.adminScanner: criMsg += "try using the '--admin' option to help you " criMsg += "find the admin login page." logger.critical(criMsg) raise W3bruteSkipTargetException def getTipeAutentikasi(self): """ mendapatkan tipe autentikasi target """ infoMsg = "[INFO] detecting target authentication type...\n" cetakData(infoMsg) if auth.IS_AUTHORIZATION: infoMsg = "authentication type: %s Authorization" % repr( auth.type.capitalize()) logger.info(infoMsg) return soup = html.soup if soup.find("input", type="text"): if re.search("(?i)email", str(soup)): auth_type = "email" auth.IS_EMAIL_AUTH = True else: auth_type = "standard" auth.IS_STANDARD_AUTH = True elif soup.find("input", type="email"): auth_type = "email" auth.IS_EMAIL_AUTH = True else: infoMsg = "page title %s" % repr(self.title) logger.info(infoMsg) auth_type = "web shell" auth.IS_WEBSHELL_AUTH = True infoMsg = "authentication type: %s" % repr(auth_type) logger.info(infoMsg) def getParameterForm(self): if auth.IS_AUTHORIZATION: # skip lagi... return infoMsg = "[INFO] find parameter(s)...\n" cetakData(infoMsg) soup = html.soup html.field = PyDict() if auth.IS_WEBSHELL_AUTH is None: input_elem = soup.find("input", type="text") \ or soup.find("input", type="email") if not input_elem.has_key("name"): errMsg = "parameter(s) not found in %s" % repr(str(input_elem)) logger.error(errMsg) raise W3bruteSkipTargetException html.field.username = input_elem.get("name") input_elem = soup.find("input", type="password") if not input_elem.has_key("name"): errMsg = "parameter(s) not found in %s" % repr(str(input_elem)) logger.error(errMsg) raise W3bruteSkipTargetException html.field.password = input_elem.get("name")
#!/usr/bin/env python #coding=utf-8 from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc) #print(soup.prettify()) print(soup.findAll('a'))