Python BeautifulSoupの例、thirdparty.beautifulsoup.beautifulsoup.BeautifulSoup Pythonの例

コード例 #1

0

ファイルを表示

ファイル: target.py プロジェクト: hashbrown1013/w3brute

def searchGoogle():
    infoMsg = "[INFO] google dorking is running, please wait...\n"
    cetakData(infoMsg)

    dork, page = konf.target
    page = page if page > 1 else 1
    # atur kembali
    konf.googleDork = dork

    data = {
        "q": dork,
        "num": 100,
        "hl": "en",
        "complete": 0,
        "safe": "off",
        "filter": 0,
        "btnG": "search",
        "start": page
    }

    url = "https://www.google.com/search?" + urllib.urlencode(data)
    response = UserAgent.open(url)
    htmltext = response.read()

    if re.search("(?i)captcha", htmltext):
        criMsg = "can't get dorking results. "
        criMsg += "captcha challenge detected"

        logger.critical(criMsg)
        raise W3bruteNextStepException

    soup = BeautifulSoup(htmltext)
    h3tags = soup.findAll("h3", attrs={"class": "r"})
    urls = [
        urlparse.parse_qsl(urlparse.urlsplit(tag.a["href"]).query)[0][1]
        for tag in h3tags
    ]

    return urls or None

コード例 #2

0

ファイルを表示

ファイル: crawler.py プロジェクト: zhouli01/penetration

class Crawler(object):
    """
    This class defines methods used to perform crawling (command
    line option '--crawl'
    """
    def getTargetUrls(self):
        try:
            threadData = getCurrentThreadData()
            threadData.shared.outputs = oset()

            def crawlThread():
                threadData = getCurrentThreadData()

                while kb.threadContinue:
                    with kb.locks.limits:
                        if threadData.shared.unprocessed:
                            current = threadData.shared.unprocessed.pop()
                        else:
                            break

                    content = None
                    try:
                        if current:
                            content = Request.getPage(url=current,
                                                      crawling=True,
                                                      raise404=False)[0]
                    except SqlmapConnectionException, e:
                        errMsg = "connection exception detected (%s). skipping " % e
                        errMsg += "url '%s'" % current
                        logger.critical(errMsg)
                    except httplib.InvalidURL, e:
                        errMsg = "invalid url detected (%s). skipping " % e
                        errMsg += "url '%s'" % current
                        logger.critical(errMsg)

                    if not kb.threadContinue:
                        break

                    if isinstance(content, unicode):
                        try:
                            soup = BeautifulSoup(content)
                            for tag in soup('a'):
                                if tag.get("href"):
                                    url = urlparse.urljoin(
                                        conf.url, tag.get("href"))

                                    # flag to know if we are dealing with the same target host
                                    _ = reduce(
                                        lambda x, y: x == y,
                                        map(
                                            lambda x: urlparse.urlparse(
                                                x).netloc.split(':')[0],
                                            (url, conf.url)))

                                    if conf.scope:
                                        if not re.search(
                                                conf.scope, url, re.I):
                                            continue
                                    elif not _:
                                        continue

                                    if url.split('.')[-1].lower(
                                    ) not in CRAWL_EXCLUDE_EXTENSIONS:
                                        with kb.locks.outputs:
                                            threadData.shared.deeper.add(url)
                                            if re.search(r"(.*?)\?(.+)", url):
                                                threadData.shared.outputs.add(
                                                    url)
                        except UnicodeEncodeError:  # for non-HTML files
                            pass
                        finally:
                            if conf.forms:
                                findPageForms(content, current, False, True)

                    if conf.verbose in (1, 2):
                        threadData.shared.count += 1
                        status = '%d/%d links visited (%d%s)' % (
                            threadData.shared.count, threadData.shared.length,
                            round(100.0 * threadData.shared.count /
                                  threadData.shared.length), '%')
                        dataToStdout(
                            "\r[%s] [INFO] %s" % (time.strftime("%X"), status),
                            True)

コード例 #3

0

ファイルを表示

ファイル: crawler.py プロジェクト: taopeng-life/github

                    logger.critical(errMsg)
                except httplib.InvalidURL, ex:
                    errMsg = u"检测到无效的网址(%s) " % getSafeExString(ex)
                    errMsg += u"跳过网址 '%s'" % current
                    logger.critical(errMsg)

                if not kb.threadContinue:
                    break

                if isinstance(content, unicode):
                    try:
                        match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
                        if match:
                            content = "<html>%s</html>" % match.group(1)

                        soup = BeautifulSoup(content)
                        tags = soup('a')

                        if not tags:
                            tags = re.finditer(r'(?i)<a[^>]+href="(?P<href>[^>"]+)"', content)

                        for tag in tags:
                            href = tag.get("href") if hasattr(tag, "get") else tag.group("href")

                            if href:
                                if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
                                    current = threadData.lastRedirectURL[1]
                                url = urlparse.urljoin(current, href)

                                # 通过标记来判断我们是否在处理同一个目标主机
                                _ = checkSameHost(url, target)

コード例 #4

0

ファイルを表示

ファイル: crawler.py プロジェクト: nyuurbome/juice

        def crawlThread():
            threadData = getCurrentThreadData()

            while kb.threadContinue:
                with kb.locks.limit:
                    if threadData.shared.unprocessed:
                        current = threadData.shared.unprocessed.pop()
                        if current in visited:
                            continue
                        elif conf.crawlExclude and re.search(conf.crawlExclude, current):
                            dbgMsg = "skipping '%s'" % current
                            logger.debug(dbgMsg)
                            continue
                        else:
                            visited.add(current)
                    else:
                        break

                content = None
                try:
                    if current:
                        content = Request.getPage(url=current, crawling=True, raise404=False)[0]
                except SqlmapConnectionException as ex:
                    errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)
                    errMsg += "URL '%s'" % current
                    logger.critical(errMsg)
                except SqlmapSyntaxException:
                    errMsg = "invalid URL detected. skipping '%s'" % current
                    logger.critical(errMsg)
                except _http_client.InvalidURL as ex:
                    errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)
                    errMsg += "URL '%s'" % current
                    logger.critical(errMsg)

                if not kb.threadContinue:
                    break

                if isinstance(content, six.text_type):
                    try:
                        match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
                        if match:
                            content = "<html>%s</html>" % match.group(1)

                        soup = BeautifulSoup(content)
                        tags = soup('a')

                        tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content)
                        tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content)

                        for tag in tags:
                            href = tag.get("href") if hasattr(tag, "get") else tag.group("href")

                            if href:
                                if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
                                    current = threadData.lastRedirectURL[1]
                                url = _urllib.parse.urljoin(current, htmlUnescape(href))

                                # flag to know if we are dealing with the same target host
                                _ = checkSameHost(url, target)

                                if conf.scope:
                                    if not re.search(conf.scope, url, re.I):
                                        continue
                                elif not _:
                                    continue

                                if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS:
                                    with kb.locks.value:
                                        threadData.shared.deeper.add(url)
                                        if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url):
                                            threadData.shared.value.add(url)
                    except UnicodeEncodeError:  # for non-HTML files
                        pass
                    except ValueError:          # for non-valid links
                        pass
                    finally:
                        if conf.forms:
                            threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0

                if conf.verbose in (1, 2):
                    threadData.shared.count += 1
                    status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)

コード例 #5

0

ファイルを表示

ファイル: page.py プロジェクト: hashbrown1013/w3brute

 def __init__(self, response):
     htmltext = response.read()
     source = io.BytesIO(htmltext)
     source.geturl = response.geturl
     self.forms = ParseForm(source)
     self.soup = BeautifulSoup(htmltext)

コード例 #6

0

ファイルを表示

ファイル: page.py プロジェクト: hashbrown1013/w3brute

class ParsePage(object):
    """
    menguraikan html
    """
    def __init__(self, response):
        htmltext = response.read()
        source = io.BytesIO(htmltext)
        source.geturl = response.geturl
        self.forms = ParseForm(source)
        self.soup = BeautifulSoup(htmltext)

    @property
    def title(self):
        """
        :return: judul halaman
        """

        elem = self.soup.find("title")
        return str(elem.text)

    def getValidForms(self):
        """
        fungsi ini untuk mendapatkan form 
        yang menuju ke dashboard website
        """

        if auth.IS_AUTHORIZATION:
            # skip...
            return

        infoMsg = "[INFO] try searching for form that goes to the website dashboard...\n"
        cetakData(infoMsg)

        try:
            for form in self.forms:
                input_controls = form.controls
                for input_elem in input_controls:
                    input_type = input_elem.type
                    # jika input type 'password' ditemukan
                    # itu berarti form tersebut menuju ke
                    # dashboard website.
                    if input_type == "password":
                        html.form = form
                        html.soup = self.soup.find("form", attrs=form.attrs)

                        raise W3bruteSkipParsingFormException

        except W3bruteSkipParsingFormException:
            infoMsg = "form that goes to the website dashboard is found"
            logger.info(infoMsg)

        else:
            criMsg = "form that goes to the website dashboard is not found. "

            if not konf.adminScanner:
                criMsg += "try using the '--admin' option to help you "
                criMsg += "find the admin login page."

            logger.critical(criMsg)
            raise W3bruteSkipTargetException

    def getTipeAutentikasi(self):
        """
        mendapatkan tipe autentikasi target
        """

        infoMsg = "[INFO] detecting target authentication type...\n"
        cetakData(infoMsg)

        if auth.IS_AUTHORIZATION:
            infoMsg = "authentication type: %s Authorization" % repr(
                auth.type.capitalize())
            logger.info(infoMsg)

            return

        soup = html.soup

        if soup.find("input", type="text"):
            if re.search("(?i)email", str(soup)):
                auth_type = "email"
                auth.IS_EMAIL_AUTH = True
            else:
                auth_type = "standard"
                auth.IS_STANDARD_AUTH = True

        elif soup.find("input", type="email"):
            auth_type = "email"
            auth.IS_EMAIL_AUTH = True

        else:
            infoMsg = "page title %s" % repr(self.title)
            logger.info(infoMsg)

            auth_type = "web shell"
            auth.IS_WEBSHELL_AUTH = True

        infoMsg = "authentication type: %s" % repr(auth_type)
        logger.info(infoMsg)

    def getParameterForm(self):
        if auth.IS_AUTHORIZATION:
            # skip lagi...
            return

        infoMsg = "[INFO] find parameter(s)...\n"
        cetakData(infoMsg)

        soup = html.soup
        html.field = PyDict()

        if auth.IS_WEBSHELL_AUTH is None:
            input_elem = soup.find("input", type="text") \
                or soup.find("input", type="email")

            if not input_elem.has_key("name"):
                errMsg = "parameter(s) not found in %s" % repr(str(input_elem))
                logger.error(errMsg)

                raise W3bruteSkipTargetException

            html.field.username = input_elem.get("name")

        input_elem = soup.find("input", type="password")

        if not input_elem.has_key("name"):
            errMsg = "parameter(s) not found in %s" % repr(str(input_elem))
            logger.error(errMsg)

            raise W3bruteSkipTargetException

        html.field.password = input_elem.get("name")

コード例 #7

0

ファイルを表示

ファイル: invert88888.py プロジェクト: taopeng-life/github

#!/usr/bin/env python
#coding=utf-8

from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc)
#print(soup.prettify())
print(soup.findAll('a'))