Esempio n. 1
0
    # can't do for all responses because we need to support binary files too
    if not isinstance(page, unicode) and "text/" in contentType:
        if kb.heuristicMode:
            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
            page = getUnicode(page, kb.pageEncoding)
        else:
            # e.g. Ãëàâà
            if "&#" in page:
                page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)

            # e.g. %20%28%29
            if "%" in page:
                page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page)

            # e.g. &amp;
            page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)

            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))

            if (kb.pageEncoding or "").lower() == "utf-8-sig":
                kb.pageEncoding = "utf-8"
                if page and page.startswith("\xef\xbb\xbf"):  # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling)
                    page = page[3:]

            page = getUnicode(page, kb.pageEncoding)

            # e.g. &#8217;&#8230;&#8482;
            if "&#" in page:
                def _(match):
                    retVal = match.group(0)
                    try:
Esempio n. 2
0
    # can't do for all responses because we need to support binary files too
    if contentType and not isinstance(page, unicode) and "text/" in contentType.lower():
        if kb.heuristicMode:
            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
            page = getUnicode(page, kb.pageEncoding)
        else:
            # e.g. &#195;&#235;&#224;&#226;&#224;
            if "&#" in page:
                page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)

            # e.g. %20%28%29
            if "%" in page:
                page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page)

            # e.g. &amp;
            page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)

            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
            page = getUnicode(page, kb.pageEncoding)

            # e.g. &#8217;&#8230;&#8482;
            if "&#" in page:
                def _(match):
                    retVal = match.group(0)
                    try:
                        retVal = unichr(int(match.group(1)))
                    except ValueError:
                        pass
                    return retVal
                page = re.sub(r"&#(\d+);", _, page)
Esempio n. 3
0
        else:
            # e.g. &#195;&#235;&#224;&#226;&#224;
            if "&#" in page:
                page = re.sub(
                    r"&#(\d{1,3});", lambda _: chr(int(_.group(1)))
                    if int(_.group(1)) < 256 else _.group(0), page)

            # e.g. %20%28%29
            if "%" in page:
                page = re.sub(r"%([0-9a-fA-F]{2})",
                              lambda _: _.group(1).decode("hex"), page)

            # e.g. &amp;
            page = re.sub(
                r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)])
                if htmlEntities.get(_.group(1), 256) < 256 else _.group(0),
                page)

            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(
                getHeuristicCharEncoding(page))
            page = getUnicode(page, kb.pageEncoding)

            # e.g. &#8217;&#8230;&#8482;
            if "&#" in page:

                def _(match):
                    retVal = match.group(0)
                    try:
                        retVal = unichr(int(match.group(1)))
                    except ValueError:
                        pass
Esempio n. 4
0
def decodePage(page, contentEncoding, contentType):
    """
    Decode compressed/charset HTTP response
    """

    if not page or (conf.nullConnection and len(page) < 2):
        return getUnicode(page)

    if hasattr(contentEncoding, "lower"):
        contentEncoding = contentEncoding.lower()
    else:
        contentEncoding = ""

    if hasattr(contentType, "lower"):
        contentType = contentType.lower()
    else:
        contentType = ""

    if contentEncoding in ("gzip", "x-gzip", "deflate"):
        if not kb.pageCompress:
            return None

        try:
            if contentEncoding == "deflate":
                data = io.BytesIO(zlib.decompress(page, -15))  # Reference: http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations
            else:
                data = gzip.GzipFile("", "rb", 9, io.BytesIO(page))
                size = struct.unpack("<l", page[-4:])[0]  # Reference: http://pydoc.org/get.cgi/usr/local/lib/python2.5/gzip.py
                if size > MAX_CONNECTION_TOTAL_SIZE:
                    raise Exception("size too large")

            page = data.read()
        except Exception as ex:
            if "<html" not in page:  # in some cases, invalid "Content-Encoding" appears for plain HTML (should be ignored)
                errMsg = "detected invalid data for declared content "
                errMsg += "encoding '%s' ('%s')" % (contentEncoding, getSafeExString(ex))
                singleTimeLogMessage(errMsg, logging.ERROR)

                warnMsg = "turning off page compression"
                singleTimeWarnMessage(warnMsg)

                kb.pageCompress = False
                raise SqlmapCompressionException

    if not conf.encoding:
        httpCharset, metaCharset = None, None

        # Reference: http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
        if contentType.find("charset=") != -1:
            httpCharset = checkCharEncoding(contentType.split("charset=")[-1])

        metaCharset = checkCharEncoding(extractRegexResult(META_CHARSET_REGEX, page))

        if (any((httpCharset, metaCharset)) and not all((httpCharset, metaCharset))) or (httpCharset == metaCharset and all((httpCharset, metaCharset))):
            kb.pageEncoding = httpCharset or metaCharset  # Reference: http://bytes.com/topic/html-css/answers/154758-http-equiv-vs-true-header-has-precedence
            debugMsg = "declared web page charset '%s'" % kb.pageEncoding
            singleTimeLogMessage(debugMsg, logging.DEBUG, debugMsg)
        else:
            kb.pageEncoding = None
    else:
        kb.pageEncoding = conf.encoding

    # can't do for all responses because we need to support binary files too
    if isinstance(page, six.binary_type) and "text/" in contentType:
        # e.g. &#x9;&#195;&#235;&#224;&#226;&#224;
        if b"&#" in page:
            page = re.sub(b"&#x([0-9a-f]{1,2});", lambda _: decodeHex(_.group(1) if len(_.group(1)) == 2 else "0%s" % _.group(1)), page)
            page = re.sub(b"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)

        # e.g. %20%28%29
        if b"%" in page:
            page = re.sub(b"%([0-9a-fA-F]{2})", lambda _: decodeHex(_.group(1)), page)

        # e.g. &amp;
        page = re.sub(b"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)

        kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))

        if (kb.pageEncoding or "").lower() == "utf-8-sig":
            kb.pageEncoding = "utf-8"
            if page and page.startswith("\xef\xbb\xbf"):  # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling)
                page = page[3:]

        page = getUnicode(page, kb.pageEncoding)

        # e.g. &#8217;&#8230;&#8482;
        if "&#" in page:
            def _(match):
                retVal = match.group(0)
                try:
                    retVal = unichr(int(match.group(1)))
                except (ValueError, OverflowError):
                    pass
                return retVal
            page = re.sub(r"&#(\d+);", _, page)

        # e.g. &zeta;
        page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page)

    return page
Esempio n. 5
0
            page = getUnicode(page, kb.pageEncoding)
        else:
            # e.g. &#195;&#235;&#224;&#226;&#224;
            if "&#" in page:
                page = re.sub(
                    r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page
                )

            # e.g. %20%28%29
            if "%" in page:
                page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page)

            # e.g. &amp;
            page = re.sub(
                r"&([^;]+);",
                lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0),
                page,
            )

            kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))
            page = getUnicode(page, kb.pageEncoding)

            # e.g. &#8217;&#8230;&#8482;
            if "&#" in page:

                def _(match):
                    retVal = match.group(0)
                    try:
                        retVal = unichr(int(match.group(1)))
                    except ValueError:
                        pass
Esempio n. 6
0
def decodePage(page, contentEncoding, contentType):
    """
    Decode compressed/charset HTTP response
    """

    if not page or (conf.nullConnection and len(page) < 2):
        return getUnicode(page)

    if hasattr(contentEncoding, "lower"):
        contentEncoding = contentEncoding.lower()
    else:
        contentEncoding = ""

    if hasattr(contentType, "lower"):
        contentType = contentType.lower()
    else:
        contentType = ""

    if contentEncoding in ("gzip", "x-gzip", "deflate"):
        if not kb.pageCompress:
            return None

        try:
            if contentEncoding == "deflate":
                data = io.BytesIO(zlib.decompress(page, -15))  # Reference: http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations
            else:
                data = gzip.GzipFile("", "rb", 9, io.BytesIO(page))
                size = struct.unpack("<l", page[-4:])[0]  # Reference: http://pydoc.org/get.cgi/usr/local/lib/python2.5/gzip.py
                if size > MAX_CONNECTION_TOTAL_SIZE:
                    raise Exception("size too large")

            page = data.read()
        except Exception as ex:
            if "<html" not in page:  # in some cases, invalid "Content-Encoding" appears for plain HTML (should be ignored)
                errMsg = "detected invalid data for declared content "
                errMsg += "encoding '%s' ('%s')" % (contentEncoding, getSafeExString(ex))
                singleTimeLogMessage(errMsg, logging.ERROR)

                warnMsg = "turning off page compression"
                singleTimeWarnMessage(warnMsg)

                kb.pageCompress = False
                raise SqlmapCompressionException

    if not conf.encoding:
        httpCharset, metaCharset = None, None

        # Reference: http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
        if contentType.find("charset=") != -1:
            httpCharset = checkCharEncoding(contentType.split("charset=")[-1])

        metaCharset = checkCharEncoding(extractRegexResult(META_CHARSET_REGEX, page))

        if (any((httpCharset, metaCharset)) and not all((httpCharset, metaCharset))) or (httpCharset == metaCharset and all((httpCharset, metaCharset))):
            kb.pageEncoding = httpCharset or metaCharset  # Reference: http://bytes.com/topic/html-css/answers/154758-http-equiv-vs-true-header-has-precedence
            debugMsg = "declared web page charset '%s'" % kb.pageEncoding
            singleTimeLogMessage(debugMsg, logging.DEBUG, debugMsg)
        else:
            kb.pageEncoding = None
    else:
        kb.pageEncoding = conf.encoding

    # can't do for all responses because we need to support binary files too
    if isinstance(page, six.binary_type) and "text/" in contentType:
        # e.g. &#x9;&#195;&#235;&#224;&#226;&#224;
        if "&#" in page:
            page = re.sub(r"&#x([0-9a-f]{1,2});", lambda _: (_.group(1) if len(_.group(1)) == 2 else "0%s" % _.group(1)).decode("hex"), page)
            page = re.sub(r"&#(\d{1,3});", lambda _: chr(int(_.group(1))) if int(_.group(1)) < 256 else _.group(0), page)

        # e.g. %20%28%29
        if "%" in page:
            page = re.sub(r"%([0-9a-fA-F]{2})", lambda _: _.group(1).decode("hex"), page)

        # e.g. &amp;
        page = re.sub(r"&([^;]+);", lambda _: chr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 256) < 256 else _.group(0), page)

        kb.pageEncoding = kb.pageEncoding or checkCharEncoding(getHeuristicCharEncoding(page))

        if (kb.pageEncoding or "").lower() == "utf-8-sig":
            kb.pageEncoding = "utf-8"
            if page and page.startswith("\xef\xbb\xbf"):  # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling)
                page = page[3:]

        page = getUnicode(page, kb.pageEncoding)

        # e.g. &#8217;&#8230;&#8482;
        if "&#" in page:
            def _(match):
                retVal = match.group(0)
                try:
                    retVal = unichr(int(match.group(1)))
                except (ValueError, OverflowError):
                    pass
                return retVal
            page = re.sub(r"&#(\d+);", _, page)

        # e.g. &zeta;
        page = re.sub(r"&([^;]+);", lambda _: unichr(htmlEntities[_.group(1)]) if htmlEntities.get(_.group(1), 0) > 255 else _.group(0), page)

    return page