def removeDynamicContent(self, page, dynamicMarks):
        """
        Removing dynamic content from supplied page basing removal on
        precalculated dynamic markings
        """
        if page and len(dynamicMarks) > 0:
            encoding = chardet.detect(page)['encoding']
            page = page.decode(encoding, errors='replace')
            for item in dynamicMarks:
                prefix, suffix = item
                if prefix is not None:
                    prefix = prefix.decode(encoding, errors='replace')
                if suffix is not None:
                    suffix = suffix.decode(encoding, errors='replace')

                if prefix is None and suffix is None:
                    continue
                elif prefix is None:
                    page = re.sub(r'(?s)^.+{0}'.format(re.escape(suffix)), suffix.replace('\\', r'\\'), page)
                elif suffix is None:
                    page = re.sub(r'(?s){0}.+$'.format(re.escape(prefix)), prefix.replace('\\', r'\\'), page)
                else:
                    page = re.sub(r'(?s){0}.+{1}'.format(re.escape(prefix), re.escape(suffix)), "{0}{1}".format(prefix.replace('\\', r'\\'), suffix.replace('\\', r'\\')), page)

            page = page.encode()

        return page
    def removeDynamicContent(self, page, dynamicMarks):
        """
        Removing dynamic content from supplied page basing removal on
        precalculated dynamic markings
        """
        if page:
            encoding = chardet.detect(page)['encoding']
            page = page.decode(encoding, errors='replace')
            for item in dynamicMarks:
                prefix, suffix = item
                if prefix is not None:
                    prefix = prefix.decode(encoding, errors='replace')
                if suffix is not None:
                    suffix = suffix.decode(encoding, errors='replace')

                if prefix is None and suffix is None:
                    continue
                elif prefix is None:
                    page = re.sub(r'(?s)^.+{0}'.format(re.escape(suffix)),
                                  suffix.replace('\\', r'\\'), page)
                elif suffix is None:
                    page = re.sub(r'(?s){0}.+$'.format(re.escape(prefix)),
                                  prefix.replace('\\', r'\\'), page)
                else:
                    page = re.sub(
                        r'(?s){0}.+{1}'.format(re.escape(prefix),
                                               re.escape(suffix)),
                        "{0}{1}".format(prefix.replace('\\', r'\\'),
                                        suffix.replace('\\', r'\\')), page)

        return page
Exemple #3
0
def getHeuristicCharEncoding(page):
    """
    Returns page encoding charset detected by usage of heuristics
    Reference: http://chardet.feedparser.org/docs/
    """
    retVal = detect(page)["encoding"]

    infoMsg = "heuristics detected web page charset '%s'" % retVal
    singleTimeLogMessage(infoMsg, logging.INFO, retVal)

    return retVal
Exemple #4
0
def char_convert(content, in_enc=["ASCII", "GB2312", "GBK", "gb18030"], out_enc="UTF-8"):
    rs_content = ""
    try:
        result = chardet.detect(content)
        coding = result.get("encoding")
        for k in in_enc:
            if k and coding and k.upper() == coding.upper():
                rs_content = content.decode(coding).encode(out_enc)
        rs_content = content if not rs_content else rs_content
    except IOError, e:
        pass
Exemple #5
0
def getHeuristicCharEncoding(page):
    """
    Returns page encoding charset detected by usage of heuristics
    Reference: http://chardet.feedparser.org/docs/
    """
    retVal = detect(page)["encoding"]

    infoMsg = "heuristics detected web page charset '%s'" % retVal
    singleTimeLogMessage(infoMsg, logging.INFO, retVal)

    return retVal
Exemple #6
0
    def _detectEncodeType(self, content):
        result = {}

        for key, value in self._bomList.iteritems():
            if content.startswith(value):
                result['encoding'] = key + "-bom"
                result['confidence'] = 0.80
                break
        else:
            result = chardet.detect(content)

        return result
Exemple #7
0
    def _detectEncodeType(self, content):
        result = {}

        for key,value in self._bomList.iteritems():
            if content.startswith(value):
                result['encoding'] = key + "-bom"
                result['confidence'] = 0.80
                break
        else:
            result = chardet.detect(content)

        return result
def getHeuristicCharEncoding(page):
    """
    Returns page encoding charset detected by usage of heuristics
    Reference: http://chardet.feedparser.org/docs/
    """

    key = hash(page)
    retVal = kb.cache.encoding.get(key) or detect(page)["encoding"]
    kb.cache.encoding[key] = retVal

    if retVal:
        infoMsg = "heuristics detected web page charset '%s'" % retVal
        singleTimeLogMessage(infoMsg, logging.INFO, retVal)

    return retVal
Exemple #9
0
def getHeuristicCharEncoding(page):
    """
    Returns page encoding charset detected by usage of heuristics
    Reference: http://chardet.feedparser.org/docs/
    """

    key = hash(page)
    retVal = kb.cache.encoding.get(key) or detect(page)["encoding"]
    kb.cache.encoding[key] = retVal

    if retVal:
        infoMsg = "heuristics detected web page charset '%s'" % retVal
        singleTimeLogMessage(infoMsg, logging.INFO, retVal)

    return retVal
Exemple #10
0
def getHeuristicCharEncoding(page):
    """
    返回使用启发式方法检测的页面编码字符集
    Reference: http://chardet.feedparser.org/docs/
    """

    key = hash(page)
    retVal = kb.cache.encoding.get(key) or detect(page)["encoding"]
    kb.cache.encoding[key] = retVal

    if retVal:
        infoMsg = u"启发式检测网页字符集是'%s'" % retVal
        singleTimeLogMessage(infoMsg, logging.INFO, retVal)

    return retVal
Exemple #11
0
    def detect(self, size=2048):
        '''
        文件编码类型推断
        '''
        content = open(self.fileName, "rb").read(size)
        result = dict()
        for key, value in self._bomList.iteritems():
            if content.startswith(value):
                result['encoding'] = key + "-bom"
                result['confidence'] = 0.80
                break
        else:
            result = chardet.detect(content)

        return result
Exemple #12
0
    def detect(self, size=2048):
        '''
        文件编码类型推断
        '''
        content = open(self.fileName,"rb").read(size)
        result = dict()
        for key,value in self._bomList.iteritems():
            if content.startswith(value):
                result['encoding'] = key + "-bom"
                result['confidence'] = 0.80
                break
        else:
            result = chardet.detect(content)

        return result
Exemple #13
0
def getHeuristicCharEncoding(page):
    """
    Returns page encoding charset detected by usage of heuristics
    Reference: https://chardet.readthedocs.io/en/latest/usage.html
    >>> getHeuristicCharEncoding(b"<html></html>")
    'ascii'
    """

    key = hash(page)
    retVal = kb.cache.encoding.get(key) or detect(page[:HEURISTIC_PAGE_SIZE_THRESHOLD])["encoding"]
    kb.cache.encoding[key] = retVal

    if retVal and retVal.lower().replace('-', "") == UNICODE_ENCODING.lower().replace('-', ""):
        infoMsg = "heuristics detected web page charset '%s'" % retVal
        singleTimeLogMessage(infoMsg, logging.INFO, retVal)

    return retVal
Exemple #14
0
def getHeuristicCharEncoding(page):
    """
    Returns page encoding charset detected by usage of heuristics

    Reference: https://chardet.readthedocs.io/en/latest/usage.html

    >>> getHeuristicCharEncoding(b"<html></html>")
    'ascii'
    """

    key = hash(page)
    retVal = kb.cache.encoding.get(key) or detect(page)["encoding"]
    kb.cache.encoding[key] = retVal

    if retVal:
        infoMsg = "heuristics detected web page charset '%s'" % retVal
        singleTimeLogMessage(infoMsg, logging.INFO, retVal)

    return retVal
Exemple #15
0
    def removeDynamicContent(self, page, dynamicMarks):
        """
        Removing dynamic content from supplied page basing removal on
        precalculated dynamic markings
        """
        if page and len(dynamicMarks) > 0:
            encoding = chardet.detect(page)["encoding"]
            page = page.decode(encoding, errors="replace")
            for item in dynamicMarks:
                prefix, suffix = item
                if prefix is not None:
                    prefix = prefix.decode(encoding, errors="replace")
                if suffix is not None:
                    suffix = suffix.decode(encoding, errors="replace")

                if prefix is None and suffix is None:
                    continue
                elif prefix is None:
                    page = re.sub(
                        r"(?s)^.+{0}".format(re.escape(suffix)),
                        suffix.replace("\\", r"\\"),
                        page,
                    )
                elif suffix is None:
                    page = re.sub(
                        r"(?s){0}.+$".format(re.escape(prefix)),
                        prefix.replace("\\", r"\\"),
                        page,
                    )
                else:
                    page = re.sub(
                        r"(?s){0}.+{1}".format(re.escape(prefix),
                                               re.escape(suffix)),
                        "{0}{1}".format(prefix.replace("\\", r"\\"),
                                        suffix.replace("\\", r"\\")),
                        page,
                    )

            page = page.encode()

        return page
Exemple #16
0
 def detect(self):
     rawstr = "".join([x[1] for x in self._autoPreDecode()])
     return chardet.detect(rawstr)
Exemple #17
0
 def detect(self):
     '''
     非ASCII字符串编码类型推断
     '''
     rawstr = "".join([x[1] for x in self._autoPreDecode()])
     return chardet.detect(rawstr)
Exemple #18
0
 def detect(self):
     rawstr = "".join([x[1] for x in self._autoPreDecode()])
     return chardet.detect(rawstr)
Exemple #19
0
 def detect(self):
     '''
     非ASCII字符串编码类型推断
     '''
     rawstr = "".join([x[1] for x in self._autoPreDecode()])
     return chardet.detect(rawstr)
Exemple #20
0
 def chardet_dammit(s):
     if isinstance(s, str):
         return None
     return chardet.detect(s)['encoding']