def removeDynamicContent(self, page, dynamicMarks): """ Removing dynamic content from supplied page basing removal on precalculated dynamic markings """ if page and len(dynamicMarks) > 0: encoding = chardet.detect(page)['encoding'] page = page.decode(encoding, errors='replace') for item in dynamicMarks: prefix, suffix = item if prefix is not None: prefix = prefix.decode(encoding, errors='replace') if suffix is not None: suffix = suffix.decode(encoding, errors='replace') if prefix is None and suffix is None: continue elif prefix is None: page = re.sub(r'(?s)^.+{0}'.format(re.escape(suffix)), suffix.replace('\\', r'\\'), page) elif suffix is None: page = re.sub(r'(?s){0}.+$'.format(re.escape(prefix)), prefix.replace('\\', r'\\'), page) else: page = re.sub(r'(?s){0}.+{1}'.format(re.escape(prefix), re.escape(suffix)), "{0}{1}".format(prefix.replace('\\', r'\\'), suffix.replace('\\', r'\\')), page) page = page.encode() return page
def removeDynamicContent(self, page, dynamicMarks): """ Removing dynamic content from supplied page basing removal on precalculated dynamic markings """ if page: encoding = chardet.detect(page)['encoding'] page = page.decode(encoding, errors='replace') for item in dynamicMarks: prefix, suffix = item if prefix is not None: prefix = prefix.decode(encoding, errors='replace') if suffix is not None: suffix = suffix.decode(encoding, errors='replace') if prefix is None and suffix is None: continue elif prefix is None: page = re.sub(r'(?s)^.+{0}'.format(re.escape(suffix)), suffix.replace('\\', r'\\'), page) elif suffix is None: page = re.sub(r'(?s){0}.+$'.format(re.escape(prefix)), prefix.replace('\\', r'\\'), page) else: page = re.sub( r'(?s){0}.+{1}'.format(re.escape(prefix), re.escape(suffix)), "{0}{1}".format(prefix.replace('\\', r'\\'), suffix.replace('\\', r'\\')), page) return page
def getHeuristicCharEncoding(page): """ Returns page encoding charset detected by usage of heuristics Reference: http://chardet.feedparser.org/docs/ """ retVal = detect(page)["encoding"] infoMsg = "heuristics detected web page charset '%s'" % retVal singleTimeLogMessage(infoMsg, logging.INFO, retVal) return retVal
def char_convert(content, in_enc=["ASCII", "GB2312", "GBK", "gb18030"], out_enc="UTF-8"): rs_content = "" try: result = chardet.detect(content) coding = result.get("encoding") for k in in_enc: if k and coding and k.upper() == coding.upper(): rs_content = content.decode(coding).encode(out_enc) rs_content = content if not rs_content else rs_content except IOError, e: pass
def _detectEncodeType(self, content): result = {} for key, value in self._bomList.iteritems(): if content.startswith(value): result['encoding'] = key + "-bom" result['confidence'] = 0.80 break else: result = chardet.detect(content) return result
def _detectEncodeType(self, content): result = {} for key,value in self._bomList.iteritems(): if content.startswith(value): result['encoding'] = key + "-bom" result['confidence'] = 0.80 break else: result = chardet.detect(content) return result
def getHeuristicCharEncoding(page): """ Returns page encoding charset detected by usage of heuristics Reference: http://chardet.feedparser.org/docs/ """ key = hash(page) retVal = kb.cache.encoding.get(key) or detect(page)["encoding"] kb.cache.encoding[key] = retVal if retVal: infoMsg = "heuristics detected web page charset '%s'" % retVal singleTimeLogMessage(infoMsg, logging.INFO, retVal) return retVal
def getHeuristicCharEncoding(page): """ 返回使用启发式方法检测的页面编码字符集 Reference: http://chardet.feedparser.org/docs/ """ key = hash(page) retVal = kb.cache.encoding.get(key) or detect(page)["encoding"] kb.cache.encoding[key] = retVal if retVal: infoMsg = u"启发式检测网页字符集是'%s'" % retVal singleTimeLogMessage(infoMsg, logging.INFO, retVal) return retVal
def detect(self, size=2048): ''' 文件编码类型推断 ''' content = open(self.fileName, "rb").read(size) result = dict() for key, value in self._bomList.iteritems(): if content.startswith(value): result['encoding'] = key + "-bom" result['confidence'] = 0.80 break else: result = chardet.detect(content) return result
def detect(self, size=2048): ''' 文件编码类型推断 ''' content = open(self.fileName,"rb").read(size) result = dict() for key,value in self._bomList.iteritems(): if content.startswith(value): result['encoding'] = key + "-bom" result['confidence'] = 0.80 break else: result = chardet.detect(content) return result
def getHeuristicCharEncoding(page): """ Returns page encoding charset detected by usage of heuristics Reference: https://chardet.readthedocs.io/en/latest/usage.html >>> getHeuristicCharEncoding(b"<html></html>") 'ascii' """ key = hash(page) retVal = kb.cache.encoding.get(key) or detect(page[:HEURISTIC_PAGE_SIZE_THRESHOLD])["encoding"] kb.cache.encoding[key] = retVal if retVal and retVal.lower().replace('-', "") == UNICODE_ENCODING.lower().replace('-', ""): infoMsg = "heuristics detected web page charset '%s'" % retVal singleTimeLogMessage(infoMsg, logging.INFO, retVal) return retVal
def getHeuristicCharEncoding(page): """ Returns page encoding charset detected by usage of heuristics Reference: https://chardet.readthedocs.io/en/latest/usage.html >>> getHeuristicCharEncoding(b"<html></html>") 'ascii' """ key = hash(page) retVal = kb.cache.encoding.get(key) or detect(page)["encoding"] kb.cache.encoding[key] = retVal if retVal: infoMsg = "heuristics detected web page charset '%s'" % retVal singleTimeLogMessage(infoMsg, logging.INFO, retVal) return retVal
def removeDynamicContent(self, page, dynamicMarks): """ Removing dynamic content from supplied page basing removal on precalculated dynamic markings """ if page and len(dynamicMarks) > 0: encoding = chardet.detect(page)["encoding"] page = page.decode(encoding, errors="replace") for item in dynamicMarks: prefix, suffix = item if prefix is not None: prefix = prefix.decode(encoding, errors="replace") if suffix is not None: suffix = suffix.decode(encoding, errors="replace") if prefix is None and suffix is None: continue elif prefix is None: page = re.sub( r"(?s)^.+{0}".format(re.escape(suffix)), suffix.replace("\\", r"\\"), page, ) elif suffix is None: page = re.sub( r"(?s){0}.+$".format(re.escape(prefix)), prefix.replace("\\", r"\\"), page, ) else: page = re.sub( r"(?s){0}.+{1}".format(re.escape(prefix), re.escape(suffix)), "{0}{1}".format(prefix.replace("\\", r"\\"), suffix.replace("\\", r"\\")), page, ) page = page.encode() return page
def detect(self): rawstr = "".join([x[1] for x in self._autoPreDecode()]) return chardet.detect(rawstr)
def detect(self): ''' 非ASCII字符串编码类型推断 ''' rawstr = "".join([x[1] for x in self._autoPreDecode()]) return chardet.detect(rawstr)
def chardet_dammit(s): if isinstance(s, str): return None return chardet.detect(s)['encoding']