Ejemplo n.º 1
0
 def __init__(self):
     CharSetGroupProber.__init__(self)
     self._mProbers = [ \
         UTF8Prober(),
         SJISProber(),
         EUCJPProber(),
         GB2312Prober(),
         EUCKRProber(),
         Big5Prober(),
         EUCTWProber()]
     self.reset()
Ejemplo n.º 2
0
 def __init__(self):
     CharSetGroupProber.__init__(self)
     self._mProbers = [ \
         UTF8Prober(),
         SJISProber(),
         EUCJPProber(),
         GB18030Prober(),
         CP949Prober(),
         Big5Prober(),
         EUCTWProber()]
     self.reset()
Ejemplo n.º 3
0
    def feed(self, aBuf):
        if isinstance(aBuf, unicode):
            self.result = {'encoding': "unicode", 'confidence': 1.0}
            self.done = constants. True
            return

        if self.done: return

        aLen = len(aBuf)
        if not aLen: return

        if not self._mGotData:
            # If the data starts with BOM, we know it is UTF
            if aBuf[:3] == '\xEF\xBB\xBF':
                # EF BB BF  UTF-8 with BOM
                self.result = {'encoding': "utf_8", 'confidence': 1.0}
            elif aBuf[:4] in ('\xFF\xFE\x00\x00',
                              '\x00\x00\xFE\xFF',
                              '\xFE\xFF\x00\x00',
                              '\x00\x00\xFF\xFE') or \
                 aBuf[:2] in ('\xFF\xFE', '\xFE\xFF'):
                self.result = {'encoding': "utf_n", 'confidence': 1.0}

        self._mGotData = constants. True
        if self.result['encoding'] and (self.result['confidence'] > 0.0):
            self.done = constants. True
            return

        if self._mInputState == ePureAscii:
            if self._highBitDetector.search(aBuf):
                self._mInputState = eHighbyte
            elif self._escDetector.search(self._mLastChar + aBuf):
                self._mInputState = eEscAscii

        self._mLastChar = aBuf[-1]

        if self._mInputState == eEscAscii:
            self.result = {'encoding': "escaped", 'confidence': 1.0}
            self.done = constants. True
        elif self._mInputState == eHighbyte:
            if not self._mCharSetProbers:
                self._mCharSetProbers = [UTF8Prober(), Latin1Prober()]
            for prober in self._mCharSetProbers:
                if prober.feed(aBuf) == constants.eFoundIt:
                    self.result = {
                        'encoding': prober.get_charset_name(),
                        'confidence': prober.get_confidence()
                    }
                    self.done = constants. True
                    break