def feed(self, aBuf):
        if self.done: return

        aLen = len(aBuf)
        if not aLen: return
        
        if not self._mGotData:
            # If the data starts with BOM, we know it is UTF
            if aBuf[:3] == '\xEF\xBB\xBF':
                # EF BB BF  UTF-8 with BOM
                self.result = {'encoding': "UTF-8", 'confidence': 1.0}
            elif aBuf[:4] == '\xFF\xFE\x00\x00':
                # FF FE 00 00  UTF-32, little-endian BOM
                self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
            elif aBuf[:4] == '\x00\x00\xFE\xFF': 
                # 00 00 FE FF  UTF-32, big-endian BOM
                self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
            elif aBuf[:4] == '\xFE\xFF\x00\x00':
                # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0}
            elif aBuf[:4] == '\x00\x00\xFF\xFE':
                # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0}
            elif aBuf[:2] == '\xFF\xFE':
                # FF FE  UTF-16, little endian BOM
                self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
            elif aBuf[:2] == '\xFE\xFF':
                # FE FF  UTF-16, big endian BOM
                self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}

        self._mGotData = constants.True
        if self.result['encoding'] and (self.result['confidence'] > 0.0):
            self.done = constants.True
            return

        if self._mInputState == ePureAscii:
            if self._highBitDetector.search(aBuf):
                self._mInputState = eHighbyte
            elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
                self._mInputState = eEscAscii

        self._mLastChar = aBuf[-1]

        if self._mInputState == eEscAscii:
            if not self._mEscCharSetProber:
                self._mEscCharSetProber = EscCharSetProber()
            if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
                self.result = {'encoding': self._mEscCharSetProber.get_charset_name(),
                               'confidence': self._mEscCharSetProber.get_confidence()}
                self.done = constants.True
        elif self._mInputState == eHighbyte:
            if not self._mCharSetProbers:
                self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()]
            for prober in self._mCharSetProbers:
                if prober.feed(aBuf) == constants.eFoundIt:
                    self.result = {'encoding': prober.get_charset_name(),
                                   'confidence': prober.get_confidence()}
                    self.done = constants.True
                    break
Exemple #2
0
	def feed(self, aBuf):
		if self.done: return

		aLen = len(aBuf)
		if not aLen: return

		if not self._mGotData:
			aBuff = ''.join([hex(ord(tmp))[2:] for tmp in aBuf[:4]]).upper()
			if aBuff[:6] == 'EFBBBF': self.result = {'encoding': "UTF-8", 'confidence': 1.0}
			elif aBuff[:8] == 'FFFE0000': self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
			elif aBuff[:8] == '0000FEFF': self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
			elif aBuff[:8] == 'FEFF0000': self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0}
			elif aBuff[:8] == '0000FFFE': self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0}
			elif aBuff[:4] == 'FFFE': self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
			elif aBuff[:4] == 'FEFF': self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}

		self._mGotData = constants.True
		if self.result['encoding'] and (self.result['confidence'] > 0.0):
			self.done = constants.True
			return

		if self._mInputState == ePureAscii:
			if self._highBitDetector.search(aBuf):
				self._mInputState = eHighbyte
			elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
				self._mInputState = eEscAscii

		self._mLastChar = aBuf[-1]

		if self._mInputState == eEscAscii:
			if not self._mEscCharSetProber:
				self._mEscCharSetProber = EscCharSetProber()
			if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
				self.result = {'encoding': self._mEscCharSetProber.get_charset_name(),
							   'confidence': self._mEscCharSetProber.get_confidence()}
				self.done = constants.True
		elif self._mInputState == eHighbyte:
			if not self._mCharSetProbers:
				self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()]
			for prober in self._mCharSetProbers:
				if prober.feed(aBuf) == constants.eFoundIt:
					self.result = {'encoding': prober.get_charset_name(),
								   'confidence': prober.get_confidence()}
					self.done = constants.True
					break
Exemple #3
0
    def feed(self, aBuf):
        if self.done: return

        charmap = (
            # EF BB BF  UTF-8 with BOM
            ('\xEF\xBB\xBF', {
                'encoding': "UTF-8",
                'confidence': 1.0
            }),
            # FF FE 00 00  UTF-32, little-endian BOM
            ('\xFF\xFE\x00\x00', {
                'encoding': "UTF-32LE",
                'confidence': 1.0
            }),
            # 00 00 FE FF  UTF-32, big-endian BOM
            ('\x00\x00\xFE\xFF', {
                'encoding': "UTF-32BE",
                'confidence': 1.0
            }),
            # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
            (u'\xFE\xFF\x00\x00', {
                'encoding': "X-ISO-10646-UCS-4-3412",
                'confidence': 1.0
            }),
            # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
            (u'\x00\x00\xFF\xFE', {
                'encoding': "X-ISO-10646-UCS-4-2143",
                'confidence': 1.0
            }),
            # FF FE  UTF-16, little endian BOM
            ('\xFF\xFE', {
                'encoding': "UTF-16LE",
                'confidence': 1.0
            }),
            # FE FF  UTF-16, big endian BOM
            ('\xFE\xFF', {
                'encoding': "UTF-16BE",
                'confidence': 1.0
            }),
        )

        aLen = len(aBuf)
        if not aLen: return

        if not self._mGotData:
            # If the data starts with BOM, we know it is UTF
            for chunk, result in charmap:
                if aBuf[:len(chunk)] == chunk:
                    self.result = result
                    break

        self._mGotData = constants. True
        if self.result['encoding'] and (self.result['confidence'] > 0.0):
            self.done = constants. True
            return

        if self._mInputState == ePureAscii:
            if self._highBitDetector.search(aBuf):
                self._mInputState = eHighbyte
            elif (self._mInputState
                  == ePureAscii) and self._escDetector.search(self._mLastChar +
                                                              aBuf):
                self._mInputState = eEscAscii

        self._mLastChar = aBuf[-1]

        if self._mInputState == eEscAscii:
            if not self._mEscCharSetProber:
                self._mEscCharSetProber = EscCharSetProber()
            if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
                self.result = {
                    'encoding': self._mEscCharSetProber.get_charset_name(),
                    'confidence': self._mEscCharSetProber.get_confidence()
                }
                self.done = constants. True
        elif self._mInputState == eHighbyte:
            if not self._mCharSetProbers:
                self._mCharSetProbers = [
                    MBCSGroupProber(),
                    SBCSGroupProber(),
                    Latin1Prober()
                ]
            for prober in self._mCharSetProbers:
                try:
                    if prober.feed(aBuf) == constants.eFoundIt:
                        self.result = {
                            'encoding': prober.get_charset_name(),
                            'confidence': prober.get_confidence()
                        }
                        self.done = constants. True
                        break
                except (UnicodeDecodeError, UnicodeEncodeError), e:
                    logger.exception(e)