def __init__(self): CharSetProber.__init__(self) self._mCodingSM = [ \ CodingStateMachine(HZSMModel), CodingStateMachine(ISO2022CNSMModel), CodingStateMachine(ISO2022JPSMModel), CodingStateMachine(ISO2022KRSMModel) ] self.reset()
class SJISProber(MultiByteCharSetProber): def __init__(self): MultiByteCharSetProber.__init__(self) self._mCodingSM = CodingStateMachine(SJISSMModel) self._mDistributionAnalyzer = SJISDistributionAnalysis() self._mContextAnalyzer = SJISContextAnalysis() self.reset() def reset(self): MultiByteCharSetProber.reset(self) self._mContextAnalyzer.reset() def get_charset_name(self): return "SHIFT_JIS" def feed(self, aBuf): aLen = len(aBuf) for i in range(0, aLen): codingState = self._mCodingSM.next_state(aBuf[i]) if codingState == eError: if constants._debug: sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') self._mState = constants.eNotMe break elif codingState == eItsMe: self._mState = constants.eFoundIt break elif codingState == eStart: charLen = self._mCodingSM.get_current_charlen() if i == 0: self._mLastChar[1] = aBuf[0] self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:], charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen) else: self._mContextAnalyzer.feed( aBuf[i + 1 - charLen:i + 3 - charLen], charLen) self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], charLen) self._mLastChar[0] = aBuf[aLen - 1] if self.get_state() == constants.eDetecting: if self._mContextAnalyzer.got_enough_data() and \ (self.get_confidence() > constants.SHORTCUT_THRESHOLD): self._mState = constants.eFoundIt return self.get_state() def get_confidence(self): contxtCf = self._mContextAnalyzer.get_confidence() distribCf = self._mDistributionAnalyzer.get_confidence() return max(contxtCf, distribCf)
class SJISProber(MultiByteCharSetProber): def __init__(self): MultiByteCharSetProber.__init__(self) self._mCodingSM = CodingStateMachine(SJISSMModel) self._mDistributionAnalyzer = SJISDistributionAnalysis() self._mContextAnalyzer = SJISContextAnalysis() self.reset() def reset(self): MultiByteCharSetProber.reset(self) self._mContextAnalyzer.reset() def get_charset_name(self): return "SHIFT_JIS" def feed(self, aBuf): aLen = len(aBuf) for i in range(0, aLen): codingState = self._mCodingSM.next_state(aBuf[i]) if codingState == eError: if constants._debug: sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') self._mState = constants.eNotMe break elif codingState == eItsMe: self._mState = constants.eFoundIt break elif codingState == eStart: charLen = self._mCodingSM.get_current_charlen() if i == 0: self._mLastChar[1] = aBuf[0] self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen) self._mDistributionAnalyzer.feed(self._mLastChar, charLen) else: self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen) self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen) self._mLastChar[0] = aBuf[aLen - 1] if self.get_state() == constants.eDetecting: if self._mContextAnalyzer.got_enough_data() and \ (self.get_confidence() > constants.SHORTCUT_THRESHOLD): self._mState = constants.eFoundIt return self.get_state() def get_confidence(self): contxtCf = self._mContextAnalyzer.get_confidence() distribCf = self._mDistributionAnalyzer.get_confidence() return max(contxtCf, distribCf)
class UTF8Prober(CharSetProber): def __init__(self): CharSetProber.__init__(self) self._mCodingSM = CodingStateMachine(UTF8SMModel) self.reset() def reset(self): CharSetProber.reset(self) self._mCodingSM.reset() self._mNumOfMBChar = 0 def get_charset_name(self): return "utf-8" def feed(self, aBuf): for c in aBuf: codingState = self._mCodingSM.next_state(c) if codingState == eError: self._mState = constants.eNotMe break elif codingState == eItsMe: self._mState = constants.eFoundIt break elif codingState == eStart: if self._mCodingSM.get_current_charlen() >= 2: self._mNumOfMBChar += 1 if self.get_state() == constants.eDetecting: if self.get_confidence() > constants.SHORTCUT_THRESHOLD: self._mState = constants.eFoundIt return self.get_state() def get_confidence(self): unlike = 0.99 if self._mNumOfMBChar < 6: for i in range(0, self._mNumOfMBChar): unlike = unlike * ONE_CHAR_PROB return 1.0 - unlike else: return unlike
def __init__(self): MultiByteCharSetProber.__init__(self) self._mCodingSM = CodingStateMachine(SJISSMModel) self._mDistributionAnalyzer = SJISDistributionAnalysis() self._mContextAnalyzer = SJISContextAnalysis() self.reset()
def __init__(self): MultiByteCharSetProber.__init__(self) self._mCodingSM = CodingStateMachine(EUCKRSMModel) self._mDistributionAnalyzer = EUCKRDistributionAnalysis() self.reset()
def __init__(self): CharSetProber.__init__(self) self._mCodingSM = CodingStateMachine(UTF8SMModel) self.reset()