def testNormalizeUtf8(self): languages = ['0', '1', '2'] testList = {} for lang in languages: testList[lang] = [] for match, sub, comment, languageId in UTF8MAP: for lang in languages: if (lang == int(languageId)): testList[lang].append(match) gtList = {} for lang in languages: gtList[lang] = [] for match, sub, comment, languageId in UTF8MAP: for lang in languages: if (lang == int(languageId)): gtList[lang].append(sub) for lang in languages: strGt = u" ".join(gtList[lang]) strGt = strGt.rstrip().strip() strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE) f = LMPreparationFormula() f.setText(u" ".join(testList[lang])) f._normalizeUtf8() strResult = f.getText() self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))
def testNormalizeCharacters(self): strTest = ur"a b c \uff1b , % œ" strGt = ur"a b c % oe" f = LMPreparationFormula() f.setText(strTest) f._normalizeUtf8() f._normalizePunctuation(self.allPunctList) self.assertEquals(strGt, f.getText())
def testNormalizeUtf8(self): testList = [] for match, sub, comment, languageId in UTF8MAP: testList.append(match) gtList = [] for match, sub, comment, languageId in UTF8MAP: gtList.append(sub) strGt = u" ".join(gtList) strGt = strGt.rstrip().strip() strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE) f = LMPreparationFormula() f.setText(u" ".join(testList)) f._normalizeUtf8() strResult = f.getText() self.assertEquals(strGt.encode("utf-8"), strResult.encode("utf-8"))
def testNormalizeUtf8(self): testList = [] for match, sub, comment, languageId in UTF8MAP: testList.append(match) gtList = [] for match, sub, comment, languageId in UTF8MAP: gtList.append(sub) strGt = u" ".join(gtList) strGt = strGt.rstrip().strip() strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE) f = LMPreparationFormula() f.setText(u" ".join(testList)) f._normalizeUtf8() strResult = f.getText() self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))