def test_unidecode_bmp(self): for n in range(0,0x10000): # Just check that it doesn't throw an exception try: t = unichr(n) unidecode(t) except: print("catch error at %02x"%n)
def test_unidecode_kana(self): for n in range(0x3000,0x30ff): # Just check that it doesn't throw an exception try: t = six.unichr(n) unidecode(t) except: print("catch error at %02x"%n)
def test_unidecode_kana(self): for n in range(0x3000, 0x30ff): # Just check that it doesn't throw an exception try: t = six.unichr(n) unidecode(t) except: print("catch error at %02x" % n)
def test_unidecode_compatibility_composite(self): TESTS = [ ("\ufb01", "fi"), ("\u0032\u2075", "25"), ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_specific_bmp(self): TESTS = [ ("Hello, World!", "Hello, World!"), ("'\"\r\n", "'\"\r\n"), ("ČŽŠčžš", "CZSczs"), ("\u00a0\u00a1\u00a2\u00a3\u00a4\u00a5\u00a6\u00a7", " !C/PS\u005c$?Y=|SS"), ("\u00a8\u00a9\u00aa\u00ab\u00ac\u00ad\u00ae\u00af", "\u0022(c)a<<!(r)-"), ("ア", "a"), ("α", "a"), ("а", "a"), ('ch\xe2teau', "chateau"), ('vi\xf1edos', "vinedos"), ("\u5317\u4EB0", "Bei Jing "), ("Efficient", "Efficient"), # Table that doesn't exist ('\ua500', ''), # Table that has less than 256 entriees ('\u1eff', ''), # Mark area ( "\u210a", #gram mark "g"), ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_compatibility_composite(self): TESTS = [ (u"\ufb01","fi"), (u"\u0032\u2075", "25"), ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_combining_chars(self): TESTS = [ # roman number "1" wrapped with solid square ("\u0031\u20de", "1"), ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def _standardized_fsn(self, fsn): """Convert FSN-Latin part into FSN-English to standardize FSN:input original FSN,output FSN_std.""" # Determine whether the FSN is (English or numeric). def is_English(s): ans = re.search(r"[0-9A-Z\']+$", s) return True if ans else False fsn['isEnglish'] = fsn['FSN'].astype(str).apply(is_English) print("FSN_English&Num shape=", list(fsn['isEnglish']).count(True)) print("FSN_Latin shape=", list(fsn['isEnglish']).count(False)) fsn_en = fsn[fsn['isEnglish'] == True] fsn_latin = fsn[fsn['isEnglish'] == False] for df in [fsn_latin, fsn_en]: df.drop([ 'isEnglish', ], axis=1, inplace=True) # Convert FSN_Latin into FSN_English fsn_latin['FSN_std'] = fsn_latin['FSN'].apply( lambda x: unihandecode.unidecode(x)) fsn_latin = fsn_latin[[ 'FSN', 'FSN_std', 'continent_code', 'country_iso_code', 'subdivision_1_iso_code', 'city' ]] for i in "()/'`!- _": fsn_latin['FSN_std'] = fsn_latin['FSN_std'].str.replace(i, '') # fsn_latin.to_csv(OUTPUT_DIR+'/'+'FSN-latin_part_stded.csv',index=None) fsn_en['FSN_std'] = fsn_en['FSN'] fsn_std = pd.concat([fsn_latin, fsn_en], axis=0, ignore_index=True, sort=False) fsn_std.to_csv(self._fsn_path, index=None)
def test_unidecode_combining_chars(self): TESTS = [ # roman number "1" wrapped with solid square (u"\u0031\u20de", "1"), ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_zh(self): ZHTESTS = [ (u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439", 'Ming Tian Ming Tian De Feng Chui '), (u"馮", "Feng "), ] for input, output in ZHTESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_zh(self): ZHTESTS = [ ("\u660e\u5929\u660e\u5929\u7684\u98ce\u5439", 'Ming Tian Ming Tian De Feng Chui '), ("馮", "Feng "), ] for input, output in ZHTESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_decomposed_form(self): TESTS = [ ("\u0041\u0301", "A"), # "A" with accent mark ("\u0061\u0323\u0302", "a"), # "a" with accent marks ("\u30AB\u3099", "ga"), # "ガ" coded by decomposed from as ' カ゛ ' ("\u304B\u3099", "ga"), # "が" coded by decomposed from as ' か゛ ' ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_decomposed_form(self): TESTS = [ (u"\u0041\u0301", "A"), # "A" with accent mark (u"\u0061\u0323\u0302", "a"), # "a" with accent marks (u"\u30AB\u3099", "ga"), # "ガ" coded by decomposed from as ' カ゛ ' (u"\u304B\u3099", "ga"), # "が" coded by decomposed from as ' か゛ ' ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_mac_japanese_pua(self): TESTS = [ (u"\uF862\u6709\u9650\u4F1A\u793E", #Adobe CID 8321 "You Xian Hui She "), # "yuugengaisha" in unihandecode(ja) (u"\u5927\u20dd", "Da "), # "大" with circle "Dai " in unihandecode(ja) (u"\u5c0f\u20dd", "Xiao "), # "小" with circle "Shou " in unihandecode(ja) (u"\u63a7\u20dd", "Kong "), # "控" with circle "Hikae " in unihandecode(ja) ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_mathematical_digits(self): if sys.maxunicode < 0x1d800: print("skip test because of Narrow Python") return # 5 consecutive sequences of 0-9 for n in range(0x1d7ce, 0x1d800): a = chr(ord('0') + (n - 0x1d7ce) % 10) b = unidecode(six.unichr(n)) self.assertEqual(b, a)
def test_unidecode_mathematical_digits(self): if sys.maxunicode < 0x1d800: print("skip test because of Narrow Python") return # 5 consecutive sequences of 0-9 for n in range(0x1d7ce, 0x1d800): a = chr(ord('0') + (n-0x1d7ce) % 10) b = unidecode(unichr(n)) self.assertEqual(b, a)
def test_unidecode_specific_bmp(self): TESTS = [ (u"Hello, World!", "Hello, World!"), (u"'\"\r\n", "'\"\r\n"), (u"ČŽŠčžš", "CZSczs"), (u"\u00a0\u00a1\u00a2\u00a3\u00a4\u00a5\u00a6\u00a7", u" !C/PS\u005c$?Y=|SS"), (u"\u00a8\u00a9\u00aa\u00ab\u00ac\u00ad\u00ae\u00af", u"\u0022(c)a<<!(r)-"), (u"ア", "a"), (u"α", "a"), (u"а", "a"), (u'ch\xe2teau', "chateau"), (u'vi\xf1edos', "vinedos"), (u"\u5317\u4EB0", "Bei Jing "), (u"Efficient", "Efficient"), # Table that doesn't exist (u'\ua500', ''), # Table that has less than 256 entriees (u'\u1eff', ''), # Mark area (u"\u210a", #gram mark "g"), ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def createImage(postContent): font = ImageFont.truetype("Andale Mono.ttf", 15) im = Image.new(mode="RGB", size=(1280, 720), color=(153, 153, 255)) draw = ImageDraw.Draw(im) # convert from latin-1 encoding to ascii postContent = unihandecode.unidecode(postContent) # lines are only 120 characters text = textwrap.fill(postContent, 120) draw.text((100, 100), text, fill=(255, 255, 0)) # draw lines on image # im.show() im.save("image.jpg")
def text_clean(x): x = re.sub("<.*?>", "", x) #remove html tags x = re.sub(r'([^\s\w]|_)+', '', x) #retain only alphanumeric chars and space x = unihandecode.unidecode(x) #Convert latin to ASCII characters x = x.lower() #convert to lower case characters x = x.translate(str.maketrans( "", "", string.punctuation)) #removing punctuation marks x = x.translate(str.maketrans("", "", digits)) #removing numbers x = x.strip() #removing leading and trailing white spaces x = PorterStemmer().stem(x) #stem to root word #x = x.replace(" ", "") #remove white space return x
def test_unidecode_specific_supplementary(self): if sys.maxunicode < 0x1d6a4: print("skip test because of Narrow Python") return TESTS = [ # Non-BMP character ('\U0001d5a0', 'A'), # Mathematical ('\U0001d5c4\U0001d5c6/\U0001d5c1', 'km/h'), ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_mac_japanese_pua(self): TESTS = [ ( "\uF862\u6709\u9650\u4F1A\u793E", #Adobe CID 8321 "You Xian Hui She "), # "yuugengaisha" in unihandecode(ja) ("\u5927\u20dd", "Da "), # "大" with circle "Dai " in unihandecode(ja) ("\u5c0f\u20dd", "Xiao "), # "小" with circle "Shou " in unihandecode(ja) ("\u63a7\u20dd", "Kong "), # "控" with circle "Hikae " in unihandecode(ja) ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_mathematical_latin(): # 13 consecutive sequences of A-Z, a-z with some codepoints # undefined. We just count the undefined ones and don't check # positions. empty = 0 for n in range(0x1d400, 0x1d6a4): if n % 52 < 26: a = chr(ord('A') + n % 26) else: a = chr(ord('a') + n % 26) b = unihandecode.unidecode(chr(n)) if not b: empty += 1 else: assert b == a assert empty == 24
def test_unidecode_specific_supplementary(self): if sys.maxunicode < 0x1d6a4: print("skip test because of Narrow Python") return TESTS = [ # Non-BMP character (u'\U0001d5a0', 'A'), # Mathematical (u'\U0001d5c4\U0001d5c6/\U0001d5c1', 'km/h'), ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_mathematical_latin(self): # 13 consecutive sequences of A-Z, a-z with some codepoints # undefined. We just count the undefined ones and don't check # positions. if sys.maxunicode < 0x1d6a4: print("skip test because of Narrow Python") return empty = 0 for n in range(0x1d400, 0x1d6a4): if n % 52 < 26: a = chr(ord('A') + n % 26) else: a = chr(ord('a') + n % 26) b = unidecode(six.unichr(n)) if not b: empty += 1 else: self.assertEqual(b, a) self.assertEqual(empty, 24)
def test_unidecode_mathematical_latin(self): # 13 consecutive sequences of A-Z, a-z with some codepoints # undefined. We just count the undefined ones and don't check # positions. if sys.maxunicode < 0x1d6a4: print("skip test because of Narrow Python") return empty = 0 for n in range(0x1d400, 0x1d6a4): if n % 52 < 26: a = chr(ord('A') + n % 26) else: a = chr(ord('a') + n % 26) b = unidecode(unichr(n)) if not b: empty += 1 else: self.assertEqual(b, a) self.assertEqual(empty, 24)
def test_unidecode_squared_chars(self): TESTS = [ (u"\u3301", "alpha"), # combined Alpha in Katakana (u"\u3302", "ampere"), # combined Ampere in Katakana (u"\u3304", "inning"), (u"\u3306", "won"), # combined Won in Katakana (u"\u3307", "escudo"), (u"\u3308", "acre"), # combined Acre in Katakana (u"\u3309", "ounce"), # combined ounce in Katakana (u"\u330a", "ohm"), # combined Ohm in Katakana (u"\u3349", "milli"), # milli in Katakana (u"\u3314", "kilo"), # kilo in Katakana (u"\u3315", "kilogram"), # kilo gram in Katakana (u"\u3316", "kilometer"), # kilo metre in Katakana (u"\u3322", "centi"), # centi in Katakana (u"\u334d", "meter"), #metre in Katakana (u"\u3318", "gram"), # gram in Katakana (u"\u3327", "ton"), # ton in Katakana (u"\u3303", "are"), # are in Katakana (u"\u3336", "hectare"), # hect-are in Katakana (u"\u337f", "Inc."), # kabusiki kaisha in Katakana ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_squared_chars(self): TESTS = [ ("\u3301", "alpha"), # combined Alpha in Katakana ("\u3302", "ampere"), # combined Ampere in Katakana ("\u3304", "inning"), ("\u3306", "won"), # combined Won in Katakana ("\u3307", "escudo"), ("\u3308", "acre"), # combined Acre in Katakana ("\u3309", "ounce"), # combined ounce in Katakana ("\u330a", "ohm"), # combined Ohm in Katakana ("\u3349", "milli"), # milli in Katakana ("\u3314", "kilo"), # kilo in Katakana ("\u3315", "kilogram"), # kilo gram in Katakana ("\u3316", "kilometer"), # kilo metre in Katakana ("\u3322", "centi"), # centi in Katakana ("\u334d", "meter"), #metre in Katakana ("\u3318", "gram"), # gram in Katakana ("\u3327", "ton"), # ton in Katakana ("\u3303", "are"), # are in Katakana ("\u3336", "hectare"), # hect-are in Katakana ("\u337f", "Inc."), # kabusiki kaisha in Katakana ] for input, output in TESTS: self.assertEqual(unidecode(input), output)
def test_unidecode_kana(): for n in range(0x3000, 0x30ff): # Just check that it doesn't throw an exception t = chr(n) unihandecode.unidecode(t)
def test_unidecode_mathematical_digits(): # 5 consecutive sequences of 0-9 for n in range(0x1d7ce, 0x1d800): a = chr(ord('0') + (n - 0x1d7ce) % 10) b = unihandecode.unidecode(chr(n)) assert b == a
def test_unidecode_ascii(): for n in range(0, 128): t = chr(n) assert (n, unihandecode.unidecode(t)) == (n, t)
def test_unidecode_ascii(self): for n in range(0,128): t = chr(n) self.assertEqual(unidecode(t), t)
def test_unidecode_ascii(self): for n in range(0, 128): t = chr(n) self.assertEqual(unidecode(t), t)
def test_unidecode_bmp(): # Just check that it doesn't throw an exception for n in range(0, 0x10000): t = chr(n) unihandecode.unidecode(t)