Ejemplo n.º 1
0
 def test_unidecode_bmp(self):
     for n in range(0,0x10000):
         # Just check that it doesn't throw an exception
         try:
             t = unichr(n)
             unidecode(t)
         except:
             print("catch error at %02x"%n)
Ejemplo n.º 2
0
 def test_unidecode_kana(self):
     for n in range(0x3000,0x30ff):
         # Just check that it doesn't throw an exception
         try:
             t = six.unichr(n)
             unidecode(t)
         except:
             print("catch error at %02x"%n)
Ejemplo n.º 3
0
 def test_unidecode_kana(self):
     for n in range(0x3000, 0x30ff):
         # Just check that it doesn't throw an exception
         try:
             t = six.unichr(n)
             unidecode(t)
         except:
             print("catch error at %02x" % n)
Ejemplo n.º 4
0
 def test_unidecode_compatibility_composite(self):
     TESTS = [
         ("\ufb01", "fi"),
         ("\u0032\u2075", "25"),
     ]
     for input, output in TESTS:
         self.assertEqual(unidecode(input), output)
Ejemplo n.º 5
0
    def test_unidecode_specific_bmp(self):

        TESTS = [
            ("Hello, World!", "Hello, World!"),
            ("'\"\r\n", "'\"\r\n"),
            ("ČŽŠčžš", "CZSczs"),
            ("\u00a0\u00a1\u00a2\u00a3\u00a4\u00a5\u00a6\u00a7",
             " !C/PS\u005c$?Y=|SS"),
            ("\u00a8\u00a9\u00aa\u00ab\u00ac\u00ad\u00ae\u00af",
             "\u0022(c)a<<!(r)-"),
            ("ア", "a"),
            ("α", "a"),
            ("а", "a"),
            ('ch\xe2teau', "chateau"),
            ('vi\xf1edos', "vinedos"),
            ("\u5317\u4EB0", "Bei Jing "),
            ("Efficient", "Efficient"),

            # Table that doesn't exist
            ('\ua500', ''),

            # Table that has less than 256 entriees
            ('\u1eff', ''),

            # Mark area
            (
                "\u210a",  #gram mark
                "g"),
        ]
        for input, output in TESTS:
            self.assertEqual(unidecode(input), output)
Ejemplo n.º 6
0
 def test_unidecode_compatibility_composite(self):
     TESTS = [
             (u"\ufb01","fi"),
             (u"\u0032\u2075", "25"),
                    ]
     for input, output in TESTS:
         self.assertEqual(unidecode(input), output)
Ejemplo n.º 7
0
 def test_unidecode_combining_chars(self):
     TESTS = [
         #  roman number "1"  wrapped with solid square
         ("\u0031\u20de", "1"),
     ]
     for input, output in TESTS:
         self.assertEqual(unidecode(input), output)
    def _standardized_fsn(self, fsn):
        """Convert FSN-Latin part into FSN-English to standardize FSN:input original FSN,output FSN_std."""

        # Determine whether the FSN is (English or numeric).
        def is_English(s):
            ans = re.search(r"[0-9A-Z\']+$", s)
            return True if ans else False

        fsn['isEnglish'] = fsn['FSN'].astype(str).apply(is_English)
        print("FSN_English&Num shape=", list(fsn['isEnglish']).count(True))
        print("FSN_Latin shape=", list(fsn['isEnglish']).count(False))
        fsn_en = fsn[fsn['isEnglish'] == True]
        fsn_latin = fsn[fsn['isEnglish'] == False]
        for df in [fsn_latin, fsn_en]:
            df.drop([
                'isEnglish',
            ], axis=1, inplace=True)
        # Convert FSN_Latin into FSN_English
        fsn_latin['FSN_std'] = fsn_latin['FSN'].apply(
            lambda x: unihandecode.unidecode(x))
        fsn_latin = fsn_latin[[
            'FSN', 'FSN_std', 'continent_code', 'country_iso_code',
            'subdivision_1_iso_code', 'city'
        ]]
        for i in "()/'`!- _":
            fsn_latin['FSN_std'] = fsn_latin['FSN_std'].str.replace(i, '')
        # fsn_latin.to_csv(OUTPUT_DIR+'/'+'FSN-latin_part_stded.csv',index=None)
        fsn_en['FSN_std'] = fsn_en['FSN']
        fsn_std = pd.concat([fsn_latin, fsn_en],
                            axis=0,
                            ignore_index=True,
                            sort=False)
        fsn_std.to_csv(self._fsn_path, index=None)
Ejemplo n.º 9
0
 def test_unidecode_combining_chars(self):
     TESTS = [
             #  roman number "1"  wrapped with solid square 
             (u"\u0031\u20de",    "1"), 
             ]
     for input, output in TESTS:
         self.assertEqual(unidecode(input), output)
Ejemplo n.º 10
0
 def test_unidecode_zh(self):
     ZHTESTS = [
         (u"\u660e\u5929\u660e\u5929\u7684\u98ce\u5439",
          'Ming Tian Ming Tian De Feng Chui '),
         (u"馮", "Feng "),
         ]
     for input, output in ZHTESTS:
         self.assertEqual(unidecode(input), output)
Ejemplo n.º 11
0
 def test_unidecode_zh(self):
     ZHTESTS = [
         ("\u660e\u5929\u660e\u5929\u7684\u98ce\u5439",
          'Ming Tian Ming Tian De Feng Chui '),
         ("馮", "Feng "),
     ]
     for input, output in ZHTESTS:
         self.assertEqual(unidecode(input), output)
Ejemplo n.º 12
0
 def test_unidecode_decomposed_form(self):
     TESTS = [
         ("\u0041\u0301", "A"),  # "A" with accent mark 
         ("\u0061\u0323\u0302", "a"),  #  "a" with accent marks
         ("\u30AB\u3099", "ga"),  # "ガ" coded by decomposed from as ' カ゛ '
         ("\u304B\u3099", "ga"),  # "が" coded by decomposed from as ' か゛ '
     ]
     for input, output in TESTS:
         self.assertEqual(unidecode(input), output)
Ejemplo n.º 13
0
 def test_unidecode_decomposed_form(self):
     TESTS = [
             (u"\u0041\u0301", "A"),  # "A" with accent mark 
             (u"\u0061\u0323\u0302", "a"), #  "a" with accent marks
             (u"\u30AB\u3099", "ga"), # "ガ" coded by decomposed from as ' カ゛ '
             (u"\u304B\u3099", "ga"), # "が" coded by decomposed from as ' か゛ '
             ]
     for input, output in TESTS:
         self.assertEqual(unidecode(input), output)
Ejemplo n.º 14
0
 def test_unidecode_mac_japanese_pua(self):
     TESTS = [
             (u"\uF862\u6709\u9650\u4F1A\u793E",  #Adobe CID 8321
             "You Xian Hui She "), # "yuugengaisha" in unihandecode(ja)
             (u"\u5927\u20dd", "Da "),  # "大" with circle "Dai " in unihandecode(ja)
             (u"\u5c0f\u20dd", "Xiao "), # "小" with circle "Shou " in unihandecode(ja)
             (u"\u63a7\u20dd", "Kong "),  # "控" with circle "Hikae " in unihandecode(ja)
                 ]
     for input, output in TESTS:
         self.assertEqual(unidecode(input), output)
Ejemplo n.º 15
0
    def test_unidecode_mathematical_digits(self):
        if sys.maxunicode < 0x1d800:
            print("skip test because of Narrow Python")
            return

        # 5 consecutive sequences of 0-9
        for n in range(0x1d7ce, 0x1d800):
            a = chr(ord('0') + (n - 0x1d7ce) % 10)
            b = unidecode(six.unichr(n))

            self.assertEqual(b, a)
Ejemplo n.º 16
0
    def test_unidecode_mathematical_digits(self):
        if sys.maxunicode < 0x1d800:
            print("skip test because of Narrow Python")
            return

        # 5 consecutive sequences of 0-9
        for n in range(0x1d7ce, 0x1d800):
            a = chr(ord('0') + (n-0x1d7ce) % 10)
            b = unidecode(unichr(n))

            self.assertEqual(b, a)
Ejemplo n.º 17
0
    def test_unidecode_specific_bmp(self):

        TESTS = [
                (u"Hello, World!", 
                "Hello, World!"),

                (u"'\"\r\n",
                 "'\"\r\n"),

                (u"ČŽŠčžš",
                 "CZSczs"),

                (u"\u00a0\u00a1\u00a2\u00a3\u00a4\u00a5\u00a6\u00a7",
                  u" !C/PS\u005c$?Y=|SS"),
                (u"\u00a8\u00a9\u00aa\u00ab\u00ac\u00ad\u00ae\u00af",
                  u"\u0022(c)a<<!(r)-"),

                (u"ア",
                 "a"),

                (u"α",
                "a"),

                (u"а",
                "a"),

                (u'ch\xe2teau',
                "chateau"),

                (u'vi\xf1edos',
                "vinedos"),
                
                (u"\u5317\u4EB0",
                "Bei Jing "),

                (u"Efficient",
                "Efficient"),

                # Table that doesn't exist
                (u'\ua500',
                ''),
                
                # Table that has less than 256 entriees
                (u'\u1eff',
                ''),

                # Mark area
                (u"\u210a",  #gram mark
                "g"),

            ]
        for input, output in TESTS:
            self.assertEqual(unidecode(input), output)
Ejemplo n.º 18
0
def createImage(postContent):
    font = ImageFont.truetype("Andale Mono.ttf", 15)

    im = Image.new(mode="RGB", size=(1280, 720), color=(153, 153, 255))
    draw = ImageDraw.Draw(im)
    # convert from latin-1 encoding to ascii
    postContent = unihandecode.unidecode(postContent)
    # lines are only 120 characters
    text = textwrap.fill(postContent, 120)
    draw.text((100, 100), text, fill=(255, 255, 0))  # draw lines on image
    # im.show()
    im.save("image.jpg")
Ejemplo n.º 19
0
def text_clean(x):
    x = re.sub("<.*?>", "", x)  #remove html tags
    x = re.sub(r'([^\s\w]|_)+', '',
               x)  #retain only alphanumeric chars and space
    x = unihandecode.unidecode(x)  #Convert latin to ASCII characters
    x = x.lower()  #convert to lower case characters
    x = x.translate(str.maketrans(
        "", "", string.punctuation))  #removing punctuation marks
    x = x.translate(str.maketrans("", "", digits))  #removing numbers
    x = x.strip()  #removing leading and trailing white spaces
    x = PorterStemmer().stem(x)  #stem to root word
    #x = x.replace(" ", "")                                            #remove white space
    return x
Ejemplo n.º 20
0
    def test_unidecode_specific_supplementary(self):
        if sys.maxunicode < 0x1d6a4:
            print("skip test because of Narrow Python")
            return

        TESTS = [
            # Non-BMP character
            ('\U0001d5a0', 'A'),

            # Mathematical
            ('\U0001d5c4\U0001d5c6/\U0001d5c1', 'km/h'),
        ]
        for input, output in TESTS:
            self.assertEqual(unidecode(input), output)
Ejemplo n.º 21
0
 def test_unidecode_mac_japanese_pua(self):
     TESTS = [
         (
             "\uF862\u6709\u9650\u4F1A\u793E",  #Adobe CID 8321
             "You Xian Hui She "),  # "yuugengaisha" in unihandecode(ja)
         ("\u5927\u20dd",
          "Da "),  # "大" with circle "Dai " in unihandecode(ja)
         ("\u5c0f\u20dd",
          "Xiao "),  # "小" with circle "Shou " in unihandecode(ja)
         ("\u63a7\u20dd",
          "Kong "),  # "控" with circle "Hikae " in unihandecode(ja)
     ]
     for input, output in TESTS:
         self.assertEqual(unidecode(input), output)
Ejemplo n.º 22
0
def test_unidecode_mathematical_latin():
    # 13 consecutive sequences of A-Z, a-z with some codepoints
    # undefined. We just count the undefined ones and don't check
    # positions.
    empty = 0
    for n in range(0x1d400, 0x1d6a4):
        if n % 52 < 26:
            a = chr(ord('A') + n % 26)
        else:
            a = chr(ord('a') + n % 26)
        b = unihandecode.unidecode(chr(n))
        if not b:
            empty += 1
        else:
            assert b == a
    assert empty == 24
Ejemplo n.º 23
0
    def test_unidecode_specific_supplementary(self):
        if sys.maxunicode < 0x1d6a4:
            print("skip test because of Narrow Python")
            return

        TESTS = [
                # Non-BMP character
                (u'\U0001d5a0',
                'A'),

                # Mathematical
                (u'\U0001d5c4\U0001d5c6/\U0001d5c1',
                'km/h'),
        ]
        for input, output in TESTS:
            self.assertEqual(unidecode(input), output)
Ejemplo n.º 24
0
    def test_unidecode_mathematical_latin(self):
        # 13 consecutive sequences of A-Z, a-z with some codepoints
        # undefined. We just count the undefined ones and don't check
        # positions.
        if sys.maxunicode < 0x1d6a4:
            print("skip test because of Narrow Python")
            return

        empty = 0
        for n in range(0x1d400, 0x1d6a4):
            if n % 52 < 26:
                a = chr(ord('A') + n % 26)
            else:
                a = chr(ord('a') + n % 26)
            b = unidecode(six.unichr(n))

            if not b:
                empty += 1
            else:
                self.assertEqual(b, a)

        self.assertEqual(empty, 24)
Ejemplo n.º 25
0
    def test_unidecode_mathematical_latin(self):
        # 13 consecutive sequences of A-Z, a-z with some codepoints
        # undefined. We just count the undefined ones and don't check
        # positions.
        if sys.maxunicode < 0x1d6a4:
            print("skip test because of Narrow Python")
            return

        empty = 0
        for n in range(0x1d400, 0x1d6a4):
            if n % 52 < 26:
                a = chr(ord('A') + n % 26)
            else:
                a = chr(ord('a') + n % 26)
            b = unidecode(unichr(n))

            if not b:
                empty += 1
            else:
                self.assertEqual(b, a)

        self.assertEqual(empty, 24)
Ejemplo n.º 26
0
 def test_unidecode_squared_chars(self):
     TESTS = [
             (u"\u3301", "alpha"), # combined Alpha in Katakana
             (u"\u3302", "ampere"), # combined Ampere in Katakana 
             (u"\u3304", "inning"),
             (u"\u3306", "won"), # combined Won in Katakana
             (u"\u3307", "escudo"), 
             (u"\u3308", "acre"), # combined Acre in Katakana
             (u"\u3309", "ounce"), # combined ounce in Katakana
             (u"\u330a", "ohm"), # combined Ohm in Katakana
             (u"\u3349", "milli"), # milli in Katakana
             (u"\u3314", "kilo"), # kilo in Katakana
             (u"\u3315", "kilogram"), # kilo gram in Katakana
             (u"\u3316", "kilometer"), # kilo metre in Katakana
             (u"\u3322", "centi"), # centi in Katakana
             (u"\u334d", "meter"), #metre in Katakana
             (u"\u3318", "gram"), # gram in Katakana
             (u"\u3327", "ton"), # ton in Katakana
             (u"\u3303", "are"), # are in Katakana
             (u"\u3336", "hectare"), # hect-are in Katakana
             (u"\u337f", "Inc."), # kabusiki kaisha in Katakana
            ]
     for input, output in TESTS:
         self.assertEqual(unidecode(input), output)
Ejemplo n.º 27
0
 def test_unidecode_squared_chars(self):
     TESTS = [
         ("\u3301", "alpha"),  # combined Alpha in Katakana
         ("\u3302", "ampere"),  # combined Ampere in Katakana 
         ("\u3304", "inning"),
         ("\u3306", "won"),  # combined Won in Katakana
         ("\u3307", "escudo"),
         ("\u3308", "acre"),  # combined Acre in Katakana
         ("\u3309", "ounce"),  # combined ounce in Katakana
         ("\u330a", "ohm"),  # combined Ohm in Katakana
         ("\u3349", "milli"),  # milli in Katakana
         ("\u3314", "kilo"),  # kilo in Katakana
         ("\u3315", "kilogram"),  # kilo gram in Katakana
         ("\u3316", "kilometer"),  # kilo metre in Katakana
         ("\u3322", "centi"),  # centi in Katakana
         ("\u334d", "meter"),  #metre in Katakana
         ("\u3318", "gram"),  # gram in Katakana
         ("\u3327", "ton"),  # ton in Katakana
         ("\u3303", "are"),  # are in Katakana
         ("\u3336", "hectare"),  # hect-are in Katakana
         ("\u337f", "Inc."),  # kabusiki kaisha in Katakana
     ]
     for input, output in TESTS:
         self.assertEqual(unidecode(input), output)
Ejemplo n.º 28
0
def test_unidecode_kana():
    for n in range(0x3000, 0x30ff):
        # Just check that it doesn't throw an exception
        t = chr(n)
        unihandecode.unidecode(t)
Ejemplo n.º 29
0
def test_unidecode_mathematical_digits():
    # 5 consecutive sequences of 0-9
    for n in range(0x1d7ce, 0x1d800):
        a = chr(ord('0') + (n - 0x1d7ce) % 10)
        b = unihandecode.unidecode(chr(n))
        assert b == a
Ejemplo n.º 30
0
def test_unidecode_ascii():
    for n in range(0, 128):
        t = chr(n)
        assert (n, unihandecode.unidecode(t)) == (n, t)
Ejemplo n.º 31
0
 def test_unidecode_ascii(self):
     for n in range(0,128):
         t = chr(n)
         self.assertEqual(unidecode(t), t)
Ejemplo n.º 32
0
 def test_unidecode_ascii(self):
     for n in range(0, 128):
         t = chr(n)
         self.assertEqual(unidecode(t), t)
Ejemplo n.º 33
0
def test_unidecode_bmp():
    # Just check that it doesn't throw an exception
    for n in range(0, 0x10000):
        t = chr(n)
        unihandecode.unidecode(t)