Ejemplo n.º 1
0
def calculateToSimplified(toSimplified, noAutoConvert, title):
    if toSimplified:
        return True
    if noAutoConvert:
        return False
    for c in title:
        if isCN(c) and not hanzidentifier.is_simplified(c):
            return True
    return False
Ejemplo n.º 2
0
def isSimplified(text):
    cn = sum([isCN(c) + hanzidentifier.is_simplified(c) for c in text])
    for c in text:
        if isCN(c) and not hanzidentifier.is_simplified(c):
            return False
    return cn * 2 >= len(text)
Ejemplo n.º 3
0
class LangCharacters(object):

    encoding = 'utf-8'

    def __init__(self, encoding='utf-8'):
        self.encoding = encoding
        return

    #
    # Latin
    #

    # Latin Unicode Block as 'int' list
    UNICODE_BLOCK_ORDINAL_LATIN_BASIC = tuple( range(0x0041, 0x005A+1, 1) ) +\
                                        tuple( range(0x0061, 0x007A+1, 1) )
    # Convert to Python Unicode Type list
    UNICODE_BLOCK_LATIN_BASIC = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_LATIN_BASIC])
    # Can be used interchangeably
    UNICODE_BLOCK_LATIN_AZ = UNICODE_BLOCK_LATIN_BASIC

    # Latin Extended
    UNICODE_BLOCK_ORDINAL_LATIN_EXTENDED = tuple( range(0x00C0, 0x00F6+1, 1) ) +\
                                           tuple( range(0x00F8, 0x01BF+1, 1) ) +\
                                           tuple( range(0x01C4, 0x024F+1, 1) )
    UNICODE_BLOCK_LATIN_EXTENDED = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_LATIN_EXTENDED])

    # All Latin
    UNICODE_BLOCK_ORDINAL_LATIN_ALL = UNICODE_BLOCK_ORDINAL_LATIN_BASIC + UNICODE_BLOCK_ORDINAL_LATIN_EXTENDED
    UNICODE_BLOCK_LATIN_ALL = UNICODE_BLOCK_LATIN_BASIC + UNICODE_BLOCK_LATIN_EXTENDED

    # Just Latin specific to Vietnamese (actually, also French, Spanish, etc.)
    # It is actually a subset of the Latin Extended
    UNICODE_BLOCK_LATIN_VIETNAMESE =\
        tuple('ăâàằầảẳẩãẵẫáắấạặậêèềẻểẽễéếẹệìỉĩíịôơòồờỏổởõỗỡóốớọộợưùừủửũữúứụựđýỳỷỹỵ')
    # Can be used interchangeably
    UNICODE_BLOCK_LATIN_VI = UNICODE_BLOCK_LATIN_VIETNAMESE
    UNICODE_BLOCK_LATIN_VI_AZ = UNICODE_BLOCK_LATIN_VI + UNICODE_BLOCK_LATIN_AZ

    #
    # CJK
    #
    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS = tuple(
        range(0x4E00, 0x9FFF + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS] )

    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_A = tuple(
        range(0x3400, 0x4DBF + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_A =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_A] )

    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_B = tuple(
        range(0x20000, 0x2A6DF + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_B =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_B] )

    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_C = tuple(
        range(0x2A700, 0x2B73F + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_C =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_C] )

    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D = tuple(
        range(0x2B740, 0x2B81F + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_D =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D] )

    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_E = tuple(
        range(0x2B820, 0x2CEAF + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_E = \
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_E] )

    UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS = tuple(
        range(0xF900, 0xFAFF + 1, 1))
    UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS = \
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS] )

    UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP = tuple(
        range(0x2F800, 0x2FA1F + 1, 1))
    UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP = \
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP] )

    UNICODE_BLOCK_ORDINAL_CJK = UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS + UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_A +\
                        UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS +\
                        UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_B + UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_C +\
                        UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D + UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D +\
                        UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_E +\
                        UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP
    UNICODE_BLOCK_CJK = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK])
    # This UNICODE_BLOCK_CJK is not a unique set, there are character repeats
    # import collections
    # c = collections.Counter(UNICODE_BLOCK_CJK)
    # char_repeats = [x for x in c.keys() if c[x]>1]
    # char_repeats_unicode = [hex(ord(x)) for x in char_repeats]
    # print(char_repeats)
    # print(char_repeats_unicode)

    # TODO
    #    Some interesting notes below
    #     Case 1: Simplified Chinese takes Precedence (all characters in Simplified Chinese are surely "simplified")
    #       Historically Never Simplified Characters still in "Traditional" Hanja/Kanji/Chinese
    #          hanzidentifier.is_simplified('入') = True
    #          hanzidentifier.is_simplified('口') = True
    #       meaning there is no traditional version to these characters at all
    #       For example, in Japan you will see this Kanji '入口' (entrance) everywhere, which is the same in
    #       simplified Chinese (China/Malaysia/Singapore), traditional Chinese (Taiwan/HK) and Hanja (Hangul 입구),
    #       with exactly the same meanings.
    #       This means we cannot use this Unicode blocks to decide the language, as Japanese Kanji, Hanja,
    #       simplified/traditional Chinese will point to "simplified"
    #     Case 2: Combination vs Individual Characters
    #       Take the traditional word '辭退' (citui), and the simplified version '辞退'. If this is fed into code,
    #          hanzidentifier.is_simplified('辭退') = False
    #          hanzidentifier.is_simplified('辞退') = True
    #       But the interesting thing is the 2nd character '退' is the same in both traditional and simplified
    #          hanzidentifier.is_simplified('退') = True
    #       So this means that without first "tokenizing" the sentence, there is no way to tell.
    #       But this is chicken & egg, how to tokenize without first knowing the language?
    #       However if every character in a text is labeled "simplified", then it should be simplified Chinese
    #       & nothing else. But even this is not applicable to very short sentences like '入口'.
    # Looping over 80k symbols is fast enough, no need to worry
    try:
        UNICODE_BLOCK_ORDINAL_CJK_SIMPLIFIED = tuple(
            [u for u in UNICODE_BLOCK_ORDINAL_CJK if hz.is_simplified(chr(u))])
        UNICODE_BLOCK_CJK_SIMPLIFIED = tuple(
            [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_SIMPLIFIED])

        UNICODE_BLOCK_ORDINAL_CJK_TRADITIONAL = tuple([
            u for u in UNICODE_BLOCK_ORDINAL_CJK
            if not hz.is_simplified(chr(u))
        ])
        UNICODE_BLOCK_CJK_TRADITIONAL = tuple([
            chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_TRADITIONAL
        ])
        # Taking set difference will result in smaller set due to repeats in CJK
        # UNICODE_BLOCK_CJK_TRADITIONAL = tuple( set(UNICODE_BLOCK_CJK) - set(UNICODE_BLOCK_CJK_SIMPLIFIED) )
    except Exception as ex:
        Log.warning(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': Cannot get CJK simplified/traditional: ' + str(ex))
        UNICODE_BLOCK_ORDINAL_CJK_SIMPLIFIED = None
        UNICODE_BLOCK_CJK_SIMPLIFIED = None

        UNICODE_BLOCK_ORDINAL_CJK_TRADITIONAL = None
        UNICODE_BLOCK_CJK_TRADITIONAL = None

    #
    # Cyrillic
    UNICODE_BLOCK_ORDINAL_CYRILLIC = tuple(range(0x0400, 0x04FF + 1, 1))
    UNICODE_BLOCK_CYRILLIC = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CYRILLIC])

    # Cyrillic Supplement
    # (Cyrillic letters for writing several minority languages,
    # including Abkhaz, Kurdish, Komi, Mordvin, Aleut, Azerbaijani,
    # and Jakovlev's Chuvash orthography)
    UNICODE_BLOCK_SUPPL_CYRILLIC = tuple(range(0x0500, 0x052F + 1, 1))
    UNICODE_BLOCK_SUPPL_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_SUPPL_CYRILLIC])

    # Cyrillic Extanded-A
    # (Cyrillic letters used in Old Church Slavonic texts)
    UNICODE_BLOCK_EXT_A_CYRILLIC = tuple(range(0x2DE0, 0x2DFF + 1, 1))
    UNICODE_BLOCK_EXT_A_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_EXT_A_CYRILLIC])

    # Cyrillic Extanded-B
    # (Cyrillic characters for writing Old Cyrillic and Old Abkhazian,
    # and combining numeric signs)
    UNICODE_BLOCK_EXT_B_CYRILLIC = tuple(range(0xA640, 0xA69F + 1, 1))
    UNICODE_BLOCK_EXT_B_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_EXT_B_CYRILLIC])

    # Cyrillic Extanded-C
    # (Cyrillic numerals)
    UNICODE_BLOCK_EXT_C_CYRILLIC = tuple(range(0x1C80, 0x1C8F + 1, 1))
    UNICODE_BLOCK_EXT_C_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_EXT_C_CYRILLIC])

    # Cyrillic Phonetic Extensions
    UNICODE_BLOCK_PHON_CYRILLIC = tuple(range(0x1D2B, 0x1D78 + 1, 1))
    UNICODE_BLOCK_EXT_PHON_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_PHON_CYRILLIC])

    # Cyrillic Combining Half Marks
    # (Unicode block containing diacritic mark parts for spanning multiple characters)
    UNICODE_BLOCK_HALF_MARKS_CYRILLIC = tuple(range(0xFE2E, 0xFE2F + 1, 1))
    UNICODE_BLOCK_HALF_MARKS_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_HALF_MARKS_CYRILLIC])

    # UNICODE block for ALL cyrillic characters
    UNICODE_BLOCK_CYRILLIC_ALL = UNICODE_BLOCK_CYRILLIC + UNICODE_BLOCK_HALF_MARKS_CYR + \
                                 UNICODE_BLOCK_EXT_PHON_CYR + UNICODE_BLOCK_EXT_C_CYR + \
                                 UNICODE_BLOCK_EXT_B_CYR + UNICODE_BLOCK_EXT_A_CYR + \
                                 UNICODE_BLOCK_SUPPL_CYR

    #
    # Hangul
    #
    # This is the 11xx jamo code block, when computer sees a sequence of these jamos, they combine
    # them into Hangul syllables (or just Hangul) in the block below.
    # print(chr(0x110c) + chr(0x1161) + chr(0x1106) + chr(0x1169))
    UNICODE_BLOCK_ORDINAL_HANGUL_JAMO = tuple(range(0x1100, 0x11FF + 1, 1))
    UNICODE_BLOCK_HANGUL_JAMO = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_HANGUL_JAMO])
    # This is the 31xx hangul compatibility jamo block,
    # when computer sees a sequence of these jamos, they print out individually, without combining into Hangul syllables
    # print(chr(0x3148) + chr(0x314f) + chr(0x3141) + chr(0x3157))
    UNICODE_BLOCK_ORDINAL_HANGUL_COMPATIBILITY_JAMO = tuple(
        range(0x3130, 0x318F + 1, 1))
    UNICODE_BLOCK_HANGUL_COMPATIBILITY_JAMO = tuple([
        chr(ordinal)
        for ordinal in UNICODE_BLOCK_ORDINAL_HANGUL_COMPATIBILITY_JAMO
    ])
    # This block is for Hangul syllables (or just Hangul). E.g. '한', '굴', '자' '모'
    # whereas the above blocks are for single 자모 (字母 or alphabet).
    UNICODE_BLOCK_ORDINAL_HANGUL_SYLLABLE = tuple(range(0xAC00, 0xD7AF + 1, 1))
    UNICODE_BLOCK_HANGUL_SYLLABLE = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_HANGUL_SYLLABLE])

    UNICODE_BLOCK_HANGUL_ALL_INCLUDING_SYLLABLE = \
        UNICODE_BLOCK_HANGUL_JAMO + UNICODE_BLOCK_HANGUL_COMPATIBILITY_JAMO + UNICODE_BLOCK_HANGUL_SYLLABLE

    #
    # Japanese Hiragana/Katakana
    #
    UNICODE_BLOCK_ORDINAL_HIRAGANA = tuple(range(0x3040, 0x309F + 1, 1))
    UNICODE_BLOCK_HIRAGANA = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_HIRAGANA])
    UNICODE_BLOCK_ORDINAL_KATAKANA = tuple(range(0x30A0, 0x30FF + 1, 1))
    UNICODE_BLOCK_KATAKANA = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_KATAKANA])

    UNICODE_BLOCK_HIRAGANA_KATAKANA = UNICODE_BLOCK_HIRAGANA + UNICODE_BLOCK_KATAKANA
    UNICODE_BLOCK_HIRAGANA_KATAKANA_KANJI = \
        UNICODE_BLOCK_HIRAGANA + UNICODE_BLOCK_KATAKANA + UNICODE_BLOCK_CJK

    #
    # Thai
    # From http://sites.psu.edu/symbolcodes/languages/asia/thai/thaichart/
    #
    UNICODE_BLOCK_ORDINAL_THAI_CONSONANTS = tuple(range(0x0E01, 0x0E2E + 1, 1))
    UNICODE_BLOCK_THAI_CONSONANTS = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_CONSONANTS])

    # The character ' ็' or chr(0x0E47) is unique, a consonant must appear before it, and another consonant after it
    # ['ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', '็']
    UNICODE_BLOCK_ORDINAL_THAI_VOWELS_AFTER_CONSONANT = \
        tuple( range(0x0E30, 0x0E3A+1, 1) ) + tuple( range(0x0E47, 0x0E47+1, 1) )
    UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_VOWELS_AFTER_CONSONANT] )

    # The character ' ็' or chr(0x0E47) is unique, a consonant must appear before it, and another consonant after it
    # ['เ', 'แ', 'โ', 'ใ', 'ไ', '็']
    UNICODE_BLOCK_ORDINAL_THAI_VOWELS_BEFORE_CONSONANT = \
        tuple( range(0x0E40, 0x0E44+1, 1) ) + tuple( range(0x0E47, 0x0E47+1, 1) )
    UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT = \
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_VOWELS_BEFORE_CONSONANT] )

    # Tone marks cannot be start of word (same with "vowels-after-consonant")
    # ['่', '้', '๊', '๋']
    UNICODE_BLOCK_ORDINAL_THAI_TONEMARKS = tuple(range(0x0E48, 0x0E4B + 1, 1))
    UNICODE_BLOCK_THAI_TONEMARKS = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_TONEMARKS])

    UNICODE_BLOCK_ORDINAL_THAI_NUMBERS = tuple(range(0x0E50, 0x0E59 + 1, 1))
    UNICODE_BLOCK_THAI_NUMBERS = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_NUMBERS])

    UNICODE_BLOCK_ORDINAL_THAI_SIGNS_PUNCTUATIONS = tuple( range(0x0E2F, 0x0E2F+1, 1) ) +\
                                                    tuple( range(0x0E45, 0x0E46+1, 1) ) +\
                                                    tuple( range(0x0E4C, 0x0E4F+1, 1) ) +\
                                                    tuple( range(0x0E5A, 0x0E5B+1, 1) )
    UNICODE_BLOCK_THAI_SIGNS_PUNCTUATIONS = tuple([
        chr(ordinal)
        for ordinal in UNICODE_BLOCK_ORDINAL_THAI_SIGNS_PUNCTUATIONS
    ])

    UNICODE_BLOCK_THAI = UNICODE_BLOCK_THAI_CONSONANTS +\
                         UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT +\
                         UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT +\
                         UNICODE_BLOCK_THAI_TONEMARKS +\
                         UNICODE_BLOCK_THAI_NUMBERS +\
                         UNICODE_BLOCK_THAI_SIGNS_PUNCTUATIONS

    #
    # Punctuations, etc.
    #
    UNICODE_BLOCK_WORD_SEPARATORS =\
        tuple(u' ,!.?()[]:;"«»\'') + tuple(u'?。,()') + tuple([chr(0xFF0C),chr(0xFF01),chr(0xFF0E),chr(0xFF1F)])

    UNICODE_BLOCK_SENTENCE_SEPARATORS =\
        tuple(u' !.?') + tuple([chr(0xFF01),chr(0xFF0E),chr(0xFF1F)])
    #
    # Numbers: normal Latin and CJK halfwidth/fullwidth
    #
    UNICODE_BLOCK_ORDINAL_NUMBERS = tuple(range(
        0x0030, 0x0039 + 1, 1)) + tuple(range(0xFF10, 0xFF19 + 1, 1))
    UNICODE_BLOCK_NUMBERS = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_NUMBERS])

    #
    # Punctuations Only (Half-Width & Full-Width Forms)
    #
    UNICODE_BLOCK_ORDINAL_PUNCTUATIONS = tuple(range(0x0000, 0x007F+1, 1)) +\
                                         tuple(range(0x2000, 0x206F+1, 1)) +\
                                         tuple(range(0x3000, 0x303F+1, 1)) +\
                                         tuple(range(0xFF00, 0xFF0F+1, 1)) +\
                                         tuple(range(0xFF1A, 0xFF20+1, 1)) +\
                                         tuple(range(0xFF3B, 0xFF40+1, 1)) +\
                                         tuple(range(0xFF5B, 0xFF65+1, 1))
    UNICODE_BLOCK_PUNCTUATIONS = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_PUNCTUATIONS])
    # Remove non-punctuations from original list of punctuations
    UNICODE_BLOCK_PUNCTUATIONS = tuple(
        set(UNICODE_BLOCK_PUNCTUATIONS) - set(UNICODE_BLOCK_LATIN_ALL))
    UNICODE_BLOCK_PUNCTUATIONS = tuple(
        set(UNICODE_BLOCK_PUNCTUATIONS) - set(UNICODE_BLOCK_WORD_SEPARATORS))
    UNICODE_BLOCK_PUNCTUATIONS = tuple(
        set(UNICODE_BLOCK_PUNCTUATIONS) -
        set(UNICODE_BLOCK_SENTENCE_SEPARATORS))
    UNICODE_BLOCK_PUNCTUATIONS = tuple(
        set(UNICODE_BLOCK_PUNCTUATIONS) - set(UNICODE_BLOCK_NUMBERS))

    #
    # Get the valid Unicode Block for a given language
    #
    @staticmethod
    def get_language_charset(lang):
        # lang_std = lf.LangFeatures.map_to_lang_code_iso639_1(
        #     lang_code = lang
        # )
        lang_std = lf.LangFeatures.map_to_correct_lang_code_iso_639_1_or_3(
            lang_code=lang)

        if lang_std in [lf.LangFeatures.LANG_EN, lf.LangFeatures.LANG_VI]:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL
        if lang == lf.LangFeatures.LANG_ZH:
            return LangCharacters.UNICODE_BLOCK_CJK
        elif lang == lf.LangFeatures.LANG_TH:
            return LangCharacters.UNICODE_BLOCK_THAI
        elif lang == lf.LangFeatures.LANG_KO:
            return LangCharacters.UNICODE_BLOCK_HANGUL_ALL_INCLUDING_SYLLABLE
        elif lang == lf.LangFeatures.LANG_JA:
            return LangCharacters.UNICODE_BLOCK_HIRAGANA_KATAKANA_KANJI
        else:
            return []

    @staticmethod
    def get_alphabet_charset(alphabet):
        #
        # Latin Type Blocks (English, Spanish, French, Vietnamese, etc.)
        # TODO Break into other language variants (done)
        #
        if alphabet == lf.LangFeatures.ALPHABET_LATIN_AZ:
            return LangCharacters.UNICODE_BLOCK_LATIN_AZ
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_VI:
            return LangCharacters.UNICODE_BLOCK_LATIN_VI + LangCharacters.UNICODE_BLOCK_LATIN_VIETNAMESE
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_VI_AZ:
            return LangCharacters.UNICODE_BLOCK_LATIN_VI + LangCharacters.UNICODE_BLOCK_LATIN_AZ
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL

        # Latin type blocks:
        # French;
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_FR:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL
        # Czech;
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_CZECH:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL
        # German
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_GERMAN:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL
        # Spanish
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_SPANISH:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL
        # English
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_ENG:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL

        # CJK Type Blocks (Korean, Chinese, Japanese)
        # TODO Break into Chinese variants (simplified, traditional, etc.),
        #   Japanese, Hanja, etc. (done)

        elif alphabet == lf.LangFeatures.ALPHABET_HANGUL:
            return LangCharacters.UNICODE_BLOCK_HANGUL_ALL_INCLUDING_SYLLABLE
        elif alphabet == lf.LangFeatures.ALPHABET_CJK:
            return LangCharacters.UNICODE_BLOCK_CJK
        elif alphabet == lf.LangFeatures.ALPHABET_CJK_SIMPLIFIED:
            return LangCharacters.UNICODE_BLOCK_CJK_SIMPLIFIED
        elif alphabet == lf.LangFeatures.ALPHABET_CJK_TRADITIONAL:
            return LangCharacters.UNICODE_BLOCK_CJK_TRADITIONAL
        elif alphabet == lf.LangFeatures.ALPHABET_HIRAGANA_KATAKANA:
            return LangCharacters.UNICODE_BLOCK_HIRAGANA_KATAKANA
        elif alphabet == lf.LangFeatures.ALPHABET_JAPANESE:
            return LangCharacters.UNICODE_BLOCK_HIRAGANA_KATAKANA_KANJI
        #
        # Cyrillic Blocks (Russian, Belarusian, Ukrainian, etc.)
        #
        elif alphabet == lf.LangFeatures.ALPHABET_CYRILLIC:
            return LangCharacters.UNICODE_BLOCK_CYRILLIC_ALL
        #
        # Other Blocks
        #
        elif alphabet == lf.LangFeatures.ALPHABET_THAI:
            return LangCharacters.UNICODE_BLOCK_THAI

    @staticmethod
    def get_alphabet_charset_all():
        alphabet_dict = {}
        for alp in lf.LangFeatures.ALPHABETS_ALL:
            alphabet_dict[alp] = LangCharacters.get_alphabet_charset(
                alphabet=alp)
        return alphabet_dict

    #
    # Given a string with allowed Unicode block, returns a string with only the allowed Unicode values
    #
    def filter_allowed_characters(self,
                                  unicode_list,
                                  s,
                                  include_word_separators=True,
                                  include_sentence_separators=True,
                                  include_numbers=True,
                                  include_punctuations=True):
        # Just in case user passes in the immutable tuples
        allowed_list = list(unicode_list).copy()

        if include_word_separators:
            allowed_list += LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS
        if include_sentence_separators:
            allowed_list += LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS
        if include_numbers:
            allowed_list += [c for c in '0123456789']
        if include_punctuations:
            allowed_list += LangCharacters.UNICODE_BLOCK_PUNCTUATIONS

        str_new = [c for c in s if (c in allowed_list)]
        return ''.join(str_new)

    #
    # This function returns whether the written language is normal Vietnamese (a mix of basic and extended Latin)
    # or purely using basic Latin (it is cultural of Vietnamese to leave out all the diacritics and use purely basic
    # Latin)
    #
    def get_vietnamese_type(self, s):
        # Must convert string to unicode string
        #if type(s) != unicode:
        #s = unicode(s, encoding=self.encoding)

        # First we remove the punctuations, numbers, etc.
        remove_block = LangCharacters.UNICODE_BLOCK_PUNCTUATIONS + LangCharacters.UNICODE_BLOCK_NUMBERS + \
                       LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS + LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS
        ss = u''
        for i in range(0, len(s), 1):
            if s[i] not in remove_block:
                ss = ss + s[i]

        is_latin_basic_count = 0
        is_latin_extended_viet_count = 0
        for i in range(0, len(ss), 1):
            latin_basic = ss[i] in LangCharacters.UNICODE_BLOCK_LATIN_BASIC
            latin_extended = ss[
                i] in LangCharacters.UNICODE_BLOCK_LATIN_EXTENDED
            # print ( ss[i] + " Latin Basic = " + str(latin_basic) + ", Latin Extended = " + str(latin_extended) )
            is_latin_basic_count += 1 * latin_basic
            is_latin_extended_viet_count += 1 * latin_extended

        latin_basic_percent = float(is_latin_basic_count / float(len(ss)))
        latin_extended_viet_percent = float(is_latin_extended_viet_count /
                                            float(len(ss)))

        if (latin_basic_percent + latin_extended_viet_percent) > 0.5:
            if latin_basic_percent > 0.98:
                return "latin.basic"
            else:
                if latin_extended_viet_percent > 0.1:
                    return "latin.viet"
                else:
                    return "latin.mix"
        else:
            return "other"

    #
    # Converts a string into a single number for various purposes when dealing with numbers are more convenient.
    # This single number is not necessarily unique.
    #
    def convert_string_to_number(self, s, verbose=0):

        lang = None
        syllable_end = [False] * len(s)

        if s[0] in self.UNICODE_BLOCK_THAI and len(s) > 1:
            # For Thai, we don't calculate the last syllable character, since that character is highly prone
            # to error. E.g. ส-วัด-ดี (สวัสดี) or ปัน-หา (ปัญหา). This is also our method of Thai spelling correction.
            # Characters that can never be start of syllable
            not_start_syllable_char = self.UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT + \
                                      self.UNICODE_BLOCK_THAI_TONEMARKS
            lang = lf.LangFeatures.LANG_TH
            char_prev = s[0]
            for i in range(1, len(s) - 1, 1):
                char_prev = s[i - 1]
                char_cur = s[i]

                # This character can never be start of syllable
                if char_cur not in not_start_syllable_char:
                    continue

                char_next = s[i + 1]
                # Case of 'เดือน', 'เมื่อ', 'เลข', etc.
                if char_cur in self.UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT:
                    syllable_end[i - 1] = True
                elif char_cur in self.UNICODE_BLOCK_THAI_CONSONANTS:
                    # Case of 'การ', 'เดือน', 'ดารา' etc.
                    if (char_next in self.UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT) and \
                            (char_prev not in self.UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT):
                        syllable_end[i - 1] = True
                    # Case of 'งง', 'สด', etc.
                    # elif ( char_prev in LangCharacters.UNICODE_BLOCK_THAI_TONEMARKS ):
                    #    syllable_end[i-1] = True

            # Last character is always end of syllable
            syllable_end[len(s) - 1] = True

            if verbose >= 1:
                sylsepstring = ''
                for i in range(0, len(s), 1):
                    sylsepstring = sylsepstring + s[i]
                    if syllable_end[i]:
                        sylsepstring = sylsepstring + ' '
                print(sylsepstring)

        x = 0
        index = 1
        # A string "abc" will be calculated as (97 + 2*98 + 3*99), Unicode for 'a' is 97, 'b' is 98, 'c' is 99
        for i in range(0, len(s), 1):
            # We don't include a syllable ending consonant for Thai in the measure, since this character is prone
            # to spelling mistakes
            ignore = False
            if lang == lf.LangFeatures.LANG_TH:
                if s[i] in LangCharacters.UNICODE_BLOCK_THAI_CONSONANTS and syllable_end[
                        i]:
                    # print('Ignore ' + s[i])
                    ignore = True
            if not ignore:
                un = ord(s[i])
                # print('Index ' + str(index) + ', ' + s[i] + ', ' + str(un))
                x = x + index * un
                index = index + 1

        return x
Ejemplo n.º 4
0
def parse_sentence_file(
    filename,
    source,
    target,
    sentences,
    nonchinese_sentences,
    intermediate_ids,
    enable_jyutping,
    enable_pinyin,
):
    print("Parsing sentence file...")
    with open(filename, "r", encoding="utf8") as f:
        for index, line in enumerate(f):
            if len(line) == 0 or line[0] == "#":
                continue

            split = line.split()
            lang = split[1]
            sentence_start = line.index("\t", line.index("\t") + 1) + 1
            sentence = line[sentence_start:]

            sentence_id = split[0]

            if lang == source:
                if hanzidentifier.is_simplified(sentence):
                    trad = HanziConv.toTraditional(sentence)
                    simp = sentence
                else:
                    trad = sentence
                    simp = HanziConv.toSimplified(sentence)
                pin = ""
                if enable_pinyin:
                    pin = " ".join(
                        lazy_pinyin(
                            trad, style=Style.TONE3, neutral_tone_with_five=True
                        )
                    ).lower()
                    pin = pin.strip().replace("v", "u:")
                jyut = ""
                if enable_jyutping:
                    jyut = pinyin_jyutping_sentence.jyutping(
                        trad, tone_numbers=True, spaces=True
                    )
                sentence_row = objects.ChineseSentence(
                    sentence_id,
                    trad,
                    simp,
                    pin,
                    jyut,
                    lang,
                )

                sentences[sentence_id] = sentence_row
                continue

            if lang == target:
                sentence = line[sentence_start:].strip()
                sentence_translation = objects.NonChineseSentence(
                    sentence_id, sentence, lang
                )
                nonchinese_sentences[sentence_id] = sentence_translation
                continue

            intermediate_ids.add(sentence_id)
 def test_is_simplified(self):
     self.assertTrue(hanzidentifier.is_simplified(SIMPLIFIED))
     self.assertFalse(hanzidentifier.is_simplified(MIXED))
     self.assertTrue(hanzidentifier.is_simplified(BOTH))
     self.assertFalse(hanzidentifier.is_simplified(TRADITIONAL))
     self.assertFalse(hanzidentifier.is_simplified(UNKNOWN))
Ejemplo n.º 6
0
import sys
import csv
import hanzidentifier
import re
import random

tsv_file = open("Chinese.txt")
read_tsv = csv.reader(tsv_file, delimiter="\t")

words = {}
characters = {}

for row in read_tsv:
    w = re.sub(r"\s+", "", row[1])
    # w = row[1]
    if len(w) > 0 and hanzidentifier.is_simplified(w):
        if len(w) == 1:
            characters[w] = 0
        elif len(w) < 5:
            words[w] = 0

tsv_file = open("skritter-export.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

skritter = {}

for row in read_tsv:
    if len(row) < 2:
        continue
    w = re.sub(r"\s+", "", row[1])
    # w = row[1]