Exemple #1
0
def get_simp_chars(char_data, Char, DTradChars=None):
    if type(Char) == int:
        Char = w_unichr(Char)

    LSubChars = []

    if DTradChars:
        # convert the converted characters/radicals Traditional-Simplified
        LVariants = [
            'unihan.simplifiedvariant',
            'unihan.semanticvariant', # CHECK ME!
            'unihan.specializedsemanticvariant', # CHECK ME!
            'unihan.zvariant'
        ]

        for Variant in LVariants:
            V = char_data.raw_data(Variant, w_ord(Char))
            if V:
                LSubChars += [
                    w_unichr(i) for i in V
                ]
                print(('Char:', Char.encode('utf-8'), 'LSubChars:', ''.join(LSubChars).encode('utf-8'), 'V:', V))

    # Use the transliteration system to convert from T-S,
    # only allowing if different and MultiRad data not
    # already available for that character
    from multi_translit.translit.my_engine.TranslitEngine import get_engine
    TradToSimp = get_engine('Chinese Traditional-Simplified')
    Conv = TradToSimp.convert(Char)
    LSubChars.append(Conv)

    # HACK: Make sure characters that are already
    # Traditional aren't converted so information isn't lost!
    # NOTE: This shouldn't be performed for RADICAL conversion,
    # just the CHARS as it can stuff up the radical lists :-(
    if DTradChars:
        LSubChars = [
            i for i in LSubChars if not i in DTradChars
        ]

    return rem_dupes(LSubChars)
Exemple #2
0
def get_L_words(fISOCode, fVariant, Word, Deinflect=False):
    if fISOCode in ('cmn', 'yue'):
        # NOTE: Filtering Chinese accents is probably a bad idea
        # as say "pin" could have multiple headers, so I think
        # it's best to filter the accents at e.g. a CEDict level
        # and sort by character frequency :-P

        # Replace commonly confused PinYin
        # combinations by Westerners :-P
        # TODO: Should this be in "Deinflect" (or a separate mode?)
        if Deinflect or len(Word) > 5:
            # TODO: What about "similar" mode? ---------------------------------------------
            R = Word.replace('r', 'l')
            if R.endswith('dz'): R = R[:-2] + 'zi'  # Yale Handz (Hanzi) HACK!
            if R.endswith('z'): R = R[:-1] + 'i'
            R = R.replace('e', 'a')
            R = R.replace('o', 'u')
            R = R.replace('d', 't')
            R = R.replace('y', 'i')
            # OPEN ISSUE: Should this be -> ts?
            R = R.replace(
                'i', 'u')  # CONTROVERSIAL - PinYin "i" often sounds like "u"!

            # replace "ch"-related sounds
            R = R.replace('ch', 'q')
            R = R.replace('j', 'q')
            R = R.replace('zh', 'q')
            R = R.replace('sh', 'q')  # CONTROVERSIAL!
            R = R.replace('x', 'q')  # OPEN ISSUE: Should this be "s" or "c"?

            # replace "ts"-related sounds
            #R = R.replace('c', 's') # CONTROVERSIAL!
            R = R.replace('z',
                          'c')  # Sometimes sounds like "ts" as in "Hanzi"?
            R = R.replace('ts', 'c')

            R = R.replace('s', 'q')
            return (Word, R)  #remove_tones(Word))
        else:
            return (Word, )

    #elif fISOCode == 'vie':
    #return (Word, filter_vie_accents(Word, False), filter_vie_accents(Word, True))
    elif fISOCode == 'tha':
        return (Word, filter_thai_accents(Word))
    elif fISOCode == 'jpn':
        # Add Katakana -> Hiragana in no accents mode
        iConv = Word  # ''.join([Conv(i) for i in Word])
        # Use romaji for more accurate similar results
        #Latin = HiraToRomaji(KataToRomaji(iConv))
        # TODO: What if the Latin is WRONG?
        #Latin = '^%s^' % Latin # Make sure Latin titles aren't confused!

        # Spaces are replaced to fix startswith/endswith queries
        NoSpaces = iConv.replace('_', '')
        LRtn = (
            Word,
            NoSpaces,
            #Latin.lower().replace('_', '')
        )

        oNoSpaces = NoSpaces
        if not Deinflect and NoSpaces:
            # Masculine Japanese HACK -
            # Converts "uzee" into "uzai" etc
            #print 'NoSpaces:', NoSpaces.encode('utf-8')

            NoSpaces = unicodedata.normalize('NFC', str(NoSpaces))  # HACK!
            Override = NoSpaces[-1] in 'ーぇ' or 'しぇ' in NoSpaces

            # Fix shenshei (sensei) as used in Lucky Star (if I recall correctly) :-P
            NoSpaces = NoSpaces.replace('しぇ', 'せ')

            NoSpaces = NoSpaces.replace('ー', 'え')
            NoSpaces = NoSpaces.replace('ぇ', 'え')
            # Koeeee (Kowai) HACK!
            while NoSpaces[-3:] == 'えええ':
                NoSpaces = NoSpaces[:-1]
            # Fix Sugeee (Sugoi) etc
            while NoSpaces[-2:] == 'ええ' and NoSpaces[1] != 'え':
                NoSpaces = NoSpaces[:-1]

            if len(NoSpaces) > 1 and (
                (NoSpaces[-1] == 'え' and NoSpaces[-2] in DAlt) or Override):
                if NoSpaces[-2] in DAlt:
                    LAltChars = DAlt[NoSpaces[-2]]
                else:
                    LAltChars = NoSpaces[-2]

                for AltChar in LAltChars:
                    Masculine = '%s%s' % (NoSpaces[:-2], AltChar)
                    Masculine = unicodedata.normalize('NFD',
                                                      Masculine)  # HACK!
                    Masculine = Masculine.replace(
                        'っ', '')  # Get rid of 'dekke-' (dekai) etc :-P

                    if is_hanzi(Masculine[0]):
                        # Fix [Ko]wai -> [Kowa]i when first character Kanji
                        KanjiForm = Masculine[0] + Masculine[2:]
                        LRtn += ('%sい' % Masculine, '%sい' % KanjiForm)
                    else:
                        LRtn += ('%sい' % Masculine, )

        if Deinflect and NoSpaces:
            # In deinflect mode, look up possible stems in the character data
            from mscSentenceJpn import IsKana  # HACK!
            if is_hanzi(oNoSpaces[0]) and IsKana(oNoSpaces[1:]):
                LKun = CharData.raw_data('Japanese Kun', w_ord(oNoSpaces[0]))
                if LKun:
                    LKun = unicodedata.normalize('NFD', LKun[0]).replace(
                        '-', '').split(' ')
                    LExtend = [
                        '%s%s' % (oNoSpaces[0], i.split('.')[1]) for i in LKun
                        if '.' in i
                    ]
                    LExtend = [(fastdist.distance(oNoSpaces, i), i)
                               for i in LExtend]
                    LExtend.sort(key=lambda x: -x[0])
                    LRtn += tuple([i[1] for i in LExtend])

        print((';'.join(LRtn).encode('utf-8')))
        return LRtn
    else:
        Rtn = (Word, filter_accents(Word))
        #print 'get_L_words RTN:', Rtn
        return Rtn
Exemple #3
0
def is_hanzi(S):
    Ord = w_ord(S)
    if Ord >= 0x4E00 and Ord <= 0x9FFF: return 1
    elif Ord >= 0x3400 and Ord <= 0x4DBF: return 1
    return 0
Exemple #4
0
 def _format_data(self, ord_, data):
     if len(data) > 1:
         i_ord = ' '.join([get_uni_point(w_ord(i)) for i in data])
         return '%s (%s)' % (data, i_ord)
     else:
         return None
Exemple #5
0
    def html_formatted(self, key, ord_):
        if isinstance(ord_, str):
            ord_ = w_ord(ord_)

        inst = self.get_class_by_property(key)
        return inst.html_formatted(ord_)
Exemple #6
0
    def raw_data(self, key, ord_):
        if isinstance(ord_, str):
            ord_ = w_ord(ord_)

        inst = self.get_class_by_property(key)
        return inst.raw_data(ord_)
Exemple #7
0
    def open_radkfile(self):
        DRads = rad_k_file.DRads  # ???? What about indexing????
        DKanji = rad_k_file.DKanji

        for kanji, LRads in list(DKanji.items()):
            yield 'multi_radicals', w_ord(kanji), [w_ord(i) for i in LRads]
Exemple #8
0
def open_kanjidic_2(path):
    ext = path.split('.')[-1].lower()

    if ext == 'gz':
        f = gzip.open(path)
    else:
        f = open(path, 'rb')

    # get an iterable
    iter_ = iterparse(f, events=("end", ))
    D = {}
    for event, elem in iter_:
        tag = elem.tag
        if tag == 'literal':
            # Yield the existing character and change it
            if elem.text:
                if D: yield D
                D = {'codepoint': w_ord(elem.text.strip())}

        elif tag in SIgnored:
            # Only children useful - ignored
            # But make sure it actually IS blank!
            if list(elem.keys()) or (elem.text and elem.text.strip()):
                print(
                    ('tag Ignore Warning:', tag, list(elem.keys()), elem.text))

        elif tag == 'database_version':
            # The Kanjidic database version
            # May as well print it
            print(('Kanjidic2 DB Version:', elem.text))

        elif tag == 'date_of_creation':
            # Likewise
            print(('Kanjidic2 Date of Creation:', elem.text))

        elif tag == 'file_version':
            # Likewise
            print(('f Version:', elem.text))

        elif tag == 'cp_value':
            # Codepoint values can be easily grabbed by str.encode('utf-8')
            # or str.encode('shift-jis') so I won't bother including them
            pass

        elif tag == 'dic_ref':
            # e.g. Morohashi references
            dic_ref_type = 'dicref_%s' % elem.get('dr_type')
            if not elem.text or not elem.text.strip():
                continue

            if not dic_ref_type in D:
                D[dic_ref_type] = []

            value = elem.text.strip()
            if dic_ref_type == 'dicref_moro':
                # HACK: Convert to a string - see also `dicref_moro`
                # in the `Indicies` variable in `IndiceBuilder`
                value = '%s.%s.%s' % (
                    value, elem.get('m_vol', '0'), elem.get('m_page', '0')
                )  # HACK! ================================

            if dic_ref_type == 'dicref_busy_people':
                if value.endswith('.A'):
                    value = '%s.0' % (
                        value[:-2]
                    )  # HACK! =============================================

            D[dic_ref_type].append(value)

        elif tag == 'freq':
            # Record the Japanese frequency
            if elem.text:
                if not 'freq' in D:
                    D['freq'] = []
                D['freq'].append(elem.text.strip())

        elif tag == 'grade':
            # Record the Japanese grade
            if elem.text:
                if not 'grade' in D:
                    D['grade'] = []
                D['grade'].append(elem.text.strip())

        elif tag == 'meaning':
            # WARNING: Specific meanings might be grouped with specific readings with rmgroup! ===========
            # I don't think Kanjidic differentiates between them *yet* though

            # Record the Japanese meaning
            if 'm_lang' in list(elem.keys()):
                # In English
                key = 'meaning_%s' % elem.get('m_lang')

                if not key in D:
                    D[key] = []

                if elem.text:
                    D[key].append(elem.text.strip())
            else:
                # In another language
                if not 'meaning' in D:
                    D['meaning'] = []

                if elem.text:
                    D['meaning'].append(elem.text.strip())

        elif tag == 'nanori':
            # Record the Japanese
            if not 'reading_nanori' in D:
                D['reading_nanori'] = []

            if elem.text:
                D['reading_nanori'].append(elem.text.strip())

        elif tag == 'q_code':
            # Input codes, e.g. SKIP and Four Corners
            # I've removed SKIP for now for licensing reasons
            query_code_type = elem.get('qc_type')

            if query_code_type == 'skip':
                continue
            query_code_type = 'querycode_%s' % query_code_type

            if not query_code_type in D:
                D[query_code_type] = []

            if elem.text:
                D[query_code_type].append(elem.text.strip())

        elif tag == 'rad_name':
            # Record the radical's name (if the character used
            # as a radical) in Japanese
            # TODO: This should be a StringData!
            if not 'rad_name' in D:
                D['rad_name'] = []

            if elem.text:
                D['rad_name'].append(elem.text)

        elif tag == 'rad_value':
            radical_type = elem.get('rad_type')
            key = 'rad_%s' % radical_type

            if not key in D:
                D[key] = []

            if elem.text:
                D[key].append(elem.text.strip())

        elif tag == 'reading':
            # The reading, e.g. pinyin/ja_on etc
            reading_type = 'reading_%s' % elem.get('r_type')

            if not reading_type in D:
                D[reading_type] = []

            if elem.text:
                D[reading_type].append(elem.text.strip())

        elif tag == 'stroke_count':
            # The total Japanese stroke count
            if elem.text:
                if not 'stroke_count' in D:
                    D['stroke_count'] = []
                D['stroke_count'].append(elem.text.strip())

        elif tag == 'variant':
            # Variant forms of this character
            key = 'crossref_%s' % elem.get('var_type')
            if elem.text:
                if not key in D:
                    D[key] = []
                D[key].append(elem.text.strip())

        elif tag == 'jlpt':
            # JLPT level
            if not 'jlpt' in D:
                D['jlpt'] = []

            D['jlpt'].append(int(elem.text.strip()))

        else:
            print(('WARNING:', tag, elem))
    f.close()