Ejemplo n.º 1
0
        def _resolve_lexemes(lexeme: str) -> (List[str], List[str], bool):
            kanji = [('1', self._get_kanji(k1)), ('2', self._get_kanji(k2)), ('3', self._get_kanji(k3)),
                     ('4', self._get_kanji(k4))]
            if not lexeme:
                return [k[1] for k in kanji if k[1]], [''.join([k[1] for k in kanji])], False
            if not [kj for kj in lexeme if kj.isnumeric() and kj != '0']:
                lexeme += '1234'

            other = {'0': '', "'": 'っ'}

            additional_kanji = re.findall(r'#(\d+)#', lexeme)
            for add_kj in additional_kanji:
                kanji.append((len(kanji), self._get_kanji(add_kj)))
            kanji_for_entry = [k[1] for k in kanji if k[1]]
            lexeme = re.sub(r'(#\d+#)', '', lexeme)

            lex_temp = ''
            cur_pos = 0
            while True:
                if cur_pos == len(lexeme):
                    lexeme = lex_temp + ''.join([kj[1] for kj in kanji])
                    break
                if lexeme[cur_pos].isnumeric():
                    if kanji and int(lexeme[cur_pos]) > int(kanji[0][0]):
                        for i in range(int(lexeme[cur_pos]), int(kanji[0][0]) - 1, -1):
                            if not kanji:
                                break
                            lex_temp += kanji[0][1]
                            kanji = kanji[1:]
                        cur_pos += 1
                    elif kanji and lexeme[cur_pos] == kanji[0][0]:
                        lex_temp += kanji[0][1]
                        kanji = kanji[1:]
                        cur_pos += 1
                    else:
                        cur_pos += 1
                        continue
                else:
                    lex_temp += lexeme[cur_pos]
                    cur_pos += 1

            for key, value in other.items():
                lexeme = lexeme.replace(key, value)

            if '^' in lexeme:
                lexeme = re.sub(r'\^([\w:]+)@?', lambda x: _latin_to_katakana(x.group(1)), lexeme)
                lexeme = re.sub(r'([\w:]*)', lambda x: _latin_to_hiragana(x.group(1)), lexeme)
            else:
                lexeme = _latin_to_hiragana(lexeme)

            if '[' in lexeme:
                return kanji_for_entry, [re.sub(r'(\[(.*?)\])', '', lexeme).strip(),
                                         re.sub(r'\[(.*?)\]', r'\1', lexeme).strip()], False
            elif '(' in lexeme:
                return kanji_for_entry, [re.sub(r'(\((.*?)\))', '', lexeme).strip(),
                                         re.sub(r'\((.*?)\)', r'\1', lexeme).strip()], True

            return kanji_for_entry, [lexeme.strip()], False
Ejemplo n.º 2
0
 def _normalize_kana(self, key: str, mode: str) -> str:
     if mode == 'right':
         if self._transliterate_collocations:
             return self._collocations_right[key]
         else:
             return _latin_to_hiragana(self._collocations_right[key].replace('wa', 'ha').replace(' ', ''))
     else:
         if self._transliterate_collocations:
             return self._collocations_left[key]
         else:
             return _latin_to_hiragana(self._collocations_left[key].replace('wa', 'ha').replace(' ', ''))
Ejemplo n.º 3
0
        def _resolve_readings(reading: str, variable: bool, hyphens: str) -> List[str]:
            def _resolve_hh(reading: str, hyphens: str) -> str:
                for h_pos in re.findall(r'\d+', hyphens):
                    if 'hh' not in reading[:int(h_pos)] + reading[int(h_pos) + 1:]:
                        reading = reading[:int(h_pos)] + reading[int(h_pos) + 1:]
                return reading

            reading = re.sub(r'(Q\d)', '', reading)
            reading = reading.replace('$', '')
            reading = reading.replace('L1', '').replace('L2', '')
            reading = reading.replace('=', '')
            reading = reading.replace(' ', '')

            if variable:
                split_reading = reading.split('*(*')
                full_reading = [r for r in split_reading[0].split('*') if r]
                if len(split_reading) > 1:
                    base_reading = [r for r in split_reading[1].split('*') if r][-len(full_reading):]
                else:
                    base_reading = []
                res = full_reading + base_reading
            else:
                res = [r for r in reading.split('*(*')[0].split('*')]

            if 'hh' in res[0]:
                for i in range(0, len(res)):
                    res[i] = _resolve_hh(res[i], hyphens)

            return sorted([_latin_to_hiragana(rd.lower()).strip() for rd in res if rd], key=len)
Ejemplo n.º 4
0
            def _extract_reference(translation: str) -> (str, str):
                res = []

                word_refs = re.findall(r'\^{2}10*(\d+)(?:_)?', translation)
                word_refs.extend(re.findall(r'\^0+(\d+)(?:\\)?(?:_)?', translation))
                word_refs.extend(re.findall(r'\^20*(\d+)(?:_)?', translation))

                if word_refs:
                    if re.search(r'^\\?#\\?.*?\\?#\\?\^\^?\d+\\?_?', translation) is not None:
                        mode = self._clean_text(re.search(r'^(\\?#\\?.*?\\?#\\?)\^\^?\d+\\?_?', translation).group(1))
                        translation = re.sub(r'^\\?#\\?.*?\\?#\\?\^\^?\d+\\?_?', '', translation).strip()
                    elif re.search(r'^\*+=?\d+\^\^?\d+', translation) is not None:
                        mode = '〈~' + self._normalize_kana(re.search(r'^(\*+=?\d+)\^\^?\d+', translation).group(1), 'right') + '〉'
                        translation = re.sub(r'^\*+=?\d+\^\^?\d+', '', translation).strip()
                    elif re.search(r'^\*-\d+\\?\^\^?\d+', translation) is not None:
                        mode = '〈-' + self._normalize_kana(re.search(r'^(\*-\d+)\\?\^\^?\d+', translation).group(1), 'left') + '〉'
                        translation = re.sub(r'^\*-\d+\\?\^\^?\d+', '', translation).strip()
                    else:
                        mode = ''

                    for word_ref in word_refs:
                        res.append(YarxiReference(eid=re.search(r'0*(\d+)', word_ref).group(1), mode=mode))

                kanji_refs = []
                kanji_refs.extend(re.findall(r'\^0-0*(\d+)-?(?:\\\'\'\\([\w|\s]+)\\\'\')?', translation))
                kanji_refs.extend(re.findall(r'\^2-0*(\d+)-?(?:\\\'\'\\([\w|\s]+)\\\'\')?', translation))

                if kanji_refs:
                    if re.search(r'^\\?#\\?.*?\\?#\\?\^[-\d]+\\?_?', translation) is not None:
                        mode = self._clean_text(re.search(r'^(\\?#\\?.*?\\?#\\?)\^[-\d]+\\?_?', translation).group(1))
                        translation = ''
                    else:
                        mode = ''

                    for kanji_ref in kanji_refs:
                        res.append(
                            YarxiReference(
                                lexeme=[self._get_kanji(kanji_ref[0]) + _latin_to_hiragana(kanji_ref[1].strip())],
                                eid='', mode=mode))

                return res, re.sub(r'(\^+\d-?\d+)(-\\\'\'\\([\w|\s]+)\\\'\'\\)?\\?_?', '', translation)
Ejemplo n.º 5
0
    def _extract_compound_values(self, kanji: {str: _Kanji}, show_progress: bool):
        def _split_and_clean_compound_translations(translations: str, rus_nick: str):
            translations = re.sub(r'(\^[io]\d+)', '', translations)
            translations = re.sub(r"\\\^\'\'\\([a-z]+)\\\'\'",
                                  lambda m: f'\\{_latin_to_hiragana(m.group(1))}', translations)
            translations = re.sub(r'(\^[78563]\d+)', '', translations)
            translations = re.sub(r'({\^\^[164035iz]\d+})', '', translations)
            translations = re.sub(r'\s?{!?=?\d?\$?\+?_?\\?([^}]+?)\.?}', '', translations)
            times = 1
            while times:
                translations, times = re.subn(r'[^\^]\^0*(\d{1,4})([^\d])', lambda m: f'{self._get_kanji(m.group(1))}{m.group(2)}', translations)
            translations = re.sub(r't0*(\d+)', lambda m: f'теперь {self._get_kanji(m.group(1))}', translations)
            translations = re.sub(r'^~+', '', translations)
            translations = translations.replace('{^^^}', '')
            translations = translations.replace('^#', '')

            if translations == '-' or '@\\7' in translations:
                return [([], '')]

            reading_numbers = []
            final_trs = []
            translations = re.sub(r'\{\^*\w\d+\}', '', translations)
            translations = re.sub(r'(\[!.*?\])', '', translations)
            translations = re.sub(r'(\+{\(.*\)})', '', translations)
            translations = re.sub(r'(\^8\d+)', '', translations)

            generic_translations = {'@9': f'{self._highlighting[0]}тж. счетный суффикс{self._highlighting[1]}',
                                    '@3': '', '@7': '', '@6': rus_nick, '@4': rus_nick,
                                    '@2': rus_nick, '@1': rus_nick,
                                    '@5': f'{self._highlighting[0]}встречается в географических названиях{self._highlighting[1]}',
                                    '@8': f'{self._highlighting[0]}в сочетаниях идиоматичен{self._highlighting[1]}',
                                    '@l': f'{self._highlighting[0]}употребляется в летоисчислении{self._highlighting[1]}',
                                    '@0': f'{self._highlighting[0]}употребляется фонетически{self._highlighting[1]}',
                                    '@\\0': f'{self._highlighting[0]}употребляется фонетически{self._highlighting[1]}',
                                    '@\\8': rus_nick + f' {self._highlighting[0]}в сочетаниях идиоматичен{self._highlighting[1]}',
                                    '@\\1': rus_nick + f' {self._highlighting[0]}сочетания неупотребительны{self._highlighting[1]}',
                                    '@\\4': rus_nick,
                                    '@\\5': rus_nick + f' {self._highlighting[0]}встречается в географических названиях{self._highlighting[1]}',
                                    '@\\2': rus_nick + f' {self._highlighting[0]}сочетания малоупотребительны{self._highlighting[1]}'}

            for translation in [tr for tr in translations.split('/') if tr]:
                nums_present = re.search(r'\((\d*)\)', translation)
                if nums_present is None:
                    reading_numbers.append(['0'])
                else:
                    reading_numbers.append(list(nums_present.groups()))

                for g_tr in generic_translations.keys():
                    if g_tr in translation:
                        if generic_translations[g_tr]:
                            translation = translation.replace(g_tr, f' {generic_translations[g_tr]} ').strip()
                        else:
                            translation = translation.replace(g_tr, '')

                final_trs.append(self._clean_text_kanji(re.sub(r'(\(\d*\))', '', translation)))

            return list(zip(reading_numbers, [fin_tr for fin_tr in final_trs if fin_tr]))

        extension = []

        complex_cases = [r'\$\\?40*(\d+)', r'\*60*(\d+)', r'^\$\\70*(\d+)', r'\$\\?10*(\d+)',
                         r'\$\\?00*(\d+)', r'\$\\50*(\d+)', r'\$\\20*(\d+)', r'\$\\30*(\d+)',
                         r'\^90*(\d+)', r'\*20*(\d+)', r'^\^0+(\d+)']

        for kj in tqdm(list(kanji.values())[2:], desc="[Yarxi] Updating kanji database".ljust(34), disable=not show_progress):
            comp_readings = {'0': [_latin_to_hiragana(on) for on in re.split(r'[*,;)(-]', kj.on) if on]}

            if not comp_readings['0'] \
                    or kj.on.startswith('-') \
                    or kj.kanji in ['壱', '勺', '玖', '凰', '呶', '夊',
                                    '寉', '寔', '螽'] \
                    or not kj.rus_nick:
                continue

            cleaned_kun = kj.kun.split('||$')[0]

            comp_readings_temp = re.search(r'\|(.*)', cleaned_kun)
            if comp_readings_temp:
                for sp_c_r in comp_readings_temp.group(1).split('/'):
                    readings = re.split(r'[_|,]', re.sub(r'q\d', 'い', _latin_to_hiragana(
                        sp_c_r.replace('-', '').replace(' ', '').lower())))
                    comp_readings[str(len(comp_readings))] = readings

            comp_translations = kj.rus.split('|')

            if len(comp_translations) > 1:
                comp_translations = _split_and_clean_compound_translations(comp_translations[1], kj.rus_nick)
                for tr in comp_translations:
                    if not tr[0]:
                        translation = re.sub(r'(.*\*#\*)(.*)(\*)', r'\2', kj.rus_nick)
                        translation = re.sub(r'\'\'(.*)\'\'', lambda match: f'«{_latin_to_hiragana(match.group(1))}»',
                                             translation)
                        already_there = self._in_container(extension, comp_readings['0'], [kj.kanji])
                        if already_there == -1:
                            extension.append(
                                YarxiEntry(reading=[cr for cr in comp_readings['0'] if cr],
                                           lexeme=[kj.kanji],
                                           translation=[self._in_compounds_pref + translation],
                                           eid=str(self._get_next_eid()),
                                           references=[],
                                           kanji=[kj.kanji]))
                        else:
                            extension[already_there].translation.extend([self._in_compounds_pref + translation])
                    else:
                        for tr_r in tr[0]:
                            already_there = self._in_container(extension, comp_readings[tr_r], [kj.kanji])
                            if already_there == -1:
                                extension.append(
                                    YarxiEntry(reading=[cr for cr in comp_readings[tr_r] if cr],
                                               lexeme=[kj.kanji],
                                               translation=[self._in_compounds_pref + tr[1]],
                                               eid=str(self._get_next_eid()),
                                               references=[],
                                               kanji=[kj.kanji]))
                            else:
                                extension[already_there].translation.extend([self._in_compounds_pref + tr[1]])
            else:
                cont = False
                for case in complex_cases:
                    if re.search(case, kj.rus) is not None:
                        already_there = self._in_container(extension, comp_readings['0'],
                                                           [self._get_kanji(re.search(case, kj.rus).group(1))])
                        if already_there == -1:
                            already_there = self._in_container(self._entries, comp_readings['0'],
                                                               [self._get_kanji(
                                                                   re.search(case, kj.rus).group(1))])
                            if already_there == -1:
                                extension.append(YarxiEntry(reading=[cr for cr in comp_readings['0'] if cr],
                                                            lexeme=[
                                                                self._get_kanji(
                                                                    re.search(case, kj.rus).group(1)),
                                                                kj.kanji],
                                                            translation=[],
                                                            eid=str(self._get_next_eid()),
                                                            references=[],
                                                            kanji=[
                                                                self._get_kanji(
                                                                    re.search(case, kj.rus).group(1)),
                                                                kj.kanji]))
                            else:
                                self._entries[already_there].lexeme.append(kj.kanji)
                                self._entries[already_there].kanji.append(kj.kanji)
                        else:
                            extension[already_there].lexeme.append(kj.kanji)
                            extension[already_there].kanji.append(kj.kanji)
                        cont = True
                        break
                if cont or any(only_comp in kj.rus for only_comp in ['@\\7', '@7']):
                    continue
                if re.search(r'\$\\60*(\d+)', kj.rus) is not None:
                    single_comp_translation = [kj.rus_nick]
                elif re.search(r'=\\([а-я].+)', kj.rus) is not None:
                    single_comp_translation = [self._clean_text_kanji(re.search(r'=\\([а-я].+)', kj.rus).group(1))]
                elif any(special in kj.rus for special in ['@\\5', '@\\8']) \
                        or re.search(r'^([^а-яА-Я]*?\d+)\*', kj.kun) is None:
                    single_comp_translation = [re.sub(r'^~', '', tr[1]).strip()
                                               for tr in
                                               _split_and_clean_compound_translations(kj.rus, kj.rus_nick)
                                               if tr[1]]
                elif len([reading for reading in re.split(r'[^a-zA-Z]', kj.kun) if reading]) < len(
                        kj.rus.split('/')):
                    start_id = len([reading for reading in re.split(r'[^a-zA-Z]', kj.kun) if reading])
                    single_comp_translation = [re.sub(r'^~', '', tr[1]).strip() for tr in _split_and_clean_compound_translations(''.join(kj.rus.split('/')[start_id:]), kj.rus_nick) if tr[1]]
                    if len(comp_readings) > 1:
                        comp_readings['0'] = comp_readings['1']
                elif '|' in kj.kun.split('||$')[0]:
                    start_id = len([reading for reading in re.split(r'[^a-z]', kj.kun.split('|')[0]) if reading])
                    single_comp_translation = [re.sub(r'^~', '', tr[1]).strip() for tr in _split_and_clean_compound_translations(''.join(kj.rus.split('/')[start_id:]), kj.rus_nick) if tr[1]]
                    if len(comp_readings) > 1:
                        comp_readings['0'] = comp_readings['1']
                else:
                    single_comp_translation = [n for n in kj.rus_nick.split('#')[0].split('*') if n]

                if not single_comp_translation:
                    continue

                already_there = self._in_container(extension, comp_readings['0'], [kj.kanji])
                if already_there == -1:
                    extension.append(YarxiEntry(reading=[cr for cr in comp_readings['0'] if cr],
                                                lexeme=[kj.kanji],
                                                translation=[self._in_compounds_pref + tr for tr in
                                                             single_comp_translation],
                                                eid=str(self._get_next_eid()),
                                                references=[],
                                                kanji=[kj.kanji]))
                else:
                    extension[already_there].translation.extend(
                        [self._in_compounds_pref + tr for tr in single_comp_translation])