def reading_form(self) -> str: sudachi_reading = "".join(m.reading_form() for m in self.morphemes) surface = self.surface() if re.match(rf"{kata_re}+", surface) and not sudachi_reading: return surface sudachi_dict_reading = "".join(m.reading_form() for m in parse(self.dictionary_form())) surface_lForms = [ m.feature.lForm for m in fugashi_parse(self.surface()) ] dict_lForms = [ m.feature.lForm for m in fugashi_parse(self.dictionary_form()) ] fugashi_reading = "".join(surface_lForms) if all( surface_lForms) else "" fugashi_dict_reading = "".join(dict_lForms) if all(dict_lForms) else "" sudachi_lookup = jmdict_lookup( jaconv.kata2hira(sudachi_reading)).entries sudachi_dict_lookup = jmdict_lookup( jaconv.kata2hira(sudachi_dict_reading)).entries fugashi_dict_lookup = (fugashi_dict_reading and jmdict_lookup( jaconv.kata2hira(fugashi_dict_reading)).entries) if not (sudachi_lookup or sudachi_dict_lookup) and fugashi_dict_lookup: return fugashi_reading return sudachi_reading
def split_into_words(text): text = text.upper() import MeCab m = MeCab.Tagger("-Owakati") m1 = MeCab.Tagger() node = m.parseToNode(text) line = m1.parse(text).splitlines() words = [] #tp = m.parse(text).split() while node: words.append(node.surface) node = node.next data = [] for k in line: v = k.split('\t') #print(v) if len(v) > 3: data.append([ v[3].split('-')[0] + ':' + jaconv.kata2hira(v[1]), v[4].split('-')[0] ]) print([ v[3].split('-')[0] + ':' + jaconv.kata2hira(v[1]), v[4].split('-')[0] ]) else: data.append([v[0] + ':' + v[0], '']) #stop_words = [x.strip() for x in open('./data/stopwords.txt','r').read().split('\n')] return data
def extract(self, file_name): with open(file_name) as sentences: # word2id,id2wordの辞書を作成 for line in sentences: ids = [] for i in self.mecab.parse(line).splitlines(): word = jaconv.kata2hira(i.split(',')[-1]) if (word not in self.word2id_dic.keys()) and (word != '*'): self.word2id_dic[word] = len(self.word2id_dic) self.id2word_dic[len(self.id2word_dic)] = word self.word2id_dic[' '] = len(self.word2id_dic) self.id2word_dic[len(self.id2word_dic)] = ' ' with open(file_name) as sentences: # 各テキストの単語にidを付与し, for line in sentences: # id列をonehotシークエンスに変換 ids = [] for i in self.mecab.parse(line).splitlines(): word = jaconv.kata2hira(i.split(',')[-1]) if word != '*': ids.append(self.word2id_dic[word]) self.text_onehot_lists.append(ids) self.text_max_len = max([len(i) for i in self.text_onehot_lists]) for id, id_list in enumerate(self.text_onehot_lists): pad_list = [self.word2id_dic[' '] for i in range(self.text_max_len-len(id_list))] self.text_onehot_lists[id] += pad_list self.text_onehot_lists = [self.make_onehot(np.array(i)).tolist() for i in self.text_onehot_lists] return self.word2id_dic, self.id2word_dic, self.text_onehot_lists
def find(cosmetic): if (item and cosmetic['type']['backendValue'] not in item.split(',') or cosmetic['name'] is None): return if mode == 'name': name = cosmetic['name'] if self.case_insensitive: name = jaconv.kata2hira(cosmetic['name'].casefold()) if self.convert_kanji: name = self.bot.converter.do(name) if text in name: result.append(cosmetic) elif mode == 'id': if text in (cosmetic['id'].casefold()): result.append(cosmetic) elif mode == 'set': if cosmetic.get('set') is None: return name = cosmetic['name'] if self.case_insensitive: name = jaconv.kata2hira(name.casefold()) if self.convert_kanji: name = self.bot.converter.do(name) if text in name: result.append(cosmetic)
def get_song_meta(song): d = {} search = set() for k, v in song.meta.items(): if request.latin: d[k] = v[(request.lc, "l")] else: d[k] = v[request.lc] for k in ("title", "artist", "seenon", "album"): if k in song.meta: v = song.meta[k] search.add(normalize(v[request.lc])) search.add(normalize(v["k"])) search.add(normalize(v["l"])) search.add( normalize(jaconv.kana2alphabet(jaconv.kata2hira( v["k"]))).replace("ー", "")) for k in ("genre", ): if k in song.meta: v = song.meta[k] search.add(normalize(v[request.lc])) d["search"] = list(search) if request.latin: d["sort"] = song.meta["title"][(request.lc, "l")] if ord(d["sort"][0:1]) > 0x100: # Try again with kana-to-romaji, might help manufacture some sensible sort order d["sort"] = jaconv.kana2alphabet( jaconv.kata2hira(song.meta["title"][(request.lc, "l", "k")])) else: d["sort"] = song.meta["title"][(request.lc, "k")] return d
def run(input, output): data = input.read() result = [] for line in data.split('\n'): cols = line.split('\t') if len(cols) == 5 and cols[0].isdigit(): # 団体コード "都道府県名(漢字)" "市区町村名(漢字)" "都道府県名(カナ)" "市区町村名(カナ)" result.append({ 'code': unicodedata.normalize('NFKC', cols[0]).strip(), 'pref': unicodedata.normalize('NFKC', cols[1]).strip(), 'city': unicodedata.normalize('NFKC', cols[2]).strip(), 'pref_k': jaconv.h2z(cols[3]).strip(), 'city_k': jaconv.h2z(cols[4]).strip(), 'pref_h': jaconv.kata2hira(jaconv.h2z(cols[3])).strip(), 'city_h': jaconv.kata2hira(jaconv.h2z(cols[4])).strip() }) output.write( json.dumps( { 'title': 'jp_citycode', 'version': DATA_VERSION, 'table': result }, ensure_ascii=False).encode("utf-8")) click.echo('%d件処理しました' % len(result))
def qu(path_1): path = path_1 for mulu in os.listdir(path): #每一个循环读到一个大文件C064L,C064R jushiqi = 0 file_dir = os.path.join(path, mulu) #chasen的出力文件的地址以及整理之后的文件的地址 file_dir_2 = os.path.join(file_dir, 'keka') #原本的.out文件的路劲,仅仅是想要把文件名取出来 feature = 'chasen.txt' feature_1 = 'chasen.ref' feature_2 = mulu + '_' + 'chasen_1.txt' files_dir = os.path.join(file_dir, feature) save_dir = os.path.join(file_dir, feature_2) with open(files_dir, 'r', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) column = [row for row in reader] column_2 = [] for xiang in column: column_2.append(xiang[0].split()) column_1 = [] banyun = [] for xiang in column_2: if len(xiang) == 1: banyun.append(xiang[0]) column_1.append(banyun) banyun = [] else: if xiang[0] != '、' and xiang[0] != '。': if xiang[1] == '未知語': banyun.append(xiang[0] + ' ' + jaconv.kata2hira(xiang[0])) else: banyun.append(xiang[0] + ' ' + jaconv.kata2hira(xiang[1])) column_1.append(banyun) banyun = [] print(column_1) with open(save_dir, 'w', encoding='utf-8') as f: # 把正解文一句一句地写入新的txt文件 for xieru in column_1: f.writelines(xieru[0] + '\n')
def search_inside_sentence(data, ono_lis_st, ono_counter, option, n): with open(data, "r") as f: for line in tqdm(f): if n == True: line = line.rstrip("\n") if option == "mid": # convert katakana into hiragana line = jaconv.kata2hira(line) for i in re.findall(ono_lis_st, line): ono_counter[i] += 1 # ono_counter = {"pachipachi":1,...} # story = book_id elif option == "line_info_n_k": # convert katakana into hiragana # line = jaconv.kata2hira(line) for i in re.findall(ono_lis_st, line): ono_counter[i].append(line) # ono_counter = {"pachipachi":1,...} # story = book_id elif option == "line_info_n": # convert katakana into hiragana line = jaconv.kata2hira(line) for i in re.findall(ono_lis_st, line): ono_counter[i].append(line) # ono_counter = {"pachipachi":1,...} # story = book_id elif option == "line_info_k": # convert katakana into hiragana # line = jaconv.kata2hira(line) for i in re.findall(ono_lis_st, line): ono_counter[i].append(line) # ono_counter = {"pachipachi":1,...} # story = book_id elif option == "line_info": # convert katakana into hiragana line = jaconv.kata2hira(line) for i in re.findall(ono_lis_st, line): ono_counter[i].append(line) # ono_counter = {"pachipachi":1,...} # story = book_id elif option == "find_ono_hira": # convert katakana into hiragana line = jaconv.kata2hira(line) for i in re.findall(ono_lis_st, line): ono_counter[i] += 1 # ono_counter = {"pachipachi":1,...} # story = book_id elif option == "find_ono_kata": # convert katakana into hiragana # line = jaconv.kata2hira(line) for i in re.findall(ono_lis_st, line): ono_counter[i] += 1 # ono_counter = {"pachipachi":1,...} # story = book_id else: raise Exception return ono_counter
def unify_text(texts): unification_dict = {} for text in texts: for i, word in enumerate(text): if jaconv.kata2hira(word) in unification_dict: text[i] = jaconv.kata2hira(word) elif jaconv.hira2kata(word) in unification_dict: text[i] = jaconv.hira2kata(word) else: unification_dict[word] = True return texts
def romaji_word(self, word): """Word is a fugashi node, return a string""" if word.surface in self.exceptions: return self.exceptions[word.surface] if word.surface.isdigit(): return word.surface if is_ascii(word.surface): return word.surface # deal with unks first if word.is_unk: # at this point is is presumably an unk # Check character type using the values defined in char.def. # This is constant across unidic versions so far but not guaranteed. if word.char_type == 6 or word.char_type == 7: # hiragana/katakana kana = jaconv.kata2hira(word.surface) return self.map_kana(kana) # At this point this is an unknown word and not kana. Could be # unknown kanji, could be hangul, cyrillic, something else. # By default ensure ascii by replacing with ?, but allow pass-through. if self.ensure_ascii: out = '?' * len(word.surface) return out else: return word.surface if word.feature.pos1 == '補助記号': # If it's punctuation we don't recognize, just discard it return self.table.get(word.surface, '') elif (self.use_wa and word.feature.pos1 == '助詞' and word.feature.pron == 'ワ'): return 'wa' elif (not self.use_he and word.feature.pos1 == '助詞' and word.feature.pron == 'エ'): return 'e' elif (not self.use_wo and word.feature.pos1 == '助詞' and word.feature.pron == 'オ'): return 'o' elif (self.use_foreign_spelling and has_foreign_lemma(word)): # this is a foreign word with known spelling return word.feature.lemma.split('-')[-1] elif word.feature.kana: # for known words kana = jaconv.kata2hira(word.feature.kana) return self.map_kana(kana) else: # unclear when we would actually get here return word.surface
def delete_both(res): l = "" while res: if res.surface: ft = res.feature.split(",") if ft[1] != "格助詞" and res.surface != "私": if len(ft) > 7: l = l + jaconv.kata2hira( ft[7].decode('utf-8')).encode('utf-8') else: l = l + jaconv.kata2hira( res.surface.decode('utf-8')).encode('utf-8') res = res.next return l
def reverse_hirakana(string): import jaconv if is_hiragana(string): string = jaconv.hira2kata(string) elif is_katakana(string): string = jaconv.kata2hira(string) return string
def search_playlist(self, mode: str, text: str) -> List[dict]: if self.case_insensitive: text = jaconv.kata2hira(text.casefold()) if self.convert_kanji: text = self.bot.converter.do(text) result = [] def find(playlist): if mode == 'name': if self.case_insensitive: name = jaconv.kata2hira(playlist['name'].casefold()) else: name = playlist['name'] if self.convert_kanji: name = self.bot.converter.do(name) if text in name: result.append(playlist) elif mode == 'id': if text in playlist['id'].casefold(): result.append(playlist) for playlist in self.main_playlists.values(): find(playlist) if len(result) == 0: for playlist in self.sub_playlists.values(): find(playlist) return result
def search_members(q_info): _text = q_info['text'] # 全角 ⇒ 半角 & ノーマライズ. ex) 'kAげヤmay' => 'kAげヤmay' cleaned_text = jaconv.normalize(_text, 'NFKC') # カタカナ => ひらがな. ex) 'kAげヤmay' => 'kAげやmay' cleaned_text = jaconv.kata2hira(cleaned_text) # 大文字 => 小文字. ex) 'kAげやmay' => 'kaげやmay' cleaned_text = cleaned_text.lower() # 英語 => ひらがな. ex) 'kaげやmay' => {'is_success': False, 'text': 'かげやま'} result = otapick.alphabet2kana(cleaned_text) if result['is_success']: # 全てひらがなの状態 cleaned_text = result['text'] else: # ひらがな変換が失敗し、 cleaned_text = result['text'] # メタ文字(* \ | ? +)をエスケープ meta_char_tuple = ('\\', '*', '+', '.', '?', '{', '}', '(', ')', '[', ']', '^', '$', '-', '|', '/') for meta_char in meta_char_tuple: if meta_char in cleaned_text: cleaned_text = cleaned_text.replace(meta_char, '\\{}'.format(meta_char)) matched_members = Member.objects.filter( Q(full_kana__iregex=r'^%s' % cleaned_text) | Q(first_kana__iregex=r'^%s' % cleaned_text) | Q(full_kanji__iregex=r'^%s' % cleaned_text) | Q(first_kanji__iregex=r'^%s' % cleaned_text) | Q(full_eng__iregex=r'^%s' % cleaned_text) | Q(first_eng__iregex=r'^%s' % cleaned_text)) matched_member_keywords = MemberKeyword.objects.filter( keyword__iregex=r'^%s' % cleaned_text) # keywordもマッチした場合 if matched_member_keywords.count() > 0: matched_member_pk_list = [ matched_member.pk for matched_member in matched_members ] matched_keyword_member_pk_list = [ matched_member_keyword.member.pk for matched_member_keyword in matched_member_keywords ] member_pk_list = list( set(matched_member_pk_list + matched_keyword_member_pk_list)) # 重複を削除 members = Member.objects.filter(pk__in=member_pk_list) else: members = matched_members if members.exists(): return members else: return
def test_jaconv(): logging.info("=========================================") logging.info("= jaconv =") logging.info("=========================================") test_cases = get_test_cases() for tc in test_cases: title = tc['title'] body = tc['body'] logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title) calc_time(jaconv.hira2kata, body) logging.debug("result: %s" % jaconv.hira2hkata(body)) logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title) calc_time(jaconv.kata2hira, body) logging.debug("result: %s" % jaconv.kata2hira(body)) logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title) calc_time(jaconv.hira2hkata, body) logging.debug("result: %s" % jaconv.hira2hkata(body)) logging.info("半角 to 全角 for %s" % title) calc_time(jaconv.h2z, body) logging.debug("result: %s" % jaconv.h2z(body)) logging.info("全角 to 半角 for %s" % title) calc_time(jaconv.z2h, body) logging.debug("result: %s" % jaconv.z2h(body))
def make(circle, lost): if circle and lost: basetext = "SELECT brandfurigana,brandname FROM brandlist " else: basetext = "SELECT brandfurigana,brandname FROM brandlist WHERE " if circle == False: basetext += "kind = 'CORPORATION' " if lost == False: if circle == False: basetext += "AND " basetext += "lost = 'FALSE' " basetext += "ORDER BY brandfurigana" print(basetext) url = "http://erogamescape.dyndns.org/~ap2/ero/toukei_kaiseki/sql_for_erogamer_form.php" s = requests.session() payload = {'sql': basetext} r = s.post(url, data=payload) html = r.text soup = BeautifulSoup(html, 'html.parser') table = soup.find('table') text = "" for i, row in enumerate(table.find_all('tr')): if i == 0: continue data = row.find_all('td') text += "{furigana}\t{name}\t固有名詞\n".format(furigana=jaconv.kata2hira( data[0].string), name=data[1].string) return text
def text2hiragana(text): text = unicodedata.normalize("NFKC", text) text = normalize_neologd(text) text = text.replace(' ', ' ') parsed = mecab.parse(text).split('\n') parsed = [p.split('\t') for p in parsed] way_of_readings = [_special_char_convert(p[1], p[3], idx==0) for idx, p in enumerate(parsed) if len(p) >= 2] way_of_reading = "".join(way_of_readings) way_of_reading = _num2word(way_of_reading) way_of_reading = kakasi_converter.do(way_of_reading) way_of_reading = jaconv.kata2hira(way_of_reading) way_of_reading = way_of_reading.replace(' ', ' ') way_of_reading = re.sub(r'[^ぁ-ゔ。、!?ー\.\!\?,\s]', '', way_of_reading) way_of_reading = re.sub(r'\s{2,}', ' ', way_of_reading) way_of_reading = re.sub(r'^\s', '', way_of_reading) if len(way_of_reading) == 0: way_of_reading = '.' if way_of_reading[-1] != '.': way_of_reading = way_of_reading + '.' if way_of_reading[0] != ' ': way_of_reading = ' ' + way_of_reading return way_of_reading
def _get_date_range_from_jp_era(self, jp_era: str) -> dict: """[summary] Args: jp_era (str): [description] Returns: dict: [description] """ l_type = self._check_language(jp_era) data = None for key, value in self.data_dic.items(): reading = value["reading"] reading_jp = reading["jp"] reading_en = reading["en"] if l_type == "kanji": if jp_era == key: data = value elif l_type == "katakana" or l_type == "hiragana": jp_era = jaconv.kata2hira(jp_era) if jp_era == reading_jp: data = value break elif l_type == "english": if jp_era == reading_en: data = value break else: break return data
def __validate_with_janome(self, kanji, yomi): # しくらちよまる /志倉千代丸/ が、「こころざしくらちよまる」になるケースを特別に除外する # きしなみかお /岸波香桜/ -> *きしなみかお*りさくら # くらちれお /倉知玲鳳/ -> *くらちれお*おとり for c in ['志', '香', '鳳']: if c in kanji: return None janome_yomi = jaconv.kata2hira(''.join( [n.reading if str(n.reading) != '*' else n.base_form for n in self.tokenizer.tokenize(kanji)])) normalized_janome_yomi = normalize_hiragana(janome_yomi) normalized_yomi = normalize_hiragana(yomi) self.logger.debug(f"yomi={yomi} normalized_yomi={normalized_yomi}, janome_yomi={janome_yomi}," f" normalized_janome_yomi={normalized_janome_yomi}") if normalized_yomi in normalized_janome_yomi: extra = len(re.sub(normalized_yomi, '', normalized_janome_yomi, 1)) # 2 に意味はない。 # 愛植男=あいうえお が janome だと あいうえおとこ になるのの救済をしている。 if extra > 3: return f"kanji may contain extra chars(janome): janome_yomi={janome_yomi}" else: return None return None
def romaji_word(self, word): """Word is a fugashi node, return a string""" if word.surface in self.exceptions: return self.exceptions[word.surface] if word.surface.isdigit(): return word.surface if isascii(word.surface): return word.surface if word.feature.pos1 == '補助記号': return self.table[word.surface] elif (self.use_wa and word.feature.pos1 == '助詞' and word.feature.pron == 'ワ'): return 'wa' elif (not self.use_he and word.feature.pos1 == '助詞' and word.feature.pron == 'エ'): return 'e' elif (not self.use_wo and word.feature.pos1 == '助詞' and word.feature.pron == 'オ'): return 'o' elif (self.use_foreign_spelling and has_foreign_lemma(word)): # this is a foreign word with known spelling return word.feature.lemma.split('-')[-1] elif word.feature.kana: # for known words kana = jaconv.kata2hira(word.feature.kana) return self.map_kana(kana) else: return word.surface
def split_furigana(text): """ MeCab has a problem if used inside a generator ( use yield instead of return ) The error message is: ``` SystemError: <built-in function delete_Tagger> returned a result with an error set ``` It seems like MeCab has bug in releasing resource """ mecab = MeCab.Tagger("-Ochasen") mecab.parse('') # 空でパースする必要がある node = mecab.parseToNode(text) ret = [] while node is not None: origin = node.surface # もとの単語を代入 if not origin: node = node.next continue # originが空のとき、漢字以外の時はふりがなを振る必要がないのでそのまま出力する if origin != "" and any(is_kanji(_) for _ in origin): kana = node.feature.split(",")[7] # 読み仮名を代入 hiragana = jaconv.kata2hira(kana) for pair in split_okurigana(origin, hiragana): ret += [pair] else: if origin: ret += [(origin, )] node = node.next return ret
def get_song_meta(song): d = {} search = set() for k, v in song.meta.items(): if request.latin: d[k] = v[(request.lc, "l")] else: d[k] = v[request.lc] for k in ("title", "artist", "seenon", "album"): if k in song.meta: v = song.meta[k] search.add(normalize(v[request.lc])) search.add(normalize(v["k"])) search.add(normalize(v["l"])) search.add( normalize(jaconv.kana2alphabet(jaconv.kata2hira( v["k"]))).replace("ー", "")) for k in ("genre", ): if k in song.meta: v = song.meta[k] search.add(normalize(v[request.lc])) d["search"] = list(search) if request.latin: d["sort"] = song.meta["title"][(request.lc, "l")] else: d["sort"] = song.meta["title"][(request.lc, "k")] return d
def write_out_textures(row, filename): """ count the frequency of texture of a recipe you want """ # convert kata to hira and add up as count of hira # add value[0] to a list if dic = {value[0]:value[1] for value in row} # import pdb; pdb.set_trace() re_katakana = re.compile(r'[\u30A1-\u30F4]+') for key, value in dic.items(): if re_katakana.fullmatch(key): kana_to_hira = jaconv.kata2hira(key) if kana_to_hira in dic.keys(): dic[kana_to_hira] += value dic[key] = 0 dic = {k:v for k,v in dic.items() if v > 0} with codecs.open(filename, 'a', 'utf-8') as file: writer = csv.writer(file, delimiter=',') writer.writerow([ 'word', 'count' ]) for texture, count in dic.items(): writer.writerow([ texture, count ])
def regularize_text(text): """数字や句読点を半角に統一 Args: text (Unicode): 日本語含む文字列 Returns: Unicode: 日本語含む文字列 """ pairs = [ ["0", "0"], ["1", "1"], ["2", "2"], ["3", "3"], ["4", "4"], ["5", "5"], ["6", "6"], ["7", "7"], ["8", "8"], ["9", "9"], [",", ","], ["、", ","], [".", "."], ["。", "."], [":", ":"], # ["'", " "], # ['"', " "], ["払金", ""], ["試着", ""], # ["金", ""], ["々", ""], ["”", " "], ["“", " "], ["(", " "], [")", " "], ["『", "「"], ["』", "」"], ["「", "」"], ["」", "」"], ["/", "/"], ["!", "!"], ["?", "?"], ["●", "まる"], ["~", ""], ["〜", ""], ["…", ""], ["…", ""], ["《", ""], ["》", ""], ] for p in pairs: text = text.replace(p[0], p[1]) mecab = MeCab.Tagger("-Oyomi") text = mecab.parse(text).rstrip() text = kata2hira(text) text = mojimoji.zen_to_han(text) for p in pairs: text = text.replace(p[0], p[1]) return text
def type_by_user(self, user_word): user_word = jaconv.kata2hira(user_word) last_chr = self.correct(self.com_word) if last_chr == user_word[0] and self.judge_last_char(user_word): self.user_word = user_word self.used_words.append(user_word) else: self.user_word = ""
def basic_preprocess(text): # convert digital number and latin to hangaku text = jaconv.z2h(text, kana=False, digit=True, ascii=True) # convert kana to zengaku text = jaconv.h2z(text, kana=True, digit=False, ascii=False) # convert kata to hira text = jaconv.kata2hira(text) # lowercase text = text.lower() return text
def get_hiragana(word: str) -> str: """文字列をひらがなに変換します。 Args: word (str): 変換する文字列 Returns: str: ひらがな文字列 """ converted_word = jaconv.kata2hira(word) return converted_word
def _get_normalized_value(target_value, rule): v = target_value if rule.get('match_zen_han', False): v = _normalize2zen(v) if rule.get('match_kata_hira', False): v = jaconv.kata2hira(v) if rule.get('match_eng_jpn', False): v = _normalize2kun(v) if rule.get('case_insensitive', False): v = v.lower() return v
def search_style(self, id: str, text: str) -> List[dict]: if self.case_insensitive: text = jaconv.kata2hira(text.casefold()) if self.convert_kanji: text = self.bot.converter.do(text) styles = self.get_style(id) result = [] for style in styles: name = style['name'] if self.case_insensitive: name = jaconv.kata2hira(name.casefold()) if self.convert_kanji: name = self.bot.converter.do(name) if text in name: result.append(style) return result
def convert_file(src, dst): with open(src, encoding="utf-8") as srcf: with open(dst, "w", encoding="utf-16", newline="\r\n") as dstf: for line in srcf: elems = line.rstrip().split("\t") tango = elems[0] if len(elems) == 1: yomis = [jaconv.kata2hira(tango)] else: yomis = elems[1:] for yomi in yomis: dstf.write(f"{yomi}\t{tango}\t固有名詞\n")
def normalize_word(word): """ For identify same word """ word = jaconv.kata2hira(word) return word.lower()
def test_kata2hira(): assert_equal(jaconv.kata2hira('巴マミ'), '巴まみ') assert_equal(jaconv.kata2hira('マミサン', ignore='ン'), 'まみさン') _compare(jaconv.kata2hira, FULL_KANA, HIRAGANA)
def reading_hira(self): return kata2hira(self.reading)