def run(input, output): data = input.read() result = [] for line in data.split('\n'): cols = line.split('\t') if len(cols) == 5 and cols[0].isdigit(): # 団体コード "都道府県名(漢字)" "市区町村名(漢字)" "都道府県名(カナ)" "市区町村名(カナ)" result.append({ 'code': unicodedata.normalize('NFKC', cols[0]).strip(), 'pref': unicodedata.normalize('NFKC', cols[1]).strip(), 'city': unicodedata.normalize('NFKC', cols[2]).strip(), 'pref_k': jaconv.h2z(cols[3]).strip(), 'city_k': jaconv.h2z(cols[4]).strip(), 'pref_h': jaconv.kata2hira(jaconv.h2z(cols[3])).strip(), 'city_h': jaconv.kata2hira(jaconv.h2z(cols[4])).strip() }) output.write( json.dumps( { 'title': 'jp_citycode', 'version': DATA_VERSION, 'table': result }, ensure_ascii=False).encode("utf-8")) click.echo('%d件処理しました' % len(result))
def run(self): data = self.load() jumanpp = Juman() output = [] for _, row in data.iterrows(): zenkaku = jaconv.h2z(row["sentence"], ascii=True, digit=True) splited = [ mrph.midasi for mrph in jumanpp.analysis(zenkaku).mrph_list() ] if self.task_name == 'QA_B': qa_zenkaku = jaconv.h2z( f"{row['target']}の{row['aspect']}は{row['sentiment']}", ascii=True, digit=True, ) else: qa_zenkaku = " " qa_splited = [ mrph.midasi for mrph in jumanpp.analysis(qa_zenkaku).mrph_list() ] output.append({ "context": " ".join(splited), "qa": " ".join(qa_splited), "label": 1 }) self.dump(pd.DataFrame(output))
def encode_plus(mecab, tokenizer, text_question, text_ending, text_context, max_length=512): #Question text_question = jaconv.h2z(text_question, kana=True, digit=True, ascii=True) tokens_question = tokenize_with_mecab(mecab, text_question) #Ending text_ending = jaconv.h2z(text_ending, kana=True, digit=True, ascii=True) tokens_ending = tokenize_with_mecab(mecab, text_ending) #Context text_context = jaconv.h2z(text_context, kana=True, digit=True, ascii=True) tokens_context = tokenize_with_mecab(mecab, text_context) tokens_a = ["[CLS]"] + tokens_question + ["[SEP]" ] + tokens_ending + ["[SEP]"] tokens_b = tokens_context + ["[SEP]"] input_ids_a = tokenizer.convert_tokens_to_ids(tokens_a) input_ids_b = tokenizer.convert_tokens_to_ids(tokens_b) len_a = len(input_ids_a) len_b = len(input_ids_b) if len_a + len_b > max_length: input_ids_b = input_ids_b[:max_length - len_a] input_ids_b[max_length - len_a - 1] = 3 #[SEP] elif len_a + len_b < max_length: padding_length = max_length - (len_a + len_b) input_ids_b = input_ids_b + [0 for i in range(padding_length)] #Input IDs input_ids = input_ids_a + input_ids_b input_ids = torch.tensor(input_ids) #Attention mask attention_mask = torch.ones(max_length, dtype=torch.long) for i in range(len_a + len_b, max_length): attention_mask[i] = 0 #Token type IDs token_type_ids = torch.ones(max_length, dtype=torch.long) for i in range(len_a): token_type_ids[i] = 0 encoding = { "input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids } return encoding
def handleM4A(path): # ./GENRE/Compilations/ARTIST/ALBUM/SONG.m4a temp = path.replace("\\", "/") #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ song = temp[temp.rfind("/") + 1:] song = jaconv.z2h(song, kana=False, digit=True, ascii=True) song = jaconv.h2z(song, kana=True, digit=False, ascii=False) temp = temp[:temp.rfind("/")] #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ album = temp[temp.rfind("/") + 1:] album = jaconv.z2h(album, kana=False, digit=True, ascii=True) album = jaconv.h2z(album, kana=True, digit=False, ascii=False) temp = temp[:temp.rfind("/")] #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ artist = temp[temp.rfind("/") + 1:] artist = jaconv.z2h(artist, kana=False, digit=True, ascii=True) artist = jaconv.h2z(artist, kana=True, digit=False, ascii=False) temp = temp[:temp.rfind("/")] #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ genre = temp[temp.rfind("/") + 1:] genre = jaconv.z2h(genre, kana=False, digit=True, ascii=True) genre = jaconv.h2z(genre, kana=True, digit=False, ascii=False) temp = temp[:temp.rfind("/")] #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ # take artist as Compilations category = temp[temp.rfind("/") + 1:] temp = temp[:temp.rfind("/")] if category == "__02_Compilations__": artist = "__Compilations__" elif category == "__01_Favorites__": pass #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ mp4 = MP4(path) #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ MyLogger.info(path) mp4.tags[TRACK_TITLE] = song mp4.tags[ALBUM] = album mp4.tags[ALBUM_ARTIST] = artist mp4.tags[ALBUM_SORT_ORDER] = conv.do(album) mp4.tags[ARTIST] = artist mp4.tags[ARTIST_SORT_ORDER] = conv.do(artist) mp4.tags[GENRE] = genre MyLogger.info("mp4.tags[TRACK_TITLE]", str(mp4.tags[TRACK_TITLE])) MyLogger.info("mp4.tags[ALBUM]", str(mp4.tags[ALBUM])) MyLogger.info("mp4.tags[ALBUM_ARTIST]", str(mp4.tags[ALBUM_ARTIST])) MyLogger.info("mp4.tags[ALBUM_SORT_ORDER]", str(mp4.tags[ALBUM_SORT_ORDER])) MyLogger.info("mp4.tags[ARTIST]", str(mp4.tags[ARTIST])) MyLogger.info("mp4.tags[ARTIST_SORT_ORDER]", str(mp4.tags[ARTIST_SORT_ORDER])) MyLogger.info("mp4.tags[GENRE]", str(mp4.tags[GENRE]))
def genEnSpeech(text, gender): kakasi = pykakasi.kakasi() kakasi.setMode("H", "a") kakasi.setMode("K", "a") kakasi.setMode("J", "a") kakasi.setMode("r", "Hepburn") conv = kakasi.getConverter() t2s_client = texttospeech.TextToSpeechClient() synthesis_input = texttospeech.types.SynthesisInput( text=conv.do(h2z(text))) ssml_gende = texttospeech.enums.SsmlVoiceGender.FEMALE if gender == 'male': ssml_gende = texttospeech.enums.SsmlVoiceGender.MALE voice = texttospeech.types.VoiceSelectionParams(language_code='en-US', ssml_gender=ssml_gende) audio_config = texttospeech.types.AudioConfig( audio_encoding=texttospeech.enums.AudioEncoding.MP3) response = t2s_client.synthesize_speech(synthesis_input, voice, audio_config) return response.audio_content
def g2p(input_yomi): # 全て全角カタカナに変換 input_yomi = jaconv.h2z(input_yomi) input_yomi = jaconv.hira2kata(input_yomi) output_yomi = [] for i, item in enumerate(input_yomi): # 先頭に長音符がきたら読まない if i == 0 and (item == "ー" or item == "〜"): pass # 文字列の、末端で無いとき、次の文字が捨て仮名で無いか確認する elif i < len(input_yomi)-1: if input_yomi[i+1] in sutegana: youon = item+input_yomi[i+1] # 拗音の音素を出力 if youon in g2p_list: output_yomi.append(g2p_list[youon]) # 拗音ではない場合、通常の仮名の音素を出力 else: output_yomi += nonyouon_before_st(input_yomi, i) output_yomi += nonyouon_before_st(input_yomi, i+1) else: output_yomi += nonyouon(input_yomi, i, item) # 末端 else: output_yomi += nonyouon(input_yomi, i, item) output_str = " ".join(output_yomi) output_yomi = output_str.split() # 音素を出力 return output_yomi
def str_cleanUp(st): st = st.replace(" ", "") st = st.replace("・", "") st = st.replace("&", "アンド") st = jaconv.h2z(st, kana=True) st = jaconv.hira2kata(st) return st
def re_cellstr(str): # 文字列前後の空白を削除。 str = str.strip() # セル内改行を取り除く。 str = str.replace('\n', '▽') # 半角カタカナを全角に変換する。 str = jaconv.h2z(str) # 全角のアスキーをASCIIへ変換(スペースもASCIIになる。)。 # 『〜』カラ str = re.sub('[〜~]', '〓から〓', str) # 『()』カッコ str = re.sub('(()(.+?)())', r'〓Rカッコ〓\2〓Rカッコ〓', str) str = re.sub('([)(.+?)(])', r'〓Bカッコ〓\2〓Bカッコ〓', str) str = jaconv.z2h(str, kana=False, ascii=True, digit=False) # 『〜』カラ復号 str = str.replace('〓から〓', '〜') # 『()』カッコ復号 str = re.sub('(〓Rカッコ〓)(.+?)(〓Rカッコ〓)', r'(\2)', str) str = re.sub('(〓Bカッコ〓)(.+?)(〓Bカッコ〓)', r'(\2)', str) # スペース(複数含む)をスペース一つに変換。 str = re.sub("\s+", " ", str) # コラムが右に1列増えるのを防ぐため。 str = re.sub(",", "/", str) return str
def search(text): RequestURL = 'http://usjinfo.com/wait/realtime.php' html = urllib.request.urlopen(RequestURL).read() if text.find('#USJ') == 0 and len(text) > 5: att = text[4:].split() elif text.find('#USJ') == 0 and len(text) < 5: att = False soup = BeautifulSoup(html, "html.parser") ReturnText = [] attractions = [] item = {} li = soup.find_all('ul', class_='list')[1].find_all('li') for tag in li: if tag.text.find('現在') != -1: pass elif tag.text.find('分') != -1 or tag.text.find('休止') != -1 or tag.text.find('終了') != -1: item.update({'info': tag.text.replace(' ','').replace('\n', '')}) attractions.append(item) else: item = {'title': jaconv.h2z(tag.text.replace(' ','').replace('\n', ''))} for se in attractions: if att is not False: if se['title'].find(att[0]) != -1: ReturnText.append(se['title'] + ': ' + se['info']) elif att is False: ReturnText.append(se['title'] + ': ' + se['info']) if not ReturnText: ReturnText.append('見つかりませんでした。検索キーワードを見直すか、管理者に問い合わせてください。') return '\n'.join(ReturnText)
def _analyze_pas(self) -> None: """extract predicate argument structure from <述語項構造:> tag in knp string""" sid2idx = { sid: idx for idx, sid in enumerate(self.sid2sentence.keys()) } for tag in self.tag_list(): if tag.pas is None: continue pas = Pas( BasePhrase(tag, self.tag2dtid[tag], tag.pas.sid, self.mrph2dmid), self.mrph2dmid) for case, arguments in tag.pas.arguments.items(): if self.relax_cases: if case in ALL_CASES and case.endswith('≒'): case = case.rstrip('≒') # ガ≒ -> ガ for arg in arguments: arg.midasi = jaconv.h2z(arg.midasi, digit=True) # 不特定:人1 -> 不特定:人1 # exophor if arg.flag == 'E': entity = self._create_entity(exophor=arg.midasi, eid=arg.eid) pas.add_special_argument(case, arg.midasi, entity.eid, '') else: sid = self.sentences[sid2idx[arg.sid] - arg.sdist].sid arg_bp = self._get_bp(sid, arg.tid) mention = self._create_mention(arg_bp) pas.add_argument(case, mention, '', self.mrph2dmid) if pas.arguments: self._pas[pas.dtid] = pas
def test_h2z(): assert_equal(jaconv.h2z('ティロフィナーレ'), 'ティロフィナーレ') assert_equal(jaconv.h2z('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ') _compare(jaconv.h2z, HALF_KANA, FULL_KANA) _compare(partial(jaconv.h2z, ascii=True), HALF_ASCII, FULL_ASCII) _compare(partial(jaconv.h2z, digit=True), HALF_DIGIT, FULL_DIGIT) for ascii in (True, False): for digit in (True, False): for kana in (True, False): assert_equal( jaconv.h2z(_concat(HALF_KANA if kana else FULL_KANA, HALF_ASCII if ascii else FULL_ASCII, HALF_DIGIT if digit else FULL_DIGIT), ascii=ascii, digit=digit, kana=kana), _concat(FULL_KANA, FULL_ASCII, FULL_DIGIT))
def template_specific(self, doc, tag, txt): with tag("div", style=css_position(self.template["nickname"]), klass="text"): with tag("p"): doc.asis( jaconv.h2z(self.author[:4].upper(), ascii=True, digit=True)) with tag("div", style=css_position(self.template["stats"]), klass="text"): with tag("p"): doc.asis(""" <span style="clear: both; float: left">レベル</span><span style="float: right">%s</span> <span style="clear: both; float: left">HP</span><span style="float: right">%s</span> <span style="clear: both; float: left">MP</span><span style="float: right">%s</span> <span style="clear: both; float: left">G</span><span style="float: right">%s</span> <span style="clear: both; float: left">E</span><span style="float: right">%s</span> """.strip() % ( random_wide(1, 99), random_wide(1, 999), random_wide(1, 999), random_wide(1, 9999), random_wide(1, 9999), ))
def make_table(etl_filename, csv_filename): i = 0 etl_filename = os.path.join(os.path.abspath(__file__), "..", etl_filename) with open(etl_filename, 'rb') as f: while True: # reading one image from 'sys.argv[1]' like a 'ETL9G_01' s = f.read(RECORD_SIZE) if s is None or len(s) < RECORD_SIZE: break # unpackaging from binay file. refer to 'https://techacademy.jp/magazine/19058' # refer to 'http://etlcdb.db.aist.go.jp/specification-of-etl-8' r = struct.unpack(">H2sHBBBBBBIHHHHBBBBHH2016s4x", s) img = Image.frombytes('F', (64, 63), r[20], 'bit', (4, 0)) img = np.array(img.convert('L')) # 0..15 #print(r[0:5], type(r[3].to_bytes(1,"big"))) #print(r[3].to_bytes(1,"big")) #print(r[3].to_bytes(1,"big").decode('shift_jis')) lbl = r[3].to_bytes(1, "big").decode('shift_jis') # 文字コード lbl = jaconv.h2z(lbl, digit=True, ascii=True) #lbl = r[3].to_bytes(1,"big").decode('utf-8') # 文字コード #dirname = bytes.fromhex(lbl).decode('jis_x_0201') #input() # write index and char to 'table.csv' with open(csv_filename, mode='a', newline='', encoding='utf-8') as csvf: writer = csv.writer(csvf) writer.writerow([str(r[0] - 1), str(lbl), "ok"]) i += 1
def insert_str(IN_STR, STR_NUM): #print(IN_STR) IN_STR_ARR = IN_STR.split('★') outstr_arr = [] for ELEM_STR in IN_STR_ARR: if ELEM_STR == "": continue #print("ELEMENT : " + ELEM_STR) IN_STR2 = jaconv.h2z(ELEM_STR,digit=False,ascii=True) #print('INPUT : ' + IN_STR2) str_sz = len(IN_STR2) str_setnum = int(len(IN_STR2) / STR_NUM) #print(str_setnum) for j in range(0, str_setnum + 1): linedata = [] for i in range(0, STR_NUM): idx = j * STR_NUM + i #print(idx) if idx >= str_sz: continue #print(IN_STR[idx]) else: linedata.append(IN_STR2[idx]) linedata2 = ''.join(linedata) outstr_arr.append(linedata2) outstr = '\n'.join(outstr_arr) #outstr = outstr.replace("。\n", "。") #print(outstr) return(outstr)
def _analyze_pas(self) -> None: """Extract predicate-argument structures represented in <述語項構造: > tags.""" sid2idx = { sid: idx for idx, sid in enumerate(self.sid2sentence.keys()) } for bp in self.bp_list(): if bp.tag.pas is None: continue pas = Pas(bp) for case, arguments in bp.tag.pas.arguments.items(): if self.relax_cases: if case in ALL_CASES and case.endswith('≒'): case = case.rstrip('≒') # ガ≒ -> ガ for arg in arguments: arg.midasi = jaconv.h2z(arg.midasi, digit=True) # 不特定:人1 -> 不特定:人1 # exophor if arg.flag == 'E': entity = self._create_entity(exophor=arg.midasi, eid=arg.eid) pas.add_special_argument(case, arg.midasi, entity.eid, '') else: sid = self.sentences[sid2idx[arg.sid] - arg.sdist].sid arg_bp = self._get_bp(sid, arg.tid) _ = self._create_mention(arg_bp) pas.add_argument(case, arg_bp, '') if pas.arguments: self._pas[pas.dtid] = pas
def chunk_srt(text): chunks = [] subs = list(sub for sub in srt.parse(text) if remove_spaces_punctuation( sub.content)) # filter ones with no useful chars, like '♬~' grouped_subs = group_subs_list(subs) for group in grouped_subs: text_pieces = [] html_pieces = [] for sub in group: start_time = sub.start.total_seconds() end_time = sub.end.total_seconds() cleaned_content = jaconv.h2z(sub.content).strip( ) # h2z only affects kana by default, which is what we want text_pieces.append(cleaned_content) html_pieces.append( f'<p t0="{start_time:.3f}" t1="{end_time:.3f}">' + html.escape(cleaned_content).replace('\n', '<br>') + f'</p>') chunks.append({ 'text': '\n'.join(text_pieces), 'html': '\n'.join(html_pieces), }) return chunks
def orig_XML_to_doc_obj(tree): """convert orig to document tree. under construction """ doc = Document('dummy') root = tree.getroot() for sent in root.findall('.//Sentence'): if sent.text == '' or sent.text == None: continue text = jaconv.h2z(sent.text, digit=True, ascii=True) doc.sentences.append(Sentence(text)) return doc
def convert(self, sent): sent = jaconv.z2h(sent, kana=False, ascii=True, digit=True) iters = re.finditer(r'([a-zA-Z][a-zA-Z\s]*)$', sent) output_word = "" pos = 0 for i in iters: s_pos, e_pos = i.span() word = i.groups()[0] word = re.sub('^\s', r'', word) word = re.sub('\s$', r'', word) s_word = "" while pos < s_pos: output_word += sent[pos] pos += 1 if word in self.med_dic: s_word = self.med_dic[word] elif word.lower() in self.med_dic: s_word = self.med_dic[word.lower()] else: s_word = word if s_word == '': s_word = word output_word += s_word pos = e_pos while pos < len(sent): output_word += sent[pos] pos += 1 return jaconv.h2z(output_word, kana=True, ascii=True, digit=True)
async def on_message(self, ctx): if ctx.author.bot: return if all([ ctx.channel.id != cs.Zyanken_room, ctx.channel.id != cs.Test_room ]): return for hand in ["グー", "チョキ", "パー"]: # グー,チョキ,パーの順に文字が含まれているか検索 if hand not in jaconv.hira2kata(jaconv.h2z(ctx.content)): continue # img, hand, msg, emoji1, emoji2 = zf.honda_to_zyanken(hand, ctx.author.id) img, hand, msg, emoji1, emoji2 = zf.honda_to_zyanken_breaktime( hand, ctx.author.id) if str(ctx.author.id) not in zf.No_reply: await ctx.add_reaction(emoji1) await ctx.add_reaction(emoji2) await ctx.channel.send(f"{ctx.author.mention} {hand}\n{msg}", file=discord.File(img), delete_after=5) if cs.Zyanken not in [roles.id for roles in ctx.author.roles]: guild = self.bot.get_guild(ctx.guild.id) await guild.get_member(ctx.author.id ).add_roles(get_role(guild, cs.Zyanken)) """ if emoji2 == "🎉" and len(zf.Former_winner) <= 5: guild = self.bot.get_guild(ctx.guild.id) await guild.get_member(ctx.author.id).add_roles(get_role(guild, cs.Winner)) if ctx.author.id not in zf.Former_winner: zf.Former_winner.append(ctx.author.id) """ break
def processing(pconf, address): for p in pconf.get('print_only_then', []): if address[p['col']] != p['value']: return None for p in pconf.get('dont_print_then', []): if address[p['col']] == p['value']: return None for p in pconf.get('replace', []): address[p['col']] = address[p['col']].replace(p['from'], p['to']) for p in pconf.get('split', []): for i, v in enumerate(address[p['col']]): address[f'{p["col"]}_{i:02}'] = v for p in pconf.get('default', []): if address.get(p['col'], '') == '': address[p['col']] = p['value'] for p in pconf.get('show_only_if_exists', []): if address.get(p['exists'], '') == '': address[p['col']] = '' for k, v in address.items(): address[k] = jaconv.h2z(v, ascii=True, digit=True) return address
def text_ins_reg(ins): # 検索置換の開始。 # ### 全角数字を半角に変換する。 ins = jaconv.z2h(ins, kana=False, ascii=False, digit=True) # ### 数字の桁区切りが全角だった場合 => 半角に変換。 ins = re.sub('^[ +]|[ +]$', '', ins) # ### こちらは『''』の前に『r』がなくてもグループ化と正規化がうまくいってる? ins = re.sub('(?<=\d),(?=\d+)', '\1,\2', ins) # ### 句点読点を統一。通常文章バージョン ins = ins.replace(',', '、') # 理科系バージョン => '、', ',' ins = ins.replace('.', '。') # 理科系バージョン => '。', '.' # ### 全角ASCIIを半角に変換する。 # 全角スペースを下駄に変換、全角ASCIIを半角に変換、下駄を全角スペースに戻す。 ins = ins.replace(' ', '〓') ins = jaconv.z2h(ins, kana=False, ascii=True, digit=False) ins = ins.replace('〓', ' ') # ### 半角カタカナを全角に変換する。 ins = jaconv.h2z(ins) # ### ASCIIの『()』『[]』を全角に変える。 # ### 『''』の前に『r』を付けることについて、規則が全く理解できない! ins = re.sub('\((.+?)\)', r'(\1)', ins) ins = re.sub('\[(.+?)\]', r'[\1]', ins) # ### 時間表示の『:』を全角に変換する。 ins = re.sub('(\d{1,2}):(\d{2})', r'\1:\2', ins) # ### 箇条書き先頭の数字周りの全角ピリオドをママ活かす。 # ### ### 句点だった場合 ins = re.sub('^(\d{1,3})。', r'\1.', ins, flags=re.MULTILINE) # ### ### Piriodの場合 ins = re.sub('^(\d{1,3})\.\s', r'\1.', ins, flags=re.MULTILINE) # ### 問題点 文字列前後の不要なスペースを取り除けない。 # ins = ins.strip() ins = re.sub('^\s+', r'', ins, flags=re.MULTILINE) # ############################################## return ins
def read_file(in_path, han2han=False, encode='cp932'): ''' データを読み込む Args: in_path: 入力ファイルのパス han2han: 半角カナをそのままにするフラグ encode : 入力ファイルのエンコード。デフォルトは cp932(Windows版シフトJIS) Returns: lines: ファイルを読んだ結果。各要素が一行分の文字列の配列 ''' lines = [] with codecs.open(in_path, 'r', encode) as f: # 一行単位で標準入力 for line in f: str = line.rstrip() # 空行は無視 if len(str) == 0: continue if not han2han: # 半角カナ -> 全角 str = jaconv.h2z(str) # 要素を追加 lines.append(str) return lines
def test_jaconv(): logging.info("=========================================") logging.info("= jaconv =") logging.info("=========================================") test_cases = get_test_cases() for tc in test_cases: title = tc['title'] body = tc['body'] logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title) calc_time(jaconv.hira2kata, body) logging.debug("result: %s" % jaconv.hira2hkata(body)) logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title) calc_time(jaconv.kata2hira, body) logging.debug("result: %s" % jaconv.kata2hira(body)) logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title) calc_time(jaconv.hira2hkata, body) logging.debug("result: %s" % jaconv.hira2hkata(body)) logging.info("半角 to 全角 for %s" % title) calc_time(jaconv.h2z, body) logging.debug("result: %s" % jaconv.h2z(body)) logging.info("全角 to 半角 for %s" % title) calc_time(jaconv.z2h, body) logging.debug("result: %s" % jaconv.z2h(body))
def normalize_text_normal_ipadic(input_text, kana=True, ascii=True, digit=True): """ * All hankaku Katanaka is converted into Zenkaku Katakana * All hankaku English alphabet and numberc string are converted into Zenkaku one """ # type: (text_type,bool,bool,bool)->text_type return jaconv.h2z(input_text, kana=kana, ascii=ascii, digit=digit)
def func_normalize_text(text): # type: (str)->str """* What you can do - It make normalize input text into text which is suitable to KNP analysis. """ if six.PY2: if isinstance(text, str): text = text.decode('utf-8') return jaconv.h2z(text=re.sub(r'\s', '', string=text), kana=True, ascii=True, digit=True) else: return jaconv.h2z(text=re.sub(r'\s', '', string=text), kana=True, ascii=True, digit=True)
def normalize_txt(text): return jaconv.h2z(jaconv.z2h(text.strip(), kana=False, digit=True, ascii=True), kana=True, digit=False, ascii=False).lower()
def prepare(self, text): text = normalize.shorten_repeat(text, 3) text = jaconv.h2z(text) text = re_a_tag.sub('', text) text = kigou.sub('', text) for (old, new) in self.paraphrases['before'].items(): text = text.replace(old, new) return text
def _value_to_zenkaku(value: str) -> str: null_pattern = re.compile(r'null') match = re.search(null_pattern, value) if match: return None zenkaku = jaconv.h2z(value, kana=True, digit=True, ascii=True) return zenkaku
def normalize_jpn(text): text = suuji(text) text = jaconv.h2z(text.upper(), ignore='', kana=True, ascii=True, digit=True) text = text.replace("ー", "ー") return text
def normalize_char_width(string: str) -> str: """Normalize character widths in string to a set standard. Converts all katakana to full-width, and converts all latin alphabet and numeric characters to half-width """ out_str = jaconv.h2z(string, kana=True, ascii=False, digit=False) out_str = jaconv.z2h(out_str, kana=False, ascii=True, digit=True) return out_str
def normalize(text, emoticon=False, repeat=None): text = HTMLParser().unescape(text) text = text.replace('\r', '\n') if emoticon is False: text = remove_emoticon(text) text = jaconv.h2z(text) text = text.replace('よぉ', 'よ').replace('よぉ', 'よ') text = text.replace('よお', 'よ').replace('よお', 'よ') if repeat: text = shorten_repeat(text, repeat) return text
def normalize_text_normal_ipadic(input_text): """ * All hankaku Katanaka is converted into Zenkaku Katakana * All hankaku English alphabet and numberc string are converted into Zenkaku one """ return jaconv.h2z(input_text, kana=True, ascii=True, digit=True)
MECAB_ARGS_KEYS = 'rdulDOapmMFUBESxbPCtco' if __name__ == '__main__': parser = argparse.ArgumentParser() for key in MECAB_ARGS_KEYS: parser.add_argument('-%s' % key) parser.add_argument('-N', type=int) args = parser.parse_args() mecab_arg = '' for key in MECAB_ARGS_KEYS: arg = getattr(args, key) if arg: mecab_arg += ' -%s%s' % (key, arg) if not args.F: mecab_arg += DEFALUT_FORMAT mecab = MeCab.Tagger(mecab_arg) while True: sentence = input() sentence = jaconv.h2z(sentence) if args.N: mecab.parseNBestInit(sentence) for i in range(args.N): result = mecab.next() if result: print(result) else: break else: result = mecab.parse(sentence) print(result)