def calc_speak_time(df, kakasi, speaker_id=None): """ :param df: Pandas DataFrame :param kakasi: Kakasi Instance :param speaker_id: String :return: Pandas DataFrame """ # ひらがなへのconverter準備 kakasi.setMode('J', 'H') conv = kakasi.getConverter() speak_scores = [] for i in df.index: speak_time = float(df.end_time[i]) - float(df.start_time[i]) speak_length = len(conv.do(df.transcript[i])) speak_score = speak_time / speak_length speak_scores.append(speak_score) df['speak_scores'] = speak_scores if speaker_id is not None: if speaker_id not in df.speaker.unique(): raise Exception( str(speaker_id) + ' does not exist in speaker column ') else: df = df[df.speaker == str(speaker_id)] df.reset_index(drop=True, inplace=True) return df
def run_kakasi(text_input): global conv if not conv: # kakasi set up from the pip page for this module. # using only setMode(J, a), (r, Hepburn) (s, False) opts # is buggy ¯\_(ツ)_/¯ from pykakasi import kakasi kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", False) # add space, default: no separator conv = kakasi.getConverter() # Use mecab instead of kakasi's wakati feature # to do spacing between Japanese kanji # because mecab has better spacing prediction spaced = mecab(text_input) chars = spaced.split(' ') def replacer(word): return 'わ' if word == 'は' else word spaced = ' '.join(list(map(replacer, chars))) result = conv.do(spaced) return result
def text2InstaDmSmiling(text): EMOJI = '^_^' little_dashes = "ゔがぎぐげござじずぜぞだぢづでどばびぶべぼ" little_dashes_conv = "うかきくけこさしすせそたちつてとはひふへほ" little_circle = "ぱぴぷぺぽ" little_circle_conv = "はひふへほ" small_letter = "ぁぃぅぇぉゃゅょっ" small_letter_conv = "あいうえおやゆよつ" little_dashes_dict = {} for i in range(len(little_dashes)): little_dashes_dict[little_dashes[i]] = little_dashes_conv[i] + EMOJI little_circle_dict = {} for i in range(len(little_circle)): little_circle_dict[little_circle[i]] = little_circle_conv[i] + EMOJI small_letter_dict = {} for i in range(len(small_letter)): small_letter_dict[small_letter[i]] = small_letter_conv[i] + EMOJI kakasi.setMode('J', 'H') kakasi.setMode('K', 'H') conv = kakasi.getConverter() text_hiragana = conv.do(text) text_hiragana = text_hiragana.translate(str.maketrans(little_dashes_dict)) text_hiragana = text_hiragana.translate(str.maketrans(little_circle_dict)) text_hiragana = text_hiragana.translate(str.maketrans(small_letter_dict)) return text_hiragana
def readDict(): #日本語辞書("./dictionary/nihongolist.xlsx")の読み込み #読み込んだあと、"dictionary"内にnihongolist.binaryfileでpickle保存する。 #2回目以降はnihongolist.binaryfileを読み込む。 #辞書を更新したあとは、nihongolist.binaryfileを削除してください。 if os.path.isfile('./dictionary/nihongolist.binaryfile'): with open('./dictionary/nihongolist.binaryfile', 'rb') as web: df = pickle.load(web) else: df=pd.read_excel("./dictionary/nihongolist.xlsx") df["romaji"]="_" df["score"]=0.000 from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') conv = kakasi.getConverter() for n in range (len(df)): #print(df.iat[n,1]) romaji=conv.do(df.iat[n,1]) romaji=romaji.replace("a","aaa").replace("i","iii").replace("u","uuu").replace("e","eee").replace("o","ooo") df.iat[n,5]=romaji #print(df.iat[n,5]) with open('./dictionary/nihongolist.binaryfile', 'wb') as web: pickle.dump(df , web) return df #readDict()
def WordConvert(word): from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('J', 'H') kakasi.setMode("K", "H") conv = kakasi.getConverter() return conv.do(word)
def to_katakana(self, text): from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('K', 'K') kakasi.setMode('H', 'K') kakasi.setMode('E', 'K') kakasi.setMode('J', 'K') kakasi.setMode('a', 'K') converter = kakasi.getConverter() return converter.do(text.decode('utf-8'))
class Phrase(pygame.sprite.Sprite): font = pygame.font.Font('migu-1m-regular.ttf', 32) kakasi = kakasi() kakasi.setMode("H", "a") kakasi.setMode("K", "a") kakasi.setMode("J", "a") kakasi.setMode("r", "Kunrei") conv = kakasi.getConverter() def __init__(self, y, string): pygame.sprite.Sprite.__init__(self, self.containers) self.characters_roman = [c for c in self.conv.do(string)] # 日本語/ローマ字のうち、より幅の大きい方に画像の幅を合わせる character_size = self.font.size( max(self.conv.do(string), string, key=lambda x: len(x))) surface = pygame.Surface((character_size[0], character_size[1] * 2)) surface.set_colorkey((0, 0, 0)) self.characters = self.font.render(string, True, (1, 1, 1), (255, 255, 255)) self.image = surface self.rect = self.image.get_rect() self.rect.midleft = (640, y) self.speed = -2.0 # 文字列内の文字の参照位置 self.next_character_pos = 0 def update(self): self.rect.move_ip(self.speed, 0) self.image.fill((255, 255, 255)) self.image.blit(self.characters, (0, 0)) if len(self.characters_roman) == self.next_character_pos: self.kill() for i, c in enumerate(self.characters_roman): if not c: continue self.image.blit( self.font.render(c, True, (1, 1, 1), (255, 255, 255)), (i * 16, 32)) if self.rect.right < 0: self.rect.left = 640 def input(self, character): if self.characters_roman[self.next_character_pos] == character: self.characters_roman[self.next_character_pos] = '' self.next_character_pos += 1 Explosion((self.rect.left + (self.next_character_pos * 16), self.rect.centery))
def to_romaji(text_jpn): text = ' '.join(tinysegmenter.tokenize(text_jpn)) kakasi.setMode("H", "a") # Hiragana ke romaji kakasi.setMode("K", "a") # Katakana ke romaji kakasi.setMode("J", "a") # Japanese ke romaji kakasi.setMode("r", "Hepburn") # default: Hepburn Roman table\ convert = (kakasi.getConverter()).do(text) return convert
def VoiceRecodeAndRecongnize(): p = pyaudio.PyAudio() start = input("録音開始 [Enter]>>") print("録音中...") stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=chunk) sequence = [] # while True: # data = stream.read(chunk) # sequence.append(data) # if keyboard.is_pressed("esc"): # break for i in range(0, int(RATE / chunk * 3)): data = stream.read(chunk) sequence.append(data) print("録音終了") stream.close() p.terminate() wavFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb') wavFile.setnchannels(CHANNELS) wavFile.setsampwidth(p.get_sample_size(FORMAT)) wavFile.setframerate(RATE) wavFile.writeframes(b"".join(sequence)) wavFile.close() r = sr.Recognizer() with sr.AudioFile(WAVE_OUTPUT_FILENAME) as source: audio = r.record(source) word = r.recognize_google(audio, language='ja') #kakasi = kakasi() kakasi.setMode('J', 'H') kakasi.setMode('K', 'H') conv = kakasi.getConverter() word_hiragana = conv.do(word) print("----------------------------------------") print("認識結果:" + word) print("認識結果(ひらがなver):" + word_hiragana) print("----------------------------------------") return word_hiragana
def change_char(tweet_text, kakasi): '''change tweet_text, Kanji -> Hiragana''' seperator = "。" sentence_list = tweet_text.split(seperator) sentence_list.pop() sentence_list = [x+seperator for x in sentence_list] kakasi = kakasi() kakasi.setMode("J", "H") # J(漢字) からH(ひらがな)へ conv = kakasi.getConverter() for sentence in sentence_list: print(sentence) print(conv.do(sentence)) print() kana_text = conv.do(tweet_text) with open("prepare_text.txt", mode="w", encoding="utf-8") as f: f.write(kana_text) return kana_text
def furiganaLineTrans(text): kakasi.setMode("J","H") kakasi.setMode("r","Hepburn") conv = kakasi.getConverter() stitchStr = '' for segWord in segmenter.tokenize(text): result = "" if transDict.has_key(segWord): result = transDict[segWord] else: result = conv.do(segWord) if segWord == result: stitchStr += segWord else: result = result.strip().replace("\n","") useStr = segWord + ' { ' + result +' } ' stitchStr += useStr return stitchStr
def exchange_word(text): kakasi.setMode("H", "a") # Hiragana to ascii kakasi.setMode("K", "a") # Katakana to ascii kakasi.setMode("J", "a") # Japanese(kanji) to ascii conv = kakasi.getConverter() result = conv.do(text) return result
def parse_topics(self, response): # 全角をローマ字に変換してくれるコンバータ(pykakasi)を準備 from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() # アイテムクラス"birditem"に画像ファイルごとの情報を格納 jpgpath = response.xpath( '//*[@id="contents"]/div[3]/div/p/img/@src').extract_first() birditem = BirdfanItem() # 画像ファイルを持つページのURL birditem['url'] = response.url # 画像ファイルを持つページのタイトル(人間向けのほう) birditem['title'] = response.xpath( '//*[@id="contents"]/div[3]/h2/a/text()').extract_first() # 画像ファイルの野鳥の種別名(全角)をローマ字に変換して格納 birditem['birdname'] = conv.do( response.xpath('//*[@id="contents"]/div[3]/div/div/h3/a/text()'). extract_first()) # 画像ファイルのURL birditem['jpgurl'] = response.urljoin(jpgpath) yield birditem
def dialog_nlp(input_txt, version): if version == 't5': DIALOG_NLP_CONTAINER_NAME = "0e3490a65e84" proc = subprocess.run( f"docker start {DIALOG_NLP_CONTAINER_NAME}", shell=True) with open("dialogue/t5/question/question.txt", "w") as question_txt: question_txt.write(input_txt) with open("intermediate/nlp_out.txt", "w") as output_txt: # import os # os.system( # f"docker exec -w /t5 {DIALOG_NLP_CONTAINER_NAME} python3 test.py") proc_1 = subprocess.run( f"docker exec -w /t5 {DIALOG_NLP_CONTAINER_NAME} python3 test.py", shell=True, stdout=output_txt, text=True) print(proc_1.stdout) # TODO: text outprocess(extract only answer in english) from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() with open("intermediate/nlp_out.txt", "r") as f: responce = f.readlines()[1].replace( "<pad>", '').replace('</s>', '') print(responce) responce = conv.do(responce) print(responce) with open("intermediate/nlp_out_fixed.txt", "w") as f: f.write( "jsut_ver1.1/onomatopee300/wav/ONOMATOPEE300_300.wav|"+responce[1:].replace('\n', '')+".|1")
def toKatakana(str): kakasi.setMode("J", "K") # a,H, K, None - roman, Hiragana, Katakana, or non conversion, default: no conversion kakasi.setMode("H", "K") # a,H,None - roman, Hiragana or non conversion, default: no conversion kakasi.setMode("a", "K") # a,H,None - roman, Hiragana or non conversion, default: no conversion kakasi.setMode("K", "K") conv = kakasi.getConverter() return conv.do(str)
def jp_to_romen(text): kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() answer = conv.do(text) answer2 = answer.capitalize() return answer2
def change_word(read_data): from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() return conv.do(read_data)
def get_reading_kakasi(word): """Gets reading for a given Japanese word by using kakasi. The reading in hiragana is returned by this function.""" import pykakasi.kakasi as kakasi kakasi = kakasi() kakasi.setMode("J", "H") kakasi.setMode("C", True) # default: Separator kakasi.setMode("c", False) # default: no Capitalize conv = kakasi.getConverter() result = conv.do(word) return result
def formats(key_world): key_world = key_world.strip() key_world = re.sub(r'[^\w\s]', '', key_world) from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() key_world = conv.do(key_world) return key_world
def getConverter(): import sys # reload()之前必须要引入模块 reload(sys) sys.setdefaultencoding('utf-8') # 防止UTF8出问题 from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() return conv
def make_filename(title): from pykakasi import kakasi, wakati import zen2han kakasi = kakasi() kakasi.setMode("H", "a") kakasi.setMode("K", "a") kakasi.setMode("J", "a") conv = kakasi.getConverter() title = conv.do(title).replace(" ", "_") return zen2han.zen2han(title)
def main(): kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() with serial.Serial(port="/dev/tty.usbmodem141141", baudrate=9600, timeout=1) as device: while True: title = itunes.current_track.name.get() artist = itunes.current_track.artist.get() time = int(itunes.player_position.get()) minute = 0; if time >= 60: while time >= 60: time = time - 60 minute = minute + 1 if minute < 10: time_str = str(0) elif minute == 0: time_str = str(0) + str(0) time_str = time_str + str(minute) if time < 10: time_str = time_str + str(0) + str(time) else: time_str = time_str + str(time) else: time_str = str(0) + str(0) if time < 10: time_str = time_str + str(0) + str(time) else: time_str = time_str + str(time) sleep(0.5) device.write(conv.do(title).encode(errors="ignore") + '\n') device.write(conv.do(artist).encode(errors="ignore") + '\n') device.write(time_str + '\n') os.system('clear') print("TITLE = " + title) print("ARTIST = " + artist) print("TIME = " + time_str) device.close()
def toFurigana(str): kakasi = pykakasi.kakasi() kakasi.setMode("J", "aF") # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion kakasi.setMode("H", "aF") # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion kakasi.setMode("K", "aF") # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion conv = kakasi.getConverter() return tokenTranslation(conv, str)
def __init__(self, dictionary_type='alphabet', max_len=1000, length=1024): if dictionary_type == 'alphabet': self.dictionary = alphabet.alphabet_dict elif dictionary_type == 'katakana': self.dictionary = katakana.katakana_dict elif dictionary_type == 'katakana_small': self.dictionary = katakana_small.katakana_small_dict self.dictionary_type = dictionary_type self.max_len = max_len self.length = length kakasi = pykakasi.kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') self.kakasi_conv = kakasi.getConverter()
def get_initials(words): kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') # kakasi.setMode("C", True) conv = kakasi.getConverter() initials = [] for word in words: initial = conv.do(word)[:1].upper() initials.append(initial) # if initials.count == 0: # print('Error: cannot get initial') # return return initials
import requests import sys import os import tweepy sys.path.append('/app') import models # noqa import jaconv # noqa from util import morpheme # noqa import services # noqa db = firestore.Client() kakasi = kakasi() kakasi.setMode("J", "H") conv = kakasi.getConverter() system_service = services.system_instance word_service = services.word_instance user_service = services.user_instance tag_service = services.tag_instance # user_service.get_oauth_url() # user_data = models.UserUpdate() # user_data.twitter_id = 'user_id' # user_data.twitter_name= 'screen_name' # user_data.twitter_key= 'oauth_token' # user_data.twitter_secret= 'oauth_token_secret' # user_data.session_id = "aaaa"
incorrect_answers.append(output_string) student_output_filepath = os.path.join(output_directory, student_name) with open(student_output_filepath, "w+") as student_corrections_file: student_corrections_file.write("\n".join(incorrect_answers)) # ------------------------------------------------------------------------------ current_directory = os.path.dirname(os.path.abspath(__file__)) # CHANGE THIS BIT! master_answer_key = os.path.join(current_directory, "answer_keys/2017_06_08_lesson_03_part_02.txt") student_answer_sets = os.path.join(current_directory, "answer_sets") # CHANGE THIS BIT! date_string = "2017_06_08" current_student_answer_sets = os.path.join(student_answer_sets, date_string, "student_answers") output_directory = os.path.join(student_answer_sets, date_string, "graded_answers") # --- kakasi = kakasi() kakasi.setMode("J", "H") # default: Japanese no conversion japanese_text_normalizer = kakasi.getConverter() # --- perform_grading(master_answer_key, current_student_answer_sets, output_directory, japanese_text_normalizer)
#!/usr/bin/env python3 # coding: utf-8 from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() filename = '本日は晴天なり.jpg' print("Base", filename) print("Base type", type(filename)) print("Conv", conv.do(filename)) # print(type(filename.decode('utf-8'))) # print(conv.do(filename.decode('utf-8'))) # print(type(filename.decode('utf-8'))) # print(type(filename)) # print(conv.do(filename))
import pykakasi.kakasi as kakasi kakasi = kakasi() kakasi.setMode("H", "a") # default: Hiragana no convert kakasi.setMode("K", "a") # default: Katakana no convert kakasi.setMode("J", "a") # default: Japanese no convert kakasi.setMode("E", "a") # default: Symbols no convert kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # separate, default: no Separator kakasi.setMode("C", True) # capitalize default: no Capitalize conv = kakasi.getConverter() result = conv.do('澱んだ街角で僕らは出会った')
def dabiaoqian(path): from pykakasi import kakasi import csv, os name_tezheng = 'mizhichuli_log' # 装有特征值的那个文件的文件名 xinde = 'xinde_mizhichuli' # 装入新的特征值的文件名 name1 = 'align1' name2 = 'symbol.txt' #标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() for i in os.listdir(path): path_1 = os.path.join(path, i) path_out = os.path.join(path_1, 'keka') path_tezheng = os.path.join(path_1, name_tezheng) #biaozhiwenjian = csv.reader(open(os.path.join(path_1, name1), 'r', encoding='EUC-JP')) # 把标志文件读进来 biaozhiwenjian = csv.reader( open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件 biaozhiwenjian_1 = [i for i in biaozhiwenjian ] # 转化为list,但是内容是list里面套list #[['id: l_8840_9810_T1_F_01'],['REF: そう です か 、 はい 。 '],['HYP: そう です か はい 。 '],['EVAL: C C C D C C '],[],['id: l_10800_13190_T1_F_01']] # print(biaozhiwenjian_1) # os.system('pause') path_xinde = os.path.join(path_1, xinde) mulu.mkdir(path_xinde) for i in range(0, len(biaozhiwenjian_1)): # 这里的每一轮可以为一个语音文件打标签 try: biaozhi = biaozhiwenjian_1[i][0] except: continue if 'id:' in biaozhi: ID = '' l_biaozhi = [] l_zhengjie = [] l_zhengjie_1 = [] l_jieguo = [] l_jieguo_1 = [] ID = biaozhiwenjian_1[i][0].replace('id: ', '') l_zhengjie = biaozhiwenjian_1[i + 1][0].split() l_zhengjie.pop(0) l_jieguo = biaozhiwenjian_1[i + 2][0].split() l_jieguo.pop(0) l_biaozhi = biaozhiwenjian_1[i + 3][0].split() l_biaozhi.pop(0) #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空 jishuqi_jieguo = 0 jishuqi_zhengjie = 0 jishuqi_biaozhi = 0 for i in l_biaozhi: if i == "D": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append('') jishuqi_zhengjie += 1 jishuqi_biaozhi += 1 if i == "C": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) jishuqi_zhengjie += 1 jishuqi_jieguo += 1 jishuqi_biaozhi += 1 if i == "I": l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) l_zhengjie_1.append('') jishuqi_jieguo += 1 jishuqi_biaozhi += 1 if i == "S": #如果是S的话特殊处理一下,转化为字母再比较,如果转化之后相等的话,把标志改为C l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) zhengjie_hanzi = l_zhengjie[jishuqi_zhengjie] jieguo_hanzi = l_jieguo[jishuqi_jieguo] #先处理识别结果 if conv.do( jieguo_hanzi ) == jieguo_hanzi and jieguo_hanzi != '、': #判断是不是字母 try: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(jieguo_hanzi))) except: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( jieguo_hanzi)) else: zhuanhuan_jieguo = conv.do(jieguo_hanzi) #再处理正解文 if conv.do( zhengjie_hanzi ) == zhengjie_hanzi and zhengjie_hanzi != '、': # 判断是不是字母 try: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(zhengjie_hanzi))) except: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( zhengjie_hanzi)) else: zhuanhuan_zhengjie = conv.do(zhengjie_hanzi) if zhuanhuan_jieguo == zhuanhuan_zhengjie: # print("正解list") # print(l_zhengjie_1) # # print("识别结果list") # print(l_jieguo_1) # # print("zhuanhuan_jieguo") # print(zhuanhuan_jieguo) # print("zhuanhuan_zhengjie") # print(zhuanhuan_zhengjie) # print("有标志被改了") # print(ID) # os.system("pause") l_biaozhi[jishuqi_biaozhi] = 'C' jishuqi_biaozhi += 1 jishuqi_zhengjie += 1 jishuqi_jieguo += 1 # print(l_jieguo_1) # print(l_zhengjie_1) # print(l_biaozhi) # os.system('pause') path_out_1 = os.path.join(path_out, ID + '.out') #读出.out文件 dianout = pi.read_out(path_out_1) start = dianout.pop(0)[1][1] # 给开始的无音区间打标签9,pop掉第一个元素 start_1 = dianout[-1][1][0] #给末尾句号打标签9 # end_1 = dianout.pop(-1)[1][1] # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv') tezhengzhi = csv.reader( open(path_tezheng_1, 'r', encoding='utf-8')) t_file_list = [i for i in tezhengzhi] end_1 = len(t_file_list) - 1 for i in range(start + 1): t_file_list[i].insert(0, '9') # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理 for i in range(start_1, end_1 + 1): t_file_list[i].insert(0, '9') l_jieguo_1.pop(-1) #最后句号的部分已经打过标签了,需要把它pop掉 print("ID") print(ID) print("l_biaozhi") print(l_biaozhi) print("l_jieguo_1") print(l_jieguo_1) print("dianout") print(dianout) dianout_chongzao = cz.chongzao(l_biaozhi, l_jieguo_1, dianout, ID) # 生成新的dianoutlist,以后就靠它了 print('dianout_chongzao') print(dianout_chongzao) #通过得到的新的list,开始打标签 # [['災害', [3, 40], 'C'], ['で', [41, 48], 'C'], ['ござい', [49, 77], 'C'], ['ます', [78, 98], 'C'], # ['から', [99, 130], 'C'], ['、', [131, 152], 'C'], ['その', [153, 177], 'C'], ['場', [178, 190], 'C'], # ['で', [191, 209], 'C']] for i in dianout_chongzao: start, end = i[1] if i[2] == 'C': for i in range(start, end + 1): t_file_list[i].insert(0, '0') else: for i in range(start, end + 1): t_file_list[i].insert(0, '1') path_xinde_tezhengzhi = os.path.join(path_xinde, ID + '.csv') with open(path_xinde_tezhengzhi, 'w+', encoding='utf-8') as mergen_file: for i in t_file_list: mergen_file.write('%s\n' % ','.join(i)) shanchu.shanchuhang(path_xinde) # 把有标记9的特征值全部都删除掉
path = r'C:\Users\a7825\Desktop\工作空间\杂物\对比\baseline\baselinetest' fazhi = 0.9 # name_tezheng = # 装有特征值的那个文件的文件名 # xinde = # 装入新的特征值的文件名 name1 = 'align1' name2 = 'symbol.txt' #标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() zhenzhi_2 = [] #塞入混淆矩阵 yucezhi_2 = [] for i in os.listdir(path): # path_1 = os.path.join(path, i) path_out = os.path.join(path_1, 'keka')
from pykakasi import kakasi kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') conv = kakasi.getConverter() print(conv.do('本日は晴天なり'))
import argparse from pykakasi import kakasi if __name__ == '__main__': kakasi = kakasi() kakasi.setMode('H', 'a') # Hiragana to ascii, default: no conversion kakasi.setMode('K', 'a') # Katakana to ascii, default: no conversion kakasi.setMode('J', 'a') # Japanese to ascii, default: no conversion conv = kakasi.getConverter() parser = argparse.ArgumentParser() parser.add_argument('args', type=str, nargs='*') parser.add_argument('--verbose', action='store_true') args = parser.parse_args() for text in args.args: if args.verbose: print(text) print(conv.do(text))