Ejemplo n.º 1
0
def text2InstaDmSmiling(text):
    EMOJI = '^_^'
    little_dashes = "ゔがぎぐげござじずぜぞだぢづでどばびぶべぼ"
    little_dashes_conv = "うかきくけこさしすせそたちつてとはひふへほ"
    little_circle = "ぱぴぷぺぽ"
    little_circle_conv = "はひふへほ"
    small_letter = "ぁぃぅぇぉゃゅょっ"
    small_letter_conv = "あいうえおやゆよつ"
    
    little_dashes_dict = {}
    for i in range(len(little_dashes)):
        little_dashes_dict[little_dashes[i]] = little_dashes_conv[i] + EMOJI
        
    little_circle_dict = {}
    for i in range(len(little_circle)):
        little_circle_dict[little_circle[i]] = little_circle_conv[i] + EMOJI

    small_letter_dict = {}
    for i in range(len(small_letter)):
        small_letter_dict[small_letter[i]] = small_letter_conv[i] + EMOJI
    
    kakasi.setMode('J', 'H') 
    kakasi.setMode('K', 'H')

    conv = kakasi.getConverter()
    text_hiragana = conv.do(text)
    
    text_hiragana = text_hiragana.translate(str.maketrans(little_dashes_dict))
    text_hiragana = text_hiragana.translate(str.maketrans(little_circle_dict))
    text_hiragana = text_hiragana.translate(str.maketrans(small_letter_dict))
    return text_hiragana
Ejemplo n.º 2
0
def calc_speak_time(df, kakasi, speaker_id=None):
    """
    :param df: Pandas DataFrame
    :param kakasi: Kakasi Instance
    :param speaker_id: String
    :return: Pandas DataFrame
    """

    # ひらがなへのconverter準備
    kakasi.setMode('J', 'H')
    conv = kakasi.getConverter()

    speak_scores = []

    for i in df.index:
        speak_time = float(df.end_time[i]) - float(df.start_time[i])
        speak_length = len(conv.do(df.transcript[i]))
        speak_score = speak_time / speak_length
        speak_scores.append(speak_score)

    df['speak_scores'] = speak_scores

    if speaker_id is not None:
        if speaker_id not in df.speaker.unique():
            raise Exception(
                str(speaker_id) + ' does not exist in speaker column ')
        else:
            df = df[df.speaker == str(speaker_id)]
            df.reset_index(drop=True, inplace=True)

    return df
Ejemplo n.º 3
0
    def parse_topics(self, response):
        # 全角をローマ字に変換してくれるコンバータ(pykakasi)を準備
        from pykakasi import kakasi
        kakasi = kakasi()
        kakasi.setMode('H', 'a')
        kakasi.setMode('K', 'a')
        kakasi.setMode('J', 'a')
        conv = kakasi.getConverter()

        # アイテムクラス"birditem"に画像ファイルごとの情報を格納
        jpgpath = response.xpath(
            '//*[@id="contents"]/div[3]/div/p/img/@src').extract_first()
        birditem = BirdfanItem()
        # 画像ファイルを持つページのURL
        birditem['url'] = response.url
        # 画像ファイルを持つページのタイトル(人間向けのほう)
        birditem['title'] = response.xpath(
            '//*[@id="contents"]/div[3]/h2/a/text()').extract_first()
        # 画像ファイルの野鳥の種別名(全角)をローマ字に変換して格納
        birditem['birdname'] = conv.do(
            response.xpath('//*[@id="contents"]/div[3]/div/div/h3/a/text()').
            extract_first())
        # 画像ファイルのURL
        birditem['jpgurl'] = response.urljoin(jpgpath)
        yield birditem
Ejemplo n.º 4
0
def run_kakasi(text_input):
    global conv

    if not conv:
        # kakasi set up from the pip page for this module.
        # using only setMode(J, a), (r, Hepburn) (s, False) opts
        # is buggy  ¯\_(ツ)_/¯

        from pykakasi import kakasi

        kakasi = kakasi()
        kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
        kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
        kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
        kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
        kakasi.setMode("s", False)  # add space, default: no separator
        conv = kakasi.getConverter()

    # Use mecab instead of kakasi's wakati feature
    # to do spacing between Japanese kanji
    # because mecab has better spacing prediction
    spaced = mecab(text_input)
    chars = spaced.split(' ')

    def replacer(word):
        return 'わ' if word == 'は' else word

    spaced = ' '.join(list(map(replacer, chars)))

    result = conv.do(spaced)
    return result
Ejemplo n.º 5
0
def exchange_word(text):
    kakasi.setMode("H", "a")  # Hiragana to ascii
    kakasi.setMode("K", "a")  # Katakana to ascii
    kakasi.setMode("J", "a")  # Japanese(kanji) to ascii
    conv = kakasi.getConverter()
    result = conv.do(text)
    return result
Ejemplo n.º 6
0
def readDict():
    #日本語辞書("./dictionary/nihongolist.xlsx")の読み込み
    #読み込んだあと、"dictionary"内にnihongolist.binaryfileでpickle保存する。
    #2回目以降はnihongolist.binaryfileを読み込む。
    #辞書を更新したあとは、nihongolist.binaryfileを削除してください。
    if os.path.isfile('./dictionary/nihongolist.binaryfile'):
        with open('./dictionary/nihongolist.binaryfile', 'rb') as web:
            df = pickle.load(web)
    else:
        df=pd.read_excel("./dictionary/nihongolist.xlsx")
        df["romaji"]="_"
        df["score"]=0.000
        from pykakasi import kakasi
        kakasi = kakasi()
        kakasi.setMode('H', 'a')
        conv = kakasi.getConverter()
        for n in range (len(df)):
            #print(df.iat[n,1])
            romaji=conv.do(df.iat[n,1])
            romaji=romaji.replace("a","aaa").replace("i","iii").replace("u","uuu").replace("e","eee").replace("o","ooo")
            df.iat[n,5]=romaji
            #print(df.iat[n,5])
        with open('./dictionary/nihongolist.binaryfile', 'wb') as web:
            pickle.dump(df , web)
    return df
#readDict()
Ejemplo n.º 7
0
def dialog_nlp(input_txt, version):
    if version == 't5':
        DIALOG_NLP_CONTAINER_NAME = "0e3490a65e84"
        proc = subprocess.run(
            f"docker start {DIALOG_NLP_CONTAINER_NAME}", shell=True)
        with open("dialogue/t5/question/question.txt", "w") as question_txt:
            question_txt.write(input_txt)
        with open("intermediate/nlp_out.txt", "w") as output_txt:
            # import os
            # os.system(
            #     f"docker exec -w /t5 {DIALOG_NLP_CONTAINER_NAME}  python3 test.py")
            proc_1 = subprocess.run(
                f"docker exec -w /t5 {DIALOG_NLP_CONTAINER_NAME}  python3 test.py", shell=True, stdout=output_txt, text=True)
            print(proc_1.stdout)

        # TODO: text outprocess(extract only answer in english)
        from pykakasi import kakasi
        kakasi = kakasi()

        kakasi.setMode('H', 'a')
        kakasi.setMode('K', 'a')
        kakasi.setMode('J', 'a')

        conv = kakasi.getConverter()
        with open("intermediate/nlp_out.txt", "r") as f:
            responce = f.readlines()[1].replace(
                "<pad>", '').replace('</s>', '')
            print(responce)
            responce = conv.do(responce)
            print(responce)
        with open("intermediate/nlp_out_fixed.txt", "w") as f:
            f.write(
                "jsut_ver1.1/onomatopee300/wav/ONOMATOPEE300_300.wav|"+responce[1:].replace('\n', '')+".|1")
Ejemplo n.º 8
0
def to_romaji(text_jpn):
    text = ' '.join(tinysegmenter.tokenize(text_jpn))
    kakasi.setMode("H", "a")  # Hiragana ke romaji
    kakasi.setMode("K", "a")  # Katakana ke romaji
    kakasi.setMode("J", "a")  # Japanese ke romaji
    kakasi.setMode("r", "Hepburn")  # default: Hepburn Roman table\
    convert = (kakasi.getConverter()).do(text)
    return convert
Ejemplo n.º 9
0
def jp_to_romen(text):
    kakasi.setMode('H', 'a')
    kakasi.setMode('K', 'a')
    kakasi.setMode('J', 'a')
    conv = kakasi.getConverter()
    answer = conv.do(text)
    answer2 = answer.capitalize()
    return answer2
Ejemplo n.º 10
0
def WordConvert(word):
    from pykakasi import kakasi
    kakasi = kakasi()
    kakasi.setMode('J', 'H')
    kakasi.setMode("K", "H")
    conv = kakasi.getConverter()

    return conv.do(word)
Ejemplo n.º 11
0
 def to_katakana(self, text):
     from pykakasi import kakasi
     kakasi = kakasi()
     kakasi.setMode('K', 'K')
     kakasi.setMode('H', 'K')
     kakasi.setMode('E', 'K')
     kakasi.setMode('J', 'K')
     kakasi.setMode('a', 'K')
     converter = kakasi.getConverter()
     return converter.do(text.decode('utf-8'))
Ejemplo n.º 12
0
def change_word(read_data):
    from pykakasi import kakasi
    kakasi = kakasi()

    kakasi.setMode('H', 'a')
    kakasi.setMode('K', 'a')
    kakasi.setMode('J', 'a')
    conv = kakasi.getConverter()

    return conv.do(read_data)
Ejemplo n.º 13
0
 def formats(key_world):
     key_world = key_world.strip()
     key_world = re.sub(r'[^\w\s]', '', key_world)
     from pykakasi import kakasi
     kakasi = kakasi()
     kakasi.setMode('H', 'a')
     kakasi.setMode('K', 'a')
     kakasi.setMode('J', 'a')
     conv = kakasi.getConverter()
     key_world = conv.do(key_world)
     return key_world
Ejemplo n.º 14
0
def get_reading_kakasi(word):
    """Gets reading for a given Japanese word by using kakasi. The reading in
       hiragana is returned by this function."""
    import pykakasi.kakasi as kakasi
    kakasi = kakasi()
    kakasi.setMode("J", "H")
    kakasi.setMode("C", True)  # default: Separator
    kakasi.setMode("c", False)  # default: no Capitalize
    conv = kakasi.getConverter()
    result = conv.do(word)
    return result
Ejemplo n.º 15
0
def getConverter():
    import sys  # reload()之前必须要引入模块
    reload(sys)
    sys.setdefaultencoding('utf-8')  # 防止UTF8出问题
    from pykakasi import kakasi
    kakasi = kakasi()
    kakasi.setMode('H', 'a')
    kakasi.setMode('K', 'a')
    kakasi.setMode('J', 'a')
    conv = kakasi.getConverter()
    return conv
Ejemplo n.º 16
0
class Phrase(pygame.sprite.Sprite):
    font = pygame.font.Font('migu-1m-regular.ttf', 32)
    kakasi = kakasi()
    kakasi.setMode("H", "a")
    kakasi.setMode("K", "a")
    kakasi.setMode("J", "a")
    kakasi.setMode("r", "Kunrei")
    conv = kakasi.getConverter()

    def __init__(self, y, string):
        pygame.sprite.Sprite.__init__(self, self.containers)
        self.characters_roman = [c for c in self.conv.do(string)]

        # 日本語/ローマ字のうち、より幅の大きい方に画像の幅を合わせる
        character_size = self.font.size(
            max(self.conv.do(string), string, key=lambda x: len(x)))

        surface = pygame.Surface((character_size[0], character_size[1] * 2))
        surface.set_colorkey((0, 0, 0))

        self.characters = self.font.render(string, True, (1, 1, 1),
                                           (255, 255, 255))

        self.image = surface
        self.rect = self.image.get_rect()
        self.rect.midleft = (640, y)
        self.speed = -2.0
        # 文字列内の文字の参照位置
        self.next_character_pos = 0

    def update(self):
        self.rect.move_ip(self.speed, 0)
        self.image.fill((255, 255, 255))
        self.image.blit(self.characters, (0, 0))

        if len(self.characters_roman) == self.next_character_pos:
            self.kill()

        for i, c in enumerate(self.characters_roman):
            if not c:
                continue
            self.image.blit(
                self.font.render(c, True, (1, 1, 1), (255, 255, 255)),
                (i * 16, 32))
        if self.rect.right < 0:
            self.rect.left = 640

    def input(self, character):
        if self.characters_roman[self.next_character_pos] == character:
            self.characters_roman[self.next_character_pos] = ''
            self.next_character_pos += 1
            Explosion((self.rect.left + (self.next_character_pos * 16),
                       self.rect.centery))
Ejemplo n.º 17
0
def make_filename(title):
    from pykakasi import kakasi, wakati

    import zen2han

    kakasi = kakasi()
    kakasi.setMode("H", "a")
    kakasi.setMode("K", "a")
    kakasi.setMode("J", "a")
    conv = kakasi.getConverter()
    title = conv.do(title).replace(" ", "_")
    return zen2han.zen2han(title)
Ejemplo n.º 18
0
def main():

    kakasi.setMode('H', 'a')
    kakasi.setMode('K', 'a')
    kakasi.setMode('J', 'a')
    conv = kakasi.getConverter()
    
    with serial.Serial(port="/dev/tty.usbmodem141141", baudrate=9600, timeout=1) as device:

        while True:
            title = itunes.current_track.name.get()
            artist = itunes.current_track.artist.get()
            time = int(itunes.player_position.get())

            minute = 0;

            if time >= 60:
                while time >= 60:
                    time = time - 60
                    minute = minute + 1

                if minute < 10:
                    time_str = str(0)
                elif minute == 0:
                    time_str = str(0) + str(0)

                time_str = time_str + str(minute)
                if time < 10:
                    time_str = time_str + str(0) + str(time)
                else:
                    time_str = time_str + str(time)
            else:
                time_str = str(0) + str(0)
                if time < 10:
                    time_str = time_str + str(0) + str(time)
                else:
                    time_str = time_str + str(time)

            sleep(0.5)
            device.write(conv.do(title).encode(errors="ignore") + '\n')
            device.write(conv.do(artist).encode(errors="ignore") + '\n')
            device.write(time_str + '\n')

            os.system('clear')
            print("TITLE = " + title)
            print("ARTIST = " + artist)
            print("TIME = " + time_str)

        device.close()
def toKatakana(str):
    kakasi.setMode("J", "K")
    # a,H, K, None - roman, Hiragana, Katakana, or non conversion, default: no conversion

    kakasi.setMode("H", "K")
    # a,H,None - roman, Hiragana or non conversion, default: no conversion

    kakasi.setMode("a", "K")
    # a,H,None - roman, Hiragana or non conversion, default: no conversion

    kakasi.setMode("K", "K")

    conv = kakasi.getConverter()

    return conv.do(str)
def toFurigana(str):
    kakasi = pykakasi.kakasi()

    kakasi.setMode("J", "aF")
    # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion

    kakasi.setMode("H", "aF")
    # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion

    kakasi.setMode("K", "aF")
    # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion

    conv = kakasi.getConverter()

    return tokenTranslation(conv, str)
Ejemplo n.º 21
0
 def __init__(self, dictionary_type='alphabet', max_len=1000, length=1024):
     if dictionary_type == 'alphabet':
         self.dictionary = alphabet.alphabet_dict
     elif dictionary_type == 'katakana':
         self.dictionary = katakana.katakana_dict
     elif dictionary_type == 'katakana_small':
         self.dictionary = katakana_small.katakana_small_dict
     self.dictionary_type = dictionary_type
     self.max_len = max_len
     self.length = length
     kakasi = pykakasi.kakasi()
     kakasi.setMode('H', 'a')
     kakasi.setMode('K', 'a')
     kakasi.setMode('J', 'a')
     self.kakasi_conv = kakasi.getConverter()
Ejemplo n.º 22
0
def get_initials(words):
    kakasi.setMode('H', 'a')
    kakasi.setMode('K', 'a')
    kakasi.setMode('J', 'a')
    # kakasi.setMode("C", True)
    conv = kakasi.getConverter()
    initials = []

    for word in words:
        initial = conv.do(word)[:1].upper()
        initials.append(initial)

    # if initials.count == 0:
    #     print('Error: cannot get initial')
    #     return

    return initials
Ejemplo n.º 23
0
def VoiceRecodeAndRecongnize():
    p = pyaudio.PyAudio()
    start = input("録音開始 [Enter]>>")
    print("録音中...")
    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=chunk)
    sequence = []
    # while True:
    #     data = stream.read(chunk)
    #     sequence.append(data)
    #     if keyboard.is_pressed("esc"):
    #         break
    for i in range(0, int(RATE / chunk * 3)):
        data = stream.read(chunk)
        sequence.append(data)
    print("録音終了")

    stream.close()
    p.terminate()
    wavFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wavFile.setnchannels(CHANNELS)
    wavFile.setsampwidth(p.get_sample_size(FORMAT))
    wavFile.setframerate(RATE)
    wavFile.writeframes(b"".join(sequence))
    wavFile.close()

    r = sr.Recognizer()
    with sr.AudioFile(WAVE_OUTPUT_FILENAME) as source:
        audio = r.record(source)
    word = r.recognize_google(audio, language='ja')
    #kakasi = kakasi()
    kakasi.setMode('J', 'H')
    kakasi.setMode('K', 'H')
    conv = kakasi.getConverter()
    word_hiragana = conv.do(word)
    print("----------------------------------------")
    print("認識結果:" + word)
    print("認識結果(ひらがなver):" + word_hiragana)
    print("----------------------------------------")

    return word_hiragana
def furiganaLineTrans(text):
	kakasi.setMode("J","H") 
	kakasi.setMode("r","Hepburn") 
	conv = kakasi.getConverter()

	stitchStr = ''
	for segWord in segmenter.tokenize(text):
		result = ""
		if transDict.has_key(segWord):
			result = transDict[segWord]
		else:
			result = conv.do(segWord)
		if segWord == result:
			stitchStr += segWord
		else:
			result = result.strip().replace("\n","")
			useStr = segWord + '  { ' + result +' }  '
			stitchStr += useStr

	return stitchStr
Ejemplo n.º 25
0
def change_char(tweet_text, kakasi):
    '''change tweet_text, Kanji -> Hiragana'''
    seperator = "。"
    sentence_list = tweet_text.split(seperator)
    sentence_list.pop()
    sentence_list = [x+seperator for x in sentence_list]
    
    kakasi = kakasi()
    kakasi.setMode("J", "H")  # J(漢字) からH(ひらがな)へ
    conv = kakasi.getConverter()
    
    for sentence in sentence_list:
        print(sentence)
        print(conv.do(sentence))
        print()
        
    kana_text = conv.do(tweet_text)
    with open("prepare_text.txt", mode="w", encoding="utf-8") as f:
        f.write(kana_text)
    return kana_text
Ejemplo n.º 26
0
import sys
import os
import tweepy

sys.path.append('/app')

import models  # noqa
import jaconv  # noqa
from util import morpheme  # noqa
import services  # noqa

db = firestore.Client()

kakasi = kakasi()
kakasi.setMode("J", "H")
conv = kakasi.getConverter()

system_service = services.system_instance
word_service = services.word_instance
user_service = services.user_instance
tag_service = services.tag_instance

# user_service.get_oauth_url()

# user_data = models.UserUpdate()
# user_data.twitter_id = 'user_id'
# user_data.twitter_name= 'screen_name'
# user_data.twitter_key= 'oauth_token'
# user_data.twitter_secret= 'oauth_token_secret'
# user_data.session_id = "aaaa"
# print(user_data)
Ejemplo n.º 27
0
                incorrect_answers.append(output_string)

        student_output_filepath = os.path.join(output_directory, student_name)
        with open(student_output_filepath, "w+") as student_corrections_file:
            student_corrections_file.write("\n".join(incorrect_answers))

# ------------------------------------------------------------------------------
current_directory = os.path.dirname(os.path.abspath(__file__))
# CHANGE THIS BIT!
master_answer_key = os.path.join(current_directory,
                                 "answer_keys/2017_06_08_lesson_03_part_02.txt")
student_answer_sets = os.path.join(current_directory,
                                   "answer_sets")
# CHANGE THIS BIT!
date_string = "2017_06_08"
current_student_answer_sets = os.path.join(student_answer_sets,
                                           date_string,
                                           "student_answers")
output_directory = os.path.join(student_answer_sets,
                                date_string,
                                "graded_answers")
# ---
kakasi = kakasi()
kakasi.setMode("J", "H")  # default: Japanese no conversion
japanese_text_normalizer = kakasi.getConverter()
# ---
perform_grading(master_answer_key,
                current_student_answer_sets,
                output_directory,
                japanese_text_normalizer)
Ejemplo n.º 28
0
def dabiaoqian(path):

    from pykakasi import kakasi
    import csv, os

    name_tezheng = 'mizhichuli_log'
    # 装有特征值的那个文件的文件名

    xinde = 'xinde_mizhichuli'
    # 装入新的特征值的文件名

    name1 = 'align1'
    name2 = 'symbol.txt'
    #标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉

    kakasi = kakasi()
    kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
    kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
    kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
    kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
    kakasi.setMode("s", True)  # add space, default: no separator
    conv = kakasi.getConverter()

    for i in os.listdir(path):

        path_1 = os.path.join(path, i)

        path_out = os.path.join(path_1, 'keka')

        path_tezheng = os.path.join(path_1, name_tezheng)

        #biaozhiwenjian = csv.reader(open(os.path.join(path_1, name1), 'r', encoding='EUC-JP'))  # 把标志文件读进来
        biaozhiwenjian = csv.reader(
            open(os.path.join(path_1, name2), 'r',
                 encoding='utf-8'))  #如果标志文件是.txt文件

        biaozhiwenjian_1 = [i for i in biaozhiwenjian
                            ]  # 转化为list,但是内容是list里面套list
        #[['id: l_8840_9810_T1_F_01'],['REF:  そう です か 、 はい 。 '],['HYP:  そう です か    はい 。 '],['EVAL: C    C    C  D  C    C  '],[],['id: l_10800_13190_T1_F_01']]

        # print(biaozhiwenjian_1)
        # os.system('pause')

        path_xinde = os.path.join(path_1, xinde)
        mulu.mkdir(path_xinde)

        for i in range(0, len(biaozhiwenjian_1)):  # 这里的每一轮可以为一个语音文件打标签

            try:
                biaozhi = biaozhiwenjian_1[i][0]

            except:

                continue

            if 'id:' in biaozhi:

                ID = ''
                l_biaozhi = []
                l_zhengjie = []
                l_zhengjie_1 = []
                l_jieguo = []
                l_jieguo_1 = []

                ID = biaozhiwenjian_1[i][0].replace('id: ', '')

                l_zhengjie = biaozhiwenjian_1[i + 1][0].split()
                l_zhengjie.pop(0)

                l_jieguo = biaozhiwenjian_1[i + 2][0].split()
                l_jieguo.pop(0)

                l_biaozhi = biaozhiwenjian_1[i + 3][0].split()
                l_biaozhi.pop(0)

                #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空
                jishuqi_jieguo = 0
                jishuqi_zhengjie = 0
                jishuqi_biaozhi = 0

                for i in l_biaozhi:

                    if i == "D":
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                        l_jieguo_1.append('')
                        jishuqi_zhengjie += 1
                        jishuqi_biaozhi += 1

                    if i == "C":
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                        jishuqi_zhengjie += 1
                        jishuqi_jieguo += 1
                        jishuqi_biaozhi += 1

                    if i == "I":
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                        l_zhengjie_1.append('')
                        jishuqi_jieguo += 1
                        jishuqi_biaozhi += 1

                    if i == "S":  #如果是S的话特殊处理一下,转化为字母再比较,如果转化之后相等的话,把标志改为C
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])

                        zhengjie_hanzi = l_zhengjie[jishuqi_zhengjie]
                        jieguo_hanzi = l_jieguo[jishuqi_jieguo]

                        #先处理识别结果
                        if conv.do(
                                jieguo_hanzi
                        ) == jieguo_hanzi and jieguo_hanzi != '、':  #判断是不是字母

                            try:
                                zhuanhuan_jieguo = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        strQ2B.strQ2B(jieguo_hanzi)))

                            except:
                                zhuanhuan_jieguo = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        jieguo_hanzi))

                        else:
                            zhuanhuan_jieguo = conv.do(jieguo_hanzi)

                        #再处理正解文
                        if conv.do(
                                zhengjie_hanzi
                        ) == zhengjie_hanzi and zhengjie_hanzi != '、':  # 判断是不是字母

                            try:
                                zhuanhuan_zhengjie = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        strQ2B.strQ2B(zhengjie_hanzi)))

                            except:
                                zhuanhuan_zhengjie = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        zhengjie_hanzi))

                        else:
                            zhuanhuan_zhengjie = conv.do(zhengjie_hanzi)

                        if zhuanhuan_jieguo == zhuanhuan_zhengjie:

                            # print("正解list")
                            # print(l_zhengjie_1)
                            #
                            # print("识别结果list")
                            # print(l_jieguo_1)
                            #
                            # print("zhuanhuan_jieguo")
                            # print(zhuanhuan_jieguo)
                            # print("zhuanhuan_zhengjie")
                            # print(zhuanhuan_zhengjie)
                            # print("有标志被改了")
                            # print(ID)
                            # os.system("pause")

                            l_biaozhi[jishuqi_biaozhi] = 'C'

                        jishuqi_biaozhi += 1
                        jishuqi_zhengjie += 1
                        jishuqi_jieguo += 1

                # print(l_jieguo_1)
                # print(l_zhengjie_1)
                # print(l_biaozhi)
                # os.system('pause')

                path_out_1 = os.path.join(path_out, ID + '.out')  #读出.out文件
                dianout = pi.read_out(path_out_1)
                start = dianout.pop(0)[1][1]  # 给开始的无音区间打标签9,pop掉第一个元素
                start_1 = dianout[-1][1][0]  #给末尾句号打标签9
                # end_1 = dianout.pop(-1)[1][1]

                # print(dianout)
                # os.system('pause')
                # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]]

                path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv')
                tezhengzhi = csv.reader(
                    open(path_tezheng_1, 'r', encoding='utf-8'))
                t_file_list = [i for i in tezhengzhi]

                end_1 = len(t_file_list) - 1

                for i in range(start + 1):
                    t_file_list[i].insert(0,
                                          '9')  # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理

                for i in range(start_1, end_1 + 1):
                    t_file_list[i].insert(0, '9')

                l_jieguo_1.pop(-1)  #最后句号的部分已经打过标签了,需要把它pop掉

                print("ID")
                print(ID)

                print("l_biaozhi")
                print(l_biaozhi)
                print("l_jieguo_1")
                print(l_jieguo_1)

                print("dianout")
                print(dianout)

                dianout_chongzao = cz.chongzao(l_biaozhi, l_jieguo_1, dianout,
                                               ID)  # 生成新的dianoutlist,以后就靠它了

                print('dianout_chongzao')
                print(dianout_chongzao)

                #通过得到的新的list,开始打标签
                # [['災害', [3, 40], 'C'], ['で', [41, 48], 'C'], ['ござい', [49, 77], 'C'], ['ます', [78, 98], 'C'],
                #  ['から', [99, 130], 'C'], ['、', [131, 152], 'C'], ['その', [153, 177], 'C'], ['場', [178, 190], 'C'],
                #  ['で', [191, 209], 'C']]
                for i in dianout_chongzao:

                    start, end = i[1]
                    if i[2] == 'C':

                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '0')

                    else:

                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '1')

                path_xinde_tezhengzhi = os.path.join(path_xinde, ID + '.csv')

                with open(path_xinde_tezhengzhi, 'w+',
                          encoding='utf-8') as mergen_file:
                    for i in t_file_list:
                        mergen_file.write('%s\n' % ','.join(i))

        shanchu.shanchuhang(path_xinde)  # 把有标记9的特征值全部都删除掉
Ejemplo n.º 29
0
from pykakasi import kakasi

kakasi = kakasi()
kakasi.setMode('H', 'a')
kakasi.setMode('K', 'a')
kakasi.setMode('J', 'a')
conv = kakasi.getConverter()
print(conv.do('本日は晴天なり'))

Ejemplo n.º 30
0
    def convert(self, inputText):

        input = inputText
        input = input.replace(" ", "**SPACE**")
        lines = input.splitlines()

        ## Prepare response with dict
        romanized = []

        for line in lines:
            text = line
          
            chunklines = mecab_tagger.parse(text).splitlines()[:-1]    
            parsed = [[chunk.split('\t')[0], tuple(chunk.split('\t')[1].split(',')) ] for chunk in chunklines]

            ## Parse
            romanizedLine = []
            for i in parsed:
                #now for each i[0] do romaji
                conv = kakasi.getConverter()
                finalResult = None

                # ignore calculation if initial string is numeric
                if is_number(i[0]):
                    finalResult = ""+i[0]

                # ignore calculation if string has non JP chars
                if finalResult == None and is_japanese(i[0]) == False:
                    finalResult = i[0]                    

                if finalResult == None:    
                    result1 = None
                    if len(i) == 2 and len(i[1]) > 8:
                        result1 = conv.do(i[1][7])

                    result2 = conv.do(i[0])

                    if result1 == None:
                        finalResult = result2+" "
                    elif result1 != None and result2 != result1:
                        finalResult = result2+" "
                    else:
                        finalResult = result2+" "

#                print("r1 "+result1)
#                print("r2 "+result2)
                romanizedLine.append(finalResult)


            pair = {}    
            romanizedLine = "".join(romanizedLine)
                

            romanizedLine = romanizedLine.replace(" ha ", " wa ")

            ## Collapse っ
            #k
            romanizedLine = romanizedLine.replace("tsu ka ", "tsuka")
            romanizedLine = romanizedLine.replace("tsu ke ", "kke")
            romanizedLine = romanizedLine.replace("tsu ki ", "kki")
            romanizedLine = romanizedLine.replace("tsu ko ", "kko")
            romanizedLine = romanizedLine.replace("tsu ku ", "kku")

            ## Collapse っ
            #s
            romanizedLine = romanizedLine.replace("tsu sa ", "ssa")
            romanizedLine = romanizedLine.replace("tsu se ", "sse")
            romanizedLine = romanizedLine.replace("tsu si ", "ssi")
            romanizedLine = romanizedLine.replace("tsu so ", "sso")
            romanizedLine = romanizedLine.replace("tsu su ", "ssu")

            ## Collapse っ
            #t
            romanizedLine = romanizedLine.replace("tsu ta ", "tta")
            romanizedLine = romanizedLine.replace("tsu te ", "tte")
            romanizedLine = romanizedLine.replace("tsu ti ", "tti")
            romanizedLine = romanizedLine.replace("tsu to ", "tto")
            romanizedLine = romanizedLine.replace("tsu tu ", "ttu")

            ## Collapse っ
            #p
            romanizedLine = romanizedLine.replace("tsu pa ", "ppa")
            romanizedLine = romanizedLine.replace("tsu pe ", "ppe")
            romanizedLine = romanizedLine.replace("tsu pi ", "ppi")
            romanizedLine = romanizedLine.replace("tsu po ", "ppo")

            ## Dangling letters
            romanizedLine = romanizedLine.replace(" u ", "u ")
            romanizedLine = romanizedLine.replace(" i ", "i ")

            ## Other fixes, after tsu particle
            romanizedLine = romanizedLine.replace(" nai ", "nai ")
            romanizedLine = romanizedLine.replace(" ta ", "ta ")
            romanizedLine = romanizedLine.replace(" te ", "te ")
            romanizedLine = romanizedLine.replace(" ten ", "ten ")
            romanizedLine = romanizedLine.replace(" ku ", "ku ")
            romanizedLine = romanizedLine.replace(" ba ", "ba ")
            romanizedLine = romanizedLine.replace(" ka ", "ka ")
            romanizedLine = romanizedLine.replace(" ze ", "ze ")
            romanizedLine = romanizedLine.replace(" ga ", "ga ")
            romanizedLine = romanizedLine.replace(" re ", "re ")

            ## Extended letters
            romanizedLine = romanizedLine.replace("a-", "ā")
            romanizedLine = romanizedLine.replace("e-", "ē")
            romanizedLine = romanizedLine.replace("i-", "ī")
            romanizedLine = romanizedLine.replace("o-", "ō")
            romanizedLine = romanizedLine.replace("u-", "ū")
            
            ## Special characters / Punctuation
            ## https://en.wikipedia.org/wiki/List_of_Japanese_typographic_symbols

            romanizedLine = romanizedLine.replace("「", "'")
            romanizedLine = romanizedLine.replace("」", "'")
            romanizedLine = romanizedLine.replace("『", "\"")
            romanizedLine = romanizedLine.replace("』", "\"")
            romanizedLine = romanizedLine.replace("(", "(")
            romanizedLine = romanizedLine.replace(")", ")")
            romanizedLine = romanizedLine.replace("〔", "[")
            romanizedLine = romanizedLine.replace("〕", "]")
            romanizedLine = romanizedLine.replace("[", "[")
            romanizedLine = romanizedLine.replace("]", "]")
            romanizedLine = romanizedLine.replace("{", "{")
            romanizedLine = romanizedLine.replace("}", "}")
            romanizedLine = romanizedLine.replace("⦅", "((")
            romanizedLine = romanizedLine.replace("⦆", "))")
            romanizedLine = romanizedLine.replace("〈", "‹")
            romanizedLine = romanizedLine.replace("〉", "›")
            romanizedLine = romanizedLine.replace("《", "«")
            romanizedLine = romanizedLine.replace("》", "»")
            romanizedLine = romanizedLine.replace("【", "[")
            romanizedLine = romanizedLine.replace("】", "]")
            romanizedLine = romanizedLine.replace("〖", "[")
            romanizedLine = romanizedLine.replace("〗", "]")
            romanizedLine = romanizedLine.replace("〘", "[")
            romanizedLine = romanizedLine.replace("〙", "]")
            romanizedLine = romanizedLine.replace("〚", "[")
            romanizedLine = romanizedLine.replace("〛", "]")
            romanizedLine = romanizedLine.replace("。", ".")
            romanizedLine = romanizedLine.replace("、", ",")
            romanizedLine = romanizedLine.replace("・", "·")
            romanizedLine = romanizedLine.replace("゠", "–")
            romanizedLine = romanizedLine.replace("=", "—")
            romanizedLine = romanizedLine.replace("…", "...")
            romanizedLine = romanizedLine.replace("‥", "..")            
            
            ## Custom tokens and fixes
            romanizedLine = romanizedLine.replace("**SPACE**", " ")
            text = text.replace("**SPACE**", " ")

            ## Remove multiple spaces
            romanizedLine = romanizedLine.strip()
            romanizedLine = " ".join(romanizedLine.split())

            

            pair[text] = romanizedLine.strip()
            romanized.append(pair)

        return romanized
Ejemplo n.º 31
0
def dabiaoqian(path):

    from pykakasi import kakasi

    BASE_DIRS = path
    # 批次

    name_tezheng = 'log'
    # 装有特征值的那个文件的文件名

    xinde = 'xinde_log'
    # 装入新的特征值的文件名

    houzhui = '.wav.csv'
    # 特征值文件中除去id号之后的后缀部分

    name = 'align1'
    # 表记着CCCCSSSS标志的文件

    shibiejieguo = {}
    # 安放识别结果的字典

    symbolcidian = {}
    # 这样的词典,标志词典
    # id: C001L_086
    # ['S', 'S', 'S', 'C', 'S', 'D', 'D', 'D', 'C']
    # id: C001L_087
    # ['S', 'D', 'D', 'C']
    # id: C001L_088
    # ['S', 'S', 'S', 'S', 'D', 'D', 'D', 'D', 'C', 'C']
    zhengjie = {}
    # 正解文词典

    kakasi = kakasi()
    kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
    kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
    kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
    kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
    kakasi.setMode("s", True)  # add space, default: no separator
    conv = kakasi.getConverter()

    for per_dirs in os.listdir(BASE_DIRS):  # per_dirs = C001L,C001R...

        d_9 = os.path.join(BASE_DIRS,per_dirs,xinde)
        d = os.path.join(BASE_DIRS,per_dirs,xinde)
        mulu.mkdir(d)

        zhengjie,symbolcidian = zidian.zidian(per_dirs,BASE_DIRS)
        #从标志文件中把标志塞进symbolcidian字典里

        for id in os.listdir(os.path.join(BASE_DIRS,per_dirs,name_tezheng)):#id = C001L,C001R下面的文件的名字

            banyun_1 = []#存储C的索引
            banyun_2 = []#存储正确的单词

            banyun_3 = []#存储非C的索引
            banyun_4 = []#存储暂时不正确的单词的拼音
            dianout = []

            id = id.replace(houzhui, '')#把文件名中的.wav.csv去掉只剩id

            # print(id)
            # print(symbolcidian[id])

            enumerate(symbolcidian[id])

            banyun_1 = [i for i,x in enumerate(symbolcidian[id]) if x == 'C']#返回标志C的索引
            banyun_3 = [i for i,x in enumerate(symbolcidian[id]) if x == 'S']#返回替换错误的单词的索引

            t_file = os.path.join(BASE_DIRS, per_dirs, name_tezheng, id + houzhui)
            a = csv.reader(open(t_file, 'r', encoding='utf-8'))
            t_file_list = [i for i in a]

            # if len(banyun_1) == 0:#如果没有一个是正确的,全错,所有的数据都打标签1
            #     for i in range(len(t_file_list)):
            #         t_file_list[i].insert(0, '1')
            # print(banyun_1)
            # print(banyun_3)
            # os.system("pause")

            for u in banyun_1:#banyun_1里面装的全是标志C的索引

                if u+1 <= len(zhengjie[id]):#正解文单词的个数可能没有标志的个数多
                    # print(banyun_1)
                    # print(zhengjie[id][u])
                    # print(zhengjie[id])
                    # print("已经把正确单词 %s 加入数组"%str(zhengjie[id][u]))
                    banyun_2.append(zhengjie[id][u])#banyun_2是存储正确单词的索引的数组
                    # print("此时的banyun_2是")
                    # print(banyun_2)
                    # os.system('pause')
                else:#如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了
                    print("手动调一下这个文件吧%s"%id)
                    print("它的正确单词是")
                    print(banyun_2)
                    os.system("pause")
            # print(banyun_2)
            # os.system('pause')

            for w in banyun_3:#存储非C的索引

                if w + 1 <= len(zhengjie[id]):  # 正解文单词的个数可能没有标志的个数多

                    result = conv.do(zhengjie[id][w])
                    banyun_4.append(result)
                    # if result == zhengjie[id][w] and zhengjie[id][w] != '、':#如果是逗号,也按正常的单词处理
                    #
                    #     banyun_4.append(conv.do(_make_kana_convertor(strQ2B(zhengjie[id][w]))))#如果转化之后的值不变,就说明遇到了字母,把字母转化为半角,再再转化为片假名,之后再转化为罗马字加入列表中
                    # else:
                    # #     banyun_4.append(result)#存储暂时不正确的单词
                    # print("此时的banyun_4是")
                    # print(banyun_4)
                    # os.system('pause')

                else:  # 如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了
                    print("手动调一下这个文件吧%s" % id)
                    print("它的认识出现错误的单词是")
                    print(banyun_4)
                    os.system("pause")

                # print(banyun_2)
                # os.system("pause")

            # for p in symbolcidian[id]:
            #     os.system("pause")
            #     # while p == 'C':
            #     print(p.index('C'))

            dir_out = os.path.join(BASE_DIRS, per_dirs, 'keka',id + '.out')
            dianout = pi.read_out(dir_out)#提取出来的帧号跟julius识别结果一样
            # print(dianout)
            # os.system('pause')
            # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]]

            # [  37   58]  0.562999  で+接続詞	[で]
            start = dianout.pop(0)[1][1]

            # print(start)

            for i in range(start+1):
                t_file_list[i].insert(0, '9')#最前面的无音区间全部都打标签9,把它们当做正确认识来处理

            for y in dianout:#dianout是识别结果跟对应的帧数表

                # print("此时的单词是%s"%y)
                # print("此时的匹配结果是")
                # print(dianout)
                # os.system("pause")

                if y[1][1]+1 <= len(t_file_list):#判断这个单词的范围是否超出了特征值得总行数

                    if y[0] == '':#跳过前面的无音区
                        continue

                    if y[0] == dianout[-1][0]:#这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,下面也有一段代码
                        start, end = y[1]
                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '9')
                        continue

                    if y[0] in banyun_2:#如果这个单词存在列表banyun_2中,就给这个单词对应的帧数范围打标签0
                        start, end = y[1]
                        print("正在为文件 %s 的单词 %s 打标签"%(os.path.split(dir_out)[1],y[0]))
                        for i in range(start, end+1):
                            t_file_list[i].insert(0, '0')
                        banyun_2.remove(y[0])#打完标签0之后再从列表中把这个单词删掉

                    elif conv.do(y[0]) == y[0] and y[0] != '、':#如果是字母的话,转化之后还是字母

                        print("发现识别结果中的字母%s"%y[0])
                        print("它在文件%s"%dir_out)

                        try:
                            zhuanhuazhi = conv.do(make_kana_convertor._make_kana_convertor(strQ2B.strQ2B(y[0])))

                        except:
                            zhuanhuazhi =conv.do(make_kana_convertor._make_kana_convertor(y[0]))

                        if zhuanhuazhi in banyun_4:#需要先把字母转化为片假名然后再转化为读音
                            print("转化之后的字母为%s"%zhuanhuazhi)
                            # os.system('pause')
                            start, end = y[1]
                            print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '0')
                            banyun_4.remove(zhuanhuazhi)  # 打完标签0之后再从列表中把这个单词删掉
                        else:
                            start, end = y[1]
                            print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '1')

                    elif conv.do(y[0]) in banyun_4:
                        start, end = y[1]
                        print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '0')
                        banyun_4.remove(conv.do(y[0]))  # 打完标签0之后再从列表中把这个单词删掉

                    else:
                        start, end = y[1]#如果这个单词不在列表banyun_2中,就给这个单词对应的帧数范围打标签1
                        print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0]))
                        for i in range(start , end+1):
                            t_file_list[i].insert(0, '1')

                elif y[1][1]+1 > len(t_file_list):

                    if y[0] == '':
                        continue

                    if y[0] == dianout[-1][0]:
                        start = y[1][0]
                        end = len(t_file_list)
                        for i in range(start, end):#如果是y[1][1]+1 > len(t_file_list)的情况这里end就不能加一了
                            t_file_list[i].insert(0, '9')
                        continue
                    # 这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,上面也有一段代码

                    if y[0] in banyun_2:
                        start = y[1][0]
                        end = len(t_file_list)#如果这个单词的帧数表的范围超出了特征值得行数,就以特征值行数作为end
                        print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end):

                            t_file_list[i].insert(0, '0')
                        banyun_2.remove(y[0])

                    elif conv.do(y[0]) == y[0] and y[0] != '、':  # 如果是字母的话,转化之后还是字母

                        if conv.do(make_kana_convertor._make_kana_convertor(y[0])) in banyun_4:  # 需要先把字母转化为片假名然后再转化为读音
                            start = y[1][0]
                            end = len(t_file_list)
                            print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '0')
                            banyun_4.remove(conv.do(make_kana_convertor._make_kana_convertor(y[0])))  # 打完标签0之后再从列表中把这个单词删掉

                        else:
                            start = y[1][0]
                            end = len(t_file_list)
                            print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0]))
                            for i in range(start, end + 1):
                                t_file_list[i].insert(0, '1')
                    else:
                        start = y[1][0]
                        end = len(t_file_list)
                        print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0]))
                        for i in range(start, end):
                             t_file_list[i].insert(0, '1')

            with open(os.path.join(BASE_DIRS, per_dirs,xinde,id+'.csv') , 'w+',encoding='utf-8') as mergen_file:
                for i in t_file_list:
                    mergen_file.write('%s\n' % ','.join(i))

        shanchu.shanchuhang(d_9)#把有标记9的特征值全部都删除掉
Ejemplo n.º 32
0
def dabiaoqian(path):

    from pykakasi import kakasi
    import csv, os

    name_tezheng = 'log'
    # 装有特征值的那个文件的文件名

    xinde = 'xinde_log'
    # 装入新的特征值的文件名

    houzhui = '.wav.csv'
    # 特征值文件中除去id号之后的后缀部分

    name = 'align1'
    # 表记着CCCCSSSS标志的文件

    name1 = 'align1'
    name2 = 'align1.txt'

    shibiejieguo = {}
    # 安放识别结果的字典

    symbolcidian = {}
    # 这样的词典,标志词典
    # id: C001L_086
    # ['S', 'S', 'S', 'C', 'S', 'D', 'D', 'D', 'C']
    # id: C001L_087
    # ['S', 'D', 'D', 'C']
    # id: C001L_088
    # ['S', 'S', 'S', 'S', 'D', 'D', 'D', 'D', 'C', 'C']
    zhengjie = {}
    # 正解文词典
    kakasi = kakasi()
    kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
    kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
    kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
    kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
    kakasi.setMode("s", True)  # add space, default: no separator
    conv = kakasi.getConverter()

    for i in os.listdir(path):

        path_1 = os.path.join(path, i)

        path_out = os.path.join(path_1, 'keka')

        path_tezheng = os.path.join(path_1, name_tezheng)

        biaozhiwenjian = csv.reader(
            open(os.path.join(path_1, name1), 'r',
                 encoding='EUC-JP'))  # 把标志文件读进来
        # biaozhiwenjian = csv.reader(open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件

        biaozhiwenjian_1 = [i for i in biaozhiwenjian
                            ]  # 转化为list,但是内容是list里面套list
        #[['id: l_8840_9810_T1_F_01'],['REF:  そう です か 、 はい 。 '],['HYP:  そう です か    はい 。 '],['EVAL: C    C    C  D  C    C  '],[],['id: l_10800_13190_T1_F_01']]

        # print(biaozhiwenjian_1)
        # os.system('pause')

        path_xinde = os.path.join(path_1, xinde)
        mulu.mkdir(path_xinde)

        for i in range(0, len(biaozhiwenjian_1), 5):  #这里的每一轮可以为一个语音文件打标签

            ID = ''
            l_biaozhi = []
            l_zhengjie = []
            l_zhengjie_1 = []
            l_jieguo = []
            l_jieguo_1 = []

            ID = biaozhiwenjian_1[i][0].replace('id: ', '')

            l_zhengjie = biaozhiwenjian_1[i + 1][0].split()
            l_zhengjie.pop(0)

            l_jieguo = biaozhiwenjian_1[i + 2][0].split()
            l_jieguo.pop(0)

            l_biaozhi = biaozhiwenjian_1[i + 3][0].split()
            l_biaozhi.pop(0)

            # try:
            #     ID = biaozhiwenjian_1[i].replace('id: ', '')
            #
            #     l_zhengjie = biaozhiwenjian_1[i+1].split()
            #     l_zhengjie.pop(0)
            #
            #     l_jieguo = biaozhiwenjian_1[i+2].split()
            #     l_jieguo.pop(0)
            #
            #     l_biaozhi = biaozhiwenjian_1[i+3].split()
            #     l_biaozhi.pop(0)
            #
            # except:
            #     print(biaozhiwenjian_1[i])
            #     os.system("pause")

            #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空
            jishuqi_jieguo = 0
            jishuqi_zhengjie = 0

            for i in l_biaozhi:

                if i == "D":
                    l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                    l_jieguo_1.append('')
                    jishuqi_zhengjie += 1

                if i == "C":
                    l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                    l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                    jishuqi_zhengjie += 1
                    jishuqi_jieguo += 1

                if i == "I":
                    l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                    l_zhengjie_1.append('')
                    jishuqi_jieguo += 1

                if i == "S":
                    l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                    l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                    jishuqi_zhengjie += 1
                    jishuqi_jieguo += 1

            # print(l_jieguo_1)
            # print(l_zhengjie_1)
            # print(l_biaozhi)
            # os.system('pause')

            path_out_1 = os.path.join(path_out, ID + '.out')
            dianout = pi.read_out(path_out_1)

            # print(dianout)
            # os.system('pause')

            path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv')
            tezhengzhi = csv.reader(open(path_tezheng_1, 'r',
                                         encoding='utf-8'))
            t_file_list = [i for i in tezhengzhi]
            dimension = len(t_file_list[0])

            start = dianout.pop(0)[1][1]  #给开始的无音区间打标签9

            for i in range(start + 1):
                t_file_list[i].insert(0, '9')  # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理

            zhenshubiao = {}  #给每个单词都建立一个词典

            for i in dianout:
                zhenshubiao[i[0]] = i[1]  #一个.out文件中的每个单词都建立一个对应的字典

            start, end = zhenshubiao['。']  #给最后面的句号的部分打上标签9

            for i in range(start, end + 1):
                t_file_list[i].insert(0, '9')

            # print(dianout)
            # os.system('pause')
            # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]]
            # while 'D' in l_biaozhi:
            #     l_biaozhi.remove('D')  # 一次性只会删除一个D,所以要用while

            l_biaozhi_1 = [i for i, x in enumerate(l_biaozhi)
                           if x == 'S']  # 返回标志S的索引
            # print(len(l_biaozhi_1))
            # os.system('pause')

            if len(l_biaozhi_1) != 0:  #如果l_biaozhi_1里面没有单词,说明全部都被正确认识了

                # print('l_jieguo_1')
                # print(l_jieguo_1)
                #
                # print('l_biaozhi_1')
                # print(l_biaozhi_1)
                #
                # print('l_biaozhi')
                # print(l_biaozhi)
                #
                # print('l_zhengjie_1')
                # print(l_zhengjie_1)

                # print(l_jieguo_1)
                # print(l_zhengjie_1)
                # print(l_biaozhi)

                for y in l_biaozhi_1:  #处理标志s对应的单词,把正解文和识别结果都转化为字母再比较一次
                    # print("现在输出y的值")
                    # print(y)
                    #
                    # print('现在输出l_jieguo_1[y]')
                    # print(l_jieguo_1[y])
                    # print(ID)
                    # os.system('pause')

                    #先处理识别结果
                    if conv.do(
                            l_jieguo_1[y]
                    ) == l_jieguo_1[y] and l_jieguo_1[y] != '、':  #判断是不是字母

                        try:
                            zhuanhuan_jieguo = conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    strQ2B.strQ2B(l_jieguo_1[y])))

                        except:
                            zhuanhuan_jieguo = conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    l_jieguo_1[y]))

                    else:
                        zhuanhuan_jieguo = conv.do(l_jieguo_1[y])

                    #再处理正解文
                    if conv.do(
                            l_zhengjie_1[y]
                    ) == l_zhengjie_1[y] and l_zhengjie_1[y] != '、':  # 判断是不是字母

                        try:
                            zhuanhuan_zhengjie = conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    strQ2B.strQ2B(l_zhengjie_1[y])))

                        except:
                            zhuanhuan_zhengjie = conv.do(
                                make_kana_convertor._make_kana_convertor(
                                    l_zhengjie_1[y]))

                    else:
                        zhuanhuan_zhengjie = conv.do(l_zhengjie_1[y])

                    # print('l_jieguo_1[y]')
                    # print(l_jieguo_1[y])
                    # os.system('pause')

                    guanjianzi = l_jieguo_1[y]  #把S对应的单词取出来

                    # print('guanjianzi')
                    # print(guanjianzi)
                    # os.system('pause')
                    #
                    # print('zhenshubiao')
                    # print(zhenshubiao[guanjianzi])
                    # os.system('pause')

                    try:
                        start, end = zhenshubiao[guanjianzi]  #把这个单词对应的帧数范围取出来

                    except:
                        print('ID')
                        print(ID)
                        print('zhenshubiao')
                        print(zhenshubiao)
                        print('guanjianzi')
                        print(guanjianzi)
                        os.system('pause')

                    for i in range(start, end + 1):

                        if zhuanhuan_jieguo == zhuanhuan_zhengjie:
                            t_file_list[i].insert(0, '0')

                        else:
                            t_file_list[i].insert(0, '1')

            jishuqi_tezhengzhi = 0

            for i in t_file_list:  #给被正确识别的单词打标签0

                # if i[0] != '0' and i[0] != '1' and i[0] != '9':

                if len(i[0]) == dimension:
                    t_file_list[jishuqi_tezhengzhi].insert(0, '0')

                jishuqi_tezhengzhi += 1

            path_xinde_tezhengzhi = os.path.join(path_xinde, ID + '.csv')

            with open(path_xinde_tezhengzhi, 'w+',
                      encoding='utf-8') as mergen_file:
                for i in t_file_list:
                    mergen_file.write('%s\n' % ','.join(i))

        shanchu.shanchuhang(path_xinde)  # 把有标记9的特征值全部都删除掉
Ejemplo n.º 33
0
import urllib.request
import json

from pykakasi import kakasi, wakati


EMOJI_JSON_URL = 'https://raw.githubusercontent.com/yagays/emoji-ja/20190726/data/emoji_ja.json'
EMOJI_DICT_PATH = 'tsv/emoji.tsv'
kakasi = kakasi()
kakasi.setMode("J","H")
conv_j2h = kakasi.getConverter()
kakasi.setMode("K","H")
conv_k2h = kakasi.getConverter()


def hiraganafy(keyword):
    k = keyword.upper()
    k = conv_j2h.do(k)
    k = conv_k2h.do(k)
    return k


def add_word_to_dict(emoji, keyword, emoji_dict):
    valid_keyword = keyword.replace('ゔ', 'う゛')
    word = f':{valid_keyword}\t{emoji}\t記号\t'
    emoji_dict.append(word)


class EmojiDict():
    emoji_json = None
    emoji_dict = []