Ejemplo n.º 1
0
def calc_speak_time(df, kakasi, speaker_id=None):
    """
    :param df: Pandas DataFrame
    :param kakasi: Kakasi Instance
    :param speaker_id: String
    :return: Pandas DataFrame
    """

    # ひらがなへのconverter準備
    kakasi.setMode('J', 'H')
    conv = kakasi.getConverter()

    speak_scores = []

    for i in df.index:
        speak_time = float(df.end_time[i]) - float(df.start_time[i])
        speak_length = len(conv.do(df.transcript[i]))
        speak_score = speak_time / speak_length
        speak_scores.append(speak_score)

    df['speak_scores'] = speak_scores

    if speaker_id is not None:
        if speaker_id not in df.speaker.unique():
            raise Exception(
                str(speaker_id) + ' does not exist in speaker column ')
        else:
            df = df[df.speaker == str(speaker_id)]
            df.reset_index(drop=True, inplace=True)

    return df
Ejemplo n.º 2
0
def run_kakasi(text_input):
    global conv

    if not conv:
        # kakasi set up from the pip page for this module.
        # using only setMode(J, a), (r, Hepburn) (s, False) opts
        # is buggy  ¯\_(ツ)_/¯

        from pykakasi import kakasi

        kakasi = kakasi()
        kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
        kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
        kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
        kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
        kakasi.setMode("s", False)  # add space, default: no separator
        conv = kakasi.getConverter()

    # Use mecab instead of kakasi's wakati feature
    # to do spacing between Japanese kanji
    # because mecab has better spacing prediction
    spaced = mecab(text_input)
    chars = spaced.split(' ')

    def replacer(word):
        return 'わ' if word == 'は' else word

    spaced = ' '.join(list(map(replacer, chars)))

    result = conv.do(spaced)
    return result
Ejemplo n.º 3
0
def text2InstaDmSmiling(text):
    EMOJI = '^_^'
    little_dashes = "ゔがぎぐげござじずぜぞだぢづでどばびぶべぼ"
    little_dashes_conv = "うかきくけこさしすせそたちつてとはひふへほ"
    little_circle = "ぱぴぷぺぽ"
    little_circle_conv = "はひふへほ"
    small_letter = "ぁぃぅぇぉゃゅょっ"
    small_letter_conv = "あいうえおやゆよつ"
    
    little_dashes_dict = {}
    for i in range(len(little_dashes)):
        little_dashes_dict[little_dashes[i]] = little_dashes_conv[i] + EMOJI
        
    little_circle_dict = {}
    for i in range(len(little_circle)):
        little_circle_dict[little_circle[i]] = little_circle_conv[i] + EMOJI

    small_letter_dict = {}
    for i in range(len(small_letter)):
        small_letter_dict[small_letter[i]] = small_letter_conv[i] + EMOJI
    
    kakasi.setMode('J', 'H') 
    kakasi.setMode('K', 'H')

    conv = kakasi.getConverter()
    text_hiragana = conv.do(text)
    
    text_hiragana = text_hiragana.translate(str.maketrans(little_dashes_dict))
    text_hiragana = text_hiragana.translate(str.maketrans(little_circle_dict))
    text_hiragana = text_hiragana.translate(str.maketrans(small_letter_dict))
    return text_hiragana
Ejemplo n.º 4
0
def readDict():
    #日本語辞書("./dictionary/nihongolist.xlsx")の読み込み
    #読み込んだあと、"dictionary"内にnihongolist.binaryfileでpickle保存する。
    #2回目以降はnihongolist.binaryfileを読み込む。
    #辞書を更新したあとは、nihongolist.binaryfileを削除してください。
    if os.path.isfile('./dictionary/nihongolist.binaryfile'):
        with open('./dictionary/nihongolist.binaryfile', 'rb') as web:
            df = pickle.load(web)
    else:
        df=pd.read_excel("./dictionary/nihongolist.xlsx")
        df["romaji"]="_"
        df["score"]=0.000
        from pykakasi import kakasi
        kakasi = kakasi()
        kakasi.setMode('H', 'a')
        conv = kakasi.getConverter()
        for n in range (len(df)):
            #print(df.iat[n,1])
            romaji=conv.do(df.iat[n,1])
            romaji=romaji.replace("a","aaa").replace("i","iii").replace("u","uuu").replace("e","eee").replace("o","ooo")
            df.iat[n,5]=romaji
            #print(df.iat[n,5])
        with open('./dictionary/nihongolist.binaryfile', 'wb') as web:
            pickle.dump(df , web)
    return df
#readDict()
Ejemplo n.º 5
0
def WordConvert(word):
    from pykakasi import kakasi
    kakasi = kakasi()
    kakasi.setMode('J', 'H')
    kakasi.setMode("K", "H")
    conv = kakasi.getConverter()

    return conv.do(word)
Ejemplo n.º 6
0
 def to_katakana(self, text):
     from pykakasi import kakasi
     kakasi = kakasi()
     kakasi.setMode('K', 'K')
     kakasi.setMode('H', 'K')
     kakasi.setMode('E', 'K')
     kakasi.setMode('J', 'K')
     kakasi.setMode('a', 'K')
     converter = kakasi.getConverter()
     return converter.do(text.decode('utf-8'))
Ejemplo n.º 7
0
class Phrase(pygame.sprite.Sprite):
    font = pygame.font.Font('migu-1m-regular.ttf', 32)
    kakasi = kakasi()
    kakasi.setMode("H", "a")
    kakasi.setMode("K", "a")
    kakasi.setMode("J", "a")
    kakasi.setMode("r", "Kunrei")
    conv = kakasi.getConverter()

    def __init__(self, y, string):
        pygame.sprite.Sprite.__init__(self, self.containers)
        self.characters_roman = [c for c in self.conv.do(string)]

        # 日本語/ローマ字のうち、より幅の大きい方に画像の幅を合わせる
        character_size = self.font.size(
            max(self.conv.do(string), string, key=lambda x: len(x)))

        surface = pygame.Surface((character_size[0], character_size[1] * 2))
        surface.set_colorkey((0, 0, 0))

        self.characters = self.font.render(string, True, (1, 1, 1),
                                           (255, 255, 255))

        self.image = surface
        self.rect = self.image.get_rect()
        self.rect.midleft = (640, y)
        self.speed = -2.0
        # 文字列内の文字の参照位置
        self.next_character_pos = 0

    def update(self):
        self.rect.move_ip(self.speed, 0)
        self.image.fill((255, 255, 255))
        self.image.blit(self.characters, (0, 0))

        if len(self.characters_roman) == self.next_character_pos:
            self.kill()

        for i, c in enumerate(self.characters_roman):
            if not c:
                continue
            self.image.blit(
                self.font.render(c, True, (1, 1, 1), (255, 255, 255)),
                (i * 16, 32))
        if self.rect.right < 0:
            self.rect.left = 640

    def input(self, character):
        if self.characters_roman[self.next_character_pos] == character:
            self.characters_roman[self.next_character_pos] = ''
            self.next_character_pos += 1
            Explosion((self.rect.left + (self.next_character_pos * 16),
                       self.rect.centery))
Ejemplo n.º 8
0
def to_romaji(text_jpn):
    text = ' '.join(tinysegmenter.tokenize(text_jpn))
    kakasi.setMode("H", "a")  # Hiragana ke romaji
    kakasi.setMode("K", "a")  # Katakana ke romaji
    kakasi.setMode("J", "a")  # Japanese ke romaji
    kakasi.setMode("r", "Hepburn")  # default: Hepburn Roman table\
    convert = (kakasi.getConverter()).do(text)
    return convert
Ejemplo n.º 9
0
def VoiceRecodeAndRecongnize():
    p = pyaudio.PyAudio()
    start = input("録音開始 [Enter]>>")
    print("録音中...")
    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=chunk)
    sequence = []
    # while True:
    #     data = stream.read(chunk)
    #     sequence.append(data)
    #     if keyboard.is_pressed("esc"):
    #         break
    for i in range(0, int(RATE / chunk * 3)):
        data = stream.read(chunk)
        sequence.append(data)
    print("録音終了")

    stream.close()
    p.terminate()
    wavFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wavFile.setnchannels(CHANNELS)
    wavFile.setsampwidth(p.get_sample_size(FORMAT))
    wavFile.setframerate(RATE)
    wavFile.writeframes(b"".join(sequence))
    wavFile.close()

    r = sr.Recognizer()
    with sr.AudioFile(WAVE_OUTPUT_FILENAME) as source:
        audio = r.record(source)
    word = r.recognize_google(audio, language='ja')
    #kakasi = kakasi()
    kakasi.setMode('J', 'H')
    kakasi.setMode('K', 'H')
    conv = kakasi.getConverter()
    word_hiragana = conv.do(word)
    print("----------------------------------------")
    print("認識結果:" + word)
    print("認識結果(ひらがなver):" + word_hiragana)
    print("----------------------------------------")

    return word_hiragana
Ejemplo n.º 10
0
def change_char(tweet_text, kakasi):
    '''change tweet_text, Kanji -> Hiragana'''
    seperator = "。"
    sentence_list = tweet_text.split(seperator)
    sentence_list.pop()
    sentence_list = [x+seperator for x in sentence_list]
    
    kakasi = kakasi()
    kakasi.setMode("J", "H")  # J(漢字) からH(ひらがな)へ
    conv = kakasi.getConverter()
    
    for sentence in sentence_list:
        print(sentence)
        print(conv.do(sentence))
        print()
        
    kana_text = conv.do(tweet_text)
    with open("prepare_text.txt", mode="w", encoding="utf-8") as f:
        f.write(kana_text)
    return kana_text
def furiganaLineTrans(text):
	kakasi.setMode("J","H") 
	kakasi.setMode("r","Hepburn") 
	conv = kakasi.getConverter()

	stitchStr = ''
	for segWord in segmenter.tokenize(text):
		result = ""
		if transDict.has_key(segWord):
			result = transDict[segWord]
		else:
			result = conv.do(segWord)
		if segWord == result:
			stitchStr += segWord
		else:
			result = result.strip().replace("\n","")
			useStr = segWord + '  { ' + result +' }  '
			stitchStr += useStr

	return stitchStr
Ejemplo n.º 12
0
def exchange_word(text):
    kakasi.setMode("H", "a")  # Hiragana to ascii
    kakasi.setMode("K", "a")  # Katakana to ascii
    kakasi.setMode("J", "a")  # Japanese(kanji) to ascii
    conv = kakasi.getConverter()
    result = conv.do(text)
    return result
Ejemplo n.º 13
0
    def parse_topics(self, response):
        # 全角をローマ字に変換してくれるコンバータ(pykakasi)を準備
        from pykakasi import kakasi
        kakasi = kakasi()
        kakasi.setMode('H', 'a')
        kakasi.setMode('K', 'a')
        kakasi.setMode('J', 'a')
        conv = kakasi.getConverter()

        # アイテムクラス"birditem"に画像ファイルごとの情報を格納
        jpgpath = response.xpath(
            '//*[@id="contents"]/div[3]/div/p/img/@src').extract_first()
        birditem = BirdfanItem()
        # 画像ファイルを持つページのURL
        birditem['url'] = response.url
        # 画像ファイルを持つページのタイトル(人間向けのほう)
        birditem['title'] = response.xpath(
            '//*[@id="contents"]/div[3]/h2/a/text()').extract_first()
        # 画像ファイルの野鳥の種別名(全角)をローマ字に変換して格納
        birditem['birdname'] = conv.do(
            response.xpath('//*[@id="contents"]/div[3]/div/div/h3/a/text()').
            extract_first())
        # 画像ファイルのURL
        birditem['jpgurl'] = response.urljoin(jpgpath)
        yield birditem
Ejemplo n.º 14
0
def dialog_nlp(input_txt, version):
    if version == 't5':
        DIALOG_NLP_CONTAINER_NAME = "0e3490a65e84"
        proc = subprocess.run(
            f"docker start {DIALOG_NLP_CONTAINER_NAME}", shell=True)
        with open("dialogue/t5/question/question.txt", "w") as question_txt:
            question_txt.write(input_txt)
        with open("intermediate/nlp_out.txt", "w") as output_txt:
            # import os
            # os.system(
            #     f"docker exec -w /t5 {DIALOG_NLP_CONTAINER_NAME}  python3 test.py")
            proc_1 = subprocess.run(
                f"docker exec -w /t5 {DIALOG_NLP_CONTAINER_NAME}  python3 test.py", shell=True, stdout=output_txt, text=True)
            print(proc_1.stdout)

        # TODO: text outprocess(extract only answer in english)
        from pykakasi import kakasi
        kakasi = kakasi()

        kakasi.setMode('H', 'a')
        kakasi.setMode('K', 'a')
        kakasi.setMode('J', 'a')

        conv = kakasi.getConverter()
        with open("intermediate/nlp_out.txt", "r") as f:
            responce = f.readlines()[1].replace(
                "<pad>", '').replace('</s>', '')
            print(responce)
            responce = conv.do(responce)
            print(responce)
        with open("intermediate/nlp_out_fixed.txt", "w") as f:
            f.write(
                "jsut_ver1.1/onomatopee300/wav/ONOMATOPEE300_300.wav|"+responce[1:].replace('\n', '')+".|1")
def toKatakana(str):
    kakasi.setMode("J", "K")
    # a,H, K, None - roman, Hiragana, Katakana, or non conversion, default: no conversion

    kakasi.setMode("H", "K")
    # a,H,None - roman, Hiragana or non conversion, default: no conversion

    kakasi.setMode("a", "K")
    # a,H,None - roman, Hiragana or non conversion, default: no conversion

    kakasi.setMode("K", "K")

    conv = kakasi.getConverter()

    return conv.do(str)
Ejemplo n.º 16
0
def jp_to_romen(text):
    kakasi.setMode('H', 'a')
    kakasi.setMode('K', 'a')
    kakasi.setMode('J', 'a')
    conv = kakasi.getConverter()
    answer = conv.do(text)
    answer2 = answer.capitalize()
    return answer2
Ejemplo n.º 17
0
def change_word(read_data):
    from pykakasi import kakasi
    kakasi = kakasi()

    kakasi.setMode('H', 'a')
    kakasi.setMode('K', 'a')
    kakasi.setMode('J', 'a')
    conv = kakasi.getConverter()

    return conv.do(read_data)
Ejemplo n.º 18
0
def get_reading_kakasi(word):
    """Gets reading for a given Japanese word by using kakasi. The reading in
       hiragana is returned by this function."""
    import pykakasi.kakasi as kakasi
    kakasi = kakasi()
    kakasi.setMode("J", "H")
    kakasi.setMode("C", True)  # default: Separator
    kakasi.setMode("c", False)  # default: no Capitalize
    conv = kakasi.getConverter()
    result = conv.do(word)
    return result
Ejemplo n.º 19
0
 def formats(key_world):
     key_world = key_world.strip()
     key_world = re.sub(r'[^\w\s]', '', key_world)
     from pykakasi import kakasi
     kakasi = kakasi()
     kakasi.setMode('H', 'a')
     kakasi.setMode('K', 'a')
     kakasi.setMode('J', 'a')
     conv = kakasi.getConverter()
     key_world = conv.do(key_world)
     return key_world
Ejemplo n.º 20
0
def getConverter():
    import sys  # reload()之前必须要引入模块
    reload(sys)
    sys.setdefaultencoding('utf-8')  # 防止UTF8出问题
    from pykakasi import kakasi
    kakasi = kakasi()
    kakasi.setMode('H', 'a')
    kakasi.setMode('K', 'a')
    kakasi.setMode('J', 'a')
    conv = kakasi.getConverter()
    return conv
Ejemplo n.º 21
0
def make_filename(title):
    from pykakasi import kakasi, wakati

    import zen2han

    kakasi = kakasi()
    kakasi.setMode("H", "a")
    kakasi.setMode("K", "a")
    kakasi.setMode("J", "a")
    conv = kakasi.getConverter()
    title = conv.do(title).replace(" ", "_")
    return zen2han.zen2han(title)
Ejemplo n.º 22
0
def main():

    kakasi.setMode('H', 'a')
    kakasi.setMode('K', 'a')
    kakasi.setMode('J', 'a')
    conv = kakasi.getConverter()
    
    with serial.Serial(port="/dev/tty.usbmodem141141", baudrate=9600, timeout=1) as device:

        while True:
            title = itunes.current_track.name.get()
            artist = itunes.current_track.artist.get()
            time = int(itunes.player_position.get())

            minute = 0;

            if time >= 60:
                while time >= 60:
                    time = time - 60
                    minute = minute + 1

                if minute < 10:
                    time_str = str(0)
                elif minute == 0:
                    time_str = str(0) + str(0)

                time_str = time_str + str(minute)
                if time < 10:
                    time_str = time_str + str(0) + str(time)
                else:
                    time_str = time_str + str(time)
            else:
                time_str = str(0) + str(0)
                if time < 10:
                    time_str = time_str + str(0) + str(time)
                else:
                    time_str = time_str + str(time)

            sleep(0.5)
            device.write(conv.do(title).encode(errors="ignore") + '\n')
            device.write(conv.do(artist).encode(errors="ignore") + '\n')
            device.write(time_str + '\n')

            os.system('clear')
            print("TITLE = " + title)
            print("ARTIST = " + artist)
            print("TIME = " + time_str)

        device.close()
def toFurigana(str):
    kakasi = pykakasi.kakasi()

    kakasi.setMode("J", "aF")
    # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion

    kakasi.setMode("H", "aF")
    # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion

    kakasi.setMode("K", "aF")
    # a,H, K, aF, None - roman, Hiragana, Katakana, Furigana, or non conversion, default: no conversion

    conv = kakasi.getConverter()

    return tokenTranslation(conv, str)
Ejemplo n.º 24
0
 def __init__(self, dictionary_type='alphabet', max_len=1000, length=1024):
     if dictionary_type == 'alphabet':
         self.dictionary = alphabet.alphabet_dict
     elif dictionary_type == 'katakana':
         self.dictionary = katakana.katakana_dict
     elif dictionary_type == 'katakana_small':
         self.dictionary = katakana_small.katakana_small_dict
     self.dictionary_type = dictionary_type
     self.max_len = max_len
     self.length = length
     kakasi = pykakasi.kakasi()
     kakasi.setMode('H', 'a')
     kakasi.setMode('K', 'a')
     kakasi.setMode('J', 'a')
     self.kakasi_conv = kakasi.getConverter()
Ejemplo n.º 25
0
def get_initials(words):
    kakasi.setMode('H', 'a')
    kakasi.setMode('K', 'a')
    kakasi.setMode('J', 'a')
    # kakasi.setMode("C", True)
    conv = kakasi.getConverter()
    initials = []

    for word in words:
        initial = conv.do(word)[:1].upper()
        initials.append(initial)

    # if initials.count == 0:
    #     print('Error: cannot get initial')
    #     return

    return initials
Ejemplo n.º 26
0
import requests
import sys
import os
import tweepy

sys.path.append('/app')

import models  # noqa
import jaconv  # noqa
from util import morpheme  # noqa
import services  # noqa

db = firestore.Client()

kakasi = kakasi()
kakasi.setMode("J", "H")
conv = kakasi.getConverter()

system_service = services.system_instance
word_service = services.word_instance
user_service = services.user_instance
tag_service = services.tag_instance

# user_service.get_oauth_url()

# user_data = models.UserUpdate()
# user_data.twitter_id = 'user_id'
# user_data.twitter_name= 'screen_name'
# user_data.twitter_key= 'oauth_token'
# user_data.twitter_secret= 'oauth_token_secret'
# user_data.session_id = "aaaa"
Ejemplo n.º 27
0
                incorrect_answers.append(output_string)

        student_output_filepath = os.path.join(output_directory, student_name)
        with open(student_output_filepath, "w+") as student_corrections_file:
            student_corrections_file.write("\n".join(incorrect_answers))

# ------------------------------------------------------------------------------
current_directory = os.path.dirname(os.path.abspath(__file__))
# CHANGE THIS BIT!
master_answer_key = os.path.join(current_directory,
                                 "answer_keys/2017_06_08_lesson_03_part_02.txt")
student_answer_sets = os.path.join(current_directory,
                                   "answer_sets")
# CHANGE THIS BIT!
date_string = "2017_06_08"
current_student_answer_sets = os.path.join(student_answer_sets,
                                           date_string,
                                           "student_answers")
output_directory = os.path.join(student_answer_sets,
                                date_string,
                                "graded_answers")
# ---
kakasi = kakasi()
kakasi.setMode("J", "H")  # default: Japanese no conversion
japanese_text_normalizer = kakasi.getConverter()
# ---
perform_grading(master_answer_key,
                current_student_answer_sets,
                output_directory,
                japanese_text_normalizer)
Ejemplo n.º 28
0
#!/usr/bin/env python3
# coding: utf-8
from pykakasi import kakasi

kakasi = kakasi()

kakasi.setMode('H', 'a')
kakasi.setMode('K', 'a')
kakasi.setMode('J', 'a')

conv = kakasi.getConverter()

filename = '本日は晴天なり.jpg'


print("Base", filename)
print("Base type", type(filename))
print("Conv", conv.do(filename))

# print(type(filename.decode('utf-8')))
# print(conv.do(filename.decode('utf-8')))
# print(type(filename.decode('utf-8')))
# print(type(filename))
# print(conv.do(filename))
Ejemplo n.º 29
0
import pykakasi.kakasi as kakasi

kakasi = kakasi()
kakasi.setMode("H", "a")  # default: Hiragana no convert
kakasi.setMode("K", "a")  # default: Katakana no convert
kakasi.setMode("J", "a")  # default: Japanese no convert
kakasi.setMode("E", "a")  # default: Symbols no convert
kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
kakasi.setMode("s", True)  # separate, default: no Separator
kakasi.setMode("C", True)  # capitalize default: no Capitalize
conv = kakasi.getConverter()
result = conv.do('澱んだ街角で僕らは出会った')
Ejemplo n.º 30
0
def dabiaoqian(path):

    from pykakasi import kakasi
    import csv, os

    name_tezheng = 'mizhichuli_log'
    # 装有特征值的那个文件的文件名

    xinde = 'xinde_mizhichuli'
    # 装入新的特征值的文件名

    name1 = 'align1'
    name2 = 'symbol.txt'
    #标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉

    kakasi = kakasi()
    kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
    kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
    kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
    kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
    kakasi.setMode("s", True)  # add space, default: no separator
    conv = kakasi.getConverter()

    for i in os.listdir(path):

        path_1 = os.path.join(path, i)

        path_out = os.path.join(path_1, 'keka')

        path_tezheng = os.path.join(path_1, name_tezheng)

        #biaozhiwenjian = csv.reader(open(os.path.join(path_1, name1), 'r', encoding='EUC-JP'))  # 把标志文件读进来
        biaozhiwenjian = csv.reader(
            open(os.path.join(path_1, name2), 'r',
                 encoding='utf-8'))  #如果标志文件是.txt文件

        biaozhiwenjian_1 = [i for i in biaozhiwenjian
                            ]  # 转化为list,但是内容是list里面套list
        #[['id: l_8840_9810_T1_F_01'],['REF:  そう です か 、 はい 。 '],['HYP:  そう です か    はい 。 '],['EVAL: C    C    C  D  C    C  '],[],['id: l_10800_13190_T1_F_01']]

        # print(biaozhiwenjian_1)
        # os.system('pause')

        path_xinde = os.path.join(path_1, xinde)
        mulu.mkdir(path_xinde)

        for i in range(0, len(biaozhiwenjian_1)):  # 这里的每一轮可以为一个语音文件打标签

            try:
                biaozhi = biaozhiwenjian_1[i][0]

            except:

                continue

            if 'id:' in biaozhi:

                ID = ''
                l_biaozhi = []
                l_zhengjie = []
                l_zhengjie_1 = []
                l_jieguo = []
                l_jieguo_1 = []

                ID = biaozhiwenjian_1[i][0].replace('id: ', '')

                l_zhengjie = biaozhiwenjian_1[i + 1][0].split()
                l_zhengjie.pop(0)

                l_jieguo = biaozhiwenjian_1[i + 2][0].split()
                l_jieguo.pop(0)

                l_biaozhi = biaozhiwenjian_1[i + 3][0].split()
                l_biaozhi.pop(0)

                #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空
                jishuqi_jieguo = 0
                jishuqi_zhengjie = 0
                jishuqi_biaozhi = 0

                for i in l_biaozhi:

                    if i == "D":
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                        l_jieguo_1.append('')
                        jishuqi_zhengjie += 1
                        jishuqi_biaozhi += 1

                    if i == "C":
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                        jishuqi_zhengjie += 1
                        jishuqi_jieguo += 1
                        jishuqi_biaozhi += 1

                    if i == "I":
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])
                        l_zhengjie_1.append('')
                        jishuqi_jieguo += 1
                        jishuqi_biaozhi += 1

                    if i == "S":  #如果是S的话特殊处理一下,转化为字母再比较,如果转化之后相等的话,把标志改为C
                        l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie])
                        l_jieguo_1.append(l_jieguo[jishuqi_jieguo])

                        zhengjie_hanzi = l_zhengjie[jishuqi_zhengjie]
                        jieguo_hanzi = l_jieguo[jishuqi_jieguo]

                        #先处理识别结果
                        if conv.do(
                                jieguo_hanzi
                        ) == jieguo_hanzi and jieguo_hanzi != '、':  #判断是不是字母

                            try:
                                zhuanhuan_jieguo = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        strQ2B.strQ2B(jieguo_hanzi)))

                            except:
                                zhuanhuan_jieguo = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        jieguo_hanzi))

                        else:
                            zhuanhuan_jieguo = conv.do(jieguo_hanzi)

                        #再处理正解文
                        if conv.do(
                                zhengjie_hanzi
                        ) == zhengjie_hanzi and zhengjie_hanzi != '、':  # 判断是不是字母

                            try:
                                zhuanhuan_zhengjie = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        strQ2B.strQ2B(zhengjie_hanzi)))

                            except:
                                zhuanhuan_zhengjie = conv.do(
                                    make_kana_convertor._make_kana_convertor(
                                        zhengjie_hanzi))

                        else:
                            zhuanhuan_zhengjie = conv.do(zhengjie_hanzi)

                        if zhuanhuan_jieguo == zhuanhuan_zhengjie:

                            # print("正解list")
                            # print(l_zhengjie_1)
                            #
                            # print("识别结果list")
                            # print(l_jieguo_1)
                            #
                            # print("zhuanhuan_jieguo")
                            # print(zhuanhuan_jieguo)
                            # print("zhuanhuan_zhengjie")
                            # print(zhuanhuan_zhengjie)
                            # print("有标志被改了")
                            # print(ID)
                            # os.system("pause")

                            l_biaozhi[jishuqi_biaozhi] = 'C'

                        jishuqi_biaozhi += 1
                        jishuqi_zhengjie += 1
                        jishuqi_jieguo += 1

                # print(l_jieguo_1)
                # print(l_zhengjie_1)
                # print(l_biaozhi)
                # os.system('pause')

                path_out_1 = os.path.join(path_out, ID + '.out')  #读出.out文件
                dianout = pi.read_out(path_out_1)
                start = dianout.pop(0)[1][1]  # 给开始的无音区间打标签9,pop掉第一个元素
                start_1 = dianout[-1][1][0]  #给末尾句号打标签9
                # end_1 = dianout.pop(-1)[1][1]

                # print(dianout)
                # os.system('pause')
                # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]]

                path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv')
                tezhengzhi = csv.reader(
                    open(path_tezheng_1, 'r', encoding='utf-8'))
                t_file_list = [i for i in tezhengzhi]

                end_1 = len(t_file_list) - 1

                for i in range(start + 1):
                    t_file_list[i].insert(0,
                                          '9')  # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理

                for i in range(start_1, end_1 + 1):
                    t_file_list[i].insert(0, '9')

                l_jieguo_1.pop(-1)  #最后句号的部分已经打过标签了,需要把它pop掉

                print("ID")
                print(ID)

                print("l_biaozhi")
                print(l_biaozhi)
                print("l_jieguo_1")
                print(l_jieguo_1)

                print("dianout")
                print(dianout)

                dianout_chongzao = cz.chongzao(l_biaozhi, l_jieguo_1, dianout,
                                               ID)  # 生成新的dianoutlist,以后就靠它了

                print('dianout_chongzao')
                print(dianout_chongzao)

                #通过得到的新的list,开始打标签
                # [['災害', [3, 40], 'C'], ['で', [41, 48], 'C'], ['ござい', [49, 77], 'C'], ['ます', [78, 98], 'C'],
                #  ['から', [99, 130], 'C'], ['、', [131, 152], 'C'], ['その', [153, 177], 'C'], ['場', [178, 190], 'C'],
                #  ['で', [191, 209], 'C']]
                for i in dianout_chongzao:

                    start, end = i[1]
                    if i[2] == 'C':

                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '0')

                    else:

                        for i in range(start, end + 1):
                            t_file_list[i].insert(0, '1')

                path_xinde_tezhengzhi = os.path.join(path_xinde, ID + '.csv')

                with open(path_xinde_tezhengzhi, 'w+',
                          encoding='utf-8') as mergen_file:
                    for i in t_file_list:
                        mergen_file.write('%s\n' % ','.join(i))

        shanchu.shanchuhang(path_xinde)  # 把有标记9的特征值全部都删除掉
Ejemplo n.º 31
0
path = r'C:\Users\a7825\Desktop\工作空间\杂物\对比\baseline\baselinetest'

fazhi = 0.9

# name_tezheng =
# 装有特征值的那个文件的文件名

# xinde =
# 装入新的特征值的文件名

name1 = 'align1'
name2 = 'symbol.txt'
#标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉

kakasi = kakasi()
kakasi.setMode("H", "a")  # Hiragana to ascii, default: no conversion
kakasi.setMode("K", "a")  # Katakana to ascii, default: no conversion
kakasi.setMode("J", "a")  # Japanese to ascii, default: no conversion
kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
kakasi.setMode("s", True)  # add space, default: no separator
conv = kakasi.getConverter()

zhenzhi_2 = []  #塞入混淆矩阵
yucezhi_2 = []

for i in os.listdir(path):  #

    path_1 = os.path.join(path, i)

    path_out = os.path.join(path_1, 'keka')
Ejemplo n.º 32
0
from pykakasi import kakasi

kakasi = kakasi()
kakasi.setMode('H', 'a')
kakasi.setMode('K', 'a')
kakasi.setMode('J', 'a')
conv = kakasi.getConverter()
print(conv.do('本日は晴天なり'))

Ejemplo n.º 33
0
import argparse

from pykakasi import kakasi

if __name__ == '__main__':
    kakasi = kakasi()
    kakasi.setMode('H', 'a')  # Hiragana to ascii, default: no conversion
    kakasi.setMode('K', 'a')  # Katakana to ascii, default: no conversion
    kakasi.setMode('J', 'a')  # Japanese to ascii, default: no conversion
    conv = kakasi.getConverter()

    parser = argparse.ArgumentParser()
    parser.add_argument('args', type=str, nargs='*')
    parser.add_argument('--verbose', action='store_true')
    args = parser.parse_args()

    for text in args.args:
        if args.verbose:
            print(text)
        print(conv.do(text))