Esempio n. 1
0
def processLine(line,mary,scriptpath):
    #http://stackoverflow.com/questions/10190981/get-a-unique-id-for-worker-in-python-multiprocessing-pool
    tokens = []
    try:
        proc_num = multiprocessing.current_process()._identity[0]-1
        tokens,phonemes = common_utils.getCleanTokensAndPhonemes(line,mary,proc_num)
    except Exception as err:
        print '[',proc_num,']','Error, omitting', line
        print err
        if scriptpath != '' and ('Read timed out' in str(err)):
            print 'restarting maryServer'
            restartMaryServer(scriptpath,None)
    return ' '.join(tokens)
Esempio n. 2
0
def getUtterances(ids, postfix_speaker ,cache_cleaned_sentences = True):
    '''Loads the corpus and gets python structured object that can be used to export the corpus to a format KALDI understands'''
    utts= []
    
    cleaned_sentences_cache = {}
    utts_phoneme_dict = {}

    print 'Reading corpus transcriptions and producing automatic corpus phoneme dict for OOV words (you need MARY running in the background!)'

    lastutt = None
    for myid in ids:
        print '.',
        #if 1==1:
        try:
            with codecs.open(myid+'.xml','r','utf-8') as myfile:
                #extract xml meta 
                xml = myfile.read()
                soup = BeautifulSoup(xml)
                sentence = soup.recording.sentence.string
                cleaned_sentence = soup.recording.cleaned_sentence.string
                gender = soup.recording.gender.string
                age = soup.recording.ageclass.string
                corpus = soup.recording.corpus.string
                nativespeaker = soup.recording.muttersprachler.string
                region = soup.recording.bundesland.string
                speakerid= soup.recording.speaker_id.string

                if speakerid is None or speakerid == '':
                    print 'ERROR, speakerid not found for', myid

                date = getDateFromID(myid)

                if cache_cleaned_sentences and (cleaned_sentence not in cleaned_sentences_cache):
                    clean_sentence_tokens,token_phonemes = common_utils.getCleanTokensAndPhonemes(cleaned_sentence,mary)
                    cleaned_sentences_cache[cleaned_sentence] = (clean_sentence_tokens,token_phonemes)
                    #print 'cleaning ', cleaned_sentence, ' -> ', clean_sentence_tokens , ' phonemes:', token_phonemes
                else:
                    clean_sentence_tokens,token_phonemes = cleaned_sentences_cache[cleaned_sentence]

                if not cache_cleaned_sentences:
                    clean_sentence_tokens,token_phonemes = common_utils.getCleanTokensAndPhonemes(sentence,mary)

                for token,phoneme_representation in itertools.izip(clean_sentence_tokens,token_phonemes):
                    if token not in utts_phoneme_dict:
                        utts_phoneme_dict[token] = phoneme_representation
                
                clean_sentence_tokens = cleaned_sentence.split(' ')
                utt = {'id':myid.split('/')[-1],'fileid':myid,'sentence':sentence,'clean_sentence_tokens':clean_sentence_tokens,'speakerid':speakerid,'gender':gender,'age':age,'corpus':corpus,'nativespeaker':nativespeaker,'region':region,'date':date}
                utts.append(utt)

        except Exception as err:
            print 'Error in file, omitting', myid
            print err

    #Sort utterances by date
    utts = sorted(utts,key=lambda utt:utt['date'])

    #Unfortunately, the xmls dont have speaker meta-information, we try to guess it here
    #for i,utt in enumerate(utts):
    #    if lastutt is not None:
    #        delta = utt['date'] - lastutt['date']
    #        diff = abs(delta.total_seconds())
    #        #Heuristic: either a enough time passed between this and the last recording, or speaker meta information (gender,age,region) changed
    #        if diff > speakerid_diff_heuristic or lastutt['gender'] != utt['gender'] or lastutt['age'] != utt['age'] or lastutt['region'] != utt['region']:
    #            print 'probable new speaker',speakerid
    #            if diff > speakerid_diff_heuristic:
    #                print 'based on time diff',diff
    #            else:
    #                print 'based on meta', 'diff:',diff, lastutt['gender'],utt['gender'],lastutt['age'],utt['age'],lastutt['region'],utt['region']
    #            speakerid += 1
    #    utt['speakerid'] = 's'+('%04d'%speakerid)+postfix_speaker
        
    for utt in utts:    
        utt['kaldi_id'] = utt['speakerid']+'_'+utt['id']
        
        #utts[i] = utt
        
        #lastutt = utt

    #Filter utterances with repeat in file name (recording was repeated after a wrong utterance)
    #utts = filterRepeatUtterances(utts)

    return utts,utts_phoneme_dict
def getUtterances(ids, postfix_speaker, cache_cleaned_sentences=True):
    '''Loads the corpus and gets python structured object that can be used to export the corpus to a format KALDI understands'''
    utts = []

    cleaned_sentences_cache = {}
    utts_phoneme_dict = {}

    print 'Reading corpus transcriptions and producing automatic corpus phoneme dict for OOV words (you need MARY running in the background!)'

    lastutt = None
    for myid in ids:
        print '.',
        #if 1==1:
        try:
            with codecs.open(myid + '.xml', 'r', 'utf-8') as myfile:
                #extract xml meta
                xml = myfile.read()
                soup = BeautifulSoup(xml)
                sentence = soup.recording.sentence.string
                cleaned_sentence = soup.recording.cleaned_sentence.string
                gender = soup.recording.gender.string
                age = soup.recording.ageclass.string
                corpus = soup.recording.corpus.string
                nativespeaker = soup.recording.muttersprachler.string
                region = soup.recording.bundesland.string
                speakerid = soup.recording.speaker_id.string

                if speakerid is None or speakerid == '':
                    print 'ERROR, speakerid not found for', myid

                date = getDateFromID(myid)

                if cache_cleaned_sentences and (
                        cleaned_sentence not in cleaned_sentences_cache):
                    clean_sentence_tokens, token_phonemes = common_utils.getCleanTokensAndPhonemes(
                        cleaned_sentence, mary)
                    cleaned_sentences_cache[cleaned_sentence] = (
                        clean_sentence_tokens, token_phonemes)
                    #print 'cleaning ', cleaned_sentence, ' -> ', clean_sentence_tokens , ' phonemes:', token_phonemes
                else:
                    clean_sentence_tokens, token_phonemes = cleaned_sentences_cache[
                        cleaned_sentence]

                if not cache_cleaned_sentences:
                    clean_sentence_tokens, token_phonemes = common_utils.getCleanTokensAndPhonemes(
                        sentence, mary)

                for token, phoneme_representation in itertools.izip(
                        clean_sentence_tokens, token_phonemes):
                    if token not in utts_phoneme_dict:
                        utts_phoneme_dict[token] = phoneme_representation

                clean_sentence_tokens = cleaned_sentence.split(' ')
                utt = {
                    'id': myid.split('/')[-1],
                    'fileid': myid,
                    'sentence': sentence,
                    'clean_sentence_tokens': clean_sentence_tokens,
                    'speakerid': speakerid,
                    'gender': gender,
                    'age': age,
                    'corpus': corpus,
                    'nativespeaker': nativespeaker,
                    'region': region,
                    'date': date
                }
                utts.append(utt)

        except Exception as err:
            print 'Error in file, omitting', myid
            print err

    #Sort utterances by date
    utts = sorted(utts, key=lambda utt: utt['date'])

    #Unfortunately, the xmls dont have speaker meta-information, we try to guess it here
    #for i,utt in enumerate(utts):
    #    if lastutt is not None:
    #        delta = utt['date'] - lastutt['date']
    #        diff = abs(delta.total_seconds())
    #        #Heuristic: either a enough time passed between this and the last recording, or speaker meta information (gender,age,region) changed
    #        if diff > speakerid_diff_heuristic or lastutt['gender'] != utt['gender'] or lastutt['age'] != utt['age'] or lastutt['region'] != utt['region']:
    #            print 'probable new speaker',speakerid
    #            if diff > speakerid_diff_heuristic:
    #                print 'based on time diff',diff
    #            else:
    #                print 'based on meta', 'diff:',diff, lastutt['gender'],utt['gender'],lastutt['age'],utt['age'],lastutt['region'],utt['region']
    #            speakerid += 1
    #    utt['speakerid'] = 's'+('%04d'%speakerid)+postfix_speaker

    for utt in utts:
        utt['kaldi_id'] = utt['speakerid'] + '_' + utt['id']

        #utts[i] = utt

        #lastutt = utt

    #Filter utterances with repeat in file name (recording was repeated after a wrong utterance)
    #utts = filterRepeatUtterances(utts)

    return utts, utts_phoneme_dict
Esempio n. 4
0
from __future__ import print_function#, unicode_literals

import maryclient
import codecs
import common_utils
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Retrieves pronounciations entries of arbitrary German words (using the TTS software Mary) for a whole word list.')
    parser.add_argument('-i', '--inputfile', dest='inputfile',
                        help='Process this word list (one per line, utf-8)', type=str, default='')
    parser.add_argument('-o', '--outputfile', dest='outputfile',
                        help='Export pronouciation entries to this outputfile (one per line, utf-8)', type=str, default='')
    args = parser.parse_args()
    
    mary = maryclient.maryclient()
    dictionary = {}
    with codecs.open(args.inputfile, 'r', 'utf-8') as inputfile:
        for word in inputfile:
            tokens, phonems = common_utils.getCleanTokensAndPhonemes(
                word, mary)
            if len(phonems) != 1:
                print(
                    'Warning, MARY did split this word into more than one token:', word, phonems)
            dictionary[word[:-1]] = ''.join(phonems[0])
    
    with codecs.open(args.outputfile, 'w', 'utf-8') as outputfile:
        for word in sorted(dictionary):
            outputfile.write(word+' '+dictionary[word]+'\n')
Esempio n. 5
0
def getUtterances(ids, use_mary=False, cache_cleaned_sentences = True):
    '''Loads the corpus and gets python structured object that can be used to export the corpus to a format KALDI understands'''
    utts= []
    
    cleaned_sentences_cache = {}
    utts_phoneme_dict = {}

    print('Reading and parsing TUDA corpus transcriptions',end='',flush=True)

    lastutt = None
    for i,myid in enumerate(ids):
        if i%100 == 0:
            print('.',end='',flush=True)
        try:
            with codecs.open(myid+'.xml','r','utf-8') as myfile:
                #extract xml meta 
                xml = myfile.read()
                soup = BeautifulSoup(xml,"lxml")
                sentence = soup.recording.sentence.string
                cleaned_sentence = soup.recording.cleaned_sentence.string
                gender = soup.recording.gender.string
                age = soup.recording.ageclass.string
                corpus = soup.recording.corpus.string
                nativespeaker = soup.recording.muttersprachler.string
                region = soup.recording.bundesland.string
                speakerid= soup.recording.speaker_id.string

                if speakerid is None or speakerid == '':
                    print('ERROR, speakerid not found for', myid)

                date = getDateFromID(myid)

                if use_mary:
                    if cache_cleaned_sentences and (cleaned_sentence not in cleaned_sentences_cache):
                        clean_sentence_tokens,token_phonemes = common_utils.getCleanTokensAndPhonemes(cleaned_sentence,mary)
                        cleaned_sentences_cache[cleaned_sentence] = (clean_sentence_tokens,token_phonemes)
                        #print 'cleaning ', cleaned_sentence, ' -> ', clean_sentence_tokens , ' phonemes:', token_phonemes
                    else:
                        clean_sentence_tokens,token_phonemes = cleaned_sentences_cache[cleaned_sentence]

                    if not cache_cleaned_sentences:
                        clean_sentence_tokens,token_phonemes = common_utils.getCleanTokensAndPhonemes(sentence,mary)

                    for token,phoneme_representation in itertools.izip(clean_sentence_tokens,token_phonemes):
                        if token not in utts_phoneme_dict:
                            utts_phoneme_dict[token] = phoneme_representation
                
                clean_sentence_tokens = cleaned_sentence.split(' ')
                utt = {'id':myid.split('/')[-1],'fileids':ids[myid],'sentence':sentence,'clean_sentence_tokens':clean_sentence_tokens,
                        'speakerid':speakerid,'gender':gender,'age':age,'corpus':corpus,'nativespeaker':nativespeaker,'region':region,'date':date}

                utts.append(utt)

        except Exception as err:
            print('Error in file, omitting', myid)
            print(err)

    #Sort utterances by date
    utts = sorted(utts,key=lambda utt:utt['date'])

    for utt in utts:    
        utt['kaldi_id'] = utt['speakerid']+'_'+utt['id']

    #Filter utterances with repeat in file name (recording was repeated after a wrong utterance)
    #utts = filterRepeatUtterances(utts)

    return utts,utts_phoneme_dict