# script you can see which phonemes are actually used in your database. from label_func import parse_transcript from data_functions import list_files import codecs datapath="/scratch/danny/CGN/data/annot/text/awd/comp-o/nl" # path to save phones.txt. Your phones should end up in your # kaldi projects data/local/lang folder phones_loc="/scratch/danny/kaldi/egs/myexp/data/local/lang/" # silence phone, change if you want a different silence phoneme ditto for out of vocab sil= 'sil' oov ='<oov>' pattern= "N[0-9]+_FON" # list input files and sort (not necesarry but it might make it easier to manually # look in your files if something goes wrong) input_files = list_files(datapath) input_files.sort() # remove durational info, headers etc. trans=[parse_transcript(pattern,datapath+"/"+y) for y in input_files] # extract all phonetic transcribed words phon_trans=[] # for each transcript in the list trans for tr in trans: # pick with step size 3 (skip durational info) for x in range (2,len(tr),3): # split the transcript (awd transcripts should already be split but somehow in CGN some # sentences are not properly split yet) split_tr = tr[x].replace('_',' ').split() # add the words to the word list for phone in split_tr:
def create_lexicon(datapath,oov): # pattern to retrieve the ortographic transcript part pattern= "N[0-9]+" #pattern to retrieve the phonetic transcript part pattern2= "N[0-9]+_FON" input_files = list_files(datapath) input_files.sort() striplist = '._,?!=\"' # remove durational info headers etc. word_trans=[parse_transcript(pattern,datapath+"/"+y) for y in input_files] phone_trans=[parse_transcript(pattern2,datapath+"/"+y) for y in input_files] phones=[] words=[] # extract words and phoneme transcriptions. strip punctuation and convert # to lower case. for word, phone in zip(word_trans, phone_trans): for x in range (2,len(word),3): # replace some special tokens either with words (% with procent) or whitespace. This list MUST be the same as in kaldi_data_train.py word[x] = word[x].replace(u'\xb1',u'plusminus').replace(u'\xd7',u'').replace(u'\xb3',u'').replace(u'–',u' ').replace(u'$', u'dollar').replace(u'%',u'procent').replace(u' & ',u' en ').replace(u'&',u'en').replace(u'&',u'en').replace(u'\x90',u' ').replace(u'\x91',u' ').replace(u'\x92',u' ').replace(u'\x93',u' ').replace(u'\x94',u' ').replace(u'\x95',u' ').replace(u'\x96',u' ').replace(u'\x97',u' ').replace(u'\x98',u' ').replace(u'\x99',u' ').replace(u'\xbd',u'').replace(u'\xff',u'').replace(u'\u2663',u'').replace(u'\u2666',u'').replace(u'\u2660',u'').replace(u'\u2665',u'').replace(u'\xb9',u'').replace(u'\xb2',u'').replace(u'\u2070',u'').replace(u'\u2079',u'').replace(u'\u2074',u'').replace(u'\u0660',u'').replace(u'\u2075',u'').replace(u'\u2071',u'').replace(u'\u2072',u'').replace(u'\u2073',u'').replace(u'\u2076',u'').replace(u'\u2077',u'').replace(u'\u2078',u'').replace(u'\u2792',u'').replace(u'\u2082',u'').replace(u"1/2","half").replace(u"/",u" ").replace(u'~',u'') word[x] = word[x].split() phone[x] =phone[x].split() # if words and phones are of same length just append to list # otherwise split the phoneme transcript (some _ are not properly) # separeted in cgn it seems. if len (word[x]) != len(phone[x]): phone[x]= [p for q in phone[x] for p in q.split('_')] for z in range(0,len(word[x])): words.append(str.lower(word[x][z].strip(striplist))) # do NOT lower case the phonemes, A and a are different phonemes phones.append(phone[x][z].strip(striplist)) # remove silence.and underscore (underscore is used when 2 adjacent words share # a phoneme and it is unclear where the boundary is) phones = [x for x in phones if len(x)>0 and not x =='_'] words = [x for x in words if len(x)>0 and not x =='_'] # remove * annotations of cgn (e.g. *v for foreign words) for x in range(0,len(words)): if '*' in words[x]: words[x] = words[x][:-2] # create a lexicon, that is tuples of words and their pronunciation in phonemes lexicon=[] for x in range (0,len(phones)): # kaldi want spaces between each phone so we need to split the phonetic #transcripts splitphones='' for y in range(0,len(phones[x])): # since some phonemes have 2 characters we cannot simply split between # each char. The exceptions are hard coded, when using a database with # diffent phonemes you might need to add them (this uses +, ~ and :) if y == len(phones[x])-1: # do not check x+1 for the last in list as it creates errors splitphones=splitphones + phones[x][y] elif phones[x][y+1]==':' or phones[x][y+1]=='+' or phones[x][y+1]=='~': splitphones=splitphones + phones[x][y] else: splitphones= splitphones + phones[x][y] + ' ' # ggg xxx and Xxx are used to transcribe unintelligeble parts. We replace these # with an out of vocab token. again hardcoded, you might need to add your own exceptions here if not 'ggg' in words[x] and not 'xxx' in words[x] and not 'Xxx' in words[x]: lexicon.append((words[x],splitphones)) else: lexicon.append((oov[0],oov[1])) # remove doubles lexicon=list(set(lexicon)) # put the lexicon in dictionary form to map multiple pronunciations # onto one ortographic form ref={} for x in lexicon: try: ref [x[0]].append(x[1]) except: ref[x[0]] = list() ref[x[0]].append(x[1]) return(ref)
@author: danny """ # this script reads the alignment files from kaldi and extracts # phonemes and their start and end times, to create a transcript # simmilar to the CGN awd files. import codecs from data_functions import list_files from label_func import parse_transcript # location of your split alignment files created by splitAlignments.py file_loc = "/scratch/danny/kaldi/egs/myexp/exp/tri4a_ali/split_ali/" # we need to retrieve the end times of the CGN files, as we gave kaldi only non # silence utterances. Most files end in silence so we need to fill the ends of the files. cgntrans_loc = "/scratch/danny/CGN/data/annot/text/awd/comp-o/nl/" input_files = list_files(cgntrans_loc) input_files.sort() end_time = [] for x in input_files: pattern = '"N[0-9]+_SEG"' cgntrans = parse_transcript(pattern, cgntrans_loc + x) end_time.append(round(float(cgntrans[-2]), 4)) # list all input files input_files = list_files(file_loc) input_files.sort() # set a counter to retrieve proper end times for the files count = 0 # read the data we need from the alignment files. for f in input_files: file = codecs.open(file_loc + f)
def proc_data(pattern, f_ex, params, l_path, d_path, conv_table): # get list of audio and transcript files audio_files = [d_path + "/" + x for x in list_files(d_path)] audio_files.sort() label_files = [l_path + "/" + x for x in list_files(l_path)] label_files.sort() # create h5 file for the processed data data_file = tables.open_file(params[5] + '.h5', mode='a') # create pytable atoms feature_shape = (params[1] + 1) * 3 f_atom = tables.Float64Atom() # N.B. label size is hard coded. It provides phoneme and 7 articulatory feature # labels l_atom = tables.StringAtom(itemsize=5) # create a feature and label group branching of the root node features = data_file.create_group("/", 'features') labels = data_file.create_group("/", 'labels') # create a dictionary from the conv table cgndict = phoneme_dict(conv_table) # check if the audio and transcript files match if check_files(audio_files, label_files, f_ex): # len(audio_files) for x in range(0, len(audio_files)): #len(audio_files) print('processing file ' + str(x)) # create new leaf nodes in the feature and leave nodes for every audio file f_table = data_file.create_earray(features, audio_files[x][-12:-4], f_atom, (0, feature_shape), expectedrows=100000) print('l_table') l_table = data_file.create_earray(labels, audio_files[x][-12:-4], l_atom, (0, 8), expectedrows=100000) # read audio samples input_data = read(audio_files[x]) # sampling frequency fs = input_data[0] # get window and frameshift size in samples s_window = int(fs * params[2]) s_shift = int(fs * params[3]) # create mfccs [mfcc, frame_nrs] = get_mfcc(input_data, params[0], params[1], s_window, s_shift, params[4]) # read datatranscript trans = parse_transcript(pattern, label_files[x]) # convert phoneme transcript to articulatory feature transcript l_trans = label_transcript(trans, fs, cgndict) nframes = mfcc.shape[0] # label frames using the labelled transcript l_data = numpy.array(label_frames(nframes, l_trans, s_shift)) # append new data to the tables f_table.append(mfcc) l_table.append(l_data) else: print('audio and transcript files do not match') # close the output files data_file.close() data_file.close() return (mfcc, l_data)