# script you can see which phonemes are actually used in your database. 

from label_func import parse_transcript
from data_functions import list_files
import codecs
datapath="/scratch/danny/CGN/data/annot/text/awd/comp-o/nl"
# path to save phones.txt. Your phones should end up in your
# kaldi projects data/local/lang folder
phones_loc="/scratch/danny/kaldi/egs/myexp/data/local/lang/"
# silence phone, change if you want a different silence phoneme ditto for out of vocab
sil= 'sil'
oov ='<oov>'
pattern= "N[0-9]+_FON"
# list input files and sort (not necesarry but it might make it easier to manually
# look in your files if something goes wrong)
input_files = list_files(datapath)
input_files.sort()
# remove durational info, headers etc.
trans=[parse_transcript(pattern,datapath+"/"+y) for y in input_files]

# extract all phonetic transcribed words
phon_trans=[]
# for each transcript in the list trans
for tr in trans:
    # pick with step size 3 (skip durational info)
    for x in range (2,len(tr),3):
        # split the transcript (awd transcripts should already be split but somehow in CGN some
        # sentences are not properly split yet)
        split_tr = tr[x].replace('_',' ').split()
        # add the words to the word list
        for phone in split_tr:
def create_lexicon(datapath,oov):
    # pattern to retrieve the ortographic transcript part
    pattern= "N[0-9]+"
    #pattern to retrieve the phonetic transcript part
    pattern2= "N[0-9]+_FON"
    input_files = list_files(datapath)
    input_files.sort()
    striplist = '._,?!=\"'    
    # remove durational info headers etc.
    word_trans=[parse_transcript(pattern,datapath+"/"+y) for y in input_files]
    phone_trans=[parse_transcript(pattern2,datapath+"/"+y) for y in input_files]  
    phones=[]
    words=[]    
    # extract words and phoneme transcriptions. strip punctuation and convert
    # to lower case.    
    for word, phone in zip(word_trans, phone_trans):
        for x in range (2,len(word),3):
            # replace some special tokens either with words (% with procent) or whitespace. This list MUST be the same as in kaldi_data_train.py
            word[x] = word[x].replace(u'\xb1',u'plusminus').replace(u'\xd7',u'').replace(u'\xb3',u'').replace(u'–',u' ').replace(u'$', u'dollar').replace(u'%',u'procent').replace(u' & ',u' en ').replace(u'&amp',u'en').replace(u'&',u'en').replace(u'\x90',u' ').replace(u'\x91',u' ').replace(u'\x92',u' ').replace(u'\x93',u' ').replace(u'\x94',u' ').replace(u'\x95',u' ').replace(u'\x96',u' ').replace(u'\x97',u' ').replace(u'\x98',u' ').replace(u'\x99',u' ').replace(u'\xbd',u'').replace(u'\xff',u'').replace(u'\u2663',u'').replace(u'\u2666',u'').replace(u'\u2660',u'').replace(u'\u2665',u'').replace(u'\xb9',u'').replace(u'\xb2',u'').replace(u'\u2070',u'').replace(u'\u2079',u'').replace(u'\u2074',u'').replace(u'\u0660',u'').replace(u'\u2075',u'').replace(u'\u2071',u'').replace(u'\u2072',u'').replace(u'\u2073',u'').replace(u'\u2076',u'').replace(u'\u2077',u'').replace(u'\u2078',u'').replace(u'\u2792',u'').replace(u'\u2082',u'').replace(u"1/2","half").replace(u"/",u" ").replace(u'~',u'')
            word[x] = word[x].split()
            phone[x] =phone[x].split()
            # if words and phones are of same length just append to list
            # otherwise split the phoneme transcript (some _ are not properly)
            # separeted in cgn it seems.
            if len (word[x]) != len(phone[x]):               
                phone[x]= [p for q in phone[x] for p in q.split('_')]                 
            for z in range(0,len(word[x])):
                words.append(str.lower(word[x][z].strip(striplist)))
                # do NOT lower case the phonemes, A and a are different phonemes
                phones.append(phone[x][z].strip(striplist))
    # remove silence.and underscore (underscore is used when 2 adjacent words share
    # a phoneme and it is unclear where the boundary is)
    phones = [x for x in phones if len(x)>0 and not x =='_']
    words = [x for x in words if len(x)>0 and not x =='_']          
    # remove * annotations of cgn (e.g. *v for foreign words)
    for x in range(0,len(words)):
        if '*' in words[x]:
            words[x] = words[x][:-2]    
    # create a lexicon, that is tuples of words and their pronunciation in phonemes   
    lexicon=[]
    for x in range (0,len(phones)):
        # kaldi want spaces between each phone so we need to split the phonetic 
        #transcripts
        splitphones=''
        for y in range(0,len(phones[x])):
            # since some phonemes have 2 characters we cannot simply split between
            # each char. The exceptions are hard coded, when using a database with
            # diffent phonemes you might need to add them (this uses +, ~ and :)
            if y == len(phones[x])-1: # do not check x+1 for the last in list as it creates errors
                splitphones=splitphones + phones[x][y]
            elif phones[x][y+1]==':' or phones[x][y+1]=='+' or phones[x][y+1]=='~':
                splitphones=splitphones + phones[x][y]
            else:
                splitphones= splitphones + phones[x][y] + ' '
        # ggg xxx and Xxx are used to transcribe unintelligeble parts. We replace these
        # with an out of vocab token. again hardcoded, you might need to add your own exceptions here
        if not 'ggg' in words[x] and not 'xxx' in words[x] and not 'Xxx' in words[x]:        
            lexicon.append((words[x],splitphones))
        else:
            lexicon.append((oov[0],oov[1]))
    # remove doubles
    lexicon=list(set(lexicon))
    # put the lexicon in dictionary form to map multiple pronunciations
    # onto one ortographic form
    ref={}
    for x in lexicon:       
        try:
            ref [x[0]].append(x[1])
        except:
            ref[x[0]] = list()
            ref[x[0]].append(x[1])  
    return(ref)
Example #3
0
@author: danny
"""
# this script reads the alignment files from kaldi and extracts
# phonemes and their start and end times, to create a transcript
# simmilar to the CGN awd files.
import codecs
from data_functions import list_files
from label_func import parse_transcript
# location of your split alignment files created by splitAlignments.py
file_loc = "/scratch/danny/kaldi/egs/myexp/exp/tri4a_ali/split_ali/"

# we need to retrieve the end times of the CGN files, as we gave kaldi only non
# silence utterances. Most files end in silence so we need to fill the ends of the files.
cgntrans_loc = "/scratch/danny/CGN/data/annot/text/awd/comp-o/nl/"
input_files = list_files(cgntrans_loc)
input_files.sort()

end_time = []
for x in input_files:
    pattern = '"N[0-9]+_SEG"'
    cgntrans = parse_transcript(pattern, cgntrans_loc + x)
    end_time.append(round(float(cgntrans[-2]), 4))
# list all input files
input_files = list_files(file_loc)
input_files.sort()
# set a counter to retrieve proper end times for the files
count = 0
# read the data we need from the alignment files.
for f in input_files:
    file = codecs.open(file_loc + f)
Example #4
0
def proc_data(pattern, f_ex, params, l_path, d_path, conv_table):
    # get list of audio and transcript files
    audio_files = [d_path + "/" + x for x in list_files(d_path)]
    audio_files.sort()

    label_files = [l_path + "/" + x for x in list_files(l_path)]
    label_files.sort()

    # create h5 file for the processed data
    data_file = tables.open_file(params[5] + '.h5', mode='a')

    # create pytable atoms
    feature_shape = (params[1] + 1) * 3
    f_atom = tables.Float64Atom()
    # N.B. label size is hard coded. It provides phoneme and 7 articulatory feature
    # labels
    l_atom = tables.StringAtom(itemsize=5)
    # create a feature and label group branching of the root node
    features = data_file.create_group("/", 'features')
    labels = data_file.create_group("/", 'labels')
    # create a dictionary from the conv table
    cgndict = phoneme_dict(conv_table)

    # check if the audio and transcript files match
    if check_files(audio_files, label_files, f_ex):

        # len(audio_files)
        for x in range(0, len(audio_files)):  #len(audio_files)
            print('processing file ' + str(x))
            # create new leaf nodes in the feature and leave nodes for every audio file
            f_table = data_file.create_earray(features,
                                              audio_files[x][-12:-4],
                                              f_atom, (0, feature_shape),
                                              expectedrows=100000)
            print('l_table')
            l_table = data_file.create_earray(labels,
                                              audio_files[x][-12:-4],
                                              l_atom, (0, 8),
                                              expectedrows=100000)

            # read audio samples
            input_data = read(audio_files[x])
            # sampling frequency
            fs = input_data[0]
            # get window and frameshift size in samples
            s_window = int(fs * params[2])
            s_shift = int(fs * params[3])

            # create mfccs
            [mfcc, frame_nrs] = get_mfcc(input_data, params[0], params[1],
                                         s_window, s_shift, params[4])

            # read datatranscript
            trans = parse_transcript(pattern, label_files[x])
            # convert phoneme transcript to articulatory feature transcript
            l_trans = label_transcript(trans, fs, cgndict)
            nframes = mfcc.shape[0]
            # label frames using the labelled transcript
            l_data = numpy.array(label_frames(nframes, l_trans, s_shift))

            # append new data to the tables
            f_table.append(mfcc)
            l_table.append(l_data)
    else:
        print('audio and transcript files do not match')
    # close the output files
    data_file.close()
    data_file.close()
    return (mfcc, l_data)