def get_question_sets(context_skeleton, qformat, fit_contexts=False, contexts_to_fit=None, HHEd_fix=False): #Should be unnecessary due to argparse, but just to be sure. if qformat not in ["Nitech_NN", "HMM", "CSTR_NN"]: raise SiReError( "Invalid question format ({0})! Must be either HMM, Nitech_NN or CSTR_NN!" .format(qformat)) c = context_skeleton c_utt = copy.deepcopy(context_skeleton) questions = [] if fit_contexts == True: #First we obtain a dict containing a list of all used values for each context feature. for context in contexts_to_fit: for key in context.added_contexts: c.add_multiple(key, context.added_contexts[key]) #Check if this should be in the GV context set if getattr(c, key) and "utt" in getattr(c, key): c_utt.add_multiple(key, context.added_contexts[key]) #Then we create questions based on these qs = make_questions(c, qformat, False, HHEd_fix) q_utt = make_questions(c_utt, qformat, False, HHEd_fix, True) return (qs, q_utt) else: raise SiReError( "Not Implemented yet! (Not fitting contexts for question set.)")
def reduce_word_tuples(words, score_file, reduction_level): #Our initial assumption is nothing needs reduction w_l = [[word, False] for word in words] #If we don't reduce we just return all unreduced if reduction_level == 1.0: return w_l #If the reduction_level is not between 0 and 1 we fail elif reduction_level > 1.0 or reduction_level < 0.0: raise SiReError( "Reduction level must be between 1.0 and 0.0 but was {0}".format( reduction_level)) #As words may appear more than once we make a dict indexed on word pos. scores = {} for i, x in enumerate(open(score_file, "r").readlines()): scores[i] = x.strip().split() if len(scores) != len(words): raise SiReError( "I seem to have a mismatching set of words ({0}) and LM scores ({1}) for {2}" .format(len(words), len(scores), score_file)) #The number of words to reduce n_to_reduce = int(round(len(words) * (1 - reduction_level), 0)) #A list of dict entry tuples ordered by score in descending order ranked = sorted(scores.items(), key=lambda (k, v): v[1]) #Now mark the appropriate ones to be reduced for i in range(n_to_reduce): w_l[ranked[i][0]][1] = True return w_l
def to_relational(pos, mpos, fw, accept_xx=False): if accept_xx: if pos == "xx" or mpos == "xx": return 0.0 if pos > mpos: raise SiReError( "Position ({0}) is above max position ({1}). Should not happen!". format(pos, mpos)) #To avoid dividing with 0 if mpos == 0: #This is technically correct but we could consider using 0.01 return 1.0 if fw == True: p = 1 - round(float(pos) / float(mpos), 2) #0.0 is reserved for pause segments if p == 0.0: p = 0.01 return p elif fw == False: p = round(float(pos) / float(mpos), 2) #0.0 is reserved for pause segments if p == 0.0: p = 0.01 return p else: raise SiReError("FW/BW unspecified! Please use bool!")
def check_value(context_skeleton, variable_name, value): #Is the value of the correct type? attr = getattr(context_skeleton, variable_name) if attr == None: if variable_name in ["start", "end"]: try: int(value) return True except (ValueError, TypeError): pass elif "float" in attr: try: float(value) return True except ValueError: if "xx" in attr and value == "xx": return True elif attr == "bool": if isinstance(value, str): return True elif "int" in attr: try: int(value) return True except ValueError: if "xx" in attr and value == "xx": return True else: raise SiReError("Unknown attribute type ({0})!".format(attr)) raise SiReError( "Value ({0}) is not valid for variable type ({1}) variable ({2}) in \n {3}" .format(value, attr, variable_name, context_skeleton))
def load_stanford_pcfg_parse(utt, parse, comma_is_pause=False): if utt.words == None: raise SiReError( "No words in utterance! Please load an mlf or txt file first!") tree = parsetrees.stanfordPcfgTree() tree.make_tree(parse) if comma_is_pause == True: leafs = tree.get_leafs(include_punct=[","]) else: leafs = tree.get_leafs() num_w = utt.num_words_no_pau(comma_is_pause) if len(leafs) != num_w: #First we try to see if this is due to differences in how words are #dealt with in parsing and annotation. #Prime example is using 's in e.g. there's for transcription instead of there is. #Parsing splits there's into two whereas in e.g. combilex there's is one word. #If this is the case we split the WORD into two with the 's being a single phoneme #single syllable word. In other cases the contraction straddles two words and #we add a "phony" word which affects contexts but adds no phonemes. utterance_utils.try_split_words(utt) #Update num_w num_w = utt.num_words_no_pau(comma_is_pause) if len(leafs) != num_w: for w in utt.words: print w.id raise SiReError( "Number of leaves ({0}) not equal to number of words ({1})! In utt ({2})!" .format(len(leafs), num_w, utt.id)) #Match each word with parse words = utt.get_words_no_pau(comma_is_pause) for i, word in enumerate(words): l = leafs[i].label.split("-") word.id = l[1] word.pos = l[0] #There should always be a parent word.parent_phrase = leafs[i].parent #But there might not be more than one if word.parent_phrase.parent != None: word.grandparent_phrase = word.parent_phrase.parent else: word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() #And certainly we might be done here if word.grandparent_phrase.parent in [ None, "xx" ] or word.grandparent_phrase.parent.label == "xx": word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse( ) else: word.greatgrandparent_phrase = word.grandparent_phrase.parent #Now add fake parse for sil, pau and # for word in utt.words: if word.id in utt.phoneme_features.get_sil_phonemes(): word.parent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse( ) word.pos = "sil"
def add(self, v_name, value): if hasattr(self, v_name): if context_utils.check_value(self, v_name, value): if v_name not in self.added_contexts: self.added_contexts[v_name] = value else: raise SiReError("Tried to add a context ({0} - new value: {1}) which already has a value ({2}).".format(v_name, value, self.added_contexts[v_name])) else: raise SiReError("Tried to add context ({0}) which does not exist in skeleton! ".format(v_name))
def merge_hvite_state_with_sp_align_mlf(state_labs, phone_labs): """ Create a state-alignment based mlf for use in e.g. Neural Network systems that rely on HMM state-alignments. Input: state_mlf - MLF from an HVite alignment output at the state-level not containing short pause (SP) and syllable stress/boundary markers. phone_mlf - MLF suitable as input for a HVite alignment at the phoneme level containing short pause (SP) and syllable stress/boundary markers. This is equivalent to the sp.mlf produced by make_lattices_and_mlfs.py. Output: merged_mlf - MLF in HVite state-level alignment format as if SP and syllable stress/boundary information had initially existed (given 0 duration). """ if len(state_labs) != len(phone_labs): raise SiReError( "Number of state align labs ({0}) not equal to the number of phone align labs ({1})!" .format(len(state_labs), len(phone_labs))) #Sort them by name state_labs = sorted(state_labs, key=lambda name: name[0]) phone_labs = sorted(phone_labs, key=lambda name: name[0]) #Prep out list merged = [] for i, s_lab in enumerate(state_labs): p_lab = phone_labs[i] #Check we have the correct labels to merge if s_lab[0] != p_lab[0]: raise SiReError( "The labels are not from the same files! State lab = {0}, context lab = {1}" .format(s_lab[0], p_lab[0])) #Do the merging s_lab.pop(0) s_lab_count = 0 c_merge = [p_lab.pop(0)] for line in p_lab: c_p = [] s_phone = s_lab[s_lab_count][0][-1] if line[-1] != s_phone: if line[-1] in ["#1", ".", "#2", "sp"]: c_p.append( ["0", "0", "s2", "FAKE", line[-1], "FAKE", line[-1]]) c_p.append(["0", "0", "s3"]) c_p.append(["0", "0", "s4"]) c_p.append(["0", "0", "s5"]) c_p.append(["0", "0", "s6"]) c_merge.append(c_p) else: print s_phone print line[-1] raise SiReError( "Mismatch in phone content! Please check MLFs for lab {0}!" .format(c_merge[0])) else: c_merge.append(s_lab[s_lab_count]) s_lab_count += 1 merged.append(c_merge) return merged
def parse_mlf(mlf, intype): if intype in ["align_mlf", "state_align_mlf"]: ext = ".rec" elif intype == "hts_mlf": ext = ".lab" else: raise SiReError( "Don't know what to do with mlf of type - {0}".format(intype)) #Remove mlf header mlf.pop(0) labs = [] tmp = [] end = len(mlf) - 1 for i, l in enumerate(mlf): l = l.split() if ext in l[0]: if tmp == []: tmp.append(l[0].split("*/")[1].split(".")[0]) else: if tmp[-1] == ["."]: tmp.pop(-1) labs.append(tmp) tmp = [] tmp.append(l[0].split("*/")[1].split(".")[0]) elif i == end: labs.append(tmp) else: tmp.append(l) # Collapse states if intype == "state_align_mlf": new_labs = [] for lab in labs: n_lab = [] tmp = [] for i, line in enumerate(lab): #print line if i == 0: n_lab.append(line) elif line[2] == "s2": tmp.append(line) elif line[2] == "s6": #Append the state info tmp.append(line) if len(tmp) != 5: raise SiReError( "Not enough states in phone! 5 expected but I got {0}! Please check format.\n{1}" .format(len(tmp), tmp)) n_lab.append(tmp) tmp = [] else: tmp.append(line) new_labs.append(n_lab) labs = new_labs return labs
def make_words(utt): words = [] word = {"id": "", "syllables": []} for i, s in enumerate(utt): #The check for len phonemes is necessary as the syll id is composed #of phoneme ids and all of "s", "i", "l" and "p" are valid ids. #Thus a syllable of the phonemes "s" and "p" has the id "sp". if s["id"] in ["sil", "sp"] and len(s["phonemes"]) == 1: if word["syllables"] != []: word["start"] = word["syllables"][0]["start"] word["end"] = word["syllables"][-1]["end"] words.append(word) #If the silence is of any length it should be kept. if s["end"] - s["start"] > 0: words.append({ "id": s["id"], "syllables": [s], "start": s["start"], "end": s["end"] }) elif i == 0 or i == len(utt) + 1: #Something is likely fishy raise SiReError( "Boundary silence not of any length in word ({0})!".format( word)) word = {"id": "", "syllables": []} else: word["syllables"].append(s) word["id"] += s["id"] return words
def open_labdir_line_by_line(path, dur_lab=False): l = os.listdir(path) labs = [] if dur_lab == False: for i, lab in enumerate(l): if ".lab" in lab: tmp = [lab.split(".")[0]] tmp += [ x.split() for x in open(os.path.join(path, lab), "r").readlines() ] labs.append(tmp) elif dur_lab == True: for i, lab in enumerate(l): if ".dur" in lab: c_pos = 0 tmp = [lab.split(".")[0]] for x in open(os.path.join(path, lab), "r").readlines(): if ".state[" not in x: x = x.split() frames = int(x[-3].split("=")[1]) tmp += [[ str(c_pos * 50000), str((c_pos + frames) * 50000), x[0] ]] c_pos += frames labs.append(tmp) else: raise SiReError("dur_lab must be boolean!") return labs
def load_stanford_dependency_parse(utt, parse): if utt.words == None: raise SiReError( "No words in utterance! Please load an mlf or txt file first!") tree = parsetrees.stanfordDependencyTree() tree.make_tree(parse) #As each word is at a node not at a leaf we get the nodes. nodes = tree.get_nodes(utt_sorted=True) if len(nodes) != utt.num_words_no_pau(): #First we try to see if this is due to differences in how words are #dealt with in parsing and annotation. #Prime example is using 's in e.g. there's for transcription instead of there is. #Parsing splits there's into two whereas in e.g. combilex there's is one word. #If this is the case we split the WORD into two with the 's being a single phoneme #single syllable word. In other cases the contraction straddles two words and #we add a "phony" word which affects contexts but adds no phonemes. utterance_utils.try_split_words(utt) if len(nodes) != utt.num_words_no_pau(): for node in nodes: print node.label raise SiReError( "Number of nodes ({0}) not equal to number of words ({1})! In utt ({2})!" .format(len(nodes), utt.num_words_no_pau(), utt.id)) #Match each word with parse for i, word in enumerate(utt.get_words_no_pau()): #As we may have split words the parse contains the id word.id = nodes[i].label #But as we may have punctuation the word itself contains the utt_pos nodes[i].utt_pos = word.pos_in_utt() #There should always be itself word.parent_dependency = nodes[i] #And there should always be a parent word.grandparent_dependency = word.parent_dependency.parent #But there might not be more than one if word.grandparent_dependency.parent != None: word.greatgrandparent_dependency = word.grandparent_dependency.parent else: word.greatgrandparent_dependency = parsetrees.stanfordDependencyTree( ) #Now add empty parse for sil, pau and # for word in utt.words: if word.id in utt.phoneme_features.get_sil_phonemes() + [","]: word.parent_dependency = parsetrees.stanfordDependencyTree() word.grandparent_dependency = parsetrees.stanfordDependencyTree() word.greatgrandparent_dependency = parsetrees.stanfordDependencyTree( )
def is_phoneme(self, phoneme, fail=False): if phoneme in self.phonemes: return True else: if fail: raise SiReError( "Phoneme ({0}) not a valid phoneme!".format(phoneme)) return False
def add_multiple(self, v_name, value): if hasattr(self, v_name): if context_utils.check_value(self, v_name, value): if v_name not in self.added_contexts: self.added_contexts[v_name] = [value] else: self.added_contexts[v_name] += [value] else: raise SiReError("Tried to add context ({0}) which does not exist in skeleton! ".format(v_name))
def merge_hvite_state_align_and_full_context_lab(state_align_labs, full_context_labs): """ Create a state-alignment based full-context label for use in e.g. Neural Network systems that rely on HMM state-alignments. Input: state_align_labs - Labels from an HVite alignment output at the state-level. full_context_labs - Labels from any full-context method on the phoneme level (any SiRe output contexts or standard HTS full-context labels) """ if len(state_align_labs) != len(full_context_labs): raise SiReError( "Number of align labs ({0}) not equal to the number of full context labs ({1})!" .format(len(state_align_labs), len(full_context_labs))) #Prep out list merged = [] #Sort them by name state_align_labs = sorted(state_align_labs, key=lambda name: name[0]) full_context_labs = sorted(full_context_labs, key=lambda name: name[0]) for i, lab in enumerate(state_align_labs): f_lab = full_context_labs[i] #Check we have the correct labels to merge if f_lab[0] != lab[0]: raise SiReError( "The labels are not from the same files! State lab = {0}, context lab = {1}" .format(lab[0], f_lab[0])) #Do the merging f_lab.pop(0) p = -1 f_line = None c_merge = [lab.pop(0)] for l in lab: if len(l) == 7: p += 1 f_line = f_lab[p][-1] c_merge.append( [l[0], l[1], f_line + "[" + l[2][-1] + "]", f_line]) elif len(l) == 4: c_merge.append([l[0], l[1], f_line + "[" + l[2][-1] + "]"]) else: raise SiReError("Error in align lab line: {0}".format(l)) merged.append(c_merge) return merged
def strfloatify(fl): #Just to make sure we deal with a int if type(fl) is not int: raise SiReError("Cannot strintify type {0}! Must be int!".format( type(fl))) #First do the division fl = float(fl) / 100 #Round fl = round(fl, 2) #Return the float string return str(fl)
def strintify(fl): #Just to make sure we deal with a float if type(fl) is not float: raise SiReError("Cannot strintify type {0}! Must be float!".format( type(fl))) #First remove any leftovers and make sure we don't just floor fl = round(fl, 2) #Do the multiplication. We round due to issues with float arithmetic. fl = round(fl * 100, 0) #Return the int string return str(int(fl))
def make_hmm_relational_qs(values, key, qtype): questions = [] #Add xx question if appropriate else ignore if "xx" in values: if "xx" in qtype: questions.append("QS \"" + key + "-xx\" {*|" + key + ":xx|*}") else: raise SiReError( "xx in values but not in qtype {0} for key {1} - why?".format( qtype, key)) values.remove("xx") for i, val in enumerate(values): if "float" in qtype: val = strintify(float(val)) questions.append("QS \"" + key + "-" + str(val) + "\" {*|" + key + ":" + str(val) + "|*}") #If val is more than one we make a less than question #If we count 0 then we start at 0 if "0" in qtype: start = 0 else: start = 1 if int(val) > start: #Prep the less than string s = "QS \"" + key + "<=" + str(val) + "\" {" #Make the less than string #Get tens and remainder tens = int(val) / 10 remainder = int(val) % 10 if tens > 0: #Make singles for n in range(start, 10): s += "*|" + key + ":" + str(n) + "|*," #Make tens for n in range(1, tens): s += "*|" + key + ":" + str(n) + "?|*," for n in range(remainder + 1): if n != remainder: s += "*|" + key + ":" + str(tens) + str(n) + "|*," else: s += "*|" + key + ":" + str(tens) + str(n) + "|*}" questions.append(s) else: #Just make singles for n in range(start, int(val) + 1): s += "*|" + key + ":" + str(n) + "|*" if n != int(val): s += "," else: s += "}" questions.append(s) return questions
def remake_stops(lab): remove = [] for i, l in enumerate(lab): if "_cl" in l[-1]: if lab[i + 1][-1] + "_cl" != l[-1]: raise SiReError( "Closure not preceding release! In {0}".format(lab)) else: lab[i + 1][0] = l[0] remove.append(l) for r in remove: lab.remove(r) return lab
def get_entries(self, word, punct_as_sil=None): try: return self.raw_dictionary_entries[word] except KeyError: #If this has underscores we try to pronounce each letter individually. if "_" in word: #The total phoneme string w_phon = "" for w in word.split("_"): #Get the entry ent = self.get_single_entry(w) #Get the phoneme string with syllable stress ent = self.get_entry_phonemes(ent, True) w_phon += " "+ent print "Warning! \"{0}\" looks like it should be pronounced {1} and is a proper noun. I'm doing that. Is it right?".format(word, w_phon) return [self.make_entry("nnp", w_phon.strip(), reduced=False)] elif punct_as_sil and word in punct_as_sil[0]: if punct_as_sil[1] in self.phoneme_feats.get_sil_phonemes(): return [self.make_entry(punct_as_sil[1], punct_as_sil[1]+" 0")] else: raise SiReError("Cannot add punctuation {0} as silence as sil phoneme specified ({1}) is not valid! Must be in {3}.".format(word, punct_as_sil[1], phoneme_feats.get_sil_phonemes())) else: raise SiReError("Could not find \"{0}\" in dictionary! Please add it manually.".format(word))
def get_sire_general_pos(word): if word.pos in [ "cd", "dt", "ex", "fw", "ls", "md", "pos", "rp", "uh", "sym", "sil" ]: return word.pos elif word.id in [ "is", "am", "are", "was", "were", "has", "have", "had", "be" ]: #This is derived from the festival aux set which else would be verb here. return "aux" elif word.id in [ "her", "his", "their", "its", "our", "their", "its", "mine" ]: #This is derived from the festival pps set which else would be noun here. return "pps" elif word.pos in ["cc", "in", "to"]: return "conj" elif word.pos in ["jj", "jjr", "jjs"]: return "adj" elif word.pos in ["nn", "nns", "nnp", "nnps", "prp", "prp$"]: return "noun" elif word.pos == "pdt": return "dt" elif word.pos in ["rb", "rbr", "rbs"]: return "adv" elif word.pos in ["vb", "vbd", "vbg", "vbn", "vbp", "vbz"]: return "verb" elif word.pos in ["wdt", "wp", "wp$", "wrb"]: return "wh" elif word.pos in [".", ",", ":", ";", "\"", "'", "(", "?", ")", "!"]: return "punc" else: if word.pos == "content": raise SiReError( "To do the SiRe pos tag generalisation you must not be doing simple_festival_pos_predict but use a proper tagger!" ) else: raise SiReError("Cannot categorise pos tag ({0})!".format( word.pos))
def get_parent_general_relation(self): pr = self.parent_relation if pr in ["auxpass", "cop"]: #aux return "aux" elif pr in ["agent", "root", "dep", "aux", "arg", "obj", "subj", "cc", "conj", "expl", "mod", "parataxis", "punct", "ref", "sdep", "goeswith", "xsubj", "discourse"]: #nonreduced - discourse is not in manual hierarcy but should nto be reduced return pr elif pr in ["acomp", "ccomp", "xcomp", "pcomp"]: #The stanford manuals hierarchy has forgotten pcomp but this should be the cat return "comp" elif pr in ["dobj", "iobj", "pobj"]: return "obj" elif pr in ["nsubj", "nsubjpass", "csubj", "csubjpass"]: return "subj" elif pr in ["amod", "appos", "advcl", "det", "predet", "preconj", "vmod", "mwe", "mark", "advmod", "neg", "rcmod", "quantmod", "nn", "npadvmod", "tmod", "num", "number", "prep", "poss", "possessive", "prt"]: return "mod" else: raise SiReError("Undefined parent_relation ({0}) for simplification! Are you using the stanford parser?".format(pr))
def get_text_utts(indir, compilexpath): txt = load_txt_dir(indir) dct = dictionary.Dictionary(compilexpath) oov = get_oov_words(txt, dct) if len(oov) != 0: print "Please remove all OOV word containing sents or add the words to dictonary before proceeding." for w in oov: print w raise SiReError("OOV words present, cannot continue.") args.dictionary = dct args.intype = "txt" utts = get_utts(txt, args) return utts
def simple_festival_pos_predict(utt): if utt.txtloaded != True: raise SiReError( "We cannot be sure that we know each word id correctly! It may just be phonemes strung together!" ) for word in utt.words: if word.id in [ "of", "for", "in", "on", "that", "with", "by", "at", "from", "as", "if", "that", "against", "about", "before", "because", "if", "under", "after", "over", "into", "while", "without", "through", "new", "between", "among", "until", "per", "up", "down" ]: word.pos = "in" elif word.id == "to": word.pos = "to" elif word.id in [ "the", "a", "an", "no", "some", "this", "that", "each", "another", "those", "every", "all", "any", "these", "both", "neither", "no", "many" ]: word.pos = "det" elif word.id in [ "will", "may", "would", "can", "could", "should", "must", "ought", "might" ]: word.pos = "md" elif word.id in ["and", "but", "or", "plus", "yet", "nor"]: word.pos = "cc" elif word.id in ["who", "what", "where", "how", "when"]: word.pos = "wp" elif word.id in [ "her", "his", "their", "its", "our", "their", "its", "mine" ]: word.pos = "pps" elif word.id in [ "is", "am", "are", "was", "were", "has", "have", "had", "be" ]: word.pos = "aux" elif word.id in [".", ",", ":", ";", "\"", "'", "(", "?", ")", "!"]: word.pos = "punc" elif word.id in utt.phoneme_features.get_sil_phonemes(): word.pos = "punc" else: word.pos = "content"
def simple_festival_accent_predict(utt): for word in utt.words: if is_festival_content(word.pos): if len(word.syllables) == 1: word.syllables[0].accent = 1 else: for syll in word.syllables: if int(syll.stress) == 1: syll.accent = 1 elif int(syll.stress) == 0 or int(syll.stress) == 2: syll.accent = 0 else: raise SiReError( "Syllable has invalid stress value({0})!".format( syll.stress)) else: for syll in word.syllables: syll.accent = 0
def dep_distance_in_arcs(n1, n2): #we include itself as if it is a parent of the other node we want to stop n1_parents = [n1] while n1.parent != None: n1_parents += [n1.parent] n1 = n1.parent n2_parents = [n2] while n2.parent != None: n2_parents += [n2.parent] n2 = n2.parent #The lowest common node is the one that is in both first encountered in one of the lists that is in the other #This is quadratic but we're never going to have enough levels for it to matter. for i1, p1 in enumerate(n1_parents): for i2, p2 in enumerate(n2_parents): if p1 == p2: #We can just add them together because when i1/i2==0 it is themselves. #So if p1 is the parent of p2, i1 == 0 and i2 the number of levels between them. return i1+i2 print n1_parents print n2_parents raise SiReError("The two nodes are not in the same tree! Cannot find a distance!")
def make_sylls(utt): sylls = [] syll = {"id": "", "stress": 0, "phonemes": []} for i, p in enumerate(utt): #. marks midword syll boundaries #sp marks word boundaries and possible silence segments #sil marks silence segments between words if p["id"] in [".", "sil", "sp"]: if len(syll["phonemes"]) > 0: syll["start"] = syll["phonemes"][0]["start"] syll["end"] = syll["phonemes"][-1]["end"] sylls.append(syll) syll = {"id": "", "stress": 0, "phonemes": []} #Sil and sp are also markers of word boundaries and #may be their own entity so should be kept if p["id"] in ["sil", "sp"]: sylls.append({ "id": p["id"], "stress": 0, "phonemes": [p], "start": p["start"], "end": p["end"] }) else: syll["phonemes"].append(p) syll["id"] += p["id"] if p["stress"] > 0: if syll["stress"] != 0: raise SiReError( "Syllable ({0}) already stressed! In utt ({1})".format( syll, utt.id)) syll["stress"] = p["stress"] if i == len(utt) - 1: syll["start"] = syll["phonemes"][0]["start"] syll["end"] = syll["phonemes"][-1]["end"] #If we're at the end of the utt the last syll is done sylls.append(syll) return sylls
def try_split_words(utt): l = len(utt.words) #We should not try this if we have not gotten the word.id from txt. if utt.txtloaded != True: raise SiReError("Cannot split words if word ids not loaded from txt.") for word in utt.words: #End of word 's if word.id[-2:] == "'s": split_word(word, -2) #Contracted are's (e.g. we're) elif word.id[-3:] == "'re": split_word(word, -3) #Contracted not's (e.g. don't) elif word.id[-3:] == "n't": split_word(word, -2) #Contracted will's (e.g. it'll) elif word.id[-3:] == "'ll": split_word(word, -3) #Contracted have's (e.g. I've) elif word.id[-3:] == "'ve": split_word(word, -3) #Contracted I am elif word.id == "i'm": split_word(word, -2) #Contracted would or had (e.g. she'd) elif word.id[-2:] == "'d": split_word(word, -2) #Contracted going to elif word.id == "gonna": split_word(word, -2) #Contracted can not elif word.id == "cannot": split_word(word, -3) #Contracted want to elif word.id == "wanna": split_word(word, -2) if l == len(utt.words): print "Warning: Nothing to split in word."
def load_txt(utt, txtpath, emphasis): txt = open(txtpath, "r").read() for x in ["!", ".", "?", ",", "--"]: txt = txt.replace(x, "") #We lower case because other methods use word name #and we don't care about case there. # if not using emphasis, lowercase like normal if not emphasis: txt = txt.lower() txt = txt.split() # if using emphasis, lower case all but words with two or more capitalised letters if emphasis: temp_txt = [] upper_reg = re.compile(r'[A-Z][A-Z]+') for i in txt: if re.search(upper_reg, i) != None: temp_txt.append(i) else: i = i.lower() temp_txt.append(i) txt = temp_txt if len(txt) != utt.num_words_no_pau(): for w in utt.words: print w.id print txt raise SiReError( "Text length ({0}) and number of words ({1}) in utt ({2}) does not match!" .format(len(txt), utt.num_words_no_pau(), utt.id)) #Now replace the phoneme based ids with txt based. i = 0 for w in utt.words: if w.id not in utt.phoneme_features.get_sil_phonemes(): w.id = txt[i] i += 1 utt.txtloaded = True
def split_word(word, split_pos): utt = word.parent_utt split_more_than_one_phoneme = False #We usually only want to change one phoneme and this list checks for those. if word.syllables[-1].num_phonemes() != 1: s = word.syllables[-1] #This should also update the word itself with the new syll info. #End of word 's if word.id[-2:] == "'s": split_syll(s, ["s", "z"]) #Contracted are's (e.g. we're) elif word.id[-3:] == "'re": #'r' is a bit... meh. In e.g. "you're" pronounced "jU@r" we kinda want to add a phony #before "r". #But this is not supported atm. split_syll(s, ["I@", "U@", "E@", "@", "r"], ["I@", "U@", "E@", "O"]) #Contracted not's (e.g. don't) elif word.id[-3:] == "n't": split_syll(s, ["n", "G", "t"]) #Contracted will's (e.g. it'll) elif word.id[-3:] == "'ll": split_syll(s, ["lw", "l"]) #Contracted have's (e.g. I've) elif word.id[-3:] == "'ve": split_syll(s, ["f", "v"]) #Contracted I am elif word.id[-3:] == "i'm": split_syll(s, ["m"]) #Contracted would or had (e.g. she'd) elif word.id[-2:] == "'d": split_syll(s, ["d", "G"], ["u"]) #Contracted going to elif word.id == "gonna": if s.id in ["nu", "n@"]: split_more_than_one_phoneme = True #Contracted can not elif word.id == "cannot": #If the syll is "nQG" or "nQt" we're good and can split if s.id in ["nQG", "nQt"]: split_more_than_one_phoneme = True elif word.id == "wanna": #If the syll is "n@" we're good and can split if s.id == "n@": split_more_than_one_phoneme = True #If there is only one syllable with one word we have to add a phony syll. elif len(word.syllables) < 2: s = word.syllables[-1] #Contracted would or had (e.g. I'd) if word.id[-2:] == "'d": split_syll(s, ["aI"], ["aI"]) if word.syllables[-1].num_phonemes( ) > 1 and split_more_than_one_phoneme != True: raise SiReError( "Cannot split a word {0} with final syllable {1} with more than one phoneme ({2}) as this has not been explicitly allowed!" .format(word.id, word.syllables[-1].id, word.syllables[-1].num_phonemes())) w1 = utterance.Word() w1.id = word.id[:split_pos] w2 = utterance.Word() w2.id = word.id[split_pos:] print "Warning: Splitting word ({0}) into two ({1} and {2}). Is this correct?".format( word.id, w1.id, w2.id) #Start time w1.start = word.start_time() w2.start = word.syllables[-1].start_time() #End time w1.end = word.syllables[-2].end_time() w2.end = word.end_time() #Parent utt w1.parent_utt = utt w2.parent_utt = utt #Pos in utt #Slice out the original word w_p_u = word.pos_in_utt() utt.words = utt.words[:w_p_u] + [w1, w2] + utt.words[w_p_u + 1:] #Fix syllables and phonemes w1.syllables = word.syllables[:-1] w2.syllables = [word.syllables[-1]] w1.phonemes = [] w2.phonemes = [] for s in w1.syllables: s.parent_word = w1 for p in s.phonemes: w1.phonemes.append(p) p.parent_word = w1 w2.syllables[0].parent_word = w2 for p in w2.syllables[0].phonemes: w2.phonemes.append(p) p.parent_word = w2 # #Fix phonemes # w1.phonemes = word.phonemes[:-len(word.syllables[-1].phonemes)] # w2.phonemes = word.phonemes[-len(word.syllables[-1].phonemes):] # for p in w1.phonemes: # p.parent_word = w1 # for p in w2.phonemes: # p.parent_word = w2 #Delete the original word. If all has gone well this should be fine. del word
def split_syll(syll, acceptable_phoneme_set, word_spanning_phonemes=[]): #Makes life a bit easier utt = syll.parent_utt phoneme_features = utt.phoneme_features #A special case for phonemes which may have ended up spanning across what would normally #be two words or if all phonemes related to the "2nd" word has been deleted. #E.g. I@ in we're (w I@) or u in who'd (h u). #Or a deleted stop in I'd (aI). #In this case we add a new "phony" syllable with no duration. So it affects contexts #but does not take any frames. if syll.phonemes[-1].id in word_spanning_phonemes: phony = utterance.Syllable() phony.id = syll.phonemes[-1].id phony.stress = syll.stress phony.parent_utt = syll.parent_utt phony.parent_word = syll.parent_word #Slice in phony phony.parent_utt.syllables.insert(syll.pos_in_utt() + 1, phony) phony.parent_word.syllables.insert(syll.pos_in_word() + 1, phony) #We need to add a phony phoneme for e.g. start end time information phony_phone = utterance.Phoneme() phony_phone.start = syll.end_time() phony_phone.end = syll.end_time() phony.phonemes = [phony_phone] phony.vowel_id = syll.vowel_id return #You must know which phonemes are acceptable to replace. #Just a safety that things don't go horribly wrong. Disable this if you feel lucky. if syll.phonemes[-1].id not in acceptable_phoneme_set: raise SiReError( "Cannot split syllable {0} unless its last phoneme {1} exists in the acceptable set ({2})" .format(syll.id, syll.phonemes[-1].id, acceptable_phoneme_set)) #ID s1 = utterance.Syllable() s1.id = syll.id[:-1] s2 = utterance.Syllable() s2.id = syll.id[-1] print "Warning: Splitting syll ({0}) into two {1} and {2}".format( syll.id, s1.id, s2.id) #Start pos s1.start = syll.start_time() s2.start = syll.phonemes[-1].start #End pos s1.end = syll.phonemes[-2].end s2.end = syll.end_time() #Stress #If we have stress on phonemes we use that, else we use the syll stress if syll.phonemes[0].stress == None: s1.stress = syll.stress s2.stress = syll.stress else: s = 0 for p in syll.phonemes[:-1]: if int(p.stress) > 0: s = 1 s1.stress = str(s) if int(syll.phonemes[-1].stress) > 0: s2.stress = str(1) else: s2.stress = str(0) #Pos in utt #Slice in syll s_p_u = syll.pos_in_utt() utt.syllables = utt.syllables[:s_p_u] + [s1, s2 ] + utt.syllables[s_p_u + 1:] #Pos in word word = syll.parent_word s_p_w = syll.pos_in_word() #Slice in the new sylls word.syllables = word.syllables[:s_p_w] + [s1, s2] #Parents s1.parent_utt = utt s2.parent_utt = utt s1.parent_word = word s2.parent_word = word #Update child phonemes s1.phonemes = syll.phonemes[:-1] s2.phonemes = [syll.phonemes[-1]] for p in s1.phonemes: p.parent_syllable = s1 s2.phonemes[0].parent_syllable = s2 #Update vowel id if phoneme_features.is_vowel(s2.phonemes[0].id): s2.vowel_id = s2.phonemes[0].id else: s2.vowel_id = "novowel" v = "novowel" for p in s1.phonemes: if phoneme_features.is_vowel(p.id): v = p.id break s1.vowel_id = v #Delete the original syll. del syll