def load_stanford_pcfg_parse(utt, parse, comma_is_pause=False): if utt.words == None: raise SiReError( "No words in utterance! Please load an mlf or txt file first!") tree = parsetrees.stanfordPcfgTree() tree.make_tree(parse) if comma_is_pause == True: leafs = tree.get_leafs(include_punct=[","]) else: leafs = tree.get_leafs() num_w = utt.num_words_no_pau(comma_is_pause) if len(leafs) != num_w: #First we try to see if this is due to differences in how words are #dealt with in parsing and annotation. #Prime example is using 's in e.g. there's for transcription instead of there is. #Parsing splits there's into two whereas in e.g. combilex there's is one word. #If this is the case we split the WORD into two with the 's being a single phoneme #single syllable word. In other cases the contraction straddles two words and #we add a "phony" word which affects contexts but adds no phonemes. utterance_utils.try_split_words(utt) #Update num_w num_w = utt.num_words_no_pau(comma_is_pause) if len(leafs) != num_w: for w in utt.words: print w.id raise SiReError( "Number of leaves ({0}) not equal to number of words ({1})! In utt ({2})!" .format(len(leafs), num_w, utt.id)) #Match each word with parse words = utt.get_words_no_pau(comma_is_pause) for i, word in enumerate(words): l = leafs[i].label.split("-") word.id = l[1] word.pos = l[0] #There should always be a parent word.parent_phrase = leafs[i].parent #But there might not be more than one if word.parent_phrase.parent != None: word.grandparent_phrase = word.parent_phrase.parent else: word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() #And certainly we might be done here if word.grandparent_phrase.parent in [ None, "xx" ] or word.grandparent_phrase.parent.label == "xx": word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse( ) else: word.greatgrandparent_phrase = word.grandparent_phrase.parent #Now add fake parse for sil, pau and # for word in utt.words: if word.id in utt.phoneme_features.get_sil_phonemes(): word.parent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse( ) word.pos = "sil"
def load_stanford_pcfg_parse(utt, parse, comma_is_pause=False): if utt.words == None: raise SiReError("No words in utterance! Please load an mlf or txt file first!") tree = parsetrees.stanfordPcfgTree() tree.make_tree(parse) if comma_is_pause == True: leafs = tree.get_leafs(include_punct=[","]) else: leafs = tree.get_leafs() num_w = utt.num_words_no_pau(comma_is_pause) if len(leafs) != num_w: #First we try to see if this is due to differences in how words are #dealt with in parsing and annotation. #Prime example is using 's in e.g. there's for transcription instead of there is. #Parsing splits there's into two whereas in e.g. combilex there's is one word. #If this is the case we split the WORD into two with the 's being a single phoneme #single syllable word. In other cases the contraction straddles two words and #we add a "phony" word which affects contexts but adds no phonemes. utterance_utils.try_split_words(utt) #Update num_w num_w = utt.num_words_no_pau(comma_is_pause) if len(leafs) != num_w: for w in utt.words: print w.id raise SiReError("Number of leaves ({0}) not equal to number of words ({1})! In utt ({2})!".format(len(leafs), num_w, utt.id)) #Match each word with parse words = utt.get_words_no_pau(comma_is_pause) for i, word in enumerate(words): l = leafs[i].label.split("-") word.id = l[1] word.pos = l[0] #There should always be a parent word.parent_phrase = leafs[i].parent #But there might not be more than one if word.parent_phrase.parent != None: word.grandparent_phrase = word.parent_phrase.parent else: word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() #And certainly we might be done here if word.grandparent_phrase.parent in [None, "xx"] or word.grandparent_phrase.parent.label == "xx": word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() else: word.greatgrandparent_phrase = word.grandparent_phrase.parent #Now add fake parse for sil, pau and # for word in utt.words: if word.id in utt.phoneme_features.get_sil_phonemes(): word.parent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse() word.pos = "sil"
def proto_from_txt(lab, dictionary, general_sil_phoneme="sil", comma_is_pause=False, stanfordparse=False, pcfgdict=None, pron_reduced=False, lm_score_dir=None, reduction_level=1.0, phoneme_lm_prons=False): #Create words proto = {"utt":[]} proto["id"] = lab[0].split("/")[-1] #First we check if we need to reduce some words, and which if pron_reduced == True and phoneme_lm_prons == True: raise SiReError("Cannot produce reduced pronunciations in combination with phoneme LM based pronunciation choice.") elif pron_reduced == True: if os.path.isdir(lm_score_dir): words = reduce_word_tuples(lab[1:], os.path.join(lm_score_dir, proto["id"]+".scored"), reduction_level) else: raise SiReError("The directory with reduction scores does not exist!") elif phoneme_lm_prons == True: if os.path.isdir(lm_score_dir): #As we do not keep stress information for the phoneme LM to score we may have a few potential versions of each word. words = find_potential_words(lab[1:], os.path.join(lm_score_dir, proto["id"]+".path")) raise SiReError("Not implemented yet! Phoneme_lm_scoring. ") else: raise SiReError("The directory with reduction scores does not exist!") else: words = [(x, False) for x in lab[1:]] #Make words and look up in dictionary #If no parse exists (i.e. no pos tags) we will simply grab the first pronunciation we can find that is not reduced (if one exist). #We also forget the pos tag of that in the process. #We start with silence. proto["utt"].append({"id":"sil", "syllables":dictionary.make_entry(general_sil_phoneme, general_sil_phoneme+" 0", False)["syllables"]}) if not stanfordparse: for word in words: #If we need to keep some punctuation if comma_is_pause == True: proto["utt"].append({"id":word[0], "syllables":dictionary.get_single_entry(word[0], reduced=word[1], punct_as_sil=([","], "sil"))["syllables"]}) else: proto["utt"].append({"id":word[0], "syllables":dictionary.get_single_entry(word[0], reduced=word[1])["syllables"]}) else: #Else a pcfg parse should exist and we can get the pos tags from that. tree = parsetrees.stanfordPcfgTree() tree.make_tree(pcfgdict[proto["id"]]) #Do we need some punctuation? if comma_is_pause == True: leafs = tree.get_leafs(include_punct=[","]) else: leafs = tree.get_leafs() #In this case we need to do some merging if len(leafs) != len(words): leafs = merge(leafs, words, proto["id"]) for i, leaf in enumerate(leafs): pos, word = leaf.label.lower().split("-") if word != words[i][0]: raise SiReError("Word ({0}) from parse does not match word ({1}) from txt! In {2}.".format(word, words[i][0], proto["id"])) else: word = words[i] if comma_is_pause: c_best = dictionary.get_single_entry(word[0], pos, word[1], punct_as_sil=([","], "sil")) else: c_best = dictionary.get_single_entry(word[0], pos, word[1]) proto["utt"].append({"id":word[0], "syllables":c_best["syllables"]}) #We end with silence. proto["utt"].append({"id":"sil", "syllables":dictionary.make_entry(general_sil_phoneme, general_sil_phoneme+" 0", False)["syllables"]}) #Add phony times to phonemes #Phony phoneme duration counter cur_dur = 0 for word in proto["utt"]: for syll in word["syllables"]: #Add phony time to phonemes for phon in syll["phonemes"]: phon["start"] = cur_dur #Add 100ms in HTK lab format cur_dur += 1000000 phon["end"] = cur_dur return proto
def proto_from_txt(lab, dictionary, general_sil_phoneme="sil", comma_is_pause=False, stanfordparse=False, pcfgdict=None, pron_reduced=False, lm_score_dir=None, reduction_level=1.0, phoneme_lm_prons=False): #Create words proto = {"utt": []} proto["id"] = lab[0].split("/")[-1] #First we check if we need to reduce some words, and which if pron_reduced == True and phoneme_lm_prons == True: raise SiReError( "Cannot produce reduced pronunciations in combination with phoneme LM based pronunciation choice." ) elif pron_reduced == True: if os.path.isdir(lm_score_dir): words = reduce_word_tuples( lab[1:], os.path.join(lm_score_dir, proto["id"] + ".scored"), reduction_level) else: raise SiReError( "The directory with reduction scores does not exist!") elif phoneme_lm_prons == True: if os.path.isdir(lm_score_dir): #As we do not keep stress information for the phoneme LM to score we may have a few potential versions of each word. words = find_potential_words( lab[1:], os.path.join(lm_score_dir, proto["id"] + ".path")) raise SiReError("Not implemented yet! Phoneme_lm_scoring. ") else: raise SiReError( "The directory with reduction scores does not exist!") else: words = [(x, False) for x in lab[1:]] #Make words and look up in dictionary #If no parse exists (i.e. no pos tags) we will simply grab the first pronunciation we can find that is not reduced (if one exist). #We also forget the pos tag of that in the process. #We start with silence. proto["utt"].append({ "id": "sil", "syllables": dictionary.make_entry(general_sil_phoneme, general_sil_phoneme + " 0", False)["syllables"] }) if not stanfordparse: for word in words: #If we need to keep some punctuation if comma_is_pause == True: proto["utt"].append({ "id": word[0], "syllables": dictionary.get_single_entry( word[0], reduced=word[1], punct_as_sil=([","], "sil"))["syllables"] }) else: proto["utt"].append({ "id": word[0], "syllables": dictionary.get_single_entry(word[0], reduced=word[1])["syllables"] }) else: #Else a pcfg parse should exist and we can get the pos tags from that. tree = parsetrees.stanfordPcfgTree() tree.make_tree(pcfgdict[proto["id"]]) #Do we need some punctuation? if comma_is_pause == True: leafs = tree.get_leafs(include_punct=[","]) else: leafs = tree.get_leafs() #In this case we need to do some merging if len(leafs) != len(words): leafs = merge(leafs, words, proto["id"]) for i, leaf in enumerate(leafs): pos, word = leaf.label.lower().split("-") if word != words[i][0]: raise SiReError( "Word ({0}) from parse does not match word ({1}) from txt! In {2}." .format(word, words[i][0], proto["id"])) else: word = words[i] if comma_is_pause: c_best = dictionary.get_single_entry(word[0], pos, word[1], punct_as_sil=([","], "sil")) else: c_best = dictionary.get_single_entry(word[0], pos, word[1]) proto["utt"].append({ "id": word[0], "syllables": c_best["syllables"] }) #We end with silence. proto["utt"].append({ "id": "sil", "syllables": dictionary.make_entry(general_sil_phoneme, general_sil_phoneme + " 0", False)["syllables"] }) #Add phony times to phonemes #Phony phoneme duration counter cur_dur = 0 for word in proto["utt"]: for syll in word["syllables"]: #Add phony time to phonemes for phon in syll["phonemes"]: phon["start"] = cur_dur #Add 100ms in HTK lab format cur_dur += 1000000 phon["end"] = cur_dur return proto