def _map_variant(self, phonvariant): """ Map phonemes of only one variant of a phonetized entry. @param phonvariant (str) One phonetization variant of an entry. """ phones = self._map_split_variant(phonvariant) subs = [] # Single phonemes for p in phones: mapped = self._maptable.map_entry(p) if len(mapped)>0: subs.append( p+"|"+mapped ) else: subs.append( p ) self._dagphon.variants = 0 phon = ToStrip( self._dagphon.decompose(" ".join(subs)) ) # Remove un-pronounced phonemes!!! # By convention, they are represented by an underscore in the # mapping table. tmp = [] for p in phon.split('|'): r = [ x for x in p.split("-") if x != "_" ] tmp.append("-".join(r)) return "|".join(set(tmp))
def run_alignment(self, inputwav, outputalign, N=3): """ Execute the external program `julius` to align. The data related to the unit to time-align need to be previously fixed with: - set_phones(str) - set_tokens(str) @param inputwav (str - IN) the audio input file name, of type PCM-WAV 16000 Hz, 16 bits @param outputalign (str - OUT) the output file name @param N (int) N value of N-grams, used only if SLM (i.e. outext=walign) @return (str) A message of `julius`. """ outputalign = outputalign + "." + self._outext basename = os.path.splitext(inputwav)[0] if self._outext == "palign": self.gen_grammar_dependencies(basename) else: self.gen_slm_dependencies(basename) self.run_julius(inputwav, basename, outputalign) with codecs.open(outputalign, 'r', encoding) as f: lines = f.readlines() errorlines = "" message = "" entries = [] for line in lines: if line.find("Error: voca_load_htkdict")>-1 and line.find("not found")>-1: line = ToStrip( line ) line = line[line.find('"')+1:] line = line[:line.find('"')] if len(line)>0: entries = line.split() if len(entries) > 0: added = self.add_tiedlist(entries) if len(added) > 0: message = "The acoustic model was modified. The following entries were successfully added into the tiedlist: " message = message + " ".join(added) + "\n" self.run_julius(inputwav, basename, outputalign) with codecs.open(outputalign, 'r', encoding) as f: lines = f.readlines() for line in lines: if (line.startswith("Error:") or line.startswith("ERROR:")) and not " line " in line: errorlines = errorlines + line if "search failed" in line: message = "Julius search has failed to find the transcription in the audio file of this unit." errorlines = "Search error. "+ errorlines if len(errorlines) > 0: raise Exception(message + errorlines) return message
def read(self, dirname): """ Read a list file (start-time end-time). @param filename is the list file name. @raise IOError """ filename = os.path.join( dirname, ListIO.DEFAULT_FILENAME ) with codecs.open(filename, 'r', encoding) as fp: lines = fp.readlines() _units = [] # Each line corresponds to a track, # with a couple 'start end' of float values. for line in lines: line = ToStrip(line) _tab = line.split() if len(_tab) >= 2: _units.append( (float(_tab[0]),float(_tab[1])) ) return _units
def get_phon_entry(self, entry): """ Return the phonetization of an entry. Unknown entries are not automatically phonetized. This is a pure dictionary-based method. @param `entry` (str) The token to phonetize. @return A string with the phonetization of `entry` or the unknown symbol. """ entry = ToStrip(entry) # Specific strings... for the italian transcription... # For the participation at the CLIPS-Evalita 2011 campaign. if entry.startswith(u"<") is True and entry.endswith(u">") is True: entry = entry[1:-1] # No entry! Nothing to do. if len(entry) == 0: return "" # Specific strings used in the CID transcription... # CID is Corpus of Interactional Data, http://sldr.org/sldr000720 if entry.startswith(u"gpd_") is True or entry.startswith(u"gpf_") is True: return "" # Specific strings used in SPPAS IPU segmentation... if entry.find(u"ipu_")>-1: return "" # Find entry in the dict as it is given _strphon = self._pdict.get_pron( entry ) # OK, the entry is properly phonetized. if _strphon != self._pdict.unkstamp: return self._map_phonentry( _strphon ) return self._pdict.unkstamp
def read_walign(self, filename): """ Read an alignment file in the standard format of Julius CSR engine. @param filename (str - IN) The input file name. @return Two lists of tuples: - None - (start-time end-time word score) """ tokens = [""] scores = [0] _wordalign = [] wordidx = -1 with codecs.open(filename, "r", encoding) as fp: lines = fp.readlines() for line in lines: # Each line is either a new annotation or nothing interesting! line = ToStrip(line) if line.startswith("=== begin forced alignment ==="): wordidx = 0 elif line.startswith("=== end forced alignment ==="): wordidx = -1 elif line.startswith("wseq1:"): line = line[6:] # each token tokens = line.split() if len(tokens) == 0: tokens = [""] elif line.startswith("cmscore1:"): line = line[9:] # confidence score of the pronunciation of each token scores = [float(s) for s in line.split()] if len(scores) == 0: scores = [0] elif line.startswith("[") and wordidx > -1: # New phonemes line = line.replace("[", "") line = line.replace("]", "") line = ToStrip(line) tab = line.split(" ") # tab 0: first frame # tab 1: last frame # tab 2: score of the segmentation (log proba) # tab 3: word loc_s = float(tab[0]) / 100.0 loc_e = float(tab[1]) / 100.0 _wordalign.append([loc_s, loc_e, tokens[wordidx], scores[wordidx]]) wordidx = wordidx + 1 # Adjust time values for wordidx in range(len(_wordalign)): # Fix the end of this annotation to the begin of the next one. loc_e = _wordalign[wordidx][1] if wordidx < (len(_wordalign) - 1): nextloc_s = _wordalign[wordidx + 1][0] else: nextloc_s = 0.0 if loc_e < nextloc_s: loc_e = nextloc_s _wordalign[wordidx][1] = loc_e return (None, _wordalign)
def read_palign(self, filename): """ Read an alignment file in the standard format of Julius CSR engine. @param filename (str - IN) The input file name. @return Two lists of tuples: - (start-time end-time phoneme score) - (start-time end-time word score) """ _phonalign = [] _wordalign = [] phonidx = -1 # phoneme index loc_s = 0.0 # phoneme start time loc_e = 0.0 # phoneme end time phonlist = [] wordseq = [] scores = [0] tokens = [""] with codecs.open(filename, "r", encoding) as fp: lines = fp.readlines() for line in lines: # Each line is either a new annotation or nothing interesting! line = ToStrip(line) if line.startswith("=== begin forced alignment ==="): phonidx = 0 elif line.startswith("=== end forced alignment ==="): phonidx = -1 elif line.startswith("phseq1:"): line = line[7:] line = ToStrip(line) wordseq = line.split("|") # get indexes of each word wordlist = [] _idx = -1 for w in wordseq: _wrdphseq = w.strip().split() _idx += len(_wrdphseq) wordlist.append(_idx) # get the list of phonemes (without word segmentation) line = line.replace("|", "") line = ToStrip(line) phonlist = line.split() elif line.startswith("cmscore1:"): line = line[9:] # confidence score of the pronunciation of each token scores = [float(s) for s in line.split()] if len(scores) == 0: scores = [0] elif line.startswith("sentence1:"): line = line[10:] # each token tokens = line.split() if len(tokens) == 0: tokens = [""] elif line.startswith("[") and phonidx > -1: # New phonemes line = line.replace("[", "") line = line.replace("]", "") line = ToStrip(line) tab = line.split(" ") # tab 0: first frame # tab 1: last frame # tab 2: score of the segmentation (log proba) # tab 3: triphone used loc_s = float(tab[0]) / 100.0 loc_e = float(tab[1]) / 100.0 if len(tab) > 3: # Put real phoneme instead of triphones _phonalign.append([loc_s, loc_e, phonlist[phonidx], tab[2]]) else: _phonalign.append([loc_s, loc_e, "", tab[2]]) phonidx = phonidx + 1 # Adjust time values and create wordalign wordidx = 0 # word index wordloc_s = 0.0 # word start time loc_e = 0.0 nextloc_s = 0.0 for phonidx in range(len(_phonalign)): # Fix the end of this annotation to the begin of the next one. loc_e = _phonalign[phonidx][1] if phonidx < (len(_phonalign) - 1): nextloc_s = _phonalign[phonidx + 1][0] else: nextloc_s = 0.0 if loc_e < nextloc_s: loc_e = nextloc_s _phonalign[phonidx][1] = loc_e # Override the segmentation score of the phone by # the score of the pronunciation of the word _phonalign[phonidx][3] = scores[wordidx] # add also the word? if phonidx == wordlist[wordidx]: _wordalign.append([wordloc_s, loc_e, tokens[wordidx], scores[wordidx]]) wordidx = wordidx + 1 wordloc_s = loc_e # last word, or the only entry in case of empty interval... if len(wordseq) - 1 == wordidx: _wordalign.append([wordloc_s, loc_e, tokens[wordidx - 1], scores[wordidx - 1]]) return (_phonalign, _wordalign)