def add_from_dict(self, dictfilename): """ Add the list of phones from a pronunciation dictionary. @param dictfilename (str) is the name of an HTK-ASCII pronunciation dictionary """ d = DictPron( dictfilename ).get_dict() for value in d.values(): variants = value.split("|") for variant in variants: phones = variant.split("-") for phone in phones: self.add( phone )
def test_save(self): d = DictPron( DICT_FRA ) d.save_as_ascii( DICT_FRA+".copy" ) d2 = DictPron( DICT_FRA+".copy", nodump=True ) for w in d.get_keys(): self.assertEqual( d.get_pron(w), d2.get_pron(w) ) os.remove( DICT_FRA+".copy" )
def gen_dependencies(self, grammarname, dictname): """ Generate the dependencies (grammar, dictionary) for HVite. @param grammarname is the file name of the tokens @param dictname is the dictionary file name """ dictpron = DictPron() with codecs.open(grammarname, 'w', encoding) as flab: for token,pron in zip(self._tokens.split(),self._phones.split()): # dictionary: for variant in pron.split("|"): dictpron.add_pron( token, variant.replace("-"," ") ) if self._infersp is True: variant = variant + '-sil' dictpron.add_pron( token, variant.replace("-"," ") ) # lab file (one token per line) flab.write( token+"\n") dictpron.save_as_ascii( dictname )
def test_dict(self): d = DictPron( DICT_FRA ) self.assertTrue( d.is_unk('azerty') ) self.assertFalse( d.is_unk('il_y_a') ) self.assertFalse( d.is_unk(u'être') ) self.assertEqual( d.get_pron(u'sil'), "s.i.l" ) self.assertEqual( d.get_pron(u'azerty'), "UNK" )
def test_phonetizeFR(self): dictdir = os.path.join(SPPAS, "resources", "dict") dictfile = os.path.join(dictdir, "fra.dict") dd = DictPron(dictfile) grph = DictPhon(dd) result = grph.phonetize('pas_encore', phonunk=False) self.assertEqual(result, 'UNK') result = grph.phonetize('pas_encore', phonunk=True) self.assertEqual(result, "p.a.a~.k.o.r|p.a.z.a~.k.o.r|p.a.a~.k.o.r.eu|p.a.z.a~.k.o.r.eu") result = grph.phonetize(u'/lemot/', phonunk=True) self.assertEqual(result, u"lemot") result = grph.phonetize(u'/lemot/', phonunk=False) self.assertEqual(result, u"lemot")
def gen_slm_dependencies(self, basename, N=3): """ Generate the dependencies (slm, dictionary) for julius. @param basename (str - IN) the base name of the slm file and of the dictionary file @param N (int) Language model N-gram length. """ dictname = basename + ".dict" slmname = basename + ".arpa" phoneslist = self._phones.split() tokenslist = self._tokens.split() dictpron = DictPron() for token,pron in zip(tokenslist,phoneslist): for variant in pron.split("|"): dictpron.add_pron( token, variant.replace("-"," ") ) if dictpron.is_unk(START_SENT_SYMBOL) is True: dictpron.add_pron( START_SENT_SYMBOL, "sil" ) if dictpron.is_unk(END_SENT_SYMBOL) is True: dictpron.add_pron( END_SENT_SYMBOL, "sil" ) dictpron.save_as_ascii( dictname, False ) # Write the SLM model = NgramsModel(N) model.append_sentences( [self._tokens] ) probas = model.probabilities( method="logml" ) arpaio = ArpaIO() arpaio.set( probas ) arpaio.save( slmname )