Example #1
0
    def add_from_dict(self, dictfilename):
        """
        Add the list of phones from a pronunciation dictionary.

        @param dictfilename (str) is the name of an HTK-ASCII pronunciation dictionary

        """
        d = DictPron( dictfilename ).get_dict()
        for value in d.values():
            variants = value.split("|")
            for variant in variants:
                phones = variant.split("-")
                for phone in phones:
                    self.add( phone )
Example #2
0
 def test_save(self):
     d = DictPron( DICT_FRA )
     d.save_as_ascii( DICT_FRA+".copy" )
     d2 = DictPron( DICT_FRA+".copy", nodump=True )
     for w in d.get_keys():
         self.assertEqual( d.get_pron(w), d2.get_pron(w) )
     os.remove( DICT_FRA+".copy" )
Example #3
0
    def gen_dependencies(self, grammarname, dictname):
        """
        Generate the dependencies (grammar, dictionary) for HVite.

        @param grammarname is the file name of the tokens
        @param dictname is the dictionary file name

        """
        dictpron = DictPron()

        with codecs.open(grammarname, 'w', encoding) as flab:

            for token,pron in zip(self._tokens.split(),self._phones.split()):

                # dictionary:
                for variant in pron.split("|"):
                    dictpron.add_pron( token, variant.replace("-"," ") )
                    if self._infersp is True:
                        variant = variant + '-sil'
                        dictpron.add_pron( token, variant.replace("-"," ") )

                # lab file (one token per line)
                flab.write( token+"\n")

        dictpron.save_as_ascii( dictname )
Example #4
0
 def test_dict(self):
     d = DictPron( DICT_FRA )
     self.assertTrue( d.is_unk('azerty') )
     self.assertFalse( d.is_unk('il_y_a') )
     self.assertFalse( d.is_unk(u'ĂȘtre') )
     self.assertEqual( d.get_pron(u'sil'), "s.i.l" )
     self.assertEqual( d.get_pron(u'azerty'), "UNK" )
Example #5
0
    def test_phonetizeFR(self):
        dictdir  = os.path.join(SPPAS, "resources", "dict")
        dictfile = os.path.join(dictdir, "fra.dict")
        dd = DictPron(dictfile)
        grph = DictPhon(dd)
        result = grph.phonetize('pas_encore', phonunk=False)
        self.assertEqual(result, 'UNK')

        result = grph.phonetize('pas_encore', phonunk=True)
        self.assertEqual(result,
                         "p.a.a~.k.o.r|p.a.z.a~.k.o.r|p.a.a~.k.o.r.eu|p.a.z.a~.k.o.r.eu")

        result = grph.phonetize(u'/lemot/', phonunk=True)
        self.assertEqual(result, u"lemot")
        result = grph.phonetize(u'/lemot/', phonunk=False)
        self.assertEqual(result, u"lemot")
Example #6
0
    def gen_slm_dependencies(self, basename, N=3):
        """
        Generate the dependencies (slm, dictionary) for julius.

        @param basename (str - IN) the base name of the slm file and of the dictionary file
        @param N (int) Language model N-gram length.

        """
        dictname = basename + ".dict"
        slmname  = basename + ".arpa"

        phoneslist = self._phones.split()
        tokenslist = self._tokens.split()

        dictpron = DictPron()

        for token,pron in zip(tokenslist,phoneslist):
            for variant in pron.split("|"):
                dictpron.add_pron( token, variant.replace("-"," ") )

        if dictpron.is_unk(START_SENT_SYMBOL) is True:
            dictpron.add_pron( START_SENT_SYMBOL, "sil" )
        if dictpron.is_unk(END_SENT_SYMBOL) is True:
            dictpron.add_pron(  END_SENT_SYMBOL, "sil" )

        dictpron.save_as_ascii( dictname, False )

        # Write the SLM
        model = NgramsModel(N)
        model.append_sentences( [self._tokens] )
        probas = model.probabilities( method="logml" )
        arpaio = ArpaIO()
        arpaio.set( probas )
        arpaio.save( slmname )