Example #1
0
 def testInit(self):
     with self.assertRaises(NgramOrderValueError):
         m = sppasNgramsModel(0)
         m = sppasNgramsModel(100)
     m = sppasNgramsModel(1)
     with self.assertRaises(NgramCountValueError):
         m.set_min_count(0)
         m.set_min_count("a")
     with self.assertRaises(NgramMethodNameError):
         p = m.probabilities(method="toto")
Example #2
0
    def gen_slm_dependencies(self, basename, N=3):
        """ Generate the dependencies (slm, dictionary) for julius.

        :param basename: (str) the base name of the slm file and of the dictionary file
        :param N: (int) Language model N-gram length.

        """
        dictname = basename + ".dict"
        slmname  = basename + ".arpa"

        phoneslist = self._phones.split()
        tokenslist = self._tokens.split()

        dictpron = sppasDictPron()

        for token, pron in zip(tokenslist, phoneslist):
            for variant in pron.split("|"):
                dictpron.add_pron(token, variant.replace("-", " "))

        if dictpron.is_unk(START_SENT_SYMBOL) is True:
            dictpron.add_pron(START_SENT_SYMBOL, "sil")
        if dictpron.is_unk(END_SENT_SYMBOL) is True:
            dictpron.add_pron( END_SENT_SYMBOL, "sil")

        dictpron.save_as_ascii(dictname, False)

        # Write the SLM
        model = sppasNgramsModel(N)
        model.append_sentences([self._tokens])
        probas = model.probabilities(method="logml")
        arpaio = sppasArpaIO()
        arpaio.set(probas)
        arpaio.save(slmname)
Example #3
0
    def testARPA(self):
        arpaio = sppasArpaIO()
        with self.assertRaises(ModelsDataTypeError):
            arpaio.set("toto")
            arpaio.set([])
            arpaio.set([[], 0])

        fn1 = os.path.join(TEMP, "model1.arpa")
        fn2 = os.path.join(TEMP, "model2.arpa")
        model = sppasNgramsModel(3)
        model.count(self.corpusfile)
        probas = model.probabilities("logml")
        arpaio.set(probas)
        arpaio.save(fn1)

        slm1 = sppasSLM()
        slm1.load_from_arpa(fn1)
        slm1.save_as_arpa(fn2)

        slm2 = sppasSLM()
        slm2.load_from_arpa(fn2)

        m1 = slm1.model
        m2 = slm2.model
        sp = sppasCompare()
        self.assertTrue(sp.equals(m1, m2))
Example #4
0
    def testRawProbabilities(self):
        model = sppasNgramsModel(2)
        model.count(self.corpusfile)
        probas = model.probabilities(method="raw")
        self.assertEqual(len(probas), 2)

        unigram = probas[0]
        for token, value, bo in unigram:
            if token == "a":
                self.assertEqual(value, 15)
            if token == 'b':
                self.assertEqual(value, 10)
            if token == 'c':
                self.assertEqual(value, 4)
            if token == 'd':
                self.assertEqual(value, 3)
            if token == START_SENT_SYMBOL:
                self.assertEqual(value, 0)
            if token == END_SENT_SYMBOL:
                self.assertEqual(value, 3)

        bigram = probas[1]
        for token, value, bo in bigram:
            if token == "a b":
                self.assertEqual(value, 7)
            if token == "b a":
                self.assertEqual(value, 4)
            if token == START_SENT_SYMBOL + ' a':
                self.assertEqual(value, 3)
            if token == 'b ' + END_SENT_SYMBOL:
                self.assertEqual(value, 3)

        probas = model.probabilities(method="lograw")
        self.assertEqual(len(probas), 2)

        unigram = probas[0]
        for token, value, bo in unigram:
            if token == "a":
                self.assertEqual(value, math.log(15, 10))
            if token == 'b':
                self.assertEqual(value, math.log(10, 10))
            if token == 'c':
                self.assertEqual(value, math.log(4, 10))
            if token == 'd':
                self.assertEqual(value, math.log(3, 10))
            if token == START_SENT_SYMBOL:
                self.assertEqual(value, -99)
            if token == END_SENT_SYMBOL:
                self.assertEqual(value, math.log(3, 10))

        bigram = probas[1]
        for token, value, bo in bigram:
            if token == "a b":
                self.assertEqual(value, math.log(7, 10))
            if token == "b a":
                self.assertEqual(value, math.log(4, 10))
            if token == START_SENT_SYMBOL + ' a':
                self.assertEqual(value, math.log(3, 10))
            if token == 'b ' + END_SENT_SYMBOL:
                self.assertEqual(value, math.log(3, 10))
Example #5
0
    def gen_slm_dependencies(self, basename, N=3):
        """Generate the dependencies (slm, dictionary) for julius.

        :param basename: (str) base name of the slm and dictionary files
        :param N: (int) Language model N-gram length.

        """
        dict_name = basename + ".dict"
        slm_name = basename + ".arpa"

        phoneslist = self._phones.split()
        tokenslist = self._tokens.split()

        dictpron = sppasDictPron()

        for token, pron in zip(tokenslist, phoneslist):
            for variant in pron.split("|"):
                dictpron.add_pron(token, variant.replace("-", " "))

        if dictpron.is_unk(START_SENT_SYMBOL) is True:
            dictpron.add_pron(START_SENT_SYMBOL, SIL_PHON)
        if dictpron.is_unk(END_SENT_SYMBOL) is True:
            dictpron.add_pron(END_SENT_SYMBOL, SIL_PHON)

        dictpron.save_as_ascii(dict_name, False)

        # Write the SLM
        model = sppasNgramsModel(N)
        model.append_sentences([self._tokens])
        probas = model.probabilities(method="logml")
        arpaio = sppasArpaIO()
        arpaio.set(probas)
        arpaio.save(slm_name)
Example #6
0
    def testRawProbabilities(self):
        model = sppasNgramsModel(2)
        model.count(self.corpusfile)
        probas = model.probabilities(method="raw")
        self.assertEqual(len(probas), 2)

        unigram = probas[0]
        for token, value, bo in unigram:
            if token == "a":
                self.assertEqual(value, 15)
            if token == 'b':
                self.assertEqual(value, 10)
            if token == 'c':
                self.assertEqual(value, 4)
            if token == 'd':
                self.assertEqual(value, 3)
            if token == START_SENT_SYMBOL:
                self.assertEqual(value, 0)
            if token == END_SENT_SYMBOL:
                self.assertEqual(value, 3)

        bigram = probas[1]
        for token, value, bo in bigram:
            if token == "a b":
                self.assertEqual(value, 7)
            if token == "b a":
                self.assertEqual(value, 4)
            if token == START_SENT_SYMBOL+' a':
                self.assertEqual(value, 3)
            if token == 'b '+END_SENT_SYMBOL:
                self.assertEqual(value, 3)

        probas = model.probabilities(method="lograw")
        self.assertEqual(len(probas), 2)

        unigram = probas[0]
        for token, value, bo in unigram:
            if token == "a":
                self.assertEqual(value, math.log(15, 10))
            if token == 'b':
                self.assertEqual(value, math.log(10, 10))
            if token == 'c':
                self.assertEqual(value, math.log(4, 10))
            if token == 'd':
                self.assertEqual(value, math.log(3, 10))
            if token == START_SENT_SYMBOL:
                self.assertEqual(value, -99)
            if token == END_SENT_SYMBOL:
                self.assertEqual(value, math.log(3, 10))

        bigram = probas[1]
        for token, value, bo in bigram:
            if token == "a b":
                self.assertEqual(value, math.log(7, 10))
            if token == "b a":
                self.assertEqual(value, math.log(4, 10))
            if token == START_SENT_SYMBOL+' a':
                self.assertEqual(value, math.log(3, 10))
            if token == 'b '+END_SENT_SYMBOL:
                self.assertEqual(value, math.log(3, 10))
Example #7
0
 def testCount(self):
     model = sppasNgramsModel(2)
     model.count(self.corpusfile)
     self.assertEqual(len(model._ngramcounts), 2)
     ngramcounter = model._ngramcounts[0]
     self.assertEqual(ngramcounter.get_count('a'), 15)
     self.assertEqual(ngramcounter.get_count('b'), 10)
     self.assertEqual(ngramcounter.get_count('c'), 4)
     self.assertEqual(ngramcounter.get_count('d'), 3)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
     ngramcounter = model._ngramcounts[1]
     self.assertEqual(ngramcounter.get_count('a b'), 7)
     self.assertEqual(ngramcounter.get_count('b a'), 4)
     self.assertEqual(ngramcounter.get_count('d b'), 1)
     self.assertEqual(ngramcounter.get_count('d c'), 2)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL + ' a'), 3)
     self.assertEqual(ngramcounter.get_count('b ' + END_SENT_SYMBOL), 3)
Example #8
0
 def testCount(self):
     model = sppasNgramsModel(2)
     model.count(self.corpusfile)
     self.assertEqual(len(model._ngramcounts), 2)
     ngramcounter = model._ngramcounts[0]
     self.assertEqual(ngramcounter.get_count('a'), 15)
     self.assertEqual(ngramcounter.get_count('b'), 10)
     self.assertEqual(ngramcounter.get_count('c'), 4)
     self.assertEqual(ngramcounter.get_count('d'), 3)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
     self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
     ngramcounter = model._ngramcounts[1]
     self.assertEqual(ngramcounter.get_count('a b'), 7)
     self.assertEqual(ngramcounter.get_count('b a'), 4)
     self.assertEqual(ngramcounter.get_count('d b'), 1)
     self.assertEqual(ngramcounter.get_count('d c'), 2)
     self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL+' a'), 3)
     self.assertEqual(ngramcounter.get_count('b '+END_SENT_SYMBOL), 3)
Example #9
0
    def testARPA(self):
        fn1 = os.path.join(TEMP, "model1.arpa")
        fn2 = os.path.join(TEMP, "model2.arpa")
        model = sppasNgramsModel(3)
        model.count(self.corpusfile)
        probas = model.probabilities("logml")
        arpaio = sppasArpaIO()
        arpaio.set(probas)
        arpaio.save(fn1)

        slm1 = sppasSLM()
        slm1.load_from_arpa(fn1)
        slm1.save_as_arpa(fn2)

        slm2 = sppasSLM()
        slm2.load_from_arpa(fn2)

        m1 = slm1.model
        m2 = slm2.model
        sp = sppasCompare()
        self.assertTrue(sp.equals(m1, m2))
Example #10
0
    def testMaximumLikelihoodProbabilities(self):
        model = sppasNgramsModel(3)
        model.count(self.corpusfile)
        probas = model.probabilities(method="ml")
        self.assertEqual(len(probas), 3)

        unigram = probas[0]
        for token, value, bo in unigram:
            if token == "a":
                self.assertEqual(round(value, 6), 0.428571)
            if token == "b":
                self.assertEqual(round(value, 6), 0.285714)
            if token == "c":
                self.assertEqual(round(value, 6), 0.114286)
            if token == "d":
                self.assertEqual(round(value, 6), 0.085714)
            if token == START_SENT_SYMBOL:
                self.assertEqual(round(value, 6), 0.)
            if token == END_SENT_SYMBOL:
                self.assertEqual(round(value, 6), 0.085714)

        bigram = probas[1]
        for token, value, bo in bigram:
            if token == "a b":
                self.assertEqual(round(value, 6), 0.466667)
            if token == "b a":
                self.assertEqual(round(value, 6), 0.400000)

        trigram = probas[2]
        for token, value, bo in trigram:
            if token == "a b a":
                self.assertEqual(round(value, 6), 0.142857)
            if token == START_SENT_SYMBOL + "a a":
                self.assertEqual(round(value, 6), 0.500000)
            if token == "a b" + END_SENT_SYMBOL:
                self.assertEqual(round(value, 6), 0.428571)

        probas = model.probabilities(method="logml")
        self.assertEqual(len(probas), 3)

        unigram = probas[0]
        for token, value, bo in unigram:
            if token == "a":
                self.assertEqual(round(value, 6),
                                 round(math.log(0.42857143, 10), 6))
            if token == "b":
                self.assertEqual(round(value, 6),
                                 round(math.log(0.28571429, 10), 6))
            if token == "c":
                self.assertEqual(round(value, 6),
                                 round(math.log(0.11428571, 10), 6))
            if token == "d":
                self.assertEqual(round(value, 6),
                                 round(math.log(0.08571429, 10), 6))
            if token == START_SENT_SYMBOL:
                self.assertEqual(round(value, 6), -99.000000)
            if token == END_SENT_SYMBOL:
                self.assertEqual(round(value, 6),
                                 round(math.log(0.08571429, 10), 6))

        bigram = probas[1]
        for token, value, bo in bigram:
            if token == "a b":
                self.assertEqual(round(value, 6),
                                 round(math.log(0.466667, 10), 6))
            if token == "b a":
                self.assertEqual(round(value, 6),
                                 round(math.log(0.400000, 10), 6))

        trigram = probas[2]
        for token, value, bo in trigram:
            if token == "a b a":
                self.assertEqual(round(value, 6),
                                 round(math.log(0.142857, 10), 6))
            if token == START_SENT_SYMBOL + "a a":
                self.assertEqual(round(value, 6),
                                 round(math.log(0.500000, 10), 6))
            if token == "a b" + END_SENT_SYMBOL:
                self.assertEqual(round(value, 6),
                                 round(math.log(0.428571, 10), 6))
Example #11
0
    def testMaximumLikelihoodProbabilities(self):
        model = sppasNgramsModel(3)
        model.count(self.corpusfile)
        probas = model.probabilities(method="ml")
        self.assertEqual(len(probas), 3)

        unigram = probas[0]
        for token, value, bo in unigram:
            if token == "a":
                self.assertEqual(round(value, 6), 0.428571)
            if token == "b":
                self.assertEqual(round(value, 6), 0.285714)
            if token == "c":
                self.assertEqual(round(value, 6), 0.114286)
            if token == "d":
                self.assertEqual(round(value, 6), 0.085714)
            if token == START_SENT_SYMBOL:
                self.assertEqual(round(value, 6), 0.)
            if token == END_SENT_SYMBOL:
                self.assertEqual(round(value, 6), 0.085714)

        bigram = probas[1]
        for token, value, bo in bigram:
            if token == "a b":
                self.assertEqual(round(value, 6), 0.466667)
            if token == "b a":
                self.assertEqual(round(value, 6), 0.400000)

        trigram = probas[2]
        for token, value, bo in trigram:
            if token == "a b a":
                self.assertEqual(round(value, 6), 0.142857)
            if token == START_SENT_SYMBOL+"a a":
                self.assertEqual(round(value, 6), 0.500000)
            if token == "a b"+END_SENT_SYMBOL:
                self.assertEqual(round(value, 6), 0.428571)

        probas = model.probabilities(method="logml")
        self.assertEqual(len(probas), 3)

        unigram = probas[0]
        for token, value, bo in unigram:
            if token == "a":
                self.assertEqual(round(value, 6), round(math.log(0.42857143, 10), 6))
            if token == "b":
                self.assertEqual(round(value, 6), round(math.log(0.28571429, 10), 6))
            if token == "c":
                self.assertEqual(round(value, 6), round(math.log(0.11428571, 10), 6))
            if token == "d":
                self.assertEqual(round(value, 6), round(math.log(0.08571429, 10), 6))
            if token == START_SENT_SYMBOL:
                self.assertEqual(round(value, 6), -99.000000)
            if token == END_SENT_SYMBOL:
                self.assertEqual(round(value, 6), round(math.log(0.08571429, 10), 6))

        bigram = probas[1]
        for token, value, bo in bigram:
            if token == "a b":
                self.assertEqual(round(value, 6), round(math.log(0.466667, 10), 6))
            if token == "b a":
                self.assertEqual(round(value, 6), round(math.log(0.400000, 10), 6))

        trigram = probas[2]
        for token, value, bo in trigram:
            if token == "a b a":
                self.assertEqual(round(value, 6), round(math.log(0.142857, 10), 6))
            if token == START_SENT_SYMBOL+"a a":
                self.assertEqual(round(value, 6), round(math.log(0.500000, 10), 6))
            if token == "a b"+END_SENT_SYMBOL:
                self.assertEqual(round(value, 6), round(math.log(0.428571, 10), 6))
Example #12
0
                    action='store_true',
                    help="Disable the verbosity.")

if len(sys.argv) <= 1:
    sys.argv.append('-h')

args = parser.parse_args()

# ----------------------------------------------------------------------------
# Main program
# ----------------------------------------------------------------------------

# ---------------------------------
# 1. Create a sppasNgramsModel

model = sppasNgramsModel(args.n)
if args.r:
    model.set_vocab(args.r)

# ---------------------------------
# 2. Estimate counts of each n-gram

model.count(*(args.i))

# ---------------------------------
# 3. Estimate probabilities

probas = model.probabilities(args.m)

# ---------------------------------
# 4. Write in an ARPA file