Exemple #1
0
class MyTestCase(CorpusTest, unittest.TestCase):

    simpleUniGram: NGram
    simpleBiGram: NGram
    simpleTriGram: NGram
    complexUniGram: NGram
    complexBiGram: NGram
    complexTriGram: NGram
    simpleCorpus: list
    trainCorpus: list
    testCorpus: list
    validationCorpus: list

    def setUp(self) -> None:
        self.simpleCorpus = [
            ["<s>", "ali", "topu", "at", "mehmet", "ayşeye", "gitti", "</s>"],
            ["<s>", "ali", "top", "at", "ayşe", "eve", "gitti", "</s>"],
            ["<s>", "ayşe", "kitabı", "ver", "</s>"],
            ["<s>", "ali", "topu", "mehmete", "at", "</s>"],
            ["<s>", "ali", "topu", "at", "mehmet", "ayşeyle", "gitti", "</s>"]
        ]
        self.simpleUniGram = NGram(1, self.simpleCorpus)
        self.simpleBiGram = NGram(2, self.simpleCorpus)
        self.simpleTriGram = NGram(3, self.simpleCorpus)
        self.trainCorpus = self.readCorpus("../train.txt")
        self.complexUniGram = NGram(1, self.trainCorpus)
        self.complexBiGram = NGram(2, self.trainCorpus)
        self.complexTriGram = NGram(3, self.trainCorpus)
        self.testCorpus = self.readCorpus("../test.txt")
        self.validationCorpus = self.readCorpus("../validation.txt")

    def test_GetCountSimple(self):
        self.assertEqual(5, self.simpleUniGram.getCount(["<s>"]))
        self.assertEqual(0, self.simpleUniGram.getCount(["mahmut"]), 0.0)
        self.assertEqual(1, self.simpleUniGram.getCount(["kitabı"]), 0.0)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["ayşe", "ali"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["mahmut", "ali"]), 0.0)
        self.assertEqual(2, self.simpleBiGram.getCount(["at", "mehmet"]), 0.0)
        self.assertEqual(1, self.simpleTriGram.getCount(["<s>", "ali", "top"]),
                         0.0)
        self.assertEqual(0,
                         self.simpleTriGram.getCount(["ayşe", "kitabı", "at"]),
                         0.0)
        self.assertEqual(0, self.simpleTriGram.getCount(["ayşe", "topu",
                                                         "at"]), 0.0)
        self.assertEqual(
            0, self.simpleTriGram.getCount(["mahmut", "evde", "kal"]), 0.0)
        self.assertEqual(2, self.simpleTriGram.getCount(["ali", "topu", "at"]),
                         0.0)

    def test_GetCountComplex(self):
        self.assertEqual(20000, self.complexUniGram.getCount(["<s>"]), 0.0)
        self.assertEqual(50, self.complexUniGram.getCount(["atatürk"]), 0.0)
        self.assertEqual(11, self.complexBiGram.getCount(["<s>", "mustafa"]),
                         0.0)
        self.assertEqual(3, self.complexBiGram.getCount(["mustafa", "kemal"]),
                         0.0)
        self.assertEqual(
            1, self.complexTriGram.getCount(["<s>", "mustafa", "kemal"]), 0.0)
        self.assertEqual(
            1, self.complexTriGram.getCount(["mustafa", "kemal", "atatürk"]),
            0.0)

    def test_VocabularySizeSimple(self):
        self.assertEqual(15, self.simpleUniGram.vocabularySize())

    def test_VocabularySizeComplex(self):
        self.assertEqual(57625, self.complexUniGram.vocabularySize(), 0.0)
        self.complexUniGram = NGram(1, self.testCorpus)
        self.assertEqual(55485, self.complexUniGram.vocabularySize(), 0.0)
        self.complexUniGram = NGram(1, self.validationCorpus)
        self.assertEqual(35663, self.complexUniGram.vocabularySize(), 0.0)

    def test_SaveAsText(self):
        self.simpleUniGram.saveAsText("simple1.txt")
        self.simpleBiGram.saveAsText("simple2.txt")
        self.simpleTriGram.saveAsText("simple3.txt")
Exemple #2
0
class NGramTest(CorpusTest, unittest.TestCase):

    simpleUniGram: NGram
    simpleBiGram: NGram
    simpleTriGram: NGram
    complexUniGram: NGram
    complexBiGram: NGram
    complexTriGram: NGram
    simpleCorpus: list
    trainCorpus: list
    testCorpus: list
    validationCorpus: list

    def setUp(self) -> None:
        self.simpleCorpus = [
            ["<s>", "ali", "topu", "at", "mehmet", "ayşeye", "gitti", "</s>"],
            ["<s>", "ali", "top", "at", "ayşe", "eve", "gitti", "</s>"],
            ["<s>", "ayşe", "kitabı", "ver", "</s>"],
            ["<s>", "ali", "topu", "mehmete", "at", "</s>"],
            ["<s>", "ali", "topu", "at", "mehmet", "ayşeyle", "gitti", "</s>"]
        ]
        self.simpleUniGram = NGram(1, self.simpleCorpus)
        self.simpleBiGram = NGram(2, self.simpleCorpus)
        self.simpleTriGram = NGram(3, self.simpleCorpus)
        self.trainCorpus = self.readCorpus("../train.txt")
        self.complexUniGram = NGram(1, self.trainCorpus)
        self.complexBiGram = NGram(2, self.trainCorpus)
        self.complexTriGram = NGram(3, self.trainCorpus)
        self.testCorpus = self.readCorpus("../test.txt")
        self.validationCorpus = self.readCorpus("../validation.txt")

    def test_GetCountSimple(self):
        self.assertEqual(5, self.simpleUniGram.getCount(["<s>"]))
        self.assertEqual(0, self.simpleUniGram.getCount(["mahmut"]), 0.0)
        self.assertEqual(1, self.simpleUniGram.getCount(["kitabı"]), 0.0)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["ayşe", "ali"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["mahmut", "ali"]), 0.0)
        self.assertEqual(2, self.simpleBiGram.getCount(["at", "mehmet"]), 0.0)
        self.assertEqual(1, self.simpleTriGram.getCount(["<s>", "ali", "top"]),
                         0.0)
        self.assertEqual(0,
                         self.simpleTriGram.getCount(["ayşe", "kitabı", "at"]),
                         0.0)
        self.assertEqual(0, self.simpleTriGram.getCount(["ayşe", "topu",
                                                         "at"]), 0.0)
        self.assertEqual(
            0, self.simpleTriGram.getCount(["mahmut", "evde", "kal"]), 0.0)
        self.assertEqual(2, self.simpleTriGram.getCount(["ali", "topu", "at"]),
                         0.0)

    def test_GetCountComplex(self):
        self.assertEqual(20000, self.complexUniGram.getCount(["<s>"]), 0.0)
        self.assertEqual(50, self.complexUniGram.getCount(["atatürk"]), 0.0)
        self.assertEqual(11, self.complexBiGram.getCount(["<s>", "mustafa"]),
                         0.0)
        self.assertEqual(3, self.complexBiGram.getCount(["mustafa", "kemal"]),
                         0.0)
        self.assertEqual(
            1, self.complexTriGram.getCount(["<s>", "mustafa", "kemal"]), 0.0)
        self.assertEqual(
            1, self.complexTriGram.getCount(["mustafa", "kemal", "atatürk"]),
            0.0)

    def test_VocabularySizeSimple(self):
        self.assertEqual(15, self.simpleUniGram.vocabularySize())

    def test_VocabularySizeComplex(self):
        self.assertEqual(57625, self.complexUniGram.vocabularySize(), 0.0)
        self.complexUniGram = NGram(1, self.testCorpus)
        self.assertEqual(55485, self.complexUniGram.vocabularySize(), 0.0)
        self.complexUniGram = NGram(1, self.validationCorpus)
        self.assertEqual(35663, self.complexUniGram.vocabularySize(), 0.0)

    def test_Prune(self):
        self.simpleBiGram.prune(0.0)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(1, self.simpleBiGram.getCount(["<s>", "ayşe"]), 0.0)
        self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0)
        self.assertEqual(1, self.simpleBiGram.getCount(["ali", "top"]), 0.0)
        self.assertEqual(2, self.simpleBiGram.getCount(["topu", "at"]), 0.0)
        self.assertEqual(1, self.simpleBiGram.getCount(["topu", "mehmete"]),
                         0.0)
        self.simpleBiGram.prune(0.6)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["<s>", "ayşe"]), 0.0)
        self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["ali", "top"]), 0.0)
        self.assertEqual(2, self.simpleBiGram.getCount(["topu", "at"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["topu", "mehmete"]),
                         0.0)
        self.simpleBiGram.prune(0.7)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0)
        self.assertEqual(2, self.simpleBiGram.getCount(["topu", "at"]), 0.0)
        self.simpleBiGram.prune(0.8)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0)
        self.simpleBiGram.prune(0.9)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)

    def test_SaveAsText(self):
        self.simpleUniGram.saveAsText("simple1.txt")
        self.simpleBiGram.saveAsText("simple2.txt")
        self.simpleTriGram.saveAsText("simple3.txt")

    def test_Merge(self):
        self.simpleUniGram = NGram("simple1a.txt")
        self.simpleUniGram.merge(NGram("simple1b.txt"))
        self.assertEqual(18, self.simpleUniGram.vocabularySize())
        self.simpleBiGram = NGram("simple2a.txt")
        self.simpleBiGram.merge(NGram("simple2b.txt"))
        self.simpleBiGram.merge(NGram("simple2c.txt"))
        self.simpleBiGram.merge(NGram("simple2d.txt"))
        self.assertEqual(21, self.simpleBiGram.vocabularySize())
        self.simpleTriGram = NGram("simple3a.txt")
        self.simpleTriGram.merge(NGram("simple3b.txt"))
        self.simpleTriGram.merge(NGram("simple3c.txt"))
        self.assertEqual(20, self.simpleTriGram.vocabularySize())

    def test_LoadMultiPart(self):
        self.simpleUniGram = NGram(1)
        self.simpleUniGram.initWithMultipleFile("simple1part1.txt",
                                                "simple1part2.txt")
        self.simpleBiGram = NGram(2)
        self.simpleBiGram.initWithMultipleFile("simple2part1.txt",
                                               "simple2part2.txt",
                                               "simple2part3.txt")
        self.simpleTriGram = NGram(3)
        self.simpleTriGram.initWithMultipleFile("simple3part1.txt",
                                                "simple3part2.txt",
                                                "simple3part3.txt",
                                                "simple3part4.txt")
        self.test_GetCountSimple()
        self.test_VocabularySizeSimple()