class HmmDisambiguation(NaiveDisambiguation): wordBiGramModel: NGram igBiGramModel: NGram def train(self, corpus: DisambiguationCorpus): """ The train method gets sentences from given DisambiguationCorpus and both word and the next word of that sentence at each iteration. Then, adds these words together with their part of speech tags to word unigram and bigram models. It also adds the last inflectional group of word to the ig unigram and bigram models. At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing, and both word and ig bigram models by using InterpolatedSmoothing. PARAMETERS ---------- corpus : DisambiguationCorpus DisambiguationCorpus to train. """ words1 = [None] igs1 = [None] words2 = [None, None] igs2 = [None, None] self.wordUniGramModel = NGram(1) self.igUniGramModel = NGram(1) self.wordBiGramModel = NGram(2) self.igBiGramModel = NGram(2) for sentence in corpus.sentences: for j in range(sentence.wordCount() - 1): word = sentence.getWord(j) nextWord = sentence.getWord(j + 1) words2[0] = word.getParse().getWordWithPos() words1[0] = words2[0] words2[1] = nextWord.getParse().getWordWithPos() self.wordUniGramModel.addNGram(words1) self.wordBiGramModel.addNGram(words2) for k in range(nextWord.getParse().size()): igs2[0] = Word( word.getParse().getLastInflectionalGroup().__str__()) igs2[1] = Word( nextWord.getParse().getInflectionalGroup(k).__str__()) self.igBiGramModel.addNGram(igs2) igs1[0] = igs2[1] self.igUniGramModel.addNGram(igs1) self.wordUniGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.igUniGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.wordBiGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.igBiGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) def disambiguate(self, fsmParses: list) -> list: """ The disambiguate method takes FsmParseList as an input and gets one word with its part of speech tags, then gets its probability from word unigram model. It also gets ig and its probability. Then, hold the logarithmic value of the product of these probabilities in an array. Also by taking into consideration the parses of these word it recalculates the probabilities and returns these parses. PARAMETERS ---------- fsmParses : list FsmParseList to disambiguate. RETURNS ------- list List of FsmParses. """ if len(fsmParses) == 0: return None for i in range(len(fsmParses)): if fsmParses[i].size() == 0: return None correctFsmParses = [] probabilities = [[0.0 for _ in range(fsmParses[i].size())] for i in range(len(fsmParses))] best = [[0 for _ in range(fsmParses[i].size())] for i in range(len(fsmParses))] for i in range(fsmParses[0].size()): currentParse = fsmParses[0].getFsmParse(i) if isinstance(currentParse, FsmParse): w1 = currentParse.getWordWithPos() probability = self.wordUniGramModel.getProbability(w1) for j in range(currentParse.size()): ig1 = Word(currentParse.getInflectionalGroup(j).__str__()) probability *= self.igUniGramModel.getProbability(ig1) probabilities[0][i] = math.log(probability) for i in range(1, len(fsmParses)): for j in range(fsmParses[i].size()): bestProbability = -10000 bestIndex = -1 currentParse = fsmParses[i].getFsmParse(j) if isinstance(currentParse, FsmParse): for k in range(fsmParses[i - 1].size()): previousParse = fsmParses[i - 1].getFsmParse(k) w1 = previousParse.getWordWithPos() w2 = currentParse.getWordWithPos() probability = probabilities[i - 1][k] + math.log( self.wordBiGramModel.getProbability(w1, w2)) for t in range(fsmParses[i].getFsmParse(j).size()): ig1 = Word(previousParse.lastInflectionalGroup(). __str__()) ig2 = Word( currentParse.getInflectionalGroup(t).__str__()) probability += math.log( self.igBiGramModel.getProbability(ig1, ig2)) if probability > bestProbability: bestIndex = k bestProbability = probability probabilities[i][j] = bestProbability best[i][j] = bestIndex bestProbability = -10000 bestIndex = -1 for i in range(fsmParses[len(fsmParses) - 1].size()): if probabilities[len(fsmParses) - 1][i] > bestProbability: bestProbability = probabilities[len(fsmParses) - 1][i] bestIndex = i if bestIndex == -1: return None correctFsmParses.append(fsmParses[len(fsmParses) - 1].getFsmParse(bestIndex)) for i in range(len(fsmParses) - 2, -1, -1): bestIndex = best[i + 1][bestIndex] if bestIndex == -1: return None correctFsmParses.insert(0, fsmParses[i].getFsmParse(bestIndex)) return correctFsmParses def saveModel(self): """ Method to save unigrams and bigrams. """ super().saveModel() self.wordBiGramModel.saveAsText("words2.txt") self.igBiGramModel.saveAsText("igs2.txt") def loadModel(self): """ Method to load unigrams and bigrams. """ super().loadModel() self.wordBiGramModel = NGram("words2.txt") self.igBiGramModel = NGram("igs2.txt")
class MyTestCase(CorpusTest, unittest.TestCase): simpleUniGram: NGram simpleBiGram: NGram simpleTriGram: NGram complexUniGram: NGram complexBiGram: NGram complexTriGram: NGram simpleCorpus: list trainCorpus: list testCorpus: list validationCorpus: list def setUp(self) -> None: self.simpleCorpus = [ ["<s>", "ali", "topu", "at", "mehmet", "ayşeye", "gitti", "</s>"], ["<s>", "ali", "top", "at", "ayşe", "eve", "gitti", "</s>"], ["<s>", "ayşe", "kitabı", "ver", "</s>"], ["<s>", "ali", "topu", "mehmete", "at", "</s>"], ["<s>", "ali", "topu", "at", "mehmet", "ayşeyle", "gitti", "</s>"] ] self.simpleUniGram = NGram(1, self.simpleCorpus) self.simpleBiGram = NGram(2, self.simpleCorpus) self.simpleTriGram = NGram(3, self.simpleCorpus) self.trainCorpus = self.readCorpus("../train.txt") self.complexUniGram = NGram(1, self.trainCorpus) self.complexBiGram = NGram(2, self.trainCorpus) self.complexTriGram = NGram(3, self.trainCorpus) self.testCorpus = self.readCorpus("../test.txt") self.validationCorpus = self.readCorpus("../validation.txt") def test_GetCountSimple(self): self.assertEqual(5, self.simpleUniGram.getCount(["<s>"])) self.assertEqual(0, self.simpleUniGram.getCount(["mahmut"]), 0.0) self.assertEqual(1, self.simpleUniGram.getCount(["kitabı"]), 0.0) self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0) self.assertEqual(0, self.simpleBiGram.getCount(["ayşe", "ali"]), 0.0) self.assertEqual(0, self.simpleBiGram.getCount(["mahmut", "ali"]), 0.0) self.assertEqual(2, self.simpleBiGram.getCount(["at", "mehmet"]), 0.0) self.assertEqual(1, self.simpleTriGram.getCount(["<s>", "ali", "top"]), 0.0) self.assertEqual(0, self.simpleTriGram.getCount(["ayşe", "kitabı", "at"]), 0.0) self.assertEqual(0, self.simpleTriGram.getCount(["ayşe", "topu", "at"]), 0.0) self.assertEqual( 0, self.simpleTriGram.getCount(["mahmut", "evde", "kal"]), 0.0) self.assertEqual(2, self.simpleTriGram.getCount(["ali", "topu", "at"]), 0.0) def test_GetCountComplex(self): self.assertEqual(20000, self.complexUniGram.getCount(["<s>"]), 0.0) self.assertEqual(50, self.complexUniGram.getCount(["atatürk"]), 0.0) self.assertEqual(11, self.complexBiGram.getCount(["<s>", "mustafa"]), 0.0) self.assertEqual(3, self.complexBiGram.getCount(["mustafa", "kemal"]), 0.0) self.assertEqual( 1, self.complexTriGram.getCount(["<s>", "mustafa", "kemal"]), 0.0) self.assertEqual( 1, self.complexTriGram.getCount(["mustafa", "kemal", "atatürk"]), 0.0) def test_VocabularySizeSimple(self): self.assertEqual(15, self.simpleUniGram.vocabularySize()) def test_VocabularySizeComplex(self): self.assertEqual(57625, self.complexUniGram.vocabularySize(), 0.0) self.complexUniGram = NGram(1, self.testCorpus) self.assertEqual(55485, self.complexUniGram.vocabularySize(), 0.0) self.complexUniGram = NGram(1, self.validationCorpus) self.assertEqual(35663, self.complexUniGram.vocabularySize(), 0.0) def test_SaveAsText(self): self.simpleUniGram.saveAsText("simple1.txt") self.simpleBiGram.saveAsText("simple2.txt") self.simpleTriGram.saveAsText("simple3.txt")
class RootFirstDisambiguation(NaiveDisambiguation): wordBiGramModel: NGram igBiGramModel: NGram def train(self, corpus: DisambiguationCorpus): """ The train method initially creates new NGrams; wordUniGramModel, wordBiGramModel, igUniGramModel, and igBiGramModel. It gets the sentences from given corpus and gets each word as a DisambiguatedWord. Then, adds the word together with its part of speech tags to the wordUniGramModel. It also gets the transition list of that word and adds it to the igUniGramModel. If there exists a next word in the sentence, it adds the current and next {@link DisambiguatedWord} to the wordBiGramModel with their part of speech tags. It also adds them to the igBiGramModel with their transition lists. At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing, and both word and ig bigram models by using InterpolatedSmoothing. PARAMETERS ---------- corpus : DisambiguationCorpus DisambiguationCorpus to train. """ words1 = [None] igs1 = [None] words2 = [None, None] igs2 = [None, None] self.wordUniGramModel = NGram(1) self.wordBiGramModel = NGram(2) self.igUniGramModel = NGram(1) self.igBiGramModel = NGram(2) for sentence in corpus.sentences: for j in range(sentence.wordCount()): word = sentence.getWord(j) if isinstance(word, DisambiguatedWord): words1[0] = word.getParse().getWordWithPos() self.wordUniGramModel.addNGram(words1) igs1[0] = Word(word.getParse().getTransitionList()) self.igUniGramModel.addNGram(igs1) if j + 1 < sentence.wordCount(): words2[0] = words1[0] words2[1] = sentence.getWord( j + 1).getParse().getWordWithPos() self.wordBiGramModel.addNGram(words2) igs2[0] = igs1[0] igs2[1] = Word( sentence.getWord(j + 1).getParse().getTransitionList()) self.igBiGramModel.addNGram(igs2) self.wordUniGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.igUniGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.wordBiGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.igBiGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) def getWordProbability(self, word: Word, correctFsmParses: list, index: int) -> float: """ The getWordProbability method returns the probability of a word by using word bigram or unigram model. PARAMETERS ---------- word : Word Word to find the probability. correctFsmParses : list FsmParse of given word which will be used for getting part of speech tags. index : int Index of FsmParse of which part of speech tag will be used to get the probability. RETURNS ------- float The probability of the given word. """ if index != 0 and len(correctFsmParses) == index: return self.wordBiGramModel.getProbability( correctFsmParses[index - 1].getWordWithPos(), word) else: return self.wordUniGramModel.getProbability(word) def getIgProbability(self, word: Word, correctFsmParses: list, index: int) -> float: """ The getIgProbability method returns the probability of a word by using ig bigram or unigram model. PARAMETERS ---------- word : Word Word to find the probability. correctFsmParses : list FsmParse of given word which will be used for getting transition list. index : int Index of FsmParse of which transition list will be used to get the probability. RETURNS ------- float The probability of the given word. """ if index != 0 and len(correctFsmParses) == index: return self.igBiGramModel.getProbability( Word(correctFsmParses[index - 1].getTransitionList()), word) else: return self.igUniGramModel.getProbability(word) def getBestRootWord(self, fsmParseList: FsmParseList) -> Word: """ The getBestRootWord method takes a FsmParseList as an input and loops through the list. It gets each word with its part of speech tags as a new Word word and its transition list as a Word ig. Then, finds their corresponding probabilities. At the end returns the word with the highest probability. PARAMETERS ---------- fsmParseList : FsmParseList FsmParseList is used to get the part of speech tags and transition lists of words. RETURNS ------- Word The word with the highest probability. """ bestProbability = -1 bestWord = None for j in range(fsmParseList.size()): word = fsmParseList.getFsmParse(j).getWordWithPos() ig = Word(fsmParseList.getFsmParse(j).getTransitionList()) wordProbability = self.wordUniGramModel.getProbability(word) igProbability = self.igUniGramModel.getProbability(ig) probability = wordProbability * igProbability if probability > bestProbability: bestWord = word bestProbability = probability return bestWord def getParseWithBestIgProbability(self, parseList: FsmParseList, correctFsmParses: list, index: int) -> FsmParse: """ The getParseWithBestIgProbability gets each FsmParse's transition list as a Word ig. Then, finds the corresponding probability. At the end returns the parse with the highest ig probability. PARAMETERS ---------- parseList : FsmParseList FsmParseList is used to get the FsmParse. correctFsmParses : list FsmParse is used to get the transition lists. index : int Index of FsmParse of which transition list will be used to get the probability. RETURNS ------- FsmParse The parse with the highest probability. """ bestParse = None bestProbability = -1 for j in range(parseList.size()): ig = Word(parseList.getFsmParse(j).getTransitionList()) probability = self.getIgProbability(ig, correctFsmParses, index) if probability > bestProbability: bestParse = parseList.getFsmParse(j) bestProbability = probability return bestParse def disambiguate(self, fsmParses: list) -> list: """ The disambiguate method gets an array of fsmParses. Then loops through that parses and finds the most probable root word and removes the other words which are identical to the most probable root word. At the end, gets the most probable parse among the fsmParses and adds it to the correctFsmParses list. PARAMETERS ---------- fsmParses : list FsmParseList to disambiguate. RETURNS ------- list CcorrectFsmParses list which holds the most probable parses. """ correctFsmParses = [] for i in range(len(fsmParses)): bestWord = self.getBestRootWord(fsmParses[i]) fsmParses[i].reduceToParsesWithSameRootAndPos(bestWord) bestParse = self.getParseWithBestIgProbability( fsmParses[i], correctFsmParses, i) if bestParse is not None: correctFsmParses.append(bestParse) return correctFsmParses def saveModel(self): """ Method to save unigrams and bigrams. """ super().saveModel() self.wordBiGramModel.saveAsText("words2.txt") self.igBiGramModel.saveAsText("igs2.txt") def loadModel(self): """ Method to load unigrams and bigrams. """ super().loadModel() self.wordBiGramModel = NGram("words2.txt") self.igBiGramModel = NGram("igs2.txt")
class NGramTest(CorpusTest, unittest.TestCase): simpleUniGram: NGram simpleBiGram: NGram simpleTriGram: NGram complexUniGram: NGram complexBiGram: NGram complexTriGram: NGram simpleCorpus: list trainCorpus: list testCorpus: list validationCorpus: list def setUp(self) -> None: self.simpleCorpus = [ ["<s>", "ali", "topu", "at", "mehmet", "ayşeye", "gitti", "</s>"], ["<s>", "ali", "top", "at", "ayşe", "eve", "gitti", "</s>"], ["<s>", "ayşe", "kitabı", "ver", "</s>"], ["<s>", "ali", "topu", "mehmete", "at", "</s>"], ["<s>", "ali", "topu", "at", "mehmet", "ayşeyle", "gitti", "</s>"] ] self.simpleUniGram = NGram(1, self.simpleCorpus) self.simpleBiGram = NGram(2, self.simpleCorpus) self.simpleTriGram = NGram(3, self.simpleCorpus) self.trainCorpus = self.readCorpus("../train.txt") self.complexUniGram = NGram(1, self.trainCorpus) self.complexBiGram = NGram(2, self.trainCorpus) self.complexTriGram = NGram(3, self.trainCorpus) self.testCorpus = self.readCorpus("../test.txt") self.validationCorpus = self.readCorpus("../validation.txt") def test_GetCountSimple(self): self.assertEqual(5, self.simpleUniGram.getCount(["<s>"])) self.assertEqual(0, self.simpleUniGram.getCount(["mahmut"]), 0.0) self.assertEqual(1, self.simpleUniGram.getCount(["kitabı"]), 0.0) self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0) self.assertEqual(0, self.simpleBiGram.getCount(["ayşe", "ali"]), 0.0) self.assertEqual(0, self.simpleBiGram.getCount(["mahmut", "ali"]), 0.0) self.assertEqual(2, self.simpleBiGram.getCount(["at", "mehmet"]), 0.0) self.assertEqual(1, self.simpleTriGram.getCount(["<s>", "ali", "top"]), 0.0) self.assertEqual(0, self.simpleTriGram.getCount(["ayşe", "kitabı", "at"]), 0.0) self.assertEqual(0, self.simpleTriGram.getCount(["ayşe", "topu", "at"]), 0.0) self.assertEqual( 0, self.simpleTriGram.getCount(["mahmut", "evde", "kal"]), 0.0) self.assertEqual(2, self.simpleTriGram.getCount(["ali", "topu", "at"]), 0.0) def test_GetCountComplex(self): self.assertEqual(20000, self.complexUniGram.getCount(["<s>"]), 0.0) self.assertEqual(50, self.complexUniGram.getCount(["atatürk"]), 0.0) self.assertEqual(11, self.complexBiGram.getCount(["<s>", "mustafa"]), 0.0) self.assertEqual(3, self.complexBiGram.getCount(["mustafa", "kemal"]), 0.0) self.assertEqual( 1, self.complexTriGram.getCount(["<s>", "mustafa", "kemal"]), 0.0) self.assertEqual( 1, self.complexTriGram.getCount(["mustafa", "kemal", "atatürk"]), 0.0) def test_VocabularySizeSimple(self): self.assertEqual(15, self.simpleUniGram.vocabularySize()) def test_VocabularySizeComplex(self): self.assertEqual(57625, self.complexUniGram.vocabularySize(), 0.0) self.complexUniGram = NGram(1, self.testCorpus) self.assertEqual(55485, self.complexUniGram.vocabularySize(), 0.0) self.complexUniGram = NGram(1, self.validationCorpus) self.assertEqual(35663, self.complexUniGram.vocabularySize(), 0.0) def test_Prune(self): self.simpleBiGram.prune(0.0) self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0) self.assertEqual(1, self.simpleBiGram.getCount(["<s>", "ayşe"]), 0.0) self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0) self.assertEqual(1, self.simpleBiGram.getCount(["ali", "top"]), 0.0) self.assertEqual(2, self.simpleBiGram.getCount(["topu", "at"]), 0.0) self.assertEqual(1, self.simpleBiGram.getCount(["topu", "mehmete"]), 0.0) self.simpleBiGram.prune(0.6) self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0) self.assertEqual(0, self.simpleBiGram.getCount(["<s>", "ayşe"]), 0.0) self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0) self.assertEqual(0, self.simpleBiGram.getCount(["ali", "top"]), 0.0) self.assertEqual(2, self.simpleBiGram.getCount(["topu", "at"]), 0.0) self.assertEqual(0, self.simpleBiGram.getCount(["topu", "mehmete"]), 0.0) self.simpleBiGram.prune(0.7) self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0) self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0) self.assertEqual(2, self.simpleBiGram.getCount(["topu", "at"]), 0.0) self.simpleBiGram.prune(0.8) self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0) self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0) self.simpleBiGram.prune(0.9) self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0) def test_SaveAsText(self): self.simpleUniGram.saveAsText("simple1.txt") self.simpleBiGram.saveAsText("simple2.txt") self.simpleTriGram.saveAsText("simple3.txt") def test_Merge(self): self.simpleUniGram = NGram("simple1a.txt") self.simpleUniGram.merge(NGram("simple1b.txt")) self.assertEqual(18, self.simpleUniGram.vocabularySize()) self.simpleBiGram = NGram("simple2a.txt") self.simpleBiGram.merge(NGram("simple2b.txt")) self.simpleBiGram.merge(NGram("simple2c.txt")) self.simpleBiGram.merge(NGram("simple2d.txt")) self.assertEqual(21, self.simpleBiGram.vocabularySize()) self.simpleTriGram = NGram("simple3a.txt") self.simpleTriGram.merge(NGram("simple3b.txt")) self.simpleTriGram.merge(NGram("simple3c.txt")) self.assertEqual(20, self.simpleTriGram.vocabularySize()) def test_LoadMultiPart(self): self.simpleUniGram = NGram(1) self.simpleUniGram.initWithMultipleFile("simple1part1.txt", "simple1part2.txt") self.simpleBiGram = NGram(2) self.simpleBiGram.initWithMultipleFile("simple2part1.txt", "simple2part2.txt", "simple2part3.txt") self.simpleTriGram = NGram(3) self.simpleTriGram.initWithMultipleFile("simple3part1.txt", "simple3part2.txt", "simple3part3.txt", "simple3part4.txt") self.test_GetCountSimple() self.test_VocabularySizeSimple()