def generate(self) -> DisambiguationCorpus: """ Creates a morphological disambiguation corpus from the corpus. RETURNS ------- DisambiguationCorpus Created disambiguation corpus. """ corpus = DisambiguationCorpus() for i in range(self.__annotatedCorpus.sentenceCount()): sentence = self.__annotatedCorpus.getSentence(i) disambiguationSentence = AnnotatedSentence() for j in range(sentence.wordCount()): annotatedWord = sentence.getWord(j) if isinstance(annotatedWord, AnnotatedWord): disambiguationSentence.addWord( DisambiguatedWord(annotatedWord.getName(), annotatedWord.getParse())) corpus.addSentence(disambiguationSentence) return corpus
def generate(self) -> DisambiguationCorpus: """ Creates a morphological disambiguation corpus from the treeBank. Calls generateAnnotatedSentence for each parse tree in the treebank. RETURNS ------- DisambiguationCorpus Created disambiguation corpus. """ corpus = DisambiguationCorpus() for i in range(self.__treeBank.size()): parseTree = self.__treeBank.get(i) if parseTree.layerAll(ViewLayerType.INFLECTIONAL_GROUP): sentence = parseTree.generateAnnotatedSentence() disambiguationSentence = AnnotatedSentence() for j in range(sentence.wordCount()): annotatedWord = sentence.getWord(j) if isinstance(annotatedWord, AnnotatedWord): disambiguationSentence.addWord( DisambiguatedWord(annotatedWord.getName(), annotatedWord.getParse())) corpus.addSentence(disambiguationSentence) return corpus
def test_Disambiguation(self): fsm = FsmMorphologicalAnalyzer() corpus = DisambiguationCorpus("../penntreebank.txt") algorithm = HmmDisambiguation() algorithm.train(corpus) correctParse = 0 correctRoot = 0 for i in range(corpus.sentenceCount()): sentenceAnalyses = fsm.robustMorphologicalAnalysis(corpus.getSentence(i)) fsmParses = algorithm.disambiguate(sentenceAnalyses) for j in range(corpus.getSentence(i).wordCount()): word = corpus.getSentence(i).getWord(j) if isinstance(word, DisambiguatedWord): if fsmParses[j].transitionList().lower() == word.getParse().__str__().lower(): correctParse = correctParse + 1 if fsmParses[j].getWord() == word.getParse().getWord(): correctRoot = correctRoot + 1 self.assertAlmostEqual(0.9233, (correctRoot + 0.0) / corpus.numberOfWords(), 3) self.assertAlmostEqual(0.8630, (correctParse + 0.0) / corpus.numberOfWords(), 3)
def test_Corpus(self): corpus = DisambiguationCorpus("../penntreebank.txt") self.assertEqual(19109, corpus.sentenceCount()) self.assertEqual(170211, corpus.numberOfWords())