def test_Disambiguation(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") corpus = DisambiguationCorpus("../penntreebank.txt") algorithm = RootFirstDisambiguation() algorithm.train(corpus) correctParse = 0 correctRoot = 0 for i in range(corpus.sentenceCount()): sentenceAnalyses = fsm.robustMorphologicalAnalysis( corpus.getSentence(i)) fsmParses = algorithm.disambiguate(sentenceAnalyses) for j in range(corpus.getSentence(i).wordCount()): word = corpus.getSentence(i).getWord(j) if isinstance(word, DisambiguatedWord): if fsmParses[j].transitionList() == word.getParse( ).__str__(): correctParse = correctParse + 1 if fsmParses[j].getWord() == word.getParse().getWord(): correctRoot = correctRoot + 1 self.assertEqual(0.9590, (correctRoot + 0.0) / corpus.numberOfWords(), 0.002) self.assertEqual(0.8639, (correctParse + 0.0) / corpus.numberOfWords(), 0.002)
def test_RootWordStatistics(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") rootWordStatistics = RootWordStatistics( "../penntreebank_statistics.txt") self.assertTrue(rootWordStatistics.containsKey("it$iti$itici")) self.assertTrue(rootWordStatistics.containsKey("yas$yasa$yasama")) self.assertTrue(rootWordStatistics.containsKey("tutuk$tutukla")) self.assertEqual( "çık", rootWordStatistics.bestRootWord(fsm.morphologicalAnalysis("çıkma"), 0.0)) self.assertEqual( "danışman", rootWordStatistics.bestRootWord( fsm.morphologicalAnalysis("danışman"), 0.0)) self.assertIsNone( rootWordStatistics.bestRootWord( fsm.morphologicalAnalysis("danışman"), 0.7)) self.assertEqual( "görüşme", rootWordStatistics.bestRootWord( fsm.morphologicalAnalysis("görüşme"), 0.0)) self.assertIsNone( rootWordStatistics.bestRootWord( fsm.morphologicalAnalysis("görüşme"), 0.7)) self.assertEqual( "anlaş", rootWordStatistics.bestRootWord( fsm.morphologicalAnalysis("anlaşma"), 0.0)) self.assertIsNone( rootWordStatistics.bestRootWord( fsm.morphologicalAnalysis("anlaşma"), 0.7))
def constructIdiomLiterals(self, fsm: FsmMorphologicalAnalyzer, morphologicalParse1: MorphologicalParse, metaParse1: MetamorphicParse, morphologicalParse2: MorphologicalParse, metaParse2: MetamorphicParse, morphologicalParse3: MorphologicalParse = None, metaParse3: MetamorphicParse = None) -> list: """ Returns a list of literals using 3 possible words gathered with the specified morphological parses and metamorphic parses. PARAMETERS ---------- morphologicalParse1 : MorphologicalParse morphological parse to get possible words morphologicalParse2 : MorphologicalParse morphological parse to get possible words morphologicalParse3 : MorphologicalParse morphological parse to get possible words metaParse1 : MetamorphicParse metamorphic parse to get possible words metaParse2 : MetamorphicParse metamorphic parse to get possible words metaParse3 : MetamorphicParse metamorphic parse to get possible words fsm : FsmMorphologicalAnalyzer finite state machine morphological analyzer to be used at getting possible words RETURNS ------- list A list of literals """ result = [] possibleWords1 = fsm.getPossibleWords(morphologicalParse1, metaParse1) possibleWords2 = fsm.getPossibleWords(morphologicalParse2, metaParse2) if morphologicalParse3 is not None and metaParse3 is not None: possibleWords3 = fsm.getPossibleWords(morphologicalParse3, metaParse3) for possibleWord1 in possibleWords1: for possibleWord2 in possibleWords2: for possibleWord3 in possibleWords3: result.extend( self.getLiteralsWithName(possibleWord1 + " " + possibleWord2 + " " + possibleWord3)) else: for possibleWord1 in possibleWords1: for possibleWord2 in possibleWords2: result.extend( self.getLiteralsWithName(possibleWord1 + " " + possibleWord2)) return result
def test_SpellCheck(self): original = [Sentence("demokratik cumhuriyet en kıymetli varlığımızdır"), Sentence("bu tablodaki değerler zedelenmeyecektir"), Sentence("milliyet'in geleneksel yılın sporcusu anketi 43. yaşını doldurdu"), Sentence("demokrasinin icadı bu ayrımı bulandırdı"), Sentence("dışişleri müsteşarı Öymen'in 1997'nin ilk aylarında Bağdat'a gitmesi öngörülüyor"), Sentence("büyüdü , palazlandı , devleti ele geçirdi"), Sentence("her maskenin ciltte kalma süresi farklıdır"), Sentence("yılın son ayında 10 gazeteci gözaltına alındı"), Sentence("iki pilotun kullandığı uçakta bir hostes görev alıyor"), Sentence("son derece kısıtlı kelimeler çerçevesinde kendilerini uzun cümlelerle ifade edebiliyorlar")] modified = [Sentence("demokratik cumhüriyet en kımetli varlıgımızdır"), Sentence("bu tblodaki değerlğr zedelenmeyecüktir"), Sentence("milliyet'in geeneksel yılın spoşcusu ankşti 43. yeşını doldürdu"), Sentence("demokrasinin icşdı bu ayrmıı bulandürdı"), Sentence("dışişleri mütseşarı Öymen'in 1997'nin ilk aylğrında Bağdat'a gitmesi öngşrülüyor"), Sentence("büyüdü , palazandı , devltei ele geçridi"), Sentence("her makenin cültte kalma sürdsi farlkıdır"), Sentence("yılın sno ayında 10 gazteci gözlatına alündı"), Sentence("iki piotun kulçandığı uçkata üir hotes görçv alyıor"), Sentence("son deece kısütlı keilmeler çeçevesinde kendülerini uzuü cümllerle ifüde edbeiliyorlar")] fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") nGram = NGram("../ngram.txt") nGram.calculateNGramProbabilitiesSimple(NoSmoothing()) nGramSpellChecker = NGramSpellChecker(fsm, nGram) for i in range(len(modified)): self.assertEqual(original[i].toString(), nGramSpellChecker.spellCheck(modified[i]).toString())
def test_Deasciify(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") nGram = NGram("../ngram.txt") nGram.calculateNGramProbabilitiesSimple(NoSmoothing()) nGramDeasciifier = NGramDeasciifier(fsm, nGram, True) simpleAsciifier = SimpleAsciifier() corpus = Corpus("../corpus.txt") for i in range(corpus.sentenceCount()): sentence = corpus.getSentence(i) for j in range(1, sentence.wordCount()): if fsm.morphologicalAnalysis(sentence.getWord(j).getName()).size() > 0: asciified = simpleAsciifier.asciifyWord(sentence.getWord(j)) if asciified != sentence.getWord(j).getName(): deasciified = nGramDeasciifier.deasciify(Sentence(sentence.getWord(j - 1).getName() + " " + sentence.getWord(j).getName())) self.assertEqual(sentence.getWord(j).getName(), deasciified.getWord(1).getName())
def constructLiterals(self, word: str, parse: MorphologicalParse, metaParse: MetamorphicParse, fsm: FsmMorphologicalAnalyzer): """ Creates a list of literals with a specified word, or possible words corresponding to morphological parse. PARAMETERS ---------- word : str literal String parse : MorphologicalParse morphological parse to get possible words metaParse : MetamorphicParse metamorphic parse to get possible words fsm : FsmMorphologicalAnalyzer finite state machine morphological analyzer to be used at getting possible words RETURNS ------- list A list of literal """ result = [] if parse.size() > 0: if not parse.isPunctuation() and not parse.isCardinal( ) and not parse.isReal(): possibleWords = fsm.getPossibleWords(parse, metaParse) for possibleWord in possibleWords: result.extend(self.getLiteralsWithName(possibleWord)) else: result.extend(self.getLiteralsWithName(word)) else: result.extend(self.getLiteralsWithName(word)) return result
def setUp(self) -> None: fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") self.parse1 = fsm.morphologicalAnalysis("açılır") self.parse2 = fsm.morphologicalAnalysis("koparılarak") self.parse3 = fsm.morphologicalAnalysis("toplama") self.parse4 = fsm.morphologicalAnalysis("değerlendirmede") self.parse5 = fsm.morphologicalAnalysis("soruşturmasının") self.parse6 = fsm.morphologicalAnalysis("karşılaştırmalı") self.parse7 = fsm.morphologicalAnalysis("esaslarını") self.parse8 = fsm.morphologicalAnalysis("güçleriyle") self.parse9 = fsm.morphologicalAnalysis("bulmayacakları")
def test_Deasciify2(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") nGram = NGram("../ngram.txt") nGram.calculateNGramProbabilitiesSimple(NoSmoothing()) nGramDeasciifier = NGramDeasciifier(fsm, nGram, False) self.assertEqual("noter hakkında", nGramDeasciifier.deasciify(Sentence("noter hakkinda")).__str__()) self.assertEqual("sandık medrese", nGramDeasciifier.deasciify(Sentence("sandik medrese")).__str__()) self.assertEqual("kuran'ı karşılıklı", nGramDeasciifier.deasciify(Sentence("kuran'ı karsilikli")).__str__())
def test_Disambiguation(self): fsm = FsmMorphologicalAnalyzer() corpus = DisambiguationCorpus("../penntreebank.txt") algorithm = HmmDisambiguation() algorithm.train(corpus) correctParse = 0 correctRoot = 0 for i in range(corpus.sentenceCount()): sentenceAnalyses = fsm.robustMorphologicalAnalysis(corpus.getSentence(i)) fsmParses = algorithm.disambiguate(sentenceAnalyses) for j in range(corpus.getSentence(i).wordCount()): word = corpus.getSentence(i).getWord(j) if isinstance(word, DisambiguatedWord): if fsmParses[j].transitionList().lower() == word.getParse().__str__().lower(): correctParse = correctParse + 1 if fsmParses[j].getWord() == word.getParse().getWord(): correctRoot = correctRoot + 1 self.assertAlmostEqual(0.9233, (correctRoot + 0.0) / corpus.numberOfWords(), 3) self.assertAlmostEqual(0.8630, (correctParse + 0.0) / corpus.numberOfWords(), 3)
def setUp(self) -> None: self.fsm = FsmMorphologicalAnalyzer() self.parse1 = self.fsm.morphologicalAnalysis("açılır").getFsmParse(0) self.parse2 = self.fsm.morphologicalAnalysis( "koparılarak").getFsmParse(0) self.parse3 = self.fsm.morphologicalAnalysis("toplama").getFsmParse(0) self.parse4 = self.fsm.morphologicalAnalysis( "değerlendirmede").getFsmParse(0) self.parse5 = self.fsm.morphologicalAnalysis( "soruşturmasının").getFsmParse(0) self.parse6 = self.fsm.morphologicalAnalysis( "karşılaştırmalı").getFsmParse(0) self.parse7 = self.fsm.morphologicalAnalysis("esaslarını").getFsmParse( 0) self.parse8 = self.fsm.morphologicalAnalysis("güçleriyle").getFsmParse( 0) self.parse9 = self.fsm.morphologicalAnalysis( "bulmayacakları").getFsmParse(0) self.parse10 = self.fsm.morphologicalAnalysis("mü").getFsmParse(0)
def test_Deasciify(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") simpleDeasciifier = SimpleDeasciifier(fsm) simpleAsciifier = SimpleAsciifier() for i in range(fsm.getDictionary().size()): word = fsm.getDictionary().getWordWithIndex(i) count = 0 for j in range(len(word.getName())): if word.getName()[j] == 'ç' or word.getName()[j] == 'ö' or word.getName()[j] == 'ğ' or \ word.getName()[j] == 'ü' or word.getName()[j] == 'ş' or word.getName()[j] == 'ı': count = count + 1 if (count > 0 and not word.getName().endswith("fulü") and (word.isNominal() or word.isAdjective() or word.isAdverb() or word.isVerb())): asciified = simpleAsciifier.asciifyWord(word) if len(simpleDeasciifier.candidateList(Word(asciified))) == 1: deasciified = simpleDeasciifier.deasciify( Sentence(asciified)).toString() self.assertEqual(word.getName(), deasciified)
def __init__(self, corpus, example, pdf_path=None): self.corpus = corpus self.example = example self.result = None self.pdf_path = pdf_path if self.example == Tool.KELIMEYI_OGELERINE_AYIR: self.result = zemberekTool.ogelere_ayir(corpus) if self.result is None: self.result = "Cümle yerine kelime girmeniz gerekiyor veya girdiğiniz kelime yanlış" if self.example == Tool.CUMLEDE_GECEN_KOKLERI_BUL: self.result = zemberekTool.metinde_gecen_kokleri_bul(self.corpus) if self.example == Tool.CUMLEYI_PARCALARA_AYIR: self.result = zemberekTool.cumleyi_parcalara_ayir(self.corpus) if self.example == Tool.KELIME_ONERICI: self.result = zemberekTool.kelime_onerici(self.corpus) if self.result is None: self.result = "Cümle yerine kelime girmeniz gerekiyor" if self.example == Tool.KELIME_HECELE: self.result = zemberekTool.kelime_hecele(self.corpus) if self.result is None: self.result = "Cümle yerine kelime girmeniz gerekiyor" if self.example == Tool.NLTK_FILES_DOWNLOAD: self.result = nltk_download() if self.example == Tool.PERSONIFICATION_COPULA: self.result = personal(self.corpus, Person.FIRST, is_plural=True) if self.result is None: self.result = "Cümle yerine kelime girmeniz gerekiyor" if self.example == Tool.INFERENTIAL_MOOD: self.result = inferential(self.corpus, Person.SECOND, is_plural=False) if self.result is None: self.result = "Cümle yerine kelime girmeniz gerekiyor" if self.example == Tool.CONVERT_PDF_TO_TXT: self.result = pdfconverter.PDFParser(pdf_path).parse() if self.result is None: self.result = "PDF path yanlış olabilir veya PDF olmayabilir" if self.example == Tool.SENTENCE_CORRECTOR: fsm = FsmMorphologicalAnalyzer("./SpellChecker/turkish_dictionary.txt", "./SpellChecker/turkish_misspellings.txt", "./SpellChecker/turkish_finite_state_machine.xml") spellChecker = SimpleSpellChecker(fsm) sentence = Sentence(self.corpus) self.result = spellChecker.spellCheck(sentence)
def test_SpellCheck(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") simpleSpellChecker = SimpleSpellChecker(fsm) input = open("../misspellings.txt") lines = input.readlines() for line in lines: items = line.strip().split(" ") misspelled = items[0] corrected = items[1] self.assertEqual( corrected, simpleSpellChecker.spellCheck(Sentence(misspelled)).toString())
def test_SpellCheckSurfaceForm(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") nGram = NGram("../ngram.txt") nGram.calculateNGramProbabilitiesSimple(NoSmoothing()) nGramSpellChecker = NGramSpellChecker(fsm, nGram, False) self.assertEqual( "noter hakkında", nGramSpellChecker.spellCheck(Sentence("noter hakkınad")).__str__()) self.assertEqual( "arçelik'in çamaşır", nGramSpellChecker.spellCheck( Sentence("arçelik'in çamşaır")).__str__()) self.assertEqual( "ruhsat yanında", nGramSpellChecker.spellCheck(Sentence("ruhset yanında")).__str__())
def setUp(self) -> None: self.fsm = FsmMorphologicalAnalyzer() self.wordNet = WordNet()
def setUp(self) -> None: self.fsm = FsmMorphologicalAnalyzer()
class TransitionTest(unittest.TestCase): fsm : FsmMorphologicalAnalyzer def setUp(self) -> None: self.fsm = FsmMorphologicalAnalyzer() def test_NumberWithAccusative(self): self.assertTrue(self.fsm.morphologicalAnalysis("2'yi").size() != 0) self.assertEqual(0, self.fsm.morphologicalAnalysis("2'i").size()) self.assertTrue(self.fsm.morphologicalAnalysis("5'i").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("9'u").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("10'u").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("30'u").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("3'ü").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("4'ü").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("100'ü").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("6'yı").size() != 0) self.assertEqual(0, self.fsm.morphologicalAnalysis("6'ı").size()) self.assertTrue(self.fsm.morphologicalAnalysis("40'ı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("60'ı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("90'ı").size() != 0) def test_NumberWithDative(self): self.assertTrue(self.fsm.morphologicalAnalysis("6'ya").size() != 0) self.assertEqual(0, self.fsm.morphologicalAnalysis("6'a").size()) self.assertTrue(self.fsm.morphologicalAnalysis("9'a").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("10'a").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("30'a").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("40'a").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("60'a").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("90'a").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("2'ye").size() != 0) self.assertEqual(0, self.fsm.morphologicalAnalysis("2'e").size()) self.assertTrue(self.fsm.morphologicalAnalysis("8'e").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("5'e").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("4'e").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("1'e").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("3'e").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("7'ye").size() != 0) self.assertEqual(0, self.fsm.morphologicalAnalysis("7'e").size()) def test_PresentTense(self): self.assertTrue(self.fsm.morphologicalAnalysis("büyülüyor").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("bölümlüyor").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("buğuluyor").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("bulguluyor").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("açıklıyor").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("çalkalıyor").size() != 0) def test_A(self): self.assertTrue(self.fsm.morphologicalAnalysis("alkole").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("anormale").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("sakala").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("kabala").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("faika").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("halika").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("kediye").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("eve").size() != 0) def test_C(self): self.assertTrue(self.fsm.morphologicalAnalysis("gripçi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("güllaççı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("gülütçü").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("gülükçü").size() != 0) def test_SH(self): self.assertTrue(self.fsm.morphologicalAnalysis("altışar").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("yedişer").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("üçer").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("beşer").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("dörder").size() != 0) def test_NumberWithD(self): self.assertTrue(self.fsm.morphologicalAnalysis("1'di").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("2'ydi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("3'tü").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("4'tü").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("5'ti").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("6'ydı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("7'ydi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("8'di").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("9'du").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("30'du").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("40'tı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("60'tı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("70'ti").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("50'ydi").size() != 0) def test_D(self): self.assertTrue(self.fsm.morphologicalAnalysis("koştu").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("kitaptı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("kaçtı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("evdi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("fraktı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("sattı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("aftı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("kesti").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("ahtı").size() != 0) def test_Exceptions(self): self.assertTrue(self.fsm.morphologicalAnalysis("yiyip").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("sana").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("bununla").size() != 0) self.assertEqual(0, self.fsm.morphologicalAnalysis("buyla").size()) self.assertTrue(self.fsm.morphologicalAnalysis("onunla").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("şununla").size() != 0) self.assertEqual(0, self.fsm.morphologicalAnalysis("şuyla").size()) self.assertTrue(self.fsm.morphologicalAnalysis("bana").size() != 0) def test_VowelEChangesToIDuringYSuffixation(self): self.assertTrue(self.fsm.morphologicalAnalysis("diyor").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("yiyor").size() != 0) def test_LastIdropsDuringPassiveSuffixation(self): self.assertTrue(self.fsm.morphologicalAnalysis("yoğruldu").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("buyruldu").size() != 0) def test_ShowsSuRegularities(self): self.assertTrue(self.fsm.morphologicalAnalysis("karasuyu").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("suyu").size() != 0) def test_DuplicatesDuringSuffixation(self): self.assertTrue(self.fsm.morphologicalAnalysis("tıbbı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("ceddi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("zıddı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("serhaddi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("fenni").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("haddi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("hazzı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("şakkı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("şakı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("halli").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("hali").size() != 0) def test_LastIdropsDuringSuffixation(self): self.assertTrue(self.fsm.morphologicalAnalysis("hizbi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("kaybı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("ahdi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("nesci").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("zehri").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("zikri").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("metni").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("metini").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("katli").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("katili").size() != 0) def test_NounSoftenDuringSuffixation(self): self.assertTrue(self.fsm.morphologicalAnalysis("adabı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("amibi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("armudu").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("ağacı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("akacı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("arkeoloğu").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("filoloğu").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("ahengi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("küngü").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("kitaplığı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("küllüğü").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("adedi").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("adeti").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("ağıdı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("ağıtı").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("anotu").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("anodu").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("Kuzguncuk'u").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("Leylak'ı").size() != 0) def test_VerbSoftenDuringSuffixation(self): self.assertTrue(self.fsm.morphologicalAnalysis("cezbediyor").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("ediyor").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("bahsediyor").size() != 0)
def setUp(self) -> None: fsm = FsmMorphologicalAnalyzer() self.parse1 = fsm.morphologicalAnalysis("açılır") self.parse2 = fsm.morphologicalAnalysis("koparılarak") self.parse3 = fsm.morphologicalAnalysis("toplama") self.parse4 = fsm.morphologicalAnalysis("değerlendirmede") self.parse5 = fsm.morphologicalAnalysis("soruşturmasının") self.parse6 = fsm.morphologicalAnalysis("karşılaştırmalı") self.parse7 = fsm.morphologicalAnalysis("esaslarını") self.parse8 = fsm.morphologicalAnalysis("güçleriyle") self.parse9 = fsm.morphologicalAnalysis("bulmayacakları") self.parse10 = fsm.morphologicalAnalysis("kitabı") self.parse11 = fsm.morphologicalAnalysis("kitapları") self.parse12 = fsm.morphologicalAnalysis("o") self.parse13 = fsm.morphologicalAnalysis("arabası") self.parse14 = fsm.morphologicalAnalysis("sana")
def setUp(self) -> None: self.fsm = FsmMorphologicalAnalyzer( "../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml")
def constructSynSets(self, word: str, parse: MorphologicalParse, metaParse: MetamorphicParse, fsm: FsmMorphologicalAnalyzer) -> list: """ Creates a list of SynSets with a specified word, or possible words corresponding to morphological parse. PARAMETERS ---------- word : str literal String to get SynSets with parse : MorphologicalParse morphological parse to get SynSets with proper literals metaParse : MetamorphicParse metamorphic parse to get possible words fsm : FsmMorphologicalAnalyzer finite state machine morphological analyzer to be used at getting possible words RETURNS ------- list A list of SynSets """ result = [] if parse.size() > 0: if parse.isProperNoun(): result.append(self.getSynSetWithLiteral("(özel isim)", 1)) if parse.isTime(): result.append(self.getSynSetWithLiteral("(zaman)", 1)) if parse.isDate(): result.append(self.getSynSetWithLiteral("(tarih)", 1)) if parse.isHashTag(): result.append(self.getSynSetWithLiteral("(hashtag)", 1)) if parse.isEmail(): result.append(self.getSynSetWithLiteral("(email)", 1)) if parse.isOrdinal(): result.append( self.getSynSetWithLiteral("(sayı sıra sıfatı)", 1)) if parse.isPercent(): result.append(self.getSynSetWithLiteral("(yüzde)", 1)) if parse.isFraction(): result.append(self.getSynSetWithLiteral("(kesir sayı)", 1)) if parse.isRange(): result.append(self.getSynSetWithLiteral("(sayı aralığı)", 1)) if parse.isReal(): result.append(self.getSynSetWithLiteral("(reel sayı)", 1)) if not parse.isPunctuation() and not parse.isCardinal( ) and not parse.isReal(): possibleWords = fsm.getPossibleWords(parse, metaParse) for possibleWord in possibleWords: synSets = self.getSynSetsWithLiteral(possibleWord) if len(synSets) > 0: for synSet in synSets: if synSet.getPos() is not None and ( parse.getPos() == "NOUN" or parse.getPos() == "ADVERB" or parse.getPos() == "VERB" or parse.getPos() == "ADJ" or parse.getPos() == "CONJ"): if synSet.getPos() == Pos.NOUN: if parse.getPos( ) == "NOUN" or parse.getRootPos( ) == "NOUN": result.append(synSet) elif synSet.getPos() == Pos.ADVERB: if parse.getPos( ) == "ADVERB" or parse.getRootPos( ) == "ADVERB": result.append(synSet) elif synSet.getPos() == Pos.VERB: if parse.getPos( ) == "VERB" or parse.getRootPos( ) == "VERB": result.append(synSet) elif synSet.getPos() == Pos.ADJECTIVE: if parse.getPos( ) == "ADJ" or parse.getRootPos() == "ADJ": result.append(synSet) elif synSet.getPos() == Pos.CONJUNCTION: if parse.getPos( ) == "CONJ" or parse.getRootPos( ) == "CONJ": result.append(synSet) else: result.append(synSet) else: result.append(synSet) if len(result) == 0: for possibleWord in possibleWords: synSets = self.getSynSetsWithLiteral(possibleWord) result.extend(synSets) else: result.extend(self.getSynSetsWithLiteral(word)) if parse.isCardinal() and len(result) == 0: result.append(self.getSynSetWithLiteral("(tam sayı)", 1)) else: result.extend(self.getSynSetsWithLiteral(word)) return result
class FsmParseTest(unittest.TestCase): parse1: FsmParse parse2: FsmParse parse3: FsmParse parse4: FsmParse parse5: FsmParse parse6: FsmParse parse7: FsmParse parse8: FsmParse parse9: FsmParse parse10: FsmParse def setUp(self) -> None: self.fsm = FsmMorphologicalAnalyzer() self.parse1 = self.fsm.morphologicalAnalysis("açılır").getFsmParse(0) self.parse2 = self.fsm.morphologicalAnalysis( "koparılarak").getFsmParse(0) self.parse3 = self.fsm.morphologicalAnalysis("toplama").getFsmParse(0) self.parse4 = self.fsm.morphologicalAnalysis( "değerlendirmede").getFsmParse(0) self.parse5 = self.fsm.morphologicalAnalysis( "soruşturmasının").getFsmParse(0) self.parse6 = self.fsm.morphologicalAnalysis( "karşılaştırmalı").getFsmParse(0) self.parse7 = self.fsm.morphologicalAnalysis("esaslarını").getFsmParse( 0) self.parse8 = self.fsm.morphologicalAnalysis("güçleriyle").getFsmParse( 0) self.parse9 = self.fsm.morphologicalAnalysis( "bulmayacakları").getFsmParse(0) self.parse10 = self.fsm.morphologicalAnalysis("mü").getFsmParse(0) def test_GetLastLemmaWithTag(self): self.assertEqual("açıl", self.parse1.getLastLemmaWithTag("VERB")) self.assertEqual("koparıl", self.parse2.getLastLemmaWithTag("VERB")) self.assertEqual("değerlendir", self.parse4.getLastLemmaWithTag("VERB")) self.assertEqual("soruştur", self.parse5.getLastLemmaWithTag("VERB")) self.assertEqual("karşı", self.parse6.getLastLemmaWithTag("ADJ")) def test_GetLastLemma(self): self.assertEqual("açıl", self.parse1.getLastLemma()) self.assertEqual("koparılarak", self.parse2.getLastLemma()) self.assertEqual("değerlendirme", self.parse4.getLastLemma()) self.assertEqual("soruşturma", self.parse5.getLastLemma()) self.assertEqual("karşılaştır", self.parse6.getLastLemma()) def test_GetTransitionList(self): self.assertEqual("aç+VERB^DB+VERB+PASS+POS+AOR+A3SG", self.parse1.__str__()) self.assertEqual( "kop+VERB^DB+VERB+CAUS^DB+VERB+PASS+POS^DB+ADV+BYDOINGSO", self.parse2.__str__()) self.assertEqual("topla+NOUN+A3SG+P1SG+DAT", self.parse3.__str__()) self.assertEqual( "değer+NOUN+A3SG+PNON+NOM^DB+VERB+ACQUIRE^DB+VERB+CAUS+POS^DB+NOUN+INF2+A3SG+PNON+LOC", self.parse4.__str__()) self.assertEqual( "sor+VERB+RECIP^DB+VERB+CAUS+POS^DB+NOUN+INF2+A3SG+P3SG+GEN", self.parse5.__str__()) self.assertEqual( "karşı+ADJ^DB+VERB+BECOME^DB+VERB+CAUS+POS+NECES+A3SG", self.parse6.__str__()) self.assertEqual("esas+ADJ^DB+NOUN+ZERO+A3PL+P2SG+ACC", self.parse7.__str__()) self.assertEqual("güç+ADJ^DB+NOUN+ZERO+A3PL+P3PL+INS", self.parse8.__str__()) self.assertEqual("bul+VERB+NEG^DB+ADJ+FUTPART+P3PL", self.parse9.__str__()) self.assertEqual("mi+QUES+PRES+A3SG", self.parse10.__str__()) def test_WithList(self): self.assertEqual("aç+Hl+Hr", self.parse1.withList()) self.assertEqual("kop+Ar+Hl+yArAk", self.parse2.withList()) self.assertEqual("topla+Hm+yA", self.parse3.withList()) self.assertEqual("değer+lAn+DHr+mA+DA", self.parse4.withList()) self.assertEqual("sor+Hs+DHr+mA+sH+nHn", self.parse5.withList()) self.assertEqual("karşı+lAs+DHr+mAlH", self.parse6.withList()) self.assertEqual("esas+lAr+Hn+yH", self.parse7.withList()) self.assertEqual("güç+lArH+ylA", self.parse8.withList()) self.assertEqual("bul+mA+yAcAk+lArH", self.parse9.withList()) def test_SuffixList(self): self.assertEqual( "VerbalRoot(F5PR)(aç)+PassiveHl(açıl)+OtherTense2(açılır)", self.parse1.suffixList()) self.assertEqual( "VerbalRoot(F1P1)(kop)+CausativeAr(kopar)+PassiveHl(koparıl)+Adverb1(koparılarak)", self.parse2.suffixList()) self.assertEqual( "NominalRoot(topla)+Possessive(toplam)+Case1(toplama)", self.parse3.suffixList()) self.assertEqual( "NominalRoot(değer)+VerbalRoot(F5PR)(değerlen)+CausativeDHr(değerlendir)+NominalRoot(değerlendirme)+Case1(değerlendirmede)", self.parse4.suffixList()) self.assertEqual( "VerbalRoot(F5PR)(sor)+Reciprocal(soruş)+CausativeDHr(soruştur)+NominalRoot(soruşturma)+Possessive3(soruşturması)+Case1(soruşturmasının)", self.parse5.suffixList()) self.assertEqual( "AdjectiveRoot(karşı)+VerbalRoot(F5PR)(karşılaş)+CausativeDHr(karşılaştır)+OtherTense(karşılaştırmalı)", self.parse6.suffixList()) self.assertEqual( "AdjectiveRoot(esas)+Plural(esaslar)+Possessive(esasların)+AccusativeNoun(esaslarını)", self.parse7.suffixList()) self.assertEqual( "AdjectiveRoot(güç)+Possesive3(güçleri)+Case1(güçleriyle)", self.parse8.suffixList()) self.assertEqual( "VerbalRoot(F5PW)(bul)+Negativema(bulma)+AdjectiveParticiple(bulmayacak)+Adjective(bulmayacakları)", self.parse9.suffixList())
class FsmMorphologicalAnalyzerTest(unittest.TestCase): fsm: FsmMorphologicalAnalyzer def setUp(self) -> None: self.fsm = FsmMorphologicalAnalyzer( "../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") def test_morphologicalAnalysisDataTimeNumber(self): self.assertTrue(self.fsm.morphologicalAnalysis("3/4").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("3\\/4").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("4/2/1973").size() != 0) self.assertTrue( self.fsm.morphologicalAnalysis("14/2/1993").size() != 0) self.assertTrue( self.fsm.morphologicalAnalysis("14/12/1933").size() != 0) self.assertTrue( self.fsm.morphologicalAnalysis("6/12/1903").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("%34.5").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("%3").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("%56").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("2:3").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("12:3").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("4:23").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("11:56").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("1:2:3").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("3:12:3").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("5:4:23").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("7:11:56").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("12:2:3").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("10:12:3").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("11:4:23").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("22:11:56").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("45").size() != 0) self.assertTrue(self.fsm.morphologicalAnalysis("34.23").size() != 0) def test_morphologicalAnalysisProperNoun(self): dictionary = self.fsm.getDictionary() for i in range(dictionary.size()): word = dictionary.getWordWithIndex(i) if isinstance(word, TxtWord): if word.isProperNoun(): self.assertTrue( self.fsm.morphologicalAnalysis(word.getName().replace( "i", "İ").upper()).size() != 0) def test_morphologicalAnalysisNounSoftenDuringSuffixation(self): dictionary = self.fsm.getDictionary() for i in range(dictionary.size()): word = dictionary.getWordWithIndex(i) if isinstance(word, TxtWord): if word.isNominal() and word.nounSoftenDuringSuffixation(): transitionState = State("Possessive", False, False) startState = State("NominalRoot", True, False) transition = Transition("yH", transitionState, "ACC") surfaceForm = transition.makeTransition( word, word.getName(), startState) self.assertTrue( self.fsm.morphologicalAnalysis(surfaceForm).size() != 0 ) def test_morphologicalAnalysisVowelAChangesToIDuringYSuffixation(self): dictionary = self.fsm.getDictionary() for i in range(dictionary.size()): word = dictionary.getWordWithIndex(i) if isinstance(word, TxtWord): if word.isVerb() and word.vowelAChangesToIDuringYSuffixation(): transitionState = State("VerbalStem", False, False) startState = State("VerbalRoot", True, False) transition = Transition("Hyor", transitionState, "PROG1") surfaceForm = transition.makeTransition( word, word.getName(), startState) self.assertTrue( self.fsm.morphologicalAnalysis(surfaceForm).size() != 0 ) def test_morphologicalAnalysisIsPortmanteau(self): dictionary = self.fsm.getDictionary() for i in range(dictionary.size()): word = dictionary.getWordWithIndex(i) if isinstance(word, TxtWord): if word.isNominal() and word.isPortmanteau() and not word.isPlural() and \ not word.isPortmanteauFacedVowelEllipsis(): transitionState = State("CompoundNounRoot", True, False) startState = State("CompoundNounRoot", True, False) transition = Transition("lArH", transitionState, "A3PL+P3PL") exceptLast2 = word.getName()[:len(word.getName()) - 2] exceptLast = word.getName()[:len(word.getName()) - 1] if word.isPortmanteauFacedSoftening(): if word.getName()[len(word.getName()) - 2] == "b": rootForm = exceptLast2 + 'p' elif word.getName()[len(word.getName()) - 2] == "c": rootForm = exceptLast2 + 'ç' elif word.getName()[len(word.getName()) - 2] == "d": rootForm = exceptLast2 + 't' elif word.getName()[len(word.getName()) - 2] == "ğ": rootForm = exceptLast2 + 'k' else: rootForm = exceptLast else: if word.isPortmanteauEndingWithSI(): rootForm = exceptLast2 else: rootForm = exceptLast surfaceForm = transition.makeTransition( word, rootForm, startState) self.assertTrue( self.fsm.morphologicalAnalysis(surfaceForm).size() != 0 ) def test_morphologicalAnalysisNotObeysVowelHarmonyDuringAgglutination( self): dictionary = self.fsm.getDictionary() for i in range(dictionary.size()): word = dictionary.getWordWithIndex(i) if isinstance(word, TxtWord): if word.isNominal( ) and word.notObeysVowelHarmonyDuringAgglutination(): transitionState = State("Possessive", False, False) startState = State("NominalRoot", True, False) transition = Transition("yH", transitionState, "ACC") surfaceForm = transition.makeTransition( word, word.getName(), startState) self.assertTrue( self.fsm.morphologicalAnalysis(surfaceForm).size() != 0 ) def test_morphologicalAnalysisLastIdropsDuringSuffixation(self): dictionary = self.fsm.getDictionary() for i in range(dictionary.size()): word = dictionary.getWordWithIndex(i) if isinstance(word, TxtWord): if word.isNominal() and word.lastIdropsDuringSuffixation(): transitionState = State("Possessive", False, False) startState = State("NominalRoot", True, False) transition = Transition("yH", transitionState, "ACC") surfaceForm = transition.makeTransition( word, word.getName(), startState) self.assertTrue( self.fsm.morphologicalAnalysis(surfaceForm).size() != 0 ) def test_morphologicalAnalysisVerbSoftenDuringSuffixation(self): dictionary = self.fsm.getDictionary() for i in range(dictionary.size()): word = dictionary.getWordWithIndex(i) if isinstance(word, TxtWord): if word.isVerb() and word.verbSoftenDuringSuffixation(): transitionState = State("VerbalStem", False, False) startState = State("VerbalRoot", True, False) transition = Transition("Hyor", transitionState, "PROG1") surfaceForm = transition.makeTransition( word, word.getName(), startState) self.assertTrue( self.fsm.morphologicalAnalysis(surfaceForm).size() != 0 ) def test_morphologicalAnalysisDuplicatesDuringSuffixation(self): dictionary = self.fsm.getDictionary() for i in range(dictionary.size()): word = dictionary.getWordWithIndex(i) if isinstance(word, TxtWord): if word.isNominal() and word.duplicatesDuringSuffixation(): transitionState = State("Possessive", False, False) startState = State("NominalRoot", True, False) transition = Transition("yH", transitionState, "ACC") surfaceForm = transition.makeTransition( word, word.getName(), startState) self.assertTrue( self.fsm.morphologicalAnalysis(surfaceForm).size() != 0 ) def test_morphologicalAnalysisEndingKChangesIntoG(self): dictionary = self.fsm.getDictionary() for i in range(dictionary.size()): word = dictionary.getWordWithIndex(i) if isinstance(word, TxtWord): if word.isNominal() and word.endingKChangesIntoG(): transitionState = State("Possessive", False, False) startState = State("NominalRoot", True, False) transition = Transition("yH", transitionState, "ACC") surfaceForm = transition.makeTransition( word, word.getName(), startState) self.assertTrue( self.fsm.morphologicalAnalysis(surfaceForm).size() != 0 ) def test_morphologicalAnalysisLastIdropsDuringPassiveSuffixation(self): dictionary = self.fsm.getDictionary() for i in range(dictionary.size()): word = dictionary.getWordWithIndex(i) if isinstance(word, TxtWord): if word.isVerb() and word.lastIdropsDuringPassiveSuffixation(): transitionState = State("VerbalStem", False, False) startState = State("VerbalRoot", True, False) transition = Transition("Hl", transitionState, "^DB+VERB+PASS") surfaceForm = transition.makeTransition( word, word.getName(), startState) self.assertTrue( self.fsm.morphologicalAnalysis(surfaceForm).size() != 0 )