def setUp(self) -> None: self.sentence = Sentence() self.sentence.addWord(Word("ali")) self.sentence.addWord(Word("topu")) self.sentence.addWord(Word("at")) self.sentence.addWord(Word("mehmet")) self.sentence.addWord(Word("ayşeyle")) self.sentence.addWord(Word("gitti"))
def test_Deasciify2(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") nGram = NGram("../ngram.txt") nGram.calculateNGramProbabilitiesSimple(NoSmoothing()) nGramDeasciifier = NGramDeasciifier(fsm, nGram, False) self.assertEqual("noter hakkında", nGramDeasciifier.deasciify(Sentence("noter hakkinda")).__str__()) self.assertEqual("sandık medrese", nGramDeasciifier.deasciify(Sentence("sandik medrese")).__str__()) self.assertEqual("kuran'ı karşılıklı", nGramDeasciifier.deasciify(Sentence("kuran'ı karsilikli")).__str__())
def __init__(self, sentence: Sentence, index: int): """ Discrete attribute for a given word. Returns the nearest predicate word to the given word PARAMETERS ---------- sentence : Sentence Sentence where current word is in. index : int Position of the current word in the sentence """ if isinstance(sentence, AnnotatedSentence): sentence.getPredicate(index)
def checkAnalysisAndSetRoot(self, sentence: Sentence, index: int) -> Word: """ Checks the morphological analysis of the given word in the given index. If there is no misspelling, it returns the longest root word of the possible analyses. @param sentence Sentence to be analyzed. @param index Index of the word @return If the word is misspelled, null; otherwise the longest root word of the possible analyses. """ if index < sentence.wordCount(): fsmParses = self.fsm.morphologicalAnalysis( sentence.getWord(index).getName()) if fsmParses.size() != 0: return fsmParses.getParseWithLongestRootWord().getWord() return None
def addSentence(self, s: Sentence): """ The addSentence method takes a Sentence as an input. It adds given input to sentences list and loops through the each word in sentence and puts these words into wordList CounterHashMap. PARAMETERS ---------- s : Sentence Sentence type input that will be added to sentences list and its words will be added to wordList CounterHashMap. """ self.sentences.append(s) for i in range(s.wordCount()): w = s.getWord(i) self.wordList.put(w)
def generateInstanceFromSentence(self, sentence: Sentence, wordIndex: int) -> Instance: if isinstance(sentence, AnnotatedSentence): possibleSynSets = sentence.constructSynSets( self.__wordNet, self.__fsm, wordIndex) word = sentence.getWord(wordIndex) if isinstance(word, AnnotatedWord): classLabel = word.getSemantic() current = CompositeInstance(classLabel) possibleClassLabels = [] for synSet in possibleSynSets: possibleClassLabels.append(synSet.getId()) current.setPossibleClassLabels(possibleClassLabels) self.addAttributes(current, sentence, wordIndex) return current
def addAttributes(self, current: Instance, sentence: Sentence, wordIndex: int): """ addAttributes adds all attributes of the previous words, the current wordn, and next words of the given word to the given instance. If the previous or next words does not exists, the method calls addAttributesForEmptyWords method. If the word does not exists in the dictionary or the required annotation layer does not exists in the annotated word, the method throws InstanceNotGenerated. The window size determines the number of previous and next words. PARAMETERS ---------- current : Instance Current classification instance to which attributes will be added. sentence : Sentence Input sentence. wordIndex : int The index of the word in the sentence. """ for i in range(self.windowSize): if wordIndex - self.windowSize + i >= 0: self.addAttributesForWords(current, sentence, wordIndex - self.windowSize + i) else: self.addAttributesForEmptyWords(current, "<s>") self.addAttributesForWords(current, sentence, wordIndex) for i in range(self.windowSize): if wordIndex + i + 1 < sentence.wordCount(): self.addAttributesForWords(current, sentence, wordIndex + i + 1) else: self.addAttributesForEmptyWords(current, "</s>")
def generateInstanceFromSentence(self, sentence: Sentence, wordIndex: int) -> Instance: """ Generates a single classification instance of the morphological disambiguation problem for the given word of the given sentence. If the word does not have a morphological parse, the method throws InstanceNotGenerated. PARAMETERS ---------- sentence : Sentence Input sentence. wordIndex : int The index of the word in the sentence. RETURNS ------- Instance Classification instance. """ word = sentence.getWord(wordIndex) if isinstance(word, AnnotatedWord): current = Instance(word.getParse().getTransitionList()) for i in range(self.windowSize): if wordIndex - self.windowSize + i >= 0: self.addAttributesForPreviousWords( current, sentence, wordIndex - self.windowSize + i) else: self.addAttributesForEmptyWords(current, "<s>") self.addAttributesForPreviousWords(current, sentence, wordIndex) return current
def generateInstanceFromSentence(self, sentence: Sentence, wordIndex: int) -> Instance: word = sentence.getWord(wordIndex) if isinstance(word, AnnotatedWord): classLabel = NamedEntityType.getNamedEntityString(word.getNamedEntityType()) current = Instance(classLabel) self.addAttributes(current, sentence, wordIndex) return current
def spellCheck(self, sentence: Sentence) -> Sentence: """ The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList, if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the candidates list. Later on, it loops through candidates list and calls morphologicalAnalysis method with each word and assigns it to the FsmParseList. Then, it gets the root from FsmParseList. For the first time, it defines a previousRoot by calling getProbability method with root, and for the following times it calls getProbability method with previousRoot and root. Then, it finds out the best probability and the corresponding candidate as best candidate and adds it to the result Sentence. If the size of FsmParseList is not equal to 0, it directly adds the current word to the result Sentence and finds the previousRoot directly from the FsmParseList. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence Sentence result. """ previousRoot = None result = Sentence() for i in range(sentence.wordCount()): word = sentence.getWord(i) fsmParses = self.fsm.morphologicalAnalysis(word.getName()) if fsmParses.size() == 0: candidates = self.candidateList(word) bestCandidate = word.getName() bestRoot = word bestProbability = 0.0 for candidate in candidates: fsmParses = self.fsm.morphologicalAnalysis(candidate) root = fsmParses.getParseWithLongestRootWord().getWord() if previousRoot is not None: probability = self.__nGram.getProbability( previousRoot.getName(), root.getName()) else: probability = self.__nGram.getProbability( root.getName()) if probability > bestProbability: bestCandidate = candidate bestRoot = root bestProbability = probability previousRoot = bestRoot result.addWord(Word(bestCandidate)) else: result.addWord(word) previousRoot = fsmParses.getParseWithLongestRootWord().getWord( ) return result
def test_SpellCheckSurfaceForm(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") nGram = NGram("../ngram.txt") nGram.calculateNGramProbabilitiesSimple(NoSmoothing()) nGramSpellChecker = NGramSpellChecker(fsm, nGram, False) self.assertEqual( "noter hakkında", nGramSpellChecker.spellCheck(Sentence("noter hakkınad")).__str__()) self.assertEqual( "arçelik'in çamaşır", nGramSpellChecker.spellCheck( Sentence("arçelik'in çamşaır")).__str__()) self.assertEqual( "ruhsat yanında", nGramSpellChecker.spellCheck(Sentence("ruhset yanında")).__str__())
def spellCheck(self, sentence: Sentence) -> Sentence: """ The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList, if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the candidates list. If the size of candidates greater than 0, it generates a random number and selects an item from candidates list with this random number and assign it as newWord. If the size of candidates is not greater than 0, it directly assigns the current word as newWord. At the end, it adds the newWord to the result Sentence. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence Sentence result. """ result = Sentence() for i in range(sentence.wordCount()): word = sentence.getWord(i) fsmParseList = self.fsm.morphologicalAnalysis(word.getName()) if fsmParseList.size() == 0: candidates = self.candidateList(word) if len(candidates) > 0: randomCandidate = randrange(len(candidates)) newWord = Word(candidates[randomCandidate]) else: newWord = word else: newWord = word result.addWord(newWord) return result
def deasciify(self, sentence: Sentence) -> Sentence: """ The deasciify method takes a Sentence as an input and loops i times where i ranges from 0 to number of words in the given Sentence. First it gets ith word from given Sentence and calls candidateList with ith word and assigns the returned list to the newly created candidates list. And if the size of candidates list is greater than 0, it generates a random number and gets the item of candidates list at the index of random number and assigns it as a newWord. If the size of candidates list is 0, it then directly assigns ith word as the newWord. At the end, it adds newWord to the result Sentence. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence result Sentence. """ result = Sentence() for i in range(sentence.wordCount()): word = sentence.getWord(i) fsmParseList = self.fsm.morphologicalAnalysis(word.getName()) if fsmParseList.size() == 0: candidates = self.candidateList(word) if len(candidates) > 0: randomCandidate = randrange(len(candidates)) newWord = Word(candidates[randomCandidate]) else: newWord = word else: newWord = word result.addWord(newWord) return result
def __init__(self, corpus, example, pdf_path=None): self.corpus = corpus self.example = example self.result = None self.pdf_path = pdf_path if self.example == Tool.KELIMEYI_OGELERINE_AYIR: self.result = zemberekTool.ogelere_ayir(corpus) if self.result is None: self.result = "Cümle yerine kelime girmeniz gerekiyor veya girdiğiniz kelime yanlış" if self.example == Tool.CUMLEDE_GECEN_KOKLERI_BUL: self.result = zemberekTool.metinde_gecen_kokleri_bul(self.corpus) if self.example == Tool.CUMLEYI_PARCALARA_AYIR: self.result = zemberekTool.cumleyi_parcalara_ayir(self.corpus) if self.example == Tool.KELIME_ONERICI: self.result = zemberekTool.kelime_onerici(self.corpus) if self.result is None: self.result = "Cümle yerine kelime girmeniz gerekiyor" if self.example == Tool.KELIME_HECELE: self.result = zemberekTool.kelime_hecele(self.corpus) if self.result is None: self.result = "Cümle yerine kelime girmeniz gerekiyor" if self.example == Tool.NLTK_FILES_DOWNLOAD: self.result = nltk_download() if self.example == Tool.PERSONIFICATION_COPULA: self.result = personal(self.corpus, Person.FIRST, is_plural=True) if self.result is None: self.result = "Cümle yerine kelime girmeniz gerekiyor" if self.example == Tool.INFERENTIAL_MOOD: self.result = inferential(self.corpus, Person.SECOND, is_plural=False) if self.result is None: self.result = "Cümle yerine kelime girmeniz gerekiyor" if self.example == Tool.CONVERT_PDF_TO_TXT: self.result = pdfconverter.PDFParser(pdf_path).parse() if self.result is None: self.result = "PDF path yanlış olabilir veya PDF olmayabilir" if self.example == Tool.SENTENCE_CORRECTOR: fsm = FsmMorphologicalAnalyzer("./SpellChecker/turkish_dictionary.txt", "./SpellChecker/turkish_misspellings.txt", "./SpellChecker/turkish_finite_state_machine.xml") spellChecker = SimpleSpellChecker(fsm) sentence = Sentence(self.corpus) self.result = spellChecker.spellCheck(sentence)
def test_SpellCheck(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") simpleSpellChecker = SimpleSpellChecker(fsm) input = open("../misspellings.txt") lines = input.readlines() for line in lines: items = line.strip().split(" ") misspelled = items[0] corrected = items[1] self.assertEqual( corrected, simpleSpellChecker.spellCheck(Sentence(misspelled)).toString())
def test_Deasciify(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") nGram = NGram("../ngram.txt") nGram.calculateNGramProbabilitiesSimple(NoSmoothing()) nGramDeasciifier = NGramDeasciifier(fsm, nGram, True) simpleAsciifier = SimpleAsciifier() corpus = Corpus("../corpus.txt") for i in range(corpus.sentenceCount()): sentence = corpus.getSentence(i) for j in range(1, sentence.wordCount()): if fsm.morphologicalAnalysis(sentence.getWord(j).getName()).size() > 0: asciified = simpleAsciifier.asciifyWord(sentence.getWord(j)) if asciified != sentence.getWord(j).getName(): deasciified = nGramDeasciifier.deasciify(Sentence(sentence.getWord(j - 1).getName() + " " + sentence.getWord(j).getName())) self.assertEqual(sentence.getWord(j).getName(), deasciified.getWord(1).getName())
def test_Deasciify(self): fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") simpleDeasciifier = SimpleDeasciifier(fsm) simpleAsciifier = SimpleAsciifier() for i in range(fsm.getDictionary().size()): word = fsm.getDictionary().getWordWithIndex(i) count = 0 for j in range(len(word.getName())): if word.getName()[j] == 'ç' or word.getName()[j] == 'ö' or word.getName()[j] == 'ğ' or \ word.getName()[j] == 'ü' or word.getName()[j] == 'ş' or word.getName()[j] == 'ı': count = count + 1 if (count > 0 and not word.getName().endswith("fulü") and (word.isNominal() or word.isAdjective() or word.isAdverb() or word.isVerb())): asciified = simpleAsciifier.asciifyWord(word) if len(simpleDeasciifier.candidateList(Word(asciified))) == 1: deasciified = simpleDeasciifier.deasciify( Sentence(asciified)).toString() self.assertEqual(word.getName(), deasciified)
def generateInstanceFromSentence(self, sentence: Sentence, wordIndex: int) -> Instance: """ Generates a single classification instance of the Shallow Parse problem for the given word of the given sentence. If the word has not been labeled with shallow parse tag yet, the method returns null. PARAMETERS ---------- sentence : Sentence Input sentence. wordIndex : int The index of the word in the sentence. RETURNS ------- Instance Classification instance. """ word = sentence.getWord(wordIndex) if isinstance(word, AnnotatedWord): classLabel = word.getShallowParse() current = Instance(classLabel) self.addAttributes(current, sentence, wordIndex) return current
def asciify(self, sentence: Sentence) -> Sentence: """ Another asciify method which takes a Sentence as an input. It loops i times where i ranges form 0 to number of words in the given sentence. First it gets each word and calls asciify with current word and creates Word with returned String. At the and, adds each newly created ascified words to the result Sentence. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence Sentence output which is asciified. """ result = Sentence() for i in range(sentence.wordCount()): word = sentence.getWord(i) newWord = Word(self.asciifyWord(word)) result.addWord(newWord) return result
def __init__(self, fileName=None): """ Constructor which creates a list of sentences and a CounterHashMap of wordList. """ super().__init__() if fileName is not None: inputFile = open(fileName, "r", encoding="utf8") lines = inputFile.readlines() newSentence = Sentence() for line in lines: word = line[:line.index("\t")] parse = line[line.index("\t") + 1:] if len(word) > 0 and len(parse) > 0: newWord = DisambiguatedWord(word, MorphologicalParse(parse.strip())) if word == "<S>": newSentence = Sentence() elif word == "</S>": self.addSentence(newSentence) elif word == "<DOC>" or word == "</DOC>" or word == "<TITLE>" or word == "</TITLE>": pass else: newSentence.addWord(newWord) inputFile.close()
def deasciify(self, sentence: Sentence) -> Sentence: """ The deasciify method takes a Sentence as an input. First it creates a String list as candidates, and a Sentence result. Then, loops i times where i ranges from 0 to words size of given sentence. It gets the current word and generates a candidateList with this current word then, it loops through the candidateList. First it calls morphologicalAnalysis method with current candidate and gets the first item as root word. If it is the first root, it gets its N-gram probability, if there are also other roots, it gets probability of these roots and finds out the best candidate, best root and the best probability. At the nd, it adds the bestCandidate to the bestCandidate list. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence Sentence result as output. """ previousRoot = None result = Sentence() root = self.checkAnalysisAndSetRoot(sentence, 0) nextRoot = self.checkAnalysisAndSetRoot(sentence, 1) for i in range(sentence.wordCount()): word = sentence.getWord(i) if root is None: candidates = self.candidateList(word) bestCandidate = word.getName() bestRoot = word bestProbability = self.__threshold for candidate in candidates: fsmParses = self.fsm.morphologicalAnalysis(candidate) if self.__rootNgram: root = fsmParses.getParseWithLongestRootWord().getWord( ) else: root = Word(candidate) if previousRoot is not None: previousProbability = self.__nGram.getProbability( previousRoot.getName(), root.getName()) else: previousProbability = 0.0 if nextRoot is not None: nextProbability = self.__nGram.getProbability( root.getName(), nextRoot.getName()) else: nextProbability = 0.0 if max(previousProbability, nextProbability) > bestProbability: bestCandidate = candidate bestRoot = root bestProbability = max(previousProbability, nextProbability) root = bestRoot result.addWord(Word(bestCandidate)) else: result.addWord(word) previousRoot = root root = nextRoot nextRoot = self.checkAnalysisAndSetRoot(sentence, i + 2) return result
class SentenceTest(unittest.TestCase): sentence: Sentence def setUp(self) -> None: self.sentence = Sentence() self.sentence.addWord(Word("ali")) self.sentence.addWord(Word("topu")) self.sentence.addWord(Word("at")) self.sentence.addWord(Word("mehmet")) self.sentence.addWord(Word("ayşeyle")) self.sentence.addWord(Word("gitti")) def test_GetWord(self): self.assertEqual(Word("ali"), self.sentence.getWord(0)) self.assertEqual(Word("at"), self.sentence.getWord(2)) self.assertEqual(Word("gitti"), self.sentence.getWord(5)) def test_GetIndex(self): self.assertEqual(0, self.sentence.getIndex(Word("ali"))) self.assertEqual(2, self.sentence.getIndex(Word("at"))) self.assertEqual(5, self.sentence.getIndex(Word("gitti"))) def test_WordCount(self): self.assertEqual(6, self.sentence.wordCount()) def test_CharCount(self): self.assertEqual(27, self.sentence.charCount())
def split(self, line: str) -> list: """ The split method takes a String line as an input. Firstly it creates a new sentence as currentSentence a new list as sentences. Then loops till the end of the line and checks some conditions; If the char at ith index is a separator; ' : assigns currentWord as currentWord' { : increment the curlyBracketCount } : decrement the curlyBracketCount " : increment the specialQuotaCount " : decrement the specialQuotaCount ( : increment roundParenthesisCount ) : decrement roundParenthesisCount [ : increment bracketCount ] : decrement bracketCount " : assign quotaCount as 1- quotaCount ' : assign apostropheCount as 1- apostropheCount If the currentWord is not empty, it adds the currentWord after repeatControl to currentSentence. If the char at index i is " and bracketCount, specialQuotaCount, curlyBracketCount, roundParenthesisCount, and quotaCount equal to 0 and also the next char is uppercase or digit, it adds currentSentence to sentences. If the char at ith index is a sentence ender; . and currentWord is www : assigns webMode as true. Ex: www.google.com . and currentWord is a digit or in web or e-mail modes : assigns currentWord as currentWord+char(i) Ex: 1. . and currentWord is a shortcut or an abbreviation : assigns currentWord as currentWord+char(i) and adds currentWord to currentSentence. Ex : bkz. ' and next char is uppercase or digit: add word to currentSentence as ' and add currentSentence to sentences. If the char at index i is ' ', i.e space, add word to currentSentence and assign "" to currentSentence. If the char at index i is -, add word to currentSentence and add sentences when the wordCount of currentSentence greater than 0. If the char at ith index is a punctuation; : and if currentWord is "https" : assign webMode as true. , and there exists a number before and after : assign currentWord as currentWord+char(i) Ex: 1,2 : and if line is a time : assign currentWord as currentWord+char(i) Ex: 12:14:24 - and there exists a number before and after : assign currentWord as currentWord+char(i) Ex: 12-1 {@literal @} : assign emailMode as true. PARAMETERS ---------- line : str String input to split. RETURNS ------- list sentences list which holds split line. """ emailMode = False webMode = False i = 0 specialQuotaCount = 0 roundParenthesisCount = 0 bracketCount = 0 curlyBracketCount = 0 quotaCount = 0 apostropheCount = 0 currentSentence = Sentence() currentWord = "" sentences = [] while i < len(line): if line[i] in SentenceSplitter.SEPARATORS: if line[i] == '\'' and currentWord != "" and self.__isApostrophe( line, i): currentWord = currentWord + line[i] else: if currentWord != "": currentSentence.addWord( Word( self.__repeatControl(currentWord, webMode or emailMode))) currentSentence.addWord(Word("" + line[i])) currentWord = "" if line[i] == '{': curlyBracketCount = curlyBracketCount + 1 elif line[i] == '}': curlyBracketCount = curlyBracketCount - 1 elif line[i] == '\uFF02': specialQuotaCount = specialQuotaCount + 1 elif line[i] == '\u05F4': specialQuotaCount = specialQuotaCount - 1 elif line[i] == '(': roundParenthesisCount = roundParenthesisCount + 1 elif line[i] == ')': roundParenthesisCount = roundParenthesisCount - 1 elif line[i] == '[': bracketCount = bracketCount + 1 elif line[i] == ']': bracketCount = bracketCount - 1 elif line[i] == '"': quotaCount = 1 - quotaCount elif line[i] == '\'': apostropheCount = 1 - apostropheCount if line[i] == '"' and bracketCount == 0 and specialQuotaCount == 0 and curlyBracketCount == 0 and \ roundParenthesisCount == 0 and quotaCount == 0 and self.__isNextCharUpperCaseOrDigit(line, i + 1): sentences.append(currentSentence) currentSentence = Sentence() else: if line[i] in SentenceSplitter.SENTENCE_ENDERS: if line[i] == '.' and currentWord == "www": webMode = True if line[i] == '.' and currentWord != "" and ( webMode or emailMode or line[i - 1] in TurkishLanguage.DIGITS): currentWord = currentWord + line[i] else: if line[i] == '.' and ( self.__listContains(currentWord) or self.__isNameShortcut(currentWord)): currentWord = currentWord + line[i] currentSentence.addWord(Word(currentWord)) currentWord = "" else: if currentWord != "": currentSentence.addWord( Word( self.__repeatControl( currentWord, webMode or emailMode))) currentWord = "" + line[i] i = i + 1 while i < len(line) and line[ i] in SentenceSplitter.SENTENCE_ENDERS: i = i + 1 i = i - 1 currentSentence.addWord(Word(currentWord)) if roundParenthesisCount == 0 and bracketCount == 0 and curlyBracketCount == 0 and \ quotaCount == 0: if i + 1 < len(line) and line[i + 1] == '\'' and apostropheCount == 1 and \ self.__isNextCharUpperCaseOrDigit(line, i + 2): currentSentence.addWord(Word("'")) i = i + 1 sentences.append(currentSentence) currentSentence = Sentence() else: if i + 2 < len(line) and line[i + 1] == ' ' and line[i + 2] == '\'' and \ apostropheCount == 1 and self.__isNextCharUpperCaseOrDigit(line, i + 3): currentSentence.addWord(Word("'")) i += 2 sentences.append(currentSentence) currentSentence = Sentence() else: if self.__isNextCharUpperCaseOrDigit( line, i + 1): sentences.append(currentSentence) currentSentence = Sentence() currentWord = "" else: if line[i] == ' ': emailMode = False webMode = False if currentWord != "": currentSentence.addWord( Word( self.__repeatControl( currentWord, webMode or emailMode))) currentWord = "" else: if line[i] == '-' and not webMode and roundParenthesisCount == 0 and \ self.__isNextCharUpperCase(line, i + 1) and \ not self.__isPreviousWordUpperCase(line, i - 1): if currentWord != "" and currentWord not in TurkishLanguage.DIGITS: currentSentence.addWord( Word( self.__repeatControl( currentWord, webMode or emailMode))) if currentSentence.wordCount() > 0: sentences.append(currentSentence) currentSentence = Sentence() roundParenthesisCount = 0 bracketCount = 0 curlyBracketCount = 0 quotaCount = 0 specialQuotaCount = 0 if currentWord != "" and re.match( "\\d+", currentWord): currentSentence.addWord( Word(currentWord + " -")) else: currentSentence.addWord(Word("-")) currentWord = "" else: if line[i] in SentenceSplitter.PUNCTUATION_CHARACTERS or \ line[i] in TurkishLanguage.ARITHMETIC_CHARACTERS: if line[i] == ':' and (currentWord == "http" or currentWord == "https"): webMode = True if webMode: currentWord = currentWord + line[i] else: if line[i] == ',' and self.__numberExistsBeforeAndAfter( line, i): currentWord = currentWord + line[i] else: if line[i] == ':' and self.__isTime( line, i): currentWord = currentWord + line[i] else: if line[i] == '-' and self.__numberExistsBeforeAndAfter( line, i): currentWord = currentWord + line[ i] else: if currentWord != "": currentSentence.addWord( Word( self. __repeatControl( currentWord, webMode or emailMode))) currentSentence.addWord( Word("" + line[i])) currentWord = "" else: if line[i] == '@': currentWord = currentWord + line[i] emailMode = True else: currentWord = currentWord + line[i] i = i + 1 if currentWord != "": currentSentence.addWord( Word(self.__repeatControl(currentWord, webMode or emailMode))) if currentSentence.wordCount() > 0: sentences.append(currentSentence) return sentences
def test_SpellCheck(self): original = [ Sentence("demokratik cumhuriyet en kıymetli varlığımızdır"), Sentence("bu tablodaki değerler zedelenmeyecektir"), Sentence( "milliyet'in geleneksel yılın sporcusu anketi 43. yaşını doldurdu" ), Sentence("demokrasinin icadı bu ayrımı bulandırdı"), Sentence( "dışişleri müsteşarı Öymen'in 1997'nin ilk aylarında Bağdat'a gitmesi öngörülüyor" ), Sentence("büyüdü , palazlandı , devleti ele geçirdi"), Sentence("her maskenin ciltte kalma süresi farklıdır"), Sentence("yılın son ayında 10 gazeteci gözaltına alındı"), Sentence("iki pilotun kullandığı uçakta bir hostes görev alıyor"), Sentence( "son derece kısıtlı kelimeler çerçevesinde kendilerini uzun cümlelerle ifade edebiliyorlar" ), Sentence("kedi köpek"), Sentence("minibüs durağı"), Sentence("noter belgesi"), Sentence("") ] modified = [ Sentence("demokratik cumhüriyet rn kımetli varlıgımızdır"), Sentence("bu tblodaki değerlğr zedelenmeyecüktir"), Sentence( "milliyet'in geeneksel yılin spoşcusu ankşti 43. yeşını doldürdu" ), Sentence("demokrasinin icşdı buf ayrmıı bulandürdı"), Sentence( "dışişleri mütseşarı Öymen'in 1997'nin iljk aylğrında Bağdat'a gitmesi öngörülüyor" ), Sentence("büyüdü , palazandı , devltei eöe geçridi"), Sentence("her makenin cültte aklma sürdsi farlkıdır"), Sentence("yılın sno ayında 10 gazteci gözlatına alündı"), Sentence("iki piotun kulçandığı uçkata üir hotes görçv alyıor"), Sentence( "son deece kısütlı keilmeler çeçevesinde kendülerini uzuü cümllerle ifüde edbeiliyorlar" ), Sentence("krdi köpek"), Sentence("minibü durağı"), Sentence("ntoer belgesi"), Sentence("") ] fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt", "../turkish_finite_state_machine.xml") nGram = NGram("../ngram.txt") nGram.calculateNGramProbabilitiesSimple(NoSmoothing()) nGramSpellChecker = NGramSpellChecker(fsm, nGram) for i in range(len(modified)): self.assertEqual( original[i].toString(), nGramSpellChecker.spellCheck(modified[i]).toString())
def test_SentenceAsciify(self): self.assertEqual(Sentence("cogus iii COGUSI").toString(), self.simpleAsciifier.asciify(Sentence("çöğüş ııı ÇÖĞÜŞİ")).toString()) self.assertEqual(Sentence("uckagitcilik akiskanlistiricilik").toString(), self.simpleAsciifier.asciify(Sentence("üçkağıtçılık akışkanlıştırıcılık")).toString()) self.assertEqual(Sentence("citcitcilik duskirikligi yuzgorumlugu").toString(), self.simpleAsciifier.asciify(Sentence("çıtçıtçılık düşkırıklığı yüzgörümlüğü")).toString())