Python Sentence.Sentence Exemples, Corpus.Sentence.Sentence.Sentence Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : Corpus.py Projet : taylankabbani/Corpus-Py

    def __init__(self, fileName=None, splitterOrChecker=None):
        """
        Constructor of Corpus class which takes a file name as an input. Then reads the input file line by line
        and calls addSentence method with each read line.

        PARAMETERS
        ----------
        fileName : str
            String file name input that will be read.
        """
        self.sentences = []
        self.paragraphs = []
        self.wordList = CounterHashMap()
        if fileName is not None:
            self.fileName = fileName
            file = open(fileName, "r", encoding='utf8')
            lines = file.readlines()
            if splitterOrChecker is not None:
                if isinstance(splitterOrChecker, SentenceSplitter):
                    for line in lines:
                        sentences = splitterOrChecker.split(line.strip())
                        paragraph = Paragraph()
                        for sentence in sentences:
                            paragraph.addSentence(sentence)
                        self.addParagraph(paragraph)
                elif isinstance(splitterOrChecker, LanguageChecker):
                    for line in lines:
                        sentence = Sentence(line.strip(), splitterOrChecker)
                        self.addSentence(sentence)
            else:
                for line in lines:
                    self.addSentence(Sentence(line.strip()))

Exemple #2

0

Afficher le fichier

Fichier : NGramDeasciifierTest.py Projet : StarlangSoftware/TurkishDeasciifier-Cy

 def test_Deasciify2(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramDeasciifier = NGramDeasciifier(fsm, nGram, False)
     self.assertEqual("noter hakkında", nGramDeasciifier.deasciify(Sentence("noter hakkinda")).__str__())
     self.assertEqual("sandık medrese", nGramDeasciifier.deasciify(Sentence("sandik medrese")).__str__())
     self.assertEqual("kuran'ı karşılıklı", nGramDeasciifier.deasciify(Sentence("kuran'ı karsilikli")).__str__())

Exemple #3

0

Afficher le fichier

Fichier : SimpleDeasciifier.py Projet : StarlangSoftware/TurkishDeasciifier-Py

    def deasciify(self, sentence: Sentence) -> Sentence:
        """
        The deasciify method takes a Sentence as an input and loops i times where i ranges from 0 to number of
        words in the given Sentence. First it gets ith word from given Sentence and calls candidateList with
        ith word and assigns the returned list to the newly created candidates list. And if the size of
        candidates list is greater than 0, it generates a random number and gets the item of candidates list
        at the index of random number and assigns it as a newWord. If the size of candidates list is 0, it then
        directly assigns ith word as the newWord. At the end, it adds newWord to the result Sentence.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            result Sentence.
        """
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            fsmParseList = self.fsm.morphologicalAnalysis(word.getName())
            if fsmParseList.size() == 0:
                candidates = self.candidateList(word)
                if len(candidates) > 0:
                    randomCandidate = randrange(len(candidates))
                    newWord = Word(candidates[randomCandidate])
                else:
                    newWord = word
            else:
                newWord = word
            result.addWord(newWord)
        return result

Exemple #4

0

Afficher le fichier

    def spellCheck(self, sentence: Sentence) -> Sentence:
        """
        The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in
        given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList,
        if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the
        candidates list. If the size of candidates greater than 0, it generates a random number and selects an item from
        candidates list with this random number and assign it as newWord. If the size of candidates is not greater than
        0, it directly assigns the current word as newWord. At the end, it adds the newWord to the result Sentence.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence result.
        """
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            fsmParseList = self.fsm.morphologicalAnalysis(word.getName())
            if fsmParseList.size() == 0:
                candidates = self.candidateList(word)
                if len(candidates) > 0:
                    randomCandidate = randrange(len(candidates))
                    newWord = Word(candidates[randomCandidate])
                else:
                    newWord = word
            else:
                newWord = word
            result.addWord(newWord)
        return result

Exemple #5

0

Afficher le fichier

 def test_SpellCheckSurfaceForm(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt",
                                    "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramSpellChecker = NGramSpellChecker(fsm, nGram, False)
     self.assertEqual(
         "noter hakkında",
         nGramSpellChecker.spellCheck(Sentence("noter hakkınad")).__str__())
     self.assertEqual(
         "arçelik'in çamaşır",
         nGramSpellChecker.spellCheck(
             Sentence("arçelik'in çamşaır")).__str__())
     self.assertEqual(
         "ruhsat yanında",
         nGramSpellChecker.spellCheck(Sentence("ruhset yanında")).__str__())

Exemple #6

0

Afficher le fichier

Fichier : SentenceTest.py Projet : StarlangSoftware/Corpus-Cy

 def setUp(self) -> None:
     self.sentence = Sentence()
     self.sentence.addWord(Word("ali"))
     self.sentence.addWord(Word("topu"))
     self.sentence.addWord(Word("at"))
     self.sentence.addWord(Word("mehmet"))
     self.sentence.addWord(Word("ayşeyle"))
     self.sentence.addWord(Word("gitti"))

Exemple #7

0

Afficher le fichier

    def spellCheck(self, sentence: Sentence) -> Sentence:
        """
        The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in
        given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList,
        if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the
        candidates list.

        Later on, it loops through candidates list and calls morphologicalAnalysis method with each word and assigns it
        to the FsmParseList. Then, it gets the root from FsmParseList. For the first time, it defines a previousRoot by
        calling getProbability method with root, and for the following times it calls getProbability method with
        previousRoot and root. Then, it finds out the best probability and the corresponding candidate as best candidate
        and adds it to the result Sentence.

        If the size of FsmParseList is not equal to 0, it directly adds the current word to the result Sentence and
        finds the previousRoot directly from the FsmParseList.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence result.
        """
        previousRoot = None
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            fsmParses = self.fsm.morphologicalAnalysis(word.getName())
            if fsmParses.size() == 0:
                candidates = self.candidateList(word)
                bestCandidate = word.getName()
                bestRoot = word
                bestProbability = 0.0
                for candidate in candidates:
                    fsmParses = self.fsm.morphologicalAnalysis(candidate)
                    root = fsmParses.getParseWithLongestRootWord().getWord()
                    if previousRoot is not None:
                        probability = self.__nGram.getProbability(
                            previousRoot.getName(), root.getName())
                    else:
                        probability = self.__nGram.getProbability(
                            root.getName())
                    if probability > bestProbability:
                        bestCandidate = candidate
                        bestRoot = root
                        bestProbability = probability
                previousRoot = bestRoot
                result.addWord(Word(bestCandidate))
            else:
                result.addWord(word)
                previousRoot = fsmParses.getParseWithLongestRootWord().getWord(
                )
        return result

Exemple #8

0

Afficher le fichier

    def __init__(self, corpus, example, pdf_path=None):
        self.corpus = corpus
        self.example = example
        self.result = None
        self.pdf_path = pdf_path

        if self.example == Tool.KELIMEYI_OGELERINE_AYIR:
            self.result = zemberekTool.ogelere_ayir(corpus)
            if self.result is None:
                self.result = "Cümle yerine kelime girmeniz gerekiyor veya girdiğiniz kelime yanlış"

        if self.example == Tool.CUMLEDE_GECEN_KOKLERI_BUL:
            self.result = zemberekTool.metinde_gecen_kokleri_bul(self.corpus)

        if self.example == Tool.CUMLEYI_PARCALARA_AYIR:
            self.result = zemberekTool.cumleyi_parcalara_ayir(self.corpus)

        if self.example == Tool.KELIME_ONERICI:
            self.result = zemberekTool.kelime_onerici(self.corpus)
            if self.result is None:
                self.result = "Cümle yerine kelime girmeniz gerekiyor"

        if self.example == Tool.KELIME_HECELE:
            self.result = zemberekTool.kelime_hecele(self.corpus)
            if self.result is None:
                self.result = "Cümle yerine kelime girmeniz gerekiyor"

        if self.example == Tool.NLTK_FILES_DOWNLOAD:
            self.result = nltk_download()

        if self.example == Tool.PERSONIFICATION_COPULA:
            self.result = personal(self.corpus, Person.FIRST, is_plural=True)
            if self.result is None:
                self.result = "Cümle yerine kelime girmeniz gerekiyor"

        if self.example == Tool.INFERENTIAL_MOOD:
            self.result = inferential(self.corpus, Person.SECOND, is_plural=False)
            if self.result is None:
                self.result = "Cümle yerine kelime girmeniz gerekiyor"

        if self.example == Tool.CONVERT_PDF_TO_TXT:
            self.result = pdfconverter.PDFParser(pdf_path).parse()
            if self.result is None:
                self.result = "PDF path yanlış olabilir veya PDF olmayabilir"

        if self.example == Tool.SENTENCE_CORRECTOR:
            fsm = FsmMorphologicalAnalyzer("./SpellChecker/turkish_dictionary.txt",
                                           "./SpellChecker/turkish_misspellings.txt",
                                           "./SpellChecker/turkish_finite_state_machine.xml")
            spellChecker = SimpleSpellChecker(fsm)
            sentence = Sentence(self.corpus)
            self.result = spellChecker.spellCheck(sentence)

Exemple #9

0

Afficher le fichier

Fichier : DisambiguationCorpus.py Projet : salihercan/TurkishMorphologicalDisambiguation-Py

 def __init__(self, fileName=None):
     """
     Constructor which creates a list of sentences and a CounterHashMap of wordList.
     """
     super().__init__()
     if fileName is not None:
         inputFile = open(fileName, "r", encoding="utf8")
         lines = inputFile.readlines()
         newSentence = Sentence()
         for line in lines:
             word = line[:line.index("\t")]
             parse = line[line.index("\t") + 1:]
             if len(word) > 0 and len(parse) > 0:
                 newWord = DisambiguatedWord(word, MorphologicalParse(parse.strip()))
                 if word == "<S>":
                     newSentence = Sentence()
                 elif word == "</S>":
                     self.addSentence(newSentence)
                 elif word == "<DOC>" or word == "</DOC>" or word == "<TITLE>" or word == "</TITLE>":
                     pass
                 else:
                     newSentence.addWord(newWord)
         inputFile.close()

Exemple #10

0

Afficher le fichier

Fichier : SimpleSpellCheckerTest.py Projet : maliozer/TurkishSpellChecker-Py

 def test_SpellCheck(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt",
                                    "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     simpleSpellChecker = SimpleSpellChecker(fsm)
     input = open("../misspellings.txt")
     lines = input.readlines()
     for line in lines:
         items = line.strip().split(" ")
         misspelled = items[0]
         corrected = items[1]
         self.assertEqual(
             corrected,
             simpleSpellChecker.spellCheck(Sentence(misspelled)).toString())

Exemple #11

0

Afficher le fichier

Fichier : NGramDeasciifierTest.py Projet : StarlangSoftware/TurkishDeasciifier-Cy

 def test_Deasciify(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramDeasciifier = NGramDeasciifier(fsm, nGram, True)
     simpleAsciifier = SimpleAsciifier()
     corpus = Corpus("../corpus.txt")
     for i in range(corpus.sentenceCount()):
         sentence = corpus.getSentence(i)
         for j in range(1, sentence.wordCount()):
             if fsm.morphologicalAnalysis(sentence.getWord(j).getName()).size() > 0:
                 asciified = simpleAsciifier.asciifyWord(sentence.getWord(j))
                 if asciified != sentence.getWord(j).getName():
                     deasciified = nGramDeasciifier.deasciify(Sentence(sentence.getWord(j - 1).getName() + " " + sentence.getWord(j).getName()))
                     self.assertEqual(sentence.getWord(j).getName(), deasciified.getWord(1).getName())

Exemple #12

0

Afficher le fichier

 def test_Deasciify(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt",
                                    "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     simpleDeasciifier = SimpleDeasciifier(fsm)
     simpleAsciifier = SimpleAsciifier()
     for i in range(fsm.getDictionary().size()):
         word = fsm.getDictionary().getWordWithIndex(i)
         count = 0
         for j in range(len(word.getName())):
             if word.getName()[j] == 'ç' or word.getName()[j] == 'ö' or word.getName()[j] == 'ğ' or \
                     word.getName()[j] == 'ü' or word.getName()[j] == 'ş' or word.getName()[j] == 'ı':
                 count = count + 1
         if (count > 0 and not word.getName().endswith("fulü")
                 and (word.isNominal() or word.isAdjective()
                      or word.isAdverb() or word.isVerb())):
             asciified = simpleAsciifier.asciifyWord(word)
             if len(simpleDeasciifier.candidateList(Word(asciified))) == 1:
                 deasciified = simpleDeasciifier.deasciify(
                     Sentence(asciified)).toString()
                 self.assertEqual(word.getName(), deasciified)

Exemple #13

0

Afficher le fichier

Fichier : SimpleAsciifier.py Projet : StarlangSoftware/TurkishDeasciifier-Py

    def asciify(self, sentence: Sentence) -> Sentence:
        """
        Another asciify method which takes a Sentence as an input. It loops i times where i ranges form 0 to
        number of words in the given sentence. First it gets each word and calls asciify with current word and creates
        Word with returned String. At the and, adds each newly created ascified words to the result Sentence.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence output which is asciified.
        """
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            newWord = Word(self.asciifyWord(word))
            result.addWord(newWord)
        return result

Exemple #14

0

Afficher le fichier

    def split(self, line: str) -> list:
        """
        The split method takes a String line as an input. Firstly it creates a new sentence as currentSentence a new
        list as sentences. Then loops till the end of the line and checks some conditions;
        If the char at ith index is a separator;

        ' : assigns currentWord as currentWord'
        { : increment the curlyBracketCount
        } : decrement the curlyBracketCount
        " : increment the specialQuotaCount
        " : decrement the specialQuotaCount
        ( : increment roundParenthesisCount
        ) : decrement roundParenthesisCount
        [ : increment bracketCount
        ] : decrement bracketCount
        " : assign quotaCount as 1- quotaCount
        ' : assign apostropheCount as 1- apostropheCount

        If the currentWord is not empty, it adds the currentWord after repeatControl to currentSentence.

        If the char at index i is " and  bracketCount, specialQuotaCount, curlyBracketCount, roundParenthesisCount, and
        quotaCount equal to 0 and also the next char is uppercase or digit, it adds currentSentence to sentences.

        If the char at ith index is a sentence ender;

        . and currentWord is www : assigns webMode as true. Ex: www.google.com
        . and currentWord is a digit or in web or e-mail modes : assigns currentWord as currentWord+char(i) Ex: 1.
        . and currentWord is a shortcut or an abbreviation : assigns currentWord as currentWord+char(i) and adds
        currentWord to currentSentence. Ex : bkz.
        ' and next char is uppercase or digit: add word to currentSentence as ' and add currentSentence to sentences.

        If the char at index i is ' ', i.e space, add word to currentSentence and assign "" to currentSentence.
        If the char at index i is -,  add word to currentSentence and add sentences when the wordCount of
        currentSentence greater than 0.

        If the char at ith index is a punctuation;
        : and if currentWord is "https" : assign webMode as true.
        , and there exists a number before and after : assign currentWord as currentWord+char(i) Ex: 1,2
        : and if line is a time : assign currentWord as currentWord+char(i) Ex: 12:14:24
        - and there exists a number before and after : assign currentWord as currentWord+char(i) Ex: 12-1
        {@literal @} : assign emailMode as true.

        PARAMETERS
        ----------
        line : str
            String input to split.

        RETURNS
        -------
        list
            sentences list which holds split line.
        """
        emailMode = False
        webMode = False
        i = 0
        specialQuotaCount = 0
        roundParenthesisCount = 0
        bracketCount = 0
        curlyBracketCount = 0
        quotaCount = 0
        apostropheCount = 0
        currentSentence = Sentence()
        currentWord = ""
        sentences = []
        while i < len(line):
            if line[i] in SentenceSplitter.SEPARATORS:
                if line[i] == '\'' and currentWord != "" and self.__isApostrophe(
                        line, i):
                    currentWord = currentWord + line[i]
                else:
                    if currentWord != "":
                        currentSentence.addWord(
                            Word(
                                self.__repeatControl(currentWord, webMode
                                                     or emailMode)))
                    currentSentence.addWord(Word("" + line[i]))
                    currentWord = ""
                    if line[i] == '{':
                        curlyBracketCount = curlyBracketCount + 1
                    elif line[i] == '}':
                        curlyBracketCount = curlyBracketCount - 1
                    elif line[i] == '\uFF02':
                        specialQuotaCount = specialQuotaCount + 1
                    elif line[i] == '\u05F4':
                        specialQuotaCount = specialQuotaCount - 1
                    elif line[i] == '(':
                        roundParenthesisCount = roundParenthesisCount + 1
                    elif line[i] == ')':
                        roundParenthesisCount = roundParenthesisCount - 1
                    elif line[i] == '[':
                        bracketCount = bracketCount + 1
                    elif line[i] == ']':
                        bracketCount = bracketCount - 1
                    elif line[i] == '"':
                        quotaCount = 1 - quotaCount
                    elif line[i] == '\'':
                        apostropheCount = 1 - apostropheCount
                    if line[i] == '"' and bracketCount == 0 and specialQuotaCount == 0 and curlyBracketCount == 0 and \
                            roundParenthesisCount == 0 and quotaCount == 0 and self.__isNextCharUpperCaseOrDigit(line,
                                                                                                                 i + 1):
                        sentences.append(currentSentence)
                        currentSentence = Sentence()
            else:
                if line[i] in SentenceSplitter.SENTENCE_ENDERS:
                    if line[i] == '.' and currentWord == "www":
                        webMode = True
                    if line[i] == '.' and currentWord != "" and (
                            webMode or emailMode
                            or line[i - 1] in TurkishLanguage.DIGITS):
                        currentWord = currentWord + line[i]
                    else:
                        if line[i] == '.' and (
                                self.__listContains(currentWord)
                                or self.__isNameShortcut(currentWord)):
                            currentWord = currentWord + line[i]
                            currentSentence.addWord(Word(currentWord))
                            currentWord = ""
                        else:
                            if currentWord != "":
                                currentSentence.addWord(
                                    Word(
                                        self.__repeatControl(
                                            currentWord, webMode
                                            or emailMode)))
                            currentWord = "" + line[i]
                            i = i + 1
                            while i < len(line) and line[
                                    i] in SentenceSplitter.SENTENCE_ENDERS:
                                i = i + 1
                            i = i - 1
                            currentSentence.addWord(Word(currentWord))
                            if roundParenthesisCount == 0 and bracketCount == 0 and curlyBracketCount == 0 and \
                                    quotaCount == 0:
                                if i + 1 < len(line) and line[i + 1] == '\'' and apostropheCount == 1 and \
                                        self.__isNextCharUpperCaseOrDigit(line, i + 2):
                                    currentSentence.addWord(Word("'"))
                                    i = i + 1
                                    sentences.append(currentSentence)
                                    currentSentence = Sentence()
                                else:
                                    if i + 2 < len(line) and line[i + 1] == ' ' and line[i + 2] == '\'' and \
                                            apostropheCount == 1 and self.__isNextCharUpperCaseOrDigit(line, i + 3):
                                        currentSentence.addWord(Word("'"))
                                        i += 2
                                        sentences.append(currentSentence)
                                        currentSentence = Sentence()
                                    else:
                                        if self.__isNextCharUpperCaseOrDigit(
                                                line, i + 1):
                                            sentences.append(currentSentence)
                                            currentSentence = Sentence()
                            currentWord = ""
                else:
                    if line[i] == ' ':
                        emailMode = False
                        webMode = False
                        if currentWord != "":
                            currentSentence.addWord(
                                Word(
                                    self.__repeatControl(
                                        currentWord, webMode or emailMode)))
                            currentWord = ""
                    else:
                        if line[i] == '-' and not webMode and roundParenthesisCount == 0 and \
                                self.__isNextCharUpperCase(line, i + 1) and \
                                not self.__isPreviousWordUpperCase(line, i - 1):
                            if currentWord != "" and currentWord not in TurkishLanguage.DIGITS:
                                currentSentence.addWord(
                                    Word(
                                        self.__repeatControl(
                                            currentWord, webMode
                                            or emailMode)))
                            if currentSentence.wordCount() > 0:
                                sentences.append(currentSentence)
                            currentSentence = Sentence()
                            roundParenthesisCount = 0
                            bracketCount = 0
                            curlyBracketCount = 0
                            quotaCount = 0
                            specialQuotaCount = 0
                            if currentWord != "" and re.match(
                                    "\\d+", currentWord):
                                currentSentence.addWord(
                                    Word(currentWord + " -"))
                            else:
                                currentSentence.addWord(Word("-"))
                            currentWord = ""
                        else:
                            if line[i] in SentenceSplitter.PUNCTUATION_CHARACTERS or \
                                    line[i] in TurkishLanguage.ARITHMETIC_CHARACTERS:
                                if line[i] == ':' and (currentWord == "http" or
                                                       currentWord == "https"):
                                    webMode = True
                                if webMode:
                                    currentWord = currentWord + line[i]
                                else:
                                    if line[i] == ',' and self.__numberExistsBeforeAndAfter(
                                            line, i):
                                        currentWord = currentWord + line[i]
                                    else:
                                        if line[i] == ':' and self.__isTime(
                                                line, i):
                                            currentWord = currentWord + line[i]
                                        else:
                                            if line[i] == '-' and self.__numberExistsBeforeAndAfter(
                                                    line, i):
                                                currentWord = currentWord + line[
                                                    i]
                                            else:
                                                if currentWord != "":
                                                    currentSentence.addWord(
                                                        Word(
                                                            self.
                                                            __repeatControl(
                                                                currentWord,
                                                                webMode
                                                                or emailMode)))
                                                currentSentence.addWord(
                                                    Word("" + line[i]))
                                                currentWord = ""
                            else:
                                if line[i] == '@':
                                    currentWord = currentWord + line[i]
                                    emailMode = True
                                else:
                                    currentWord = currentWord + line[i]
            i = i + 1
        if currentWord != "":
            currentSentence.addWord(
                Word(self.__repeatControl(currentWord, webMode or emailMode)))
        if currentSentence.wordCount() > 0:
            sentences.append(currentSentence)
        return sentences

Exemple #15

0

Afficher le fichier

Fichier : NGramDeasciifier.py Projet : StarlangSoftware/TurkishDeasciifier-Py

    def deasciify(self, sentence: Sentence) -> Sentence:
        """
        The deasciify method takes a Sentence as an input. First it creates a String list as candidates,
        and a Sentence result. Then, loops i times where i ranges from 0 to words size of given sentence. It gets the
        current word and generates a candidateList with this current word then, it loops through the candidateList.
        First it calls morphologicalAnalysis method with current candidate and gets the first item as root word. If it
        is the first root, it gets its N-gram probability, if there are also other roots, it gets probability of these
        roots and finds out the best candidate, best root and the best probability. At the nd, it adds the bestCandidate
        to the bestCandidate list.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence result as output.
        """
        previousRoot = None
        result = Sentence()
        root = self.checkAnalysisAndSetRoot(sentence, 0)
        nextRoot = self.checkAnalysisAndSetRoot(sentence, 1)
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            if root is None:
                candidates = self.candidateList(word)
                bestCandidate = word.getName()
                bestRoot = word
                bestProbability = self.__threshold
                for candidate in candidates:
                    fsmParses = self.fsm.morphologicalAnalysis(candidate)
                    if self.__rootNgram:
                        root = fsmParses.getParseWithLongestRootWord().getWord(
                        )
                    else:
                        root = Word(candidate)
                    if previousRoot is not None:
                        previousProbability = self.__nGram.getProbability(
                            previousRoot.getName(), root.getName())
                    else:
                        previousProbability = 0.0
                    if nextRoot is not None:
                        nextProbability = self.__nGram.getProbability(
                            root.getName(), nextRoot.getName())
                    else:
                        nextProbability = 0.0
                    if max(previousProbability,
                           nextProbability) > bestProbability:
                        bestCandidate = candidate
                        bestRoot = root
                        bestProbability = max(previousProbability,
                                              nextProbability)
                root = bestRoot
                result.addWord(Word(bestCandidate))
            else:
                result.addWord(word)
            previousRoot = root
            root = nextRoot
            nextRoot = self.checkAnalysisAndSetRoot(sentence, i + 2)
        return result

Exemple #16

0

Afficher le fichier

Fichier : NGramSpellCheckerTest.py Projet : fsahinie/TurkishSpellChecker-Py

 def test_SpellCheck(self):
     original = [
         Sentence("demokratik cumhuriyet en kıymetli varlığımızdır"),
         Sentence("bu tablodaki değerler zedelenmeyecektir"),
         Sentence(
             "milliyet'in geleneksel yılın sporcusu anketi 43. yaşını doldurdu"
         ),
         Sentence("demokrasinin icadı bu ayrımı bulandırdı"),
         Sentence(
             "dışişleri müsteşarı Öymen'in 1997'nin ilk aylarında Bağdat'a gitmesi öngörülüyor"
         ),
         Sentence("büyüdü , palazlandı , devleti ele geçirdi"),
         Sentence("her maskenin ciltte kalma süresi farklıdır"),
         Sentence("yılın son ayında 10 gazeteci gözaltına alındı"),
         Sentence("iki pilotun kullandığı uçakta bir hostes görev alıyor"),
         Sentence(
             "son derece kısıtlı kelimeler çerçevesinde kendilerini uzun cümlelerle ifade edebiliyorlar"
         ),
         Sentence("kedi köpek"),
         Sentence("minibüs durağı"),
         Sentence("noter belgesi"),
         Sentence("")
     ]
     modified = [
         Sentence("demokratik cumhüriyet rn kımetli varlıgımızdır"),
         Sentence("bu tblodaki değerlğr zedelenmeyecüktir"),
         Sentence(
             "milliyet'in geeneksel yılin spoşcusu ankşti 43. yeşını doldürdu"
         ),
         Sentence("demokrasinin icşdı buf ayrmıı bulandürdı"),
         Sentence(
             "dışişleri mütseşarı Öymen'in 1997'nin iljk aylğrında Bağdat'a gitmesi öngörülüyor"
         ),
         Sentence("büyüdü , palazandı , devltei eöe geçridi"),
         Sentence("her makenin cültte aklma sürdsi farlkıdır"),
         Sentence("yılın sno ayında 10 gazteci gözlatına alündı"),
         Sentence("iki piotun kulçandığı uçkata üir hotes görçv alyıor"),
         Sentence(
             "son deece kısütlı keilmeler çeçevesinde kendülerini uzuü cümllerle ifüde edbeiliyorlar"
         ),
         Sentence("krdi köpek"),
         Sentence("minibü durağı"),
         Sentence("ntoer belgesi"),
         Sentence("")
     ]
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt",
                                    "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramSpellChecker = NGramSpellChecker(fsm, nGram)
     for i in range(len(modified)):
         self.assertEqual(
             original[i].toString(),
             nGramSpellChecker.spellCheck(modified[i]).toString())

Exemple #17

0

Afficher le fichier

Fichier : SimpleAsciifierTest.py Projet : StarlangSoftware/TurkishDeasciifier-Cy

 def test_SentenceAsciify(self):
     self.assertEqual(Sentence("cogus iii COGUSI").toString(), self.simpleAsciifier.asciify(Sentence("çöğüş ııı ÇÖĞÜŞİ")).toString())
     self.assertEqual(Sentence("uckagitcilik akiskanlistiricilik").toString(), self.simpleAsciifier.asciify(Sentence("üçkağıtçılık akışkanlıştırıcılık")).toString())
     self.assertEqual(Sentence("citcitcilik duskirikligi yuzgorumlugu").toString(), self.simpleAsciifier.asciify(Sentence("çıtçıtçılık düşkırıklığı yüzgörümlüğü")).toString())