Python Sentence 예제들, Corpus.Sentence.Sentence Python 예제들

예제 #1

0

파일 보기

파일: SentenceTest.py 프로젝트: StarlangSoftware/Corpus-Cy

 def setUp(self) -> None:
     self.sentence = Sentence()
     self.sentence.addWord(Word("ali"))
     self.sentence.addWord(Word("topu"))
     self.sentence.addWord(Word("at"))
     self.sentence.addWord(Word("mehmet"))
     self.sentence.addWord(Word("ayşeyle"))
     self.sentence.addWord(Word("gitti"))

예제 #2

0

파일 보기

파일: NGramDeasciifierTest.py 프로젝트: StarlangSoftware/TurkishDeasciifier-Cy

 def test_Deasciify2(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramDeasciifier = NGramDeasciifier(fsm, nGram, False)
     self.assertEqual("noter hakkında", nGramDeasciifier.deasciify(Sentence("noter hakkinda")).__str__())
     self.assertEqual("sandık medrese", nGramDeasciifier.deasciify(Sentence("sandik medrese")).__str__())
     self.assertEqual("kuran'ı karşılıklı", nGramDeasciifier.deasciify(Sentence("kuran'ı karsilikli")).__str__())

예제 #3

0

파일 보기

파일: Predicate.py 프로젝트: StarlangSoftware/DataGenerator-Py

    def __init__(self, sentence: Sentence, index: int):
        """
        Discrete attribute for a given word. Returns the nearest predicate word to the given word

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence where current word is in.
        index : int
            Position of the current word in the sentence
        """
        if isinstance(sentence, AnnotatedSentence):
            sentence.getPredicate(index)

예제 #4

0

파일 보기

 def checkAnalysisAndSetRoot(self, sentence: Sentence, index: int) -> Word:
     """
     Checks the morphological analysis of the given word in the given index. If there is no misspelling, it returns
     the longest root word of the possible analyses.
     @param sentence Sentence to be analyzed.
     @param index Index of the word
     @return If the word is misspelled, null; otherwise the longest root word of the possible analyses.
     """
     if index < sentence.wordCount():
         fsmParses = self.fsm.morphologicalAnalysis(
             sentence.getWord(index).getName())
         if fsmParses.size() != 0:
             return fsmParses.getParseWithLongestRootWord().getWord()
     return None

예제 #5

0

파일 보기

파일: Corpus.py 프로젝트: taylankabbani/Corpus-Py

    def addSentence(self, s: Sentence):
        """
        The addSentence method takes a Sentence as an input. It adds given input to sentences list and loops
        through the each word in sentence and puts these words into wordList CounterHashMap.

        PARAMETERS
        ----------
        s : Sentence
            Sentence type input that will be added to sentences list and its words will be added to wordList
            CounterHashMap.
        """
        self.sentences.append(s)
        for i in range(s.wordCount()):
            w = s.getWord(i)
            self.wordList.put(w)

예제 #6

0

파일 보기

파일: SemanticInstanceGenerator.py 프로젝트: StarlangSoftware/DataGenerator-Py

 def generateInstanceFromSentence(self, sentence: Sentence,
                                  wordIndex: int) -> Instance:
     if isinstance(sentence, AnnotatedSentence):
         possibleSynSets = sentence.constructSynSets(
             self.__wordNet, self.__fsm, wordIndex)
         word = sentence.getWord(wordIndex)
         if isinstance(word, AnnotatedWord):
             classLabel = word.getSemantic()
             current = CompositeInstance(classLabel)
             possibleClassLabels = []
             for synSet in possibleSynSets:
                 possibleClassLabels.append(synSet.getId())
             current.setPossibleClassLabels(possibleClassLabels)
             self.addAttributes(current, sentence, wordIndex)
             return current

예제 #7

0

파일 보기

    def addAttributes(self, current: Instance, sentence: Sentence, wordIndex: int):
        """
        addAttributes adds all attributes of the previous words, the current wordn, and next words of the given word
        to the given instance. If the previous or next words does not exists, the method calls
        addAttributesForEmptyWords method. If the word does not exists in the dictionary or the required annotation
        layer does not exists in the annotated word, the method throws InstanceNotGenerated. The window size determines
        the number of previous and next words.

        PARAMETERS
        ----------
        current : Instance
            Current classification instance to which attributes will be added.
        sentence : Sentence
            Input sentence.
        wordIndex : int
            The index of the word in the sentence.
        """
        for i in range(self.windowSize):
            if wordIndex - self.windowSize + i >= 0:
                self.addAttributesForWords(current, sentence, wordIndex - self.windowSize + i)
            else:
                self.addAttributesForEmptyWords(current, "<s>")
            self.addAttributesForWords(current, sentence, wordIndex)
        for i in range(self.windowSize):
            if wordIndex + i + 1 < sentence.wordCount():
                self.addAttributesForWords(current, sentence, wordIndex + i + 1)
            else:
                self.addAttributesForEmptyWords(current, "</s>")

예제 #8

0

파일 보기

파일: DisambiguationInstanceGenerator.py 프로젝트: StarlangSoftware/DataGenerator-Py

    def generateInstanceFromSentence(self, sentence: Sentence,
                                     wordIndex: int) -> Instance:
        """
        Generates a single classification instance of the morphological disambiguation problem for the given word of the
        given sentence. If the word does not have a morphological parse, the method throws InstanceNotGenerated.

        PARAMETERS
        ----------
        sentence : Sentence
            Input sentence.
        wordIndex : int
            The index of the word in the sentence.

        RETURNS
        -------
        Instance
            Classification instance.
        """
        word = sentence.getWord(wordIndex)
        if isinstance(word, AnnotatedWord):
            current = Instance(word.getParse().getTransitionList())
            for i in range(self.windowSize):
                if wordIndex - self.windowSize + i >= 0:
                    self.addAttributesForPreviousWords(
                        current, sentence, wordIndex - self.windowSize + i)
                else:
                    self.addAttributesForEmptyWords(current, "<s>")
            self.addAttributesForPreviousWords(current, sentence, wordIndex)
            return current

예제 #9

0

파일 보기

 def generateInstanceFromSentence(self, sentence: Sentence, wordIndex: int) -> Instance:
     word = sentence.getWord(wordIndex)
     if isinstance(word, AnnotatedWord):
         classLabel = NamedEntityType.getNamedEntityString(word.getNamedEntityType())
         current = Instance(classLabel)
         self.addAttributes(current, sentence, wordIndex)
         return current

예제 #10

0

파일 보기

    def spellCheck(self, sentence: Sentence) -> Sentence:
        """
        The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in
        given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList,
        if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the
        candidates list.

        Later on, it loops through candidates list and calls morphologicalAnalysis method with each word and assigns it
        to the FsmParseList. Then, it gets the root from FsmParseList. For the first time, it defines a previousRoot by
        calling getProbability method with root, and for the following times it calls getProbability method with
        previousRoot and root. Then, it finds out the best probability and the corresponding candidate as best candidate
        and adds it to the result Sentence.

        If the size of FsmParseList is not equal to 0, it directly adds the current word to the result Sentence and
        finds the previousRoot directly from the FsmParseList.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence result.
        """
        previousRoot = None
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            fsmParses = self.fsm.morphologicalAnalysis(word.getName())
            if fsmParses.size() == 0:
                candidates = self.candidateList(word)
                bestCandidate = word.getName()
                bestRoot = word
                bestProbability = 0.0
                for candidate in candidates:
                    fsmParses = self.fsm.morphologicalAnalysis(candidate)
                    root = fsmParses.getParseWithLongestRootWord().getWord()
                    if previousRoot is not None:
                        probability = self.__nGram.getProbability(
                            previousRoot.getName(), root.getName())
                    else:
                        probability = self.__nGram.getProbability(
                            root.getName())
                    if probability > bestProbability:
                        bestCandidate = candidate
                        bestRoot = root
                        bestProbability = probability
                previousRoot = bestRoot
                result.addWord(Word(bestCandidate))
            else:
                result.addWord(word)
                previousRoot = fsmParses.getParseWithLongestRootWord().getWord(
                )
        return result

예제 #11

0

파일 보기

 def test_SpellCheckSurfaceForm(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt",
                                    "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramSpellChecker = NGramSpellChecker(fsm, nGram, False)
     self.assertEqual(
         "noter hakkında",
         nGramSpellChecker.spellCheck(Sentence("noter hakkınad")).__str__())
     self.assertEqual(
         "arçelik'in çamaşır",
         nGramSpellChecker.spellCheck(
             Sentence("arçelik'in çamşaır")).__str__())
     self.assertEqual(
         "ruhsat yanında",
         nGramSpellChecker.spellCheck(Sentence("ruhset yanında")).__str__())

예제 #12

0

파일 보기

    def spellCheck(self, sentence: Sentence) -> Sentence:
        """
        The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in
        given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList,
        if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the
        candidates list. If the size of candidates greater than 0, it generates a random number and selects an item from
        candidates list with this random number and assign it as newWord. If the size of candidates is not greater than
        0, it directly assigns the current word as newWord. At the end, it adds the newWord to the result Sentence.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence result.
        """
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            fsmParseList = self.fsm.morphologicalAnalysis(word.getName())
            if fsmParseList.size() == 0:
                candidates = self.candidateList(word)
                if len(candidates) > 0:
                    randomCandidate = randrange(len(candidates))
                    newWord = Word(candidates[randomCandidate])
                else:
                    newWord = word
            else:
                newWord = word
            result.addWord(newWord)
        return result

예제 #13

0

파일 보기

파일: SimpleDeasciifier.py 프로젝트: StarlangSoftware/TurkishDeasciifier-Py

    def deasciify(self, sentence: Sentence) -> Sentence:
        """
        The deasciify method takes a Sentence as an input and loops i times where i ranges from 0 to number of
        words in the given Sentence. First it gets ith word from given Sentence and calls candidateList with
        ith word and assigns the returned list to the newly created candidates list. And if the size of
        candidates list is greater than 0, it generates a random number and gets the item of candidates list
        at the index of random number and assigns it as a newWord. If the size of candidates list is 0, it then
        directly assigns ith word as the newWord. At the end, it adds newWord to the result Sentence.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            result Sentence.
        """
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            fsmParseList = self.fsm.morphologicalAnalysis(word.getName())
            if fsmParseList.size() == 0:
                candidates = self.candidateList(word)
                if len(candidates) > 0:
                    randomCandidate = randrange(len(candidates))
                    newWord = Word(candidates[randomCandidate])
                else:
                    newWord = word
            else:
                newWord = word
            result.addWord(newWord)
        return result

예제 #14

0

파일 보기

    def __init__(self, corpus, example, pdf_path=None):
        self.corpus = corpus
        self.example = example
        self.result = None
        self.pdf_path = pdf_path

        if self.example == Tool.KELIMEYI_OGELERINE_AYIR:
            self.result = zemberekTool.ogelere_ayir(corpus)
            if self.result is None:
                self.result = "Cümle yerine kelime girmeniz gerekiyor veya girdiğiniz kelime yanlış"

        if self.example == Tool.CUMLEDE_GECEN_KOKLERI_BUL:
            self.result = zemberekTool.metinde_gecen_kokleri_bul(self.corpus)

        if self.example == Tool.CUMLEYI_PARCALARA_AYIR:
            self.result = zemberekTool.cumleyi_parcalara_ayir(self.corpus)

        if self.example == Tool.KELIME_ONERICI:
            self.result = zemberekTool.kelime_onerici(self.corpus)
            if self.result is None:
                self.result = "Cümle yerine kelime girmeniz gerekiyor"

        if self.example == Tool.KELIME_HECELE:
            self.result = zemberekTool.kelime_hecele(self.corpus)
            if self.result is None:
                self.result = "Cümle yerine kelime girmeniz gerekiyor"

        if self.example == Tool.NLTK_FILES_DOWNLOAD:
            self.result = nltk_download()

        if self.example == Tool.PERSONIFICATION_COPULA:
            self.result = personal(self.corpus, Person.FIRST, is_plural=True)
            if self.result is None:
                self.result = "Cümle yerine kelime girmeniz gerekiyor"

        if self.example == Tool.INFERENTIAL_MOOD:
            self.result = inferential(self.corpus, Person.SECOND, is_plural=False)
            if self.result is None:
                self.result = "Cümle yerine kelime girmeniz gerekiyor"

        if self.example == Tool.CONVERT_PDF_TO_TXT:
            self.result = pdfconverter.PDFParser(pdf_path).parse()
            if self.result is None:
                self.result = "PDF path yanlış olabilir veya PDF olmayabilir"

        if self.example == Tool.SENTENCE_CORRECTOR:
            fsm = FsmMorphologicalAnalyzer("./SpellChecker/turkish_dictionary.txt",
                                           "./SpellChecker/turkish_misspellings.txt",
                                           "./SpellChecker/turkish_finite_state_machine.xml")
            spellChecker = SimpleSpellChecker(fsm)
            sentence = Sentence(self.corpus)
            self.result = spellChecker.spellCheck(sentence)

예제 #15

0

파일 보기

파일: SimpleSpellCheckerTest.py 프로젝트: maliozer/TurkishSpellChecker-Py

 def test_SpellCheck(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt",
                                    "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     simpleSpellChecker = SimpleSpellChecker(fsm)
     input = open("../misspellings.txt")
     lines = input.readlines()
     for line in lines:
         items = line.strip().split(" ")
         misspelled = items[0]
         corrected = items[1]
         self.assertEqual(
             corrected,
             simpleSpellChecker.spellCheck(Sentence(misspelled)).toString())

예제 #16

0

파일 보기

파일: NGramDeasciifierTest.py 프로젝트: StarlangSoftware/TurkishDeasciifier-Cy

 def test_Deasciify(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramDeasciifier = NGramDeasciifier(fsm, nGram, True)
     simpleAsciifier = SimpleAsciifier()
     corpus = Corpus("../corpus.txt")
     for i in range(corpus.sentenceCount()):
         sentence = corpus.getSentence(i)
         for j in range(1, sentence.wordCount()):
             if fsm.morphologicalAnalysis(sentence.getWord(j).getName()).size() > 0:
                 asciified = simpleAsciifier.asciifyWord(sentence.getWord(j))
                 if asciified != sentence.getWord(j).getName():
                     deasciified = nGramDeasciifier.deasciify(Sentence(sentence.getWord(j - 1).getName() + " " + sentence.getWord(j).getName()))
                     self.assertEqual(sentence.getWord(j).getName(), deasciified.getWord(1).getName())

예제 #17

0

파일 보기

 def test_Deasciify(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt",
                                    "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     simpleDeasciifier = SimpleDeasciifier(fsm)
     simpleAsciifier = SimpleAsciifier()
     for i in range(fsm.getDictionary().size()):
         word = fsm.getDictionary().getWordWithIndex(i)
         count = 0
         for j in range(len(word.getName())):
             if word.getName()[j] == 'ç' or word.getName()[j] == 'ö' or word.getName()[j] == 'ğ' or \
                     word.getName()[j] == 'ü' or word.getName()[j] == 'ş' or word.getName()[j] == 'ı':
                 count = count + 1
         if (count > 0 and not word.getName().endswith("fulü")
                 and (word.isNominal() or word.isAdjective()
                      or word.isAdverb() or word.isVerb())):
             asciified = simpleAsciifier.asciifyWord(word)
             if len(simpleDeasciifier.candidateList(Word(asciified))) == 1:
                 deasciified = simpleDeasciifier.deasciify(
                     Sentence(asciified)).toString()
                 self.assertEqual(word.getName(), deasciified)

예제 #18

0

파일 보기

    def generateInstanceFromSentence(self, sentence: Sentence, wordIndex: int) -> Instance:
        """
        Generates a single classification instance of the Shallow Parse problem for the given word of the given
        sentence. If the  word has not been labeled with shallow parse tag yet, the method returns null.

        PARAMETERS
        ----------
        sentence : Sentence
            Input sentence.
        wordIndex : int
            The index of the word in the sentence.

        RETURNS
        -------
        Instance
            Classification instance.
        """
        word = sentence.getWord(wordIndex)
        if isinstance(word, AnnotatedWord):
            classLabel = word.getShallowParse()
            current = Instance(classLabel)
            self.addAttributes(current, sentence, wordIndex)
            return current

예제 #19

0

파일 보기

파일: SimpleAsciifier.py 프로젝트: StarlangSoftware/TurkishDeasciifier-Py

    def asciify(self, sentence: Sentence) -> Sentence:
        """
        Another asciify method which takes a Sentence as an input. It loops i times where i ranges form 0 to
        number of words in the given sentence. First it gets each word and calls asciify with current word and creates
        Word with returned String. At the and, adds each newly created ascified words to the result Sentence.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence output which is asciified.
        """
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            newWord = Word(self.asciifyWord(word))
            result.addWord(newWord)
        return result

예제 #20

0

파일 보기

파일: DisambiguationCorpus.py 프로젝트: salihercan/TurkishMorphologicalDisambiguation-Py

 def __init__(self, fileName=None):
     """
     Constructor which creates a list of sentences and a CounterHashMap of wordList.
     """
     super().__init__()
     if fileName is not None:
         inputFile = open(fileName, "r", encoding="utf8")
         lines = inputFile.readlines()
         newSentence = Sentence()
         for line in lines:
             word = line[:line.index("\t")]
             parse = line[line.index("\t") + 1:]
             if len(word) > 0 and len(parse) > 0:
                 newWord = DisambiguatedWord(word, MorphologicalParse(parse.strip()))
                 if word == "<S>":
                     newSentence = Sentence()
                 elif word == "</S>":
                     self.addSentence(newSentence)
                 elif word == "<DOC>" or word == "</DOC>" or word == "<TITLE>" or word == "</TITLE>":
                     pass
                 else:
                     newSentence.addWord(newWord)
         inputFile.close()

예제 #21

0

파일 보기

파일: NGramDeasciifier.py 프로젝트: StarlangSoftware/TurkishDeasciifier-Py

    def deasciify(self, sentence: Sentence) -> Sentence:
        """
        The deasciify method takes a Sentence as an input. First it creates a String list as candidates,
        and a Sentence result. Then, loops i times where i ranges from 0 to words size of given sentence. It gets the
        current word and generates a candidateList with this current word then, it loops through the candidateList.
        First it calls morphologicalAnalysis method with current candidate and gets the first item as root word. If it
        is the first root, it gets its N-gram probability, if there are also other roots, it gets probability of these
        roots and finds out the best candidate, best root and the best probability. At the nd, it adds the bestCandidate
        to the bestCandidate list.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence result as output.
        """
        previousRoot = None
        result = Sentence()
        root = self.checkAnalysisAndSetRoot(sentence, 0)
        nextRoot = self.checkAnalysisAndSetRoot(sentence, 1)
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            if root is None:
                candidates = self.candidateList(word)
                bestCandidate = word.getName()
                bestRoot = word
                bestProbability = self.__threshold
                for candidate in candidates:
                    fsmParses = self.fsm.morphologicalAnalysis(candidate)
                    if self.__rootNgram:
                        root = fsmParses.getParseWithLongestRootWord().getWord(
                        )
                    else:
                        root = Word(candidate)
                    if previousRoot is not None:
                        previousProbability = self.__nGram.getProbability(
                            previousRoot.getName(), root.getName())
                    else:
                        previousProbability = 0.0
                    if nextRoot is not None:
                        nextProbability = self.__nGram.getProbability(
                            root.getName(), nextRoot.getName())
                    else:
                        nextProbability = 0.0
                    if max(previousProbability,
                           nextProbability) > bestProbability:
                        bestCandidate = candidate
                        bestRoot = root
                        bestProbability = max(previousProbability,
                                              nextProbability)
                root = bestRoot
                result.addWord(Word(bestCandidate))
            else:
                result.addWord(word)
            previousRoot = root
            root = nextRoot
            nextRoot = self.checkAnalysisAndSetRoot(sentence, i + 2)
        return result

예제 #22

0

파일 보기

파일: SentenceTest.py 프로젝트: StarlangSoftware/Corpus-Cy

class SentenceTest(unittest.TestCase):

    sentence: Sentence

    def setUp(self) -> None:
        self.sentence = Sentence()
        self.sentence.addWord(Word("ali"))
        self.sentence.addWord(Word("topu"))
        self.sentence.addWord(Word("at"))
        self.sentence.addWord(Word("mehmet"))
        self.sentence.addWord(Word("ayşeyle"))
        self.sentence.addWord(Word("gitti"))

    def test_GetWord(self):
        self.assertEqual(Word("ali"), self.sentence.getWord(0))
        self.assertEqual(Word("at"), self.sentence.getWord(2))
        self.assertEqual(Word("gitti"), self.sentence.getWord(5))

    def test_GetIndex(self):
        self.assertEqual(0, self.sentence.getIndex(Word("ali")))
        self.assertEqual(2, self.sentence.getIndex(Word("at")))
        self.assertEqual(5, self.sentence.getIndex(Word("gitti")))

    def test_WordCount(self):
        self.assertEqual(6, self.sentence.wordCount())

    def test_CharCount(self):
        self.assertEqual(27, self.sentence.charCount())

예제 #23

0

파일 보기

    def split(self, line: str) -> list:
        """
        The split method takes a String line as an input. Firstly it creates a new sentence as currentSentence a new
        list as sentences. Then loops till the end of the line and checks some conditions;
        If the char at ith index is a separator;

        ' : assigns currentWord as currentWord'
        { : increment the curlyBracketCount
        } : decrement the curlyBracketCount
        " : increment the specialQuotaCount
        " : decrement the specialQuotaCount
        ( : increment roundParenthesisCount
        ) : decrement roundParenthesisCount
        [ : increment bracketCount
        ] : decrement bracketCount
        " : assign quotaCount as 1- quotaCount
        ' : assign apostropheCount as 1- apostropheCount

        If the currentWord is not empty, it adds the currentWord after repeatControl to currentSentence.

        If the char at index i is " and  bracketCount, specialQuotaCount, curlyBracketCount, roundParenthesisCount, and
        quotaCount equal to 0 and also the next char is uppercase or digit, it adds currentSentence to sentences.

        If the char at ith index is a sentence ender;

        . and currentWord is www : assigns webMode as true. Ex: www.google.com
        . and currentWord is a digit or in web or e-mail modes : assigns currentWord as currentWord+char(i) Ex: 1.
        . and currentWord is a shortcut or an abbreviation : assigns currentWord as currentWord+char(i) and adds
        currentWord to currentSentence. Ex : bkz.
        ' and next char is uppercase or digit: add word to currentSentence as ' and add currentSentence to sentences.

        If the char at index i is ' ', i.e space, add word to currentSentence and assign "" to currentSentence.
        If the char at index i is -,  add word to currentSentence and add sentences when the wordCount of
        currentSentence greater than 0.

        If the char at ith index is a punctuation;
        : and if currentWord is "https" : assign webMode as true.
        , and there exists a number before and after : assign currentWord as currentWord+char(i) Ex: 1,2
        : and if line is a time : assign currentWord as currentWord+char(i) Ex: 12:14:24
        - and there exists a number before and after : assign currentWord as currentWord+char(i) Ex: 12-1
        {@literal @} : assign emailMode as true.

        PARAMETERS
        ----------
        line : str
            String input to split.

        RETURNS
        -------
        list
            sentences list which holds split line.
        """
        emailMode = False
        webMode = False
        i = 0
        specialQuotaCount = 0
        roundParenthesisCount = 0
        bracketCount = 0
        curlyBracketCount = 0
        quotaCount = 0
        apostropheCount = 0
        currentSentence = Sentence()
        currentWord = ""
        sentences = []
        while i < len(line):
            if line[i] in SentenceSplitter.SEPARATORS:
                if line[i] == '\'' and currentWord != "" and self.__isApostrophe(
                        line, i):
                    currentWord = currentWord + line[i]
                else:
                    if currentWord != "":
                        currentSentence.addWord(
                            Word(
                                self.__repeatControl(currentWord, webMode
                                                     or emailMode)))
                    currentSentence.addWord(Word("" + line[i]))
                    currentWord = ""
                    if line[i] == '{':
                        curlyBracketCount = curlyBracketCount + 1
                    elif line[i] == '}':
                        curlyBracketCount = curlyBracketCount - 1
                    elif line[i] == '\uFF02':
                        specialQuotaCount = specialQuotaCount + 1
                    elif line[i] == '\u05F4':
                        specialQuotaCount = specialQuotaCount - 1
                    elif line[i] == '(':
                        roundParenthesisCount = roundParenthesisCount + 1
                    elif line[i] == ')':
                        roundParenthesisCount = roundParenthesisCount - 1
                    elif line[i] == '[':
                        bracketCount = bracketCount + 1
                    elif line[i] == ']':
                        bracketCount = bracketCount - 1
                    elif line[i] == '"':
                        quotaCount = 1 - quotaCount
                    elif line[i] == '\'':
                        apostropheCount = 1 - apostropheCount
                    if line[i] == '"' and bracketCount == 0 and specialQuotaCount == 0 and curlyBracketCount == 0 and \
                            roundParenthesisCount == 0 and quotaCount == 0 and self.__isNextCharUpperCaseOrDigit(line,
                                                                                                                 i + 1):
                        sentences.append(currentSentence)
                        currentSentence = Sentence()
            else:
                if line[i] in SentenceSplitter.SENTENCE_ENDERS:
                    if line[i] == '.' and currentWord == "www":
                        webMode = True
                    if line[i] == '.' and currentWord != "" and (
                            webMode or emailMode
                            or line[i - 1] in TurkishLanguage.DIGITS):
                        currentWord = currentWord + line[i]
                    else:
                        if line[i] == '.' and (
                                self.__listContains(currentWord)
                                or self.__isNameShortcut(currentWord)):
                            currentWord = currentWord + line[i]
                            currentSentence.addWord(Word(currentWord))
                            currentWord = ""
                        else:
                            if currentWord != "":
                                currentSentence.addWord(
                                    Word(
                                        self.__repeatControl(
                                            currentWord, webMode
                                            or emailMode)))
                            currentWord = "" + line[i]
                            i = i + 1
                            while i < len(line) and line[
                                    i] in SentenceSplitter.SENTENCE_ENDERS:
                                i = i + 1
                            i = i - 1
                            currentSentence.addWord(Word(currentWord))
                            if roundParenthesisCount == 0 and bracketCount == 0 and curlyBracketCount == 0 and \
                                    quotaCount == 0:
                                if i + 1 < len(line) and line[i + 1] == '\'' and apostropheCount == 1 and \
                                        self.__isNextCharUpperCaseOrDigit(line, i + 2):
                                    currentSentence.addWord(Word("'"))
                                    i = i + 1
                                    sentences.append(currentSentence)
                                    currentSentence = Sentence()
                                else:
                                    if i + 2 < len(line) and line[i + 1] == ' ' and line[i + 2] == '\'' and \
                                            apostropheCount == 1 and self.__isNextCharUpperCaseOrDigit(line, i + 3):
                                        currentSentence.addWord(Word("'"))
                                        i += 2
                                        sentences.append(currentSentence)
                                        currentSentence = Sentence()
                                    else:
                                        if self.__isNextCharUpperCaseOrDigit(
                                                line, i + 1):
                                            sentences.append(currentSentence)
                                            currentSentence = Sentence()
                            currentWord = ""
                else:
                    if line[i] == ' ':
                        emailMode = False
                        webMode = False
                        if currentWord != "":
                            currentSentence.addWord(
                                Word(
                                    self.__repeatControl(
                                        currentWord, webMode or emailMode)))
                            currentWord = ""
                    else:
                        if line[i] == '-' and not webMode and roundParenthesisCount == 0 and \
                                self.__isNextCharUpperCase(line, i + 1) and \
                                not self.__isPreviousWordUpperCase(line, i - 1):
                            if currentWord != "" and currentWord not in TurkishLanguage.DIGITS:
                                currentSentence.addWord(
                                    Word(
                                        self.__repeatControl(
                                            currentWord, webMode
                                            or emailMode)))
                            if currentSentence.wordCount() > 0:
                                sentences.append(currentSentence)
                            currentSentence = Sentence()
                            roundParenthesisCount = 0
                            bracketCount = 0
                            curlyBracketCount = 0
                            quotaCount = 0
                            specialQuotaCount = 0
                            if currentWord != "" and re.match(
                                    "\\d+", currentWord):
                                currentSentence.addWord(
                                    Word(currentWord + " -"))
                            else:
                                currentSentence.addWord(Word("-"))
                            currentWord = ""
                        else:
                            if line[i] in SentenceSplitter.PUNCTUATION_CHARACTERS or \
                                    line[i] in TurkishLanguage.ARITHMETIC_CHARACTERS:
                                if line[i] == ':' and (currentWord == "http" or
                                                       currentWord == "https"):
                                    webMode = True
                                if webMode:
                                    currentWord = currentWord + line[i]
                                else:
                                    if line[i] == ',' and self.__numberExistsBeforeAndAfter(
                                            line, i):
                                        currentWord = currentWord + line[i]
                                    else:
                                        if line[i] == ':' and self.__isTime(
                                                line, i):
                                            currentWord = currentWord + line[i]
                                        else:
                                            if line[i] == '-' and self.__numberExistsBeforeAndAfter(
                                                    line, i):
                                                currentWord = currentWord + line[
                                                    i]
                                            else:
                                                if currentWord != "":
                                                    currentSentence.addWord(
                                                        Word(
                                                            self.
                                                            __repeatControl(
                                                                currentWord,
                                                                webMode
                                                                or emailMode)))
                                                currentSentence.addWord(
                                                    Word("" + line[i]))
                                                currentWord = ""
                            else:
                                if line[i] == '@':
                                    currentWord = currentWord + line[i]
                                    emailMode = True
                                else:
                                    currentWord = currentWord + line[i]
            i = i + 1
        if currentWord != "":
            currentSentence.addWord(
                Word(self.__repeatControl(currentWord, webMode or emailMode)))
        if currentSentence.wordCount() > 0:
            sentences.append(currentSentence)
        return sentences

예제 #24

0

파일 보기

파일: NGramSpellCheckerTest.py 프로젝트: fsahinie/TurkishSpellChecker-Py

 def test_SpellCheck(self):
     original = [
         Sentence("demokratik cumhuriyet en kıymetli varlığımızdır"),
         Sentence("bu tablodaki değerler zedelenmeyecektir"),
         Sentence(
             "milliyet'in geleneksel yılın sporcusu anketi 43. yaşını doldurdu"
         ),
         Sentence("demokrasinin icadı bu ayrımı bulandırdı"),
         Sentence(
             "dışişleri müsteşarı Öymen'in 1997'nin ilk aylarında Bağdat'a gitmesi öngörülüyor"
         ),
         Sentence("büyüdü , palazlandı , devleti ele geçirdi"),
         Sentence("her maskenin ciltte kalma süresi farklıdır"),
         Sentence("yılın son ayında 10 gazeteci gözaltına alındı"),
         Sentence("iki pilotun kullandığı uçakta bir hostes görev alıyor"),
         Sentence(
             "son derece kısıtlı kelimeler çerçevesinde kendilerini uzun cümlelerle ifade edebiliyorlar"
         ),
         Sentence("kedi köpek"),
         Sentence("minibüs durağı"),
         Sentence("noter belgesi"),
         Sentence("")
     ]
     modified = [
         Sentence("demokratik cumhüriyet rn kımetli varlıgımızdır"),
         Sentence("bu tblodaki değerlğr zedelenmeyecüktir"),
         Sentence(
             "milliyet'in geeneksel yılin spoşcusu ankşti 43. yeşını doldürdu"
         ),
         Sentence("demokrasinin icşdı buf ayrmıı bulandürdı"),
         Sentence(
             "dışişleri mütseşarı Öymen'in 1997'nin iljk aylğrında Bağdat'a gitmesi öngörülüyor"
         ),
         Sentence("büyüdü , palazandı , devltei eöe geçridi"),
         Sentence("her makenin cültte aklma sürdsi farlkıdır"),
         Sentence("yılın sno ayında 10 gazteci gözlatına alündı"),
         Sentence("iki piotun kulçandığı uçkata üir hotes görçv alyıor"),
         Sentence(
             "son deece kısütlı keilmeler çeçevesinde kendülerini uzuü cümllerle ifüde edbeiliyorlar"
         ),
         Sentence("krdi köpek"),
         Sentence("minibü durağı"),
         Sentence("ntoer belgesi"),
         Sentence("")
     ]
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt",
                                    "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramSpellChecker = NGramSpellChecker(fsm, nGram)
     for i in range(len(modified)):
         self.assertEqual(
             original[i].toString(),
             nGramSpellChecker.spellCheck(modified[i]).toString())

예제 #25

0

파일 보기

파일: SimpleAsciifierTest.py 프로젝트: StarlangSoftware/TurkishDeasciifier-Cy

 def test_SentenceAsciify(self):
     self.assertEqual(Sentence("cogus iii COGUSI").toString(), self.simpleAsciifier.asciify(Sentence("çöğüş ııı ÇÖĞÜŞİ")).toString())
     self.assertEqual(Sentence("uckagitcilik akiskanlistiricilik").toString(), self.simpleAsciifier.asciify(Sentence("üçkağıtçılık akışkanlıştırıcılık")).toString())
     self.assertEqual(Sentence("citcitcilik duskirikligi yuzgorumlugu").toString(), self.simpleAsciifier.asciify(Sentence("çıtçıtçılık düşkırıklığı yüzgörümlüğü")).toString())