def deasciify(self, sentence: Sentence) -> Sentence:
        """
        The deasciify method takes a Sentence as an input and loops i times where i ranges from 0 to number of
        words in the given Sentence. First it gets ith word from given Sentence and calls candidateList with
        ith word and assigns the returned list to the newly created candidates list. And if the size of
        candidates list is greater than 0, it generates a random number and gets the item of candidates list
        at the index of random number and assigns it as a newWord. If the size of candidates list is 0, it then
        directly assigns ith word as the newWord. At the end, it adds newWord to the result Sentence.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            result Sentence.
        """
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            fsmParseList = self.fsm.morphologicalAnalysis(word.getName())
            if fsmParseList.size() == 0:
                candidates = self.candidateList(word)
                if len(candidates) > 0:
                    randomCandidate = randrange(len(candidates))
                    newWord = Word(candidates[randomCandidate])
                else:
                    newWord = word
            else:
                newWord = word
            result.addWord(newWord)
        return result
Beispiel #2
0
    def spellCheck(self, sentence: Sentence) -> Sentence:
        """
        The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in
        given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList,
        if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the
        candidates list. If the size of candidates greater than 0, it generates a random number and selects an item from
        candidates list with this random number and assign it as newWord. If the size of candidates is not greater than
        0, it directly assigns the current word as newWord. At the end, it adds the newWord to the result Sentence.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence result.
        """
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            fsmParseList = self.fsm.morphologicalAnalysis(word.getName())
            if fsmParseList.size() == 0:
                candidates = self.candidateList(word)
                if len(candidates) > 0:
                    randomCandidate = randrange(len(candidates))
                    newWord = Word(candidates[randomCandidate])
                else:
                    newWord = word
            else:
                newWord = word
            result.addWord(newWord)
        return result
Beispiel #3
0
    def spellCheck(self, sentence: Sentence) -> Sentence:
        """
        The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in
        given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList,
        if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the
        candidates list.

        Later on, it loops through candidates list and calls morphologicalAnalysis method with each word and assigns it
        to the FsmParseList. Then, it gets the root from FsmParseList. For the first time, it defines a previousRoot by
        calling getProbability method with root, and for the following times it calls getProbability method with
        previousRoot and root. Then, it finds out the best probability and the corresponding candidate as best candidate
        and adds it to the result Sentence.

        If the size of FsmParseList is not equal to 0, it directly adds the current word to the result Sentence and
        finds the previousRoot directly from the FsmParseList.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence result.
        """
        previousRoot = None
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            fsmParses = self.fsm.morphologicalAnalysis(word.getName())
            if fsmParses.size() == 0:
                candidates = self.candidateList(word)
                bestCandidate = word.getName()
                bestRoot = word
                bestProbability = 0.0
                for candidate in candidates:
                    fsmParses = self.fsm.morphologicalAnalysis(candidate)
                    root = fsmParses.getParseWithLongestRootWord().getWord()
                    if previousRoot is not None:
                        probability = self.__nGram.getProbability(
                            previousRoot.getName(), root.getName())
                    else:
                        probability = self.__nGram.getProbability(
                            root.getName())
                    if probability > bestProbability:
                        bestCandidate = candidate
                        bestRoot = root
                        bestProbability = probability
                previousRoot = bestRoot
                result.addWord(Word(bestCandidate))
            else:
                result.addWord(word)
                previousRoot = fsmParses.getParseWithLongestRootWord().getWord(
                )
        return result
    def asciify(self, sentence: Sentence) -> Sentence:
        """
        Another asciify method which takes a Sentence as an input. It loops i times where i ranges form 0 to
        number of words in the given sentence. First it gets each word and calls asciify with current word and creates
        Word with returned String. At the and, adds each newly created ascified words to the result Sentence.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence output which is asciified.
        """
        result = Sentence()
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            newWord = Word(self.asciifyWord(word))
            result.addWord(newWord)
        return result
 def __init__(self, fileName=None):
     """
     Constructor which creates a list of sentences and a CounterHashMap of wordList.
     """
     super().__init__()
     if fileName is not None:
         inputFile = open(fileName, "r", encoding="utf8")
         lines = inputFile.readlines()
         newSentence = Sentence()
         for line in lines:
             word = line[:line.index("\t")]
             parse = line[line.index("\t") + 1:]
             if len(word) > 0 and len(parse) > 0:
                 newWord = DisambiguatedWord(word, MorphologicalParse(parse.strip()))
                 if word == "<S>":
                     newSentence = Sentence()
                 elif word == "</S>":
                     self.addSentence(newSentence)
                 elif word == "<DOC>" or word == "</DOC>" or word == "<TITLE>" or word == "</TITLE>":
                     pass
                 else:
                     newSentence.addWord(newWord)
         inputFile.close()
Beispiel #6
0
    def split(self, line: str) -> list:
        """
        The split method takes a String line as an input. Firstly it creates a new sentence as currentSentence a new
        list as sentences. Then loops till the end of the line and checks some conditions;
        If the char at ith index is a separator;

        ' : assigns currentWord as currentWord'
        { : increment the curlyBracketCount
        } : decrement the curlyBracketCount
        " : increment the specialQuotaCount
        " : decrement the specialQuotaCount
        ( : increment roundParenthesisCount
        ) : decrement roundParenthesisCount
        [ : increment bracketCount
        ] : decrement bracketCount
        " : assign quotaCount as 1- quotaCount
        ' : assign apostropheCount as 1- apostropheCount

        If the currentWord is not empty, it adds the currentWord after repeatControl to currentSentence.

        If the char at index i is " and  bracketCount, specialQuotaCount, curlyBracketCount, roundParenthesisCount, and
        quotaCount equal to 0 and also the next char is uppercase or digit, it adds currentSentence to sentences.

        If the char at ith index is a sentence ender;

        . and currentWord is www : assigns webMode as true. Ex: www.google.com
        . and currentWord is a digit or in web or e-mail modes : assigns currentWord as currentWord+char(i) Ex: 1.
        . and currentWord is a shortcut or an abbreviation : assigns currentWord as currentWord+char(i) and adds
        currentWord to currentSentence. Ex : bkz.
        ' and next char is uppercase or digit: add word to currentSentence as ' and add currentSentence to sentences.

        If the char at index i is ' ', i.e space, add word to currentSentence and assign "" to currentSentence.
        If the char at index i is -,  add word to currentSentence and add sentences when the wordCount of
        currentSentence greater than 0.

        If the char at ith index is a punctuation;
        : and if currentWord is "https" : assign webMode as true.
        , and there exists a number before and after : assign currentWord as currentWord+char(i) Ex: 1,2
        : and if line is a time : assign currentWord as currentWord+char(i) Ex: 12:14:24
        - and there exists a number before and after : assign currentWord as currentWord+char(i) Ex: 12-1
        {@literal @} : assign emailMode as true.

        PARAMETERS
        ----------
        line : str
            String input to split.

        RETURNS
        -------
        list
            sentences list which holds split line.
        """
        emailMode = False
        webMode = False
        i = 0
        specialQuotaCount = 0
        roundParenthesisCount = 0
        bracketCount = 0
        curlyBracketCount = 0
        quotaCount = 0
        apostropheCount = 0
        currentSentence = Sentence()
        currentWord = ""
        sentences = []
        while i < len(line):
            if line[i] in SentenceSplitter.SEPARATORS:
                if line[i] == '\'' and currentWord != "" and self.__isApostrophe(
                        line, i):
                    currentWord = currentWord + line[i]
                else:
                    if currentWord != "":
                        currentSentence.addWord(
                            Word(
                                self.__repeatControl(currentWord, webMode
                                                     or emailMode)))
                    currentSentence.addWord(Word("" + line[i]))
                    currentWord = ""
                    if line[i] == '{':
                        curlyBracketCount = curlyBracketCount + 1
                    elif line[i] == '}':
                        curlyBracketCount = curlyBracketCount - 1
                    elif line[i] == '\uFF02':
                        specialQuotaCount = specialQuotaCount + 1
                    elif line[i] == '\u05F4':
                        specialQuotaCount = specialQuotaCount - 1
                    elif line[i] == '(':
                        roundParenthesisCount = roundParenthesisCount + 1
                    elif line[i] == ')':
                        roundParenthesisCount = roundParenthesisCount - 1
                    elif line[i] == '[':
                        bracketCount = bracketCount + 1
                    elif line[i] == ']':
                        bracketCount = bracketCount - 1
                    elif line[i] == '"':
                        quotaCount = 1 - quotaCount
                    elif line[i] == '\'':
                        apostropheCount = 1 - apostropheCount
                    if line[i] == '"' and bracketCount == 0 and specialQuotaCount == 0 and curlyBracketCount == 0 and \
                            roundParenthesisCount == 0 and quotaCount == 0 and self.__isNextCharUpperCaseOrDigit(line,
                                                                                                                 i + 1):
                        sentences.append(currentSentence)
                        currentSentence = Sentence()
            else:
                if line[i] in SentenceSplitter.SENTENCE_ENDERS:
                    if line[i] == '.' and currentWord == "www":
                        webMode = True
                    if line[i] == '.' and currentWord != "" and (
                            webMode or emailMode
                            or line[i - 1] in TurkishLanguage.DIGITS):
                        currentWord = currentWord + line[i]
                    else:
                        if line[i] == '.' and (
                                self.__listContains(currentWord)
                                or self.__isNameShortcut(currentWord)):
                            currentWord = currentWord + line[i]
                            currentSentence.addWord(Word(currentWord))
                            currentWord = ""
                        else:
                            if currentWord != "":
                                currentSentence.addWord(
                                    Word(
                                        self.__repeatControl(
                                            currentWord, webMode
                                            or emailMode)))
                            currentWord = "" + line[i]
                            i = i + 1
                            while i < len(line) and line[
                                    i] in SentenceSplitter.SENTENCE_ENDERS:
                                i = i + 1
                            i = i - 1
                            currentSentence.addWord(Word(currentWord))
                            if roundParenthesisCount == 0 and bracketCount == 0 and curlyBracketCount == 0 and \
                                    quotaCount == 0:
                                if i + 1 < len(line) and line[i + 1] == '\'' and apostropheCount == 1 and \
                                        self.__isNextCharUpperCaseOrDigit(line, i + 2):
                                    currentSentence.addWord(Word("'"))
                                    i = i + 1
                                    sentences.append(currentSentence)
                                    currentSentence = Sentence()
                                else:
                                    if i + 2 < len(line) and line[i + 1] == ' ' and line[i + 2] == '\'' and \
                                            apostropheCount == 1 and self.__isNextCharUpperCaseOrDigit(line, i + 3):
                                        currentSentence.addWord(Word("'"))
                                        i += 2
                                        sentences.append(currentSentence)
                                        currentSentence = Sentence()
                                    else:
                                        if self.__isNextCharUpperCaseOrDigit(
                                                line, i + 1):
                                            sentences.append(currentSentence)
                                            currentSentence = Sentence()
                            currentWord = ""
                else:
                    if line[i] == ' ':
                        emailMode = False
                        webMode = False
                        if currentWord != "":
                            currentSentence.addWord(
                                Word(
                                    self.__repeatControl(
                                        currentWord, webMode or emailMode)))
                            currentWord = ""
                    else:
                        if line[i] == '-' and not webMode and roundParenthesisCount == 0 and \
                                self.__isNextCharUpperCase(line, i + 1) and \
                                not self.__isPreviousWordUpperCase(line, i - 1):
                            if currentWord != "" and currentWord not in TurkishLanguage.DIGITS:
                                currentSentence.addWord(
                                    Word(
                                        self.__repeatControl(
                                            currentWord, webMode
                                            or emailMode)))
                            if currentSentence.wordCount() > 0:
                                sentences.append(currentSentence)
                            currentSentence = Sentence()
                            roundParenthesisCount = 0
                            bracketCount = 0
                            curlyBracketCount = 0
                            quotaCount = 0
                            specialQuotaCount = 0
                            if currentWord != "" and re.match(
                                    "\\d+", currentWord):
                                currentSentence.addWord(
                                    Word(currentWord + " -"))
                            else:
                                currentSentence.addWord(Word("-"))
                            currentWord = ""
                        else:
                            if line[i] in SentenceSplitter.PUNCTUATION_CHARACTERS or \
                                    line[i] in TurkishLanguage.ARITHMETIC_CHARACTERS:
                                if line[i] == ':' and (currentWord == "http" or
                                                       currentWord == "https"):
                                    webMode = True
                                if webMode:
                                    currentWord = currentWord + line[i]
                                else:
                                    if line[i] == ',' and self.__numberExistsBeforeAndAfter(
                                            line, i):
                                        currentWord = currentWord + line[i]
                                    else:
                                        if line[i] == ':' and self.__isTime(
                                                line, i):
                                            currentWord = currentWord + line[i]
                                        else:
                                            if line[i] == '-' and self.__numberExistsBeforeAndAfter(
                                                    line, i):
                                                currentWord = currentWord + line[
                                                    i]
                                            else:
                                                if currentWord != "":
                                                    currentSentence.addWord(
                                                        Word(
                                                            self.
                                                            __repeatControl(
                                                                currentWord,
                                                                webMode
                                                                or emailMode)))
                                                currentSentence.addWord(
                                                    Word("" + line[i]))
                                                currentWord = ""
                            else:
                                if line[i] == '@':
                                    currentWord = currentWord + line[i]
                                    emailMode = True
                                else:
                                    currentWord = currentWord + line[i]
            i = i + 1
        if currentWord != "":
            currentSentence.addWord(
                Word(self.__repeatControl(currentWord, webMode or emailMode)))
        if currentSentence.wordCount() > 0:
            sentences.append(currentSentence)
        return sentences
    def deasciify(self, sentence: Sentence) -> Sentence:
        """
        The deasciify method takes a Sentence as an input. First it creates a String list as candidates,
        and a Sentence result. Then, loops i times where i ranges from 0 to words size of given sentence. It gets the
        current word and generates a candidateList with this current word then, it loops through the candidateList.
        First it calls morphologicalAnalysis method with current candidate and gets the first item as root word. If it
        is the first root, it gets its N-gram probability, if there are also other roots, it gets probability of these
        roots and finds out the best candidate, best root and the best probability. At the nd, it adds the bestCandidate
        to the bestCandidate list.

        PARAMETERS
        ----------
        sentence : Sentence
            Sentence type input.

        RETURNS
        -------
        Sentence
            Sentence result as output.
        """
        previousRoot = None
        result = Sentence()
        root = self.checkAnalysisAndSetRoot(sentence, 0)
        nextRoot = self.checkAnalysisAndSetRoot(sentence, 1)
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            if root is None:
                candidates = self.candidateList(word)
                bestCandidate = word.getName()
                bestRoot = word
                bestProbability = self.__threshold
                for candidate in candidates:
                    fsmParses = self.fsm.morphologicalAnalysis(candidate)
                    if self.__rootNgram:
                        root = fsmParses.getParseWithLongestRootWord().getWord(
                        )
                    else:
                        root = Word(candidate)
                    if previousRoot is not None:
                        previousProbability = self.__nGram.getProbability(
                            previousRoot.getName(), root.getName())
                    else:
                        previousProbability = 0.0
                    if nextRoot is not None:
                        nextProbability = self.__nGram.getProbability(
                            root.getName(), nextRoot.getName())
                    else:
                        nextProbability = 0.0
                    if max(previousProbability,
                           nextProbability) > bestProbability:
                        bestCandidate = candidate
                        bestRoot = root
                        bestProbability = max(previousProbability,
                                              nextProbability)
                root = bestRoot
                result.addWord(Word(bestCandidate))
            else:
                result.addWord(word)
            previousRoot = root
            root = nextRoot
            nextRoot = self.checkAnalysisAndSetRoot(sentence, i + 2)
        return result
class SentenceTest(unittest.TestCase):

    sentence: Sentence

    def setUp(self) -> None:
        self.sentence = Sentence()
        self.sentence.addWord(Word("ali"))
        self.sentence.addWord(Word("topu"))
        self.sentence.addWord(Word("at"))
        self.sentence.addWord(Word("mehmet"))
        self.sentence.addWord(Word("ayşeyle"))
        self.sentence.addWord(Word("gitti"))

    def test_GetWord(self):
        self.assertEqual(Word("ali"), self.sentence.getWord(0))
        self.assertEqual(Word("at"), self.sentence.getWord(2))
        self.assertEqual(Word("gitti"), self.sentence.getWord(5))

    def test_GetIndex(self):
        self.assertEqual(0, self.sentence.getIndex(Word("ali")))
        self.assertEqual(2, self.sentence.getIndex(Word("at")))
        self.assertEqual(5, self.sentence.getIndex(Word("gitti")))

    def test_WordCount(self):
        self.assertEqual(6, self.sentence.wordCount())

    def test_CharCount(self):
        self.assertEqual(27, self.sentence.charCount())