Example #1
0
class SpellCorrectorSentenceCorrectionBigramsTest(SpellCorrectorTestBase):
    def setUp(self):
        self.words_provider = MockKnownWordsProvider()
        self.bigrams_provider = MockBigramsProvider()
        self.sut = SpellCorrector(self.words_provider, self.bigrams_provider)

    def test_should_choose_word_existing_in_bigrams(self):
        self.words_provider.initialize({"panie": 50, "pranie": 19})
        self.bigrams_provider.initialize({
            "suszyć pranie": 15,
            "suszyć ubranie": 10
        })
        corrected = self.sut.sentence_correction("suszyć ptanie",
                                                 print_words=False)
        self.assert_equal_utf("suszyć pranie", corrected)

    def test_should_choose_most_frequent_bigram(self):
        self.words_provider.initialize({
            "komputer": 99,
            "komputerów": 30,
            "programowanie": 20
        })
        self.bigrams_provider.initialize({
            "programowanie komputer": 1,
            "programowanie komputerów": 3
        })
        corrected = self.sut.sentence_correction("pogramowanie komputeruf",
                                                 print_words=False)
        self.assert_equal_utf("programowanie komputerów", corrected)
Example #2
0
class SpellCorrectorCorrectionTest(SpellCorrectorTestBase):
    def setUp(self):
        self.words_provider = MockKnownWordsProvider()
        self.sut = SpellCorrector(self.words_provider)

    def test_should_correct_diacritics_at_first(self):
        self.words_provider.initialize({"że": 21, "za": 30})

        corrected = self.sut.correction("ze")
        self.assert_equal_utf("że", corrected)

    def test_should_choose_most_frequent_word(self):
        self.words_provider.initialize({"tata": 130, "taca": 44, "tara": 29})

        corrected = self.sut.correction("taya")
        self.assert_equal_utf("tata", corrected)

    def test_should_choose_edit2_if_no_edit1_available(self):
        self.words_provider.initialize({"lampka": 101, "choinka": 50})

        corrected = self.sut.correction("honika")
        self.assert_equal_utf("choinka", corrected)

    def test_should_choose_edit1_if_exist(self):
        self.words_provider.initialize({
            "lampka": 101,
            "choinka": 50,
            'konika': 5
        })

        corrected = self.sut.correction("honika")
        self.assert_equal_utf("konika", corrected)
Example #3
0
class SpellCorrectorEditsTest(unittest.TestCase):
    def setUp(self):
        self.words_provider = MockKnownWordsProvider()
        self.sut = SpellCorrector(self.words_provider)

    def assert_contains_list(self,
                             container,
                             member_list,
                             print_container=True):
        container_members_str = '[' + ", ".join(
            [elem.encode('utf-8')
             for elem in container]) + ']' if print_container else "container"
        for member in member_list:
            if not isinstance(member, unicode):
                member = unicode(member, 'utf-8')
            self.assertIn(
                member, container,
                member.encode('utf-8') + " not in " + container_members_str)

    def test_edit1(self):
        word = "hello"
        result = self.sut._edits1(word)

        deletes = ('ello', 'hllo', 'helo', 'hell')
        transposes = ('ehllo', 'hlelo', 'hello', 'helol')
        replaces = ('jello', 'hallo', 'helko', 'healo', 'hęllo')
        inserts = ('heello', 'hhello', 'helllo', 'hellou', 'helloż')

        self.assert_contains_list(result, deletes)
        self.assert_contains_list(result, transposes)
        self.assert_contains_list(result, replaces)
        self.assert_contains_list(result, inserts)

    def test_edit2(self):
        word = "hello"
        result = self.sut._edits2(word)

        example_edits = ('hekko', 'tallo', 'hhhllo', 'belko')
        self.assert_contains_list(result, example_edits, print_container=True)

        result = self.sut._edits2("komputeruf")
        example_edits = ('komputerów', 'omputeruf')
        self.assert_contains_list(result, example_edits, print_container=False)

    def test_diacritics_words(self):
        word = 'czesc'
        result = self.sut._add_diacritics(word)

        edit1_corrections = (
            'ćzesc',
            'cżesc',  # 'cźesc',
            'częsc',
            'cześc',
            'czesć')
        edit2_corrections = ('ćzesć', 'cześć', 'częśc')
        self.assert_contains_list(result, edit1_corrections)
        self.assert_contains_list(result, edit2_corrections)
        self.assertFalse(word in result)
Example #4
0
 def setUp(self):
     self.words_provider = MockKnownWordsProvider()
     self.sut = SpellCorrector(self.words_provider)
Example #5
0
    def processInput(self, uType, uInput, jsondb):
        print(uInput)
        wn = WordNormalizer()
        uInputDict = wn.normalizeLyrics([uInput])

        spn = SpellCorrector(jsondb)
        if (uType == "lyric"):
            originalDB = jsondb.getOriginalWordsDB()
        elif (uType == "artist"):
            originalDB = jsondb.getOriginalNamesDB()
        elif (uType == "album"):
            originalDB = jsondb.getOriginalAlbumsDB()

        probableSongs = []
        probableSongsFrequency = []
        mostFrequentWordList = []
        mostRelevantSongs = []
        mostRelevantSongFrquency = []
        editCounter = []
        #uInputWords = uInput.lower().strip().split(" ")
        for uInputWord in uInputDict:
            #print(uInputWord)
            matchingWords = spn.checkMatches(uType, uInputWord)
            data = spn.getMostFrequentWords(matchingWords)
            mostFrequentWords = data[0]

            if (uType != "artist" and data[1] == 1):
                mostFrequentWords = mostFrequentWords + spn.stemInputAndCheckMatch(
                    uType, uInputWord)

            mostFrequentWordList.append(mostFrequentWords)
        print(mostFrequentWordList)
        print(len(mostFrequentWordList))

        for array in mostFrequentWordList:
            temp = []
            tempFrequency = []
            for word in array:
                try:
                    wordData = originalDB[word]
                    temp.append(wordData)
                    #tempFrequency.append(wordData[1])
                except:
                    print("some error")
            probableSongs.append(temp)
            probableSongsFrequency.append(tempFrequency)

        i = 0
        for arr in probableSongs:
            for songarr in arr:
                for song in songarr:
                    try:
                        ind = mostRelevantSongs.index(song)
                        #print("i",i)
                        if (i > editCounter[ind]):
                            mostRelevantSongFrquency[ind] += 1
                            editCounter[ind] = i
                    except:
                        mostRelevantSongs.append(song)
                        mostRelevantSongFrquency.append(1)
                        editCounter.append(i)
            i = i + 1
        results = []
        #print(mostRelevantSongs)
        #print(mostRelevantSongFrquency)
        for i in range(15):
            if (len(mostRelevantSongFrquency) == i):
                break
            ind = mostRelevantSongFrquency.index(max(mostRelevantSongFrquency))
            #print(mostRelevantSongFrquency[ind])
            results.append(mostRelevantSongs[ind])
            mostRelevantSongFrquency[ind] = 0

        return results
        "type of unigrams provider - RAM/BigFile/MultipleFiles\nRAM is used by default"
    )
    args = parser.parse_args()

    unigrams_path = UNIGRAMS_FILEPATH
    if args.type == "BigFile":
        words_provider = KnownWordsProviderUsingBigFile()
    elif args.type == "MultipleFiles":
        unigrams_path = UNIGRAMS_FILES_DIR
        words_provider = KnownWordsProviderUsingMultipleFiles()
    else:
        words_provider = KnownWordsProviderUsingRAM()
    words_provider.initialize(unigrams_path)

    bigrams_provider = None
    if args.bigrams:
        bigrams_provider = BigramsProvider()
        bigrams_provider.initialize(BIGRAMS_FILEPATH)

    corrector = SpellCorrector(words_provider, bigrams_provider)

    if args.word:
        corrector.sentence_correction(args.word)
        print("")
        exit(0)

    while True:
        text_to_correct = raw_input("> ")
        corrector.sentence_correction(text_to_correct)
        print("")
Example #7
0
elif number == 3:
    corp = twitter_samples
elif number == 4:
    corp = gutenberg
elif number == 5:
    corp = reuters
"""

num2 = input("What n would you like to use for n grams")
number2 = int(num2)

num3 = input("How many words should be considered for each incorrect word?")
number3 = int(num3)

if len(sys.argv) == 1:
    file = input("What is the fileName?")
else:
    file = sys.argv[1]

# speller = SpellCorrector(brown, number2, number3)
speller = SpellCorrector()
with open(file) as f:
    for line in f.readlines():
        returnList = speller.check(line)
        for (lineNum, misspelledWord, corrections) in returnList:
            print(
                str(lineNum) + "  " + misspelledWord + "  " + " [ " +
                ",".join(corrections) + " ]")

exit(0)
Example #8
0
from SpellCorrector import SpellCorrector
from ClipboardManager import ClipboardManager

if __name__ == '__main__':
    spell_corrector = SpellCorrector()
    while True:
        try:
            s = input('Correction: ')
        except KeyboardInterrupt:
            print('End Script')
            break

        if s.lower() == 'exit':
            print('End Script')
            break

        corrected_str = spell_corrector.correct(s)
        print('Corrected: ' + corrected_str.get_attribute('innerHTML'))
        ClipboardManager.set_clipboard(
            ClipboardManager.format_string(
                corrected_str.get_attribute('innerHTML')))
        print('Corrected result has been pasted in clipboard.')