class SpellCorrectorSentenceCorrectionBigramsTest(SpellCorrectorTestBase): def setUp(self): self.words_provider = MockKnownWordsProvider() self.bigrams_provider = MockBigramsProvider() self.sut = SpellCorrector(self.words_provider, self.bigrams_provider) def test_should_choose_word_existing_in_bigrams(self): self.words_provider.initialize({"panie": 50, "pranie": 19}) self.bigrams_provider.initialize({ "suszyć pranie": 15, "suszyć ubranie": 10 }) corrected = self.sut.sentence_correction("suszyć ptanie", print_words=False) self.assert_equal_utf("suszyć pranie", corrected) def test_should_choose_most_frequent_bigram(self): self.words_provider.initialize({ "komputer": 99, "komputerów": 30, "programowanie": 20 }) self.bigrams_provider.initialize({ "programowanie komputer": 1, "programowanie komputerów": 3 }) corrected = self.sut.sentence_correction("pogramowanie komputeruf", print_words=False) self.assert_equal_utf("programowanie komputerów", corrected)
class SpellCorrectorCorrectionTest(SpellCorrectorTestBase): def setUp(self): self.words_provider = MockKnownWordsProvider() self.sut = SpellCorrector(self.words_provider) def test_should_correct_diacritics_at_first(self): self.words_provider.initialize({"że": 21, "za": 30}) corrected = self.sut.correction("ze") self.assert_equal_utf("że", corrected) def test_should_choose_most_frequent_word(self): self.words_provider.initialize({"tata": 130, "taca": 44, "tara": 29}) corrected = self.sut.correction("taya") self.assert_equal_utf("tata", corrected) def test_should_choose_edit2_if_no_edit1_available(self): self.words_provider.initialize({"lampka": 101, "choinka": 50}) corrected = self.sut.correction("honika") self.assert_equal_utf("choinka", corrected) def test_should_choose_edit1_if_exist(self): self.words_provider.initialize({ "lampka": 101, "choinka": 50, 'konika': 5 }) corrected = self.sut.correction("honika") self.assert_equal_utf("konika", corrected)
class SpellCorrectorEditsTest(unittest.TestCase): def setUp(self): self.words_provider = MockKnownWordsProvider() self.sut = SpellCorrector(self.words_provider) def assert_contains_list(self, container, member_list, print_container=True): container_members_str = '[' + ", ".join( [elem.encode('utf-8') for elem in container]) + ']' if print_container else "container" for member in member_list: if not isinstance(member, unicode): member = unicode(member, 'utf-8') self.assertIn( member, container, member.encode('utf-8') + " not in " + container_members_str) def test_edit1(self): word = "hello" result = self.sut._edits1(word) deletes = ('ello', 'hllo', 'helo', 'hell') transposes = ('ehllo', 'hlelo', 'hello', 'helol') replaces = ('jello', 'hallo', 'helko', 'healo', 'hęllo') inserts = ('heello', 'hhello', 'helllo', 'hellou', 'helloż') self.assert_contains_list(result, deletes) self.assert_contains_list(result, transposes) self.assert_contains_list(result, replaces) self.assert_contains_list(result, inserts) def test_edit2(self): word = "hello" result = self.sut._edits2(word) example_edits = ('hekko', 'tallo', 'hhhllo', 'belko') self.assert_contains_list(result, example_edits, print_container=True) result = self.sut._edits2("komputeruf") example_edits = ('komputerów', 'omputeruf') self.assert_contains_list(result, example_edits, print_container=False) def test_diacritics_words(self): word = 'czesc' result = self.sut._add_diacritics(word) edit1_corrections = ( 'ćzesc', 'cżesc', # 'cźesc', 'częsc', 'cześc', 'czesć') edit2_corrections = ('ćzesć', 'cześć', 'częśc') self.assert_contains_list(result, edit1_corrections) self.assert_contains_list(result, edit2_corrections) self.assertFalse(word in result)
def setUp(self): self.words_provider = MockKnownWordsProvider() self.sut = SpellCorrector(self.words_provider)
def processInput(self, uType, uInput, jsondb): print(uInput) wn = WordNormalizer() uInputDict = wn.normalizeLyrics([uInput]) spn = SpellCorrector(jsondb) if (uType == "lyric"): originalDB = jsondb.getOriginalWordsDB() elif (uType == "artist"): originalDB = jsondb.getOriginalNamesDB() elif (uType == "album"): originalDB = jsondb.getOriginalAlbumsDB() probableSongs = [] probableSongsFrequency = [] mostFrequentWordList = [] mostRelevantSongs = [] mostRelevantSongFrquency = [] editCounter = [] #uInputWords = uInput.lower().strip().split(" ") for uInputWord in uInputDict: #print(uInputWord) matchingWords = spn.checkMatches(uType, uInputWord) data = spn.getMostFrequentWords(matchingWords) mostFrequentWords = data[0] if (uType != "artist" and data[1] == 1): mostFrequentWords = mostFrequentWords + spn.stemInputAndCheckMatch( uType, uInputWord) mostFrequentWordList.append(mostFrequentWords) print(mostFrequentWordList) print(len(mostFrequentWordList)) for array in mostFrequentWordList: temp = [] tempFrequency = [] for word in array: try: wordData = originalDB[word] temp.append(wordData) #tempFrequency.append(wordData[1]) except: print("some error") probableSongs.append(temp) probableSongsFrequency.append(tempFrequency) i = 0 for arr in probableSongs: for songarr in arr: for song in songarr: try: ind = mostRelevantSongs.index(song) #print("i",i) if (i > editCounter[ind]): mostRelevantSongFrquency[ind] += 1 editCounter[ind] = i except: mostRelevantSongs.append(song) mostRelevantSongFrquency.append(1) editCounter.append(i) i = i + 1 results = [] #print(mostRelevantSongs) #print(mostRelevantSongFrquency) for i in range(15): if (len(mostRelevantSongFrquency) == i): break ind = mostRelevantSongFrquency.index(max(mostRelevantSongFrquency)) #print(mostRelevantSongFrquency[ind]) results.append(mostRelevantSongs[ind]) mostRelevantSongFrquency[ind] = 0 return results
"type of unigrams provider - RAM/BigFile/MultipleFiles\nRAM is used by default" ) args = parser.parse_args() unigrams_path = UNIGRAMS_FILEPATH if args.type == "BigFile": words_provider = KnownWordsProviderUsingBigFile() elif args.type == "MultipleFiles": unigrams_path = UNIGRAMS_FILES_DIR words_provider = KnownWordsProviderUsingMultipleFiles() else: words_provider = KnownWordsProviderUsingRAM() words_provider.initialize(unigrams_path) bigrams_provider = None if args.bigrams: bigrams_provider = BigramsProvider() bigrams_provider.initialize(BIGRAMS_FILEPATH) corrector = SpellCorrector(words_provider, bigrams_provider) if args.word: corrector.sentence_correction(args.word) print("") exit(0) while True: text_to_correct = raw_input("> ") corrector.sentence_correction(text_to_correct) print("")
elif number == 3: corp = twitter_samples elif number == 4: corp = gutenberg elif number == 5: corp = reuters """ num2 = input("What n would you like to use for n grams") number2 = int(num2) num3 = input("How many words should be considered for each incorrect word?") number3 = int(num3) if len(sys.argv) == 1: file = input("What is the fileName?") else: file = sys.argv[1] # speller = SpellCorrector(brown, number2, number3) speller = SpellCorrector() with open(file) as f: for line in f.readlines(): returnList = speller.check(line) for (lineNum, misspelledWord, corrections) in returnList: print( str(lineNum) + " " + misspelledWord + " " + " [ " + ",".join(corrections) + " ]") exit(0)
from SpellCorrector import SpellCorrector from ClipboardManager import ClipboardManager if __name__ == '__main__': spell_corrector = SpellCorrector() while True: try: s = input('Correction: ') except KeyboardInterrupt: print('End Script') break if s.lower() == 'exit': print('End Script') break corrected_str = spell_corrector.correct(s) print('Corrected: ' + corrected_str.get_attribute('innerHTML')) ClipboardManager.set_clipboard( ClipboardManager.format_string( corrected_str.get_attribute('innerHTML'))) print('Corrected result has been pasted in clipboard.')