class VoikkoCountVectorizer(CountVectorizer): """Converts a collection of text documents to a matrix of lemmatized token counts. This is similar to scikit-learn CountVectorizer but uses Voikko for tokenization and lemmatization. Additionally stop words can be specified using word classes that are considered irrelevant for particular task. """ FINNISH_STOPWORD_CLASSES = [ "huudahdussana", "seikkasana", "lukusana", "asemosana", "sidesana", "suhdesana", "kieltosana" ] """List of closed word classes for Finnish analyzer. Use these if you want to concentrate the analysis on nouns, verbs and adjectives only.""" def __init__(self, langtag="fi", binary=False, stop_word_classes=[]): self.voikko = Voikko(langtag) self.stop_word_classes = set(stop_word_classes) super().__init__(binary=binary) def terminate(self): self.voikko.terminate() def build_analyzer(self): check_stop_words = len(self.stop_word_classes) > 0 def analyse_word(word): baseform = None is_stop_word = False for analysis in self.voikko.analyze(word): if check_stop_words and "CLASS" in analysis and analysis[ "CLASS"] in self.stop_word_classes: is_stop_word = True elif "BASEFORM" in analysis: new_baseform = analysis["BASEFORM"] if baseform is not None and baseform != new_baseform: return word.lower() baseform = new_baseform else: return word.lower() if baseform is None: if is_stop_word: return None return word.lower() return baseform def analyse_text(text): baseforms = [ analyse_word(token.tokenText) for token in self.voikko.tokens(text) if token.tokenType == Token.WORD ] if check_stop_words: return [ baseform for baseform in baseforms if baseform is not None ] return baseforms return analyse_text
class VoikkoCountVectorizer(CountVectorizer): FINNISH_STOPWORD_CLASSES = ["huudahdussana", "seikkasana", "lukusana", "asemosana", "sidesana", "suhdesana", "kieltosana"] def __init__(self, langtag="fi", binary=False, stop_word_classes=[]): self.voikko = Voikko(langtag) self.stop_word_classes = set(stop_word_classes) super().__init__(binary=binary) def terminate(self): self.voikko.terminate() def build_analyzer(self): check_stop_words = len(self.stop_word_classes) > 0 def analyse_word(word): baseform = None is_stop_word = False for analysis in self.voikko.analyze(word): if check_stop_words and "CLASS" in analysis and analysis["CLASS"] in self.stop_word_classes: is_stop_word = True elif "BASEFORM" in analysis: new_baseform = analysis["BASEFORM"] if baseform is not None and baseform != new_baseform: return word.lower() baseform = new_baseform else: return word.lower() if baseform is None: if is_stop_word: return None return word.lower() return baseform def analyse_text(text): baseforms = [analyse_word(token.tokenText) for token in self.voikko.tokens(text) if token.tokenType == Token.WORD] if check_stop_words: return [baseform for baseform in baseforms if baseform is not None] return baseforms return analyse_text
class LibvoikkoTest(unittest.TestCase): def setUp(self): self.voikko = Voikko(u"fi") def tearDown(self): self.voikko.terminate() def testInitAndTerminate(self): pass # do nothing, just check that setUp and tearDown complete succesfully def testTerminateCanBeCalledMultipleTimes(self): self.voikko.terminate() self.voikko.terminate() def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self): medicalVoikko = Voikko(u"fi-x-medicine") self.failUnless(medicalVoikko.spell(u"amifostiini")) self.failIf(self.voikko.spell(u"amifostiini")) del medicalVoikko self.failIf(self.voikko.spell(u"amifostiini")) def testDictionaryComparisonWorks(self): d1 = Dictionary(u"fi", u"", u"a", u"b") d2 = Dictionary(u"fi", u"", u"a", u"c") d3 = Dictionary(u"fi", u"", u"c", u"b") d4 = Dictionary(u"fi", u"", u"a", u"b") d5 = Dictionary(u"sv", u"", u"a", u"b") self.assertNotEqual(u"kissa", d1) self.assertNotEqual(d1, u"kissa") self.assertNotEqual(d1, d2) self.assertNotEqual(d1, d3) self.assertNotEqual(d4, d5) self.assertEqual(d1, d4) self.failUnless(d1 < d2) self.failUnless(d2 < d3) self.failUnless(d4 < d5) def testDictionaryHashCodeWorks(self): d1 = Dictionary(u"fi", u"", u"a", u"b") d2 = Dictionary(u"fi", u"", u"a", u"c") d3 = Dictionary(u"fi", u"", u"c", u"b") d4 = Dictionary(u"fi", u"", u"a", u"b") d5 = Dictionary(u"sv", u"", u"a", u"b") self.assertNotEqual(hash(d1), hash(d2)) self.assertNotEqual(hash(d1), hash(d3)) self.assertNotEqual(hash(d4), hash(d5)) self.assertEqual(hash(d1), hash(d4)) def testListDictsWithoutPath(self): dicts = Voikko.listDicts() self.failUnless(len(dicts) > 0) standard = dicts[0] self.assertEqual( u"standard", standard.variant, u"Standard dictionary must be the default in test environment.") def testListSupportedSpellingLanguagesWithoutPath(self): langs = Voikko.listSupportedSpellingLanguages() self.failUnless( u"fi" in langs, u"Finnish dictionary must be present in the test environment") def testListDictsWithPathAndAttributes(self): info = MorphologyInfo() info.variant = u"test-variant-name" info.description = u"Some test description sakldjasd" info.morphology = u"null" dataDir = TestDataDir() dataDir.createMorphology(info.variant, info) dicts = Voikko.listDicts(dataDir.getDirectory()) dataDir.tearDown() dictsWithCorrectVariant = list( filter(lambda aDict: aDict.variant == info.variant, dicts)) self.assertEqual(1, len(dictsWithCorrectVariant)) theDict = dictsWithCorrectVariant[0] self.assertEqual(info.description, theDict.description) self.assertEqual(u"fi", theDict.language) self.assertEqual(u"", theDict.script) def testInitWithCorrectDictWorks(self): self.voikko.terminate() self.voikko = Voikko(u"fi-x-standard") self.failIf(self.voikko.spell(u"amifostiini")) self.voikko.terminate() self.voikko = Voikko(u"fi-x-medicine") self.failUnless(self.voikko.spell(u"amifostiini")) def testInitWithNonExistentDictThrowsException(self): def tryInit(): self.voikko = Voikko(u"fi-x-non-existent-variant") self.voikko.terminate() self.assertRaises(VoikkoException, tryInit) def testInitWithPathWorks(self): # TODO: better test self.voikko.terminate() self.voikko = Voikko(u"fi", path=u"/path/to/nowhere") self.failUnless(self.voikko.spell(u"kissa")) def testSpellAfterTerminateThrowsException(self): def trySpell(): self.voikko.spell(u"kissa") self.voikko.terminate() self.assertRaises(VoikkoException, trySpell) def testSpell(self): self.failUnless(self.voikko.spell(u"määrä")) self.failIf(self.voikko.spell(u"määä")) def testSuggest(self): suggs = self.voikko.suggest(u"koirra") self.failUnless(u"koira" in suggs) def testSuggestReturnsArgumentIfWordIsCorrect(self): suggs = self.voikko.suggest(u"koira") self.assertEqual(1, len(suggs)) self.assertEqual(u"koira", suggs[0]) def testGrammarErrorsAndExplanation(self): errors = self.voikko.grammarErrors(u"Minä olen joten kuten kaunis.", "fi") self.assertEqual(1, len(errors)) error = errors[0] self.assertEqual(10, error.startPos) self.assertEqual(11, error.errorLen) self.assertEqual([u"jotenkuten"], error.suggestions) self.assertEqual(u"Virheellinen kirjoitusasu", error.shortDescription) def testNoGrammarErrorsInEmptyParagraph(self): errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen täi.", "fi") self.assertEqual(0, len(errors)) def testGrammarErrorOffsetsInMultipleParagraphs(self): errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen joten kuten.", "fi") self.assertEqual(1, len(errors)) error = errors[0] self.assertEqual(16, error.startPos) self.assertEqual(11, error.errorLen) def testAnalyze(self): analysisList = self.voikko.analyze(u"kansaneläkehakemus") self.assertEqual(1, len(analysisList)) analysis = analysisList[0] self.assertEqual(u"=pppppp=ppppp=ppppppp", analysis["STRUCTURE"]) def testTokens(self): tokenList = self.voikko.tokens(u"kissa ja koira") self.assertEqual(5, len(tokenList)) tokenJa = tokenList[2] self.assertEqual(Token.WORD, tokenJa.tokenType) self.assertEqual(u"ja", tokenJa.tokenText) def testSentences(self): sentences = self.voikko.sentences( u"Kissa ei ole koira. Koira ei ole kissa.") self.assertEqual(2, len(sentences)) self.assertEqual(u"Kissa ei ole koira. ", sentences[0].sentenceText) self.assertEqual(Sentence.PROBABLE, sentences[0].nextStartType) self.assertEqual(u"Koira ei ole kissa.", sentences[1].sentenceText) self.assertEqual(Sentence.NONE, sentences[1].nextStartType) def testAttributeValuesForEnumeratedAttribute(self): values = self.voikko.attributeValues(u"NUMBER") self.assertEqual(2, len(values)) self.assertTrue("singular" in values) self.assertTrue("plural" in values) def testAttributeValuesForNonEnumeratedAttribute(self): values = self.voikko.attributeValues(u"BASEFORM") self.assertEqual(None, values) def testAttributeValuesForUnknownAttribute(self): values = self.voikko.attributeValues(u"XYZ") self.assertEqual(None, values) def testHyphenationPattern(self): pattern = self.voikko.getHyphenationPattern(u"kissa") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"määrä") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"kuorma-auto") self.assertEqual(" - = - ", pattern) pattern = self.voikko.getHyphenationPattern(u"vaa'an") self.assertEqual(" = ", pattern) pattern = self.voikko.getHyphenationPattern(u"auton-") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"aztoa-") self.assertEqual(" - - ", pattern) pattern = self.voikko.getHyphenationPattern(u"aztoa-alus") self.assertEqual(" - -= - ", pattern) pattern = self.voikko.getHyphenationPattern(u"-auton") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"-aztoa") self.assertEqual(" - -", pattern) def testHyphenate(self): self.assertEqual(u"kis-sa", self.voikko.hyphenate(u"kissa")) self.assertEqual(u"mää-rä", self.voikko.hyphenate(u"määrä")) self.assertEqual(u"kuor-ma-au-to", self.voikko.hyphenate(u"kuorma-auto")) self.assertEqual(u"vaa-an", self.voikko.hyphenate(u"vaa'an")) def testHyphenateWithCustomSeparator(self): self.assertEqual(u"kis­sa", self.voikko.hyphenate(u"kissa", u"­", True)) self.assertEqual(u"kuor­ma-au­to", self.voikko.hyphenate(u"kuorma-auto", u"­", True)) self.assertEqual(u"vaa­an", self.voikko.hyphenate(u"vaa'an", u"­", True)) self.assertEqual(u"vaa'an", self.voikko.hyphenate(u"vaa'an", u"­", False)) def testSetIgnoreDot(self): self.voikko.setIgnoreDot(False) self.failIf(self.voikko.spell(u"kissa.")) self.voikko.setIgnoreDot(True) self.failUnless(self.voikko.spell(u"kissa.")) def testSetBooleanOption(self): self.voikko.setBooleanOption(0, False) # This is "ignore dot" self.failIf(self.voikko.spell(u"kissa.")) self.voikko.setBooleanOption(0, True) self.failUnless(self.voikko.spell(u"kissa.")) def testSetIgnoreNumbers(self): self.voikko.setIgnoreNumbers(False) self.failIf(self.voikko.spell(u"kissa2")) self.voikko.setIgnoreNumbers(True) self.failUnless(self.voikko.spell(u"kissa2")) def testSetIgnoreUppercase(self): self.voikko.setIgnoreUppercase(False) self.failIf(self.voikko.spell(u"KAAAA")) self.voikko.setIgnoreUppercase(True) self.failUnless(self.voikko.spell(u"KAAAA")) def testAcceptFirstUppercase(self): self.voikko.setAcceptFirstUppercase(False) self.failIf(self.voikko.spell("Kissa")) self.voikko.setAcceptFirstUppercase(True) self.failUnless(self.voikko.spell("Kissa")) def testUpperCaseScandinavianLetters(self): self.failUnless(self.voikko.spell(u"Äiti")) self.failIf(self.voikko.spell(u"Ääiti")) self.failUnless(self.voikko.spell(u"š")) self.failUnless(self.voikko.spell(u"Š")) def testAcceptAllUppercase(self): self.voikko.setIgnoreUppercase(False) self.voikko.setAcceptAllUppercase(False) self.failIf(self.voikko.spell("KISSA")) self.voikko.setAcceptAllUppercase(True) self.failUnless(self.voikko.spell("KISSA")) self.failIf(self.voikko.spell("KAAAA")) def testIgnoreNonwords(self): self.voikko.setIgnoreNonwords(False) self.failIf(self.voikko.spell("*****@*****.**")) self.voikko.setIgnoreNonwords(True) self.failUnless(self.voikko.spell("*****@*****.**")) self.failIf(self.voikko.spell("ashdaksd")) def testAcceptExtraHyphens(self): self.voikko.setAcceptExtraHyphens(False) self.failIf(self.voikko.spell("kerros-talo")) self.voikko.setAcceptExtraHyphens(True) self.failUnless(self.voikko.spell("kerros-talo")) def testAcceptMissingHyphens(self): self.voikko.setAcceptMissingHyphens(False) self.failIf(self.voikko.spell("sosiaali")) self.voikko.setAcceptMissingHyphens(True) self.failUnless(self.voikko.spell("sosiaali")) def testSetAcceptTitlesInGc(self): self.voikko.setAcceptTitlesInGc(False) self.assertEqual( 1, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi"))) self.voikko.setAcceptTitlesInGc(True) self.assertEqual( 0, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi"))) def testSetAcceptUnfinishedParagraphsInGc(self): self.voikko.setAcceptUnfinishedParagraphsInGc(False) self.assertEqual(1, len(self.voikko.grammarErrors(u"Kissa on ", "fi"))) self.voikko.setAcceptUnfinishedParagraphsInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"Kissa on ", "fi"))) def testSetAcceptBulletedListsInGc(self): self.voikko.setAcceptBulletedListsInGc(False) self.assertNotEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi"))) self.voikko.setAcceptBulletedListsInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi"))) def testSetNoUglyHyphenation(self): self.voikko.setNoUglyHyphenation(False) self.assertEqual(u"i-va", self.voikko.hyphenate(u"iva")) self.voikko.setNoUglyHyphenation(True) self.assertEqual(u"iva", self.voikko.hyphenate(u"iva")) def testSetHyphenateUnknownWordsWorks(self): self.voikko.setHyphenateUnknownWords(False) self.assertEqual(u"kirjutepo", self.voikko.hyphenate(u"kirjutepo")) self.voikko.setHyphenateUnknownWords(True) self.assertEqual(u"kir-ju-te-po", self.voikko.hyphenate(u"kirjutepo")) def testSetMinHyphenatedWordLength(self): self.voikko.setMinHyphenatedWordLength(6) self.assertEqual(u"koira", self.voikko.hyphenate(u"koira")) self.voikko.setMinHyphenatedWordLength(2) self.assertEqual(u"koi-ra", self.voikko.hyphenate(u"koira")) def testIncreaseSpellerCacheSize(self): # TODO: this only tests that nothing breaks, not that cache is actually increased self.voikko.setSpellerCacheSize(3) self.failUnless(self.voikko.spell(u"kissa")) def testDisableSpellerCache(self): # TODO: this only tests that nothing breaks, not that cache is actually disabled self.voikko.setSpellerCacheSize(-1) self.failUnless(self.voikko.spell(u"kissa")) def testSetSuggestionStrategy(self): self.voikko.setSuggestionStrategy(SuggestionStrategy.OCR) self.failIf(u"koira" in self.voikko.suggest(u"koari")) self.failUnless(u"koira" in self.voikko.suggest(u"koir_")) self.voikko.setSuggestionStrategy(SuggestionStrategy.TYPO) self.failUnless(u"koira" in self.voikko.suggest(u"koari")) def testMaxAnalysisCountIsNotPassed(self): complexWord = u"lumenerolumenerolumenerolumenerolumenero" self.failUnless( len(self.voikko.analyze(complexWord)) <= MAX_ANALYSIS_COUNT) def testMorPruningWorks(self): # TODO: this test will not fail, it just takes very long time # if pruning does not work. complexWord = u"" for i in range(0, 20): complexWord = complexWord + u"lumenero" self.failUnless(len(complexWord) < MAX_WORD_CHARS) self.voikko.analyze(complexWord) def testOverLongWordsAreRejectedInSpellCheck(self): # Limit is 255 characters. This behavior is deprecated and may change. longWord = u"" for i in range(0, 25): longWord = longWord + u"kuraattori" self.failUnless(len(longWord) < MAX_WORD_CHARS) self.failUnless(self.voikko.spell(longWord)) longWord = longWord + u"kuraattori" self.failUnless(len(longWord) > MAX_WORD_CHARS) self.failIf(self.voikko.spell(longWord)) def testOverLongWordsAreRejectedInAnalysis(self): # Limit is 255 characters. This behavior is deprecated and may change. longWord = u"" for i in range(0, 25): longWord = longWord + u"kuraattori" self.failUnless(len(longWord) < MAX_WORD_CHARS) self.assertEqual(1, len(self.voikko.analyze(longWord))) longWord = longWord + u"kuraattori" self.failUnless(len(longWord) > MAX_WORD_CHARS) self.assertEqual(0, len(self.voikko.analyze(longWord))) def testTokenizationWorksForHugeParagraphs(self): hugeParagraph = "Kissa on 29 vuotta vanha... Onhan se silloin vanha. " * 10000 self.assertEqual(10000 * 20, len(self.voikko.tokens(hugeParagraph))) def testTokenizationWorksWithSomeMultibyteCharacters(self): text = u"Kissä on 29 vuotta vanha... Onhan se silloin vanha. \n" * 9 self.assertEqual(180, len(self.voikko.tokens(text))) def testEmbeddedNullsAreNotAccepted(self): self.failIf(self.voikko.spell(u"kissa\0asdasd")) self.assertEqual(0, len(self.voikko.suggest(u"kisssa\0koira"))) self.assertEqual(u"kissa\0koira", self.voikko.hyphenate(u"kissa\0koira")) self.assertEquals( 0, len(self.voikko.grammarErrors(u"kissa\0koira", "fi"))) self.assertEquals(0, len(self.voikko.analyze(u"kissa\0koira"))) def testNullCharMeansSingleSentence(self): sentences = self.voikko.sentences(u"kissa\0koira. Koira ja kissa.") self.assertEqual(1, len(sentences)) self.assertEqual(Sentence.NONE, sentences[0].nextStartType) self.assertEqual(u"kissa\0koira. Koira ja kissa.", sentences[0].sentenceText) def testNullCharIsUnknownToken(self): tokens = self.voikko.tokens(u"kissa\0koira") self.assertEquals(3, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) self.assertEquals(Token.WORD, tokens[2].tokenType) self.assertEquals(u"koira", tokens[2].tokenText) tokens = self.voikko.tokens(u"kissa\0\0koira") self.assertEquals(4, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) self.assertEquals(Token.UNKNOWN, tokens[2].tokenType) self.assertEquals(u"\0", tokens[2].tokenText) self.assertEquals(Token.WORD, tokens[3].tokenType) self.assertEquals(u"koira", tokens[3].tokenText) tokens = self.voikko.tokens(u"kissa\0") self.assertEquals(2, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) tokens = self.voikko.tokens(u"\0kissa") self.assertEquals(2, len(tokens)) self.assertEquals(Token.UNKNOWN, tokens[0].tokenType) self.assertEquals(u"\0", tokens[0].tokenText) self.assertEquals(Token.WORD, tokens[1].tokenType) self.assertEquals(u"kissa", tokens[1].tokenText) tokens = self.voikko.tokens(u"\0") self.assertEquals(1, len(tokens)) self.assertEquals(Token.UNKNOWN, tokens[0].tokenType) self.assertEquals(u"\0", tokens[0].tokenText) self.assertEquals(0, len(self.voikko.tokens(u""))) def testAllCapsAndDot(self): self.voikko.setIgnoreDot(True) self.failIf(self.voikko.spell(u"ABC-DEF.")) def testGetVersion(self): version = Voikko.getVersion() # We can't test for correct version but let's assume it starts with a number self.failUnless(re.compile(u"[0-9].*").match(version) is not None)
class VoikkoAttributeVectorizer: """Converts a collection of text documents to a matrix of counts of words having specific value for enumerated morphological analysis attributes. Examples -------- >>> from voikko_sklearn import VoikkoAttributeVectorizer >>> corpus = [ ... 'Koiran karvat olivat takussa.', ... 'Kissamme goli vanha.' ... ] >>> vectorizer = VoikkoAttributeVectorizer(['NUMBER', 'PERSON'], langtag='fi') >>> print(vectorizer.get_feature_names()) ['unknown', 'NUMBER_plural', 'NUMBER_singular', 'PERSON_1', 'PERSON_2', 'PERSON_3', 'PERSON_4'] >>> X = vectorizer.transform(corpus) >>> print(X.toarray()) [[0. 0.5 0.5 0. 0. 0.25 0. ] [0.33333333 0. 0.66666667 0. 0. 0. 0. ]] """ def __init__(self, attributes, langtag="fi"): self.input = input self.attributes = attributes self.voikko = Voikko(langtag) self.__init_feature_names() def __init_feature_names(self): self.feature_names = ['unknown'] self.feature_name_to_index = {'unknown' : 0} for attribute in self.attributes: values = self.voikko.attributeValues(attribute) if values is None: raise ValueError("Attribute '" + attribute + "' does not exist or is not categorial.") values.sort() for value in values: name = attribute + '_' + value self.feature_name_to_index[name] = len(self.feature_names) self.feature_names.append(name) def terminate(self): self.voikko.terminate() def build_tokenizer(self): return lambda text: [token.tokenText for token in self.voikko.tokens(text) if token.tokenType == Token.WORD] def get_feature_names(self): return self.feature_names def __transform_document(self, document, target_vector): words = self.build_tokenizer()(document) wordcount = len(words) if wordcount == 0: return for word in words: analysis_list = self.voikko.analyze(word) count = len(analysis_list) if count == 0: target_vector[0] += 1 else: for analysis in analysis_list: for attribute in self.attributes: if attribute in analysis: value = analysis[attribute] target_vector[self.feature_name_to_index[attribute + "_" + value]] += 1.0 / count target_vector /= wordcount def transform(self, document_list): document_count = len(document_list) vector_length = len(self.feature_names) data = numpy.zeros((document_count, vector_length), dtype=numpy.float64) for i in range(document_count): self.__transform_document(document_list[i], data[i]) return csr_matrix(data) def fit(self, document_list): return self def fit_transform(self, document_list): return self.transform(document_list)
#!/usr/bin/env python import sys from libvoikko import Voikko print('Analysoidaan annetut sanat:\n') v = Voikko("fi") # Pass the 1st argument as it is the app name itself. for a in sys.argv[1:]: print('Sanan {} analyysi:'.format(a)) print(v.analyze(a)) print('Annetut sanat analysoitu.')
URL = "http://pompier.fi/espa/lounas/" text = get_html(URL) soup = BeautifulSoup(text) # columns = soup.find_all('strong') todays_lunch = soup.find(text=pattern) print(todays_lunch.parent.parent.text) from libvoikko import Voikko, Token v = Voikko(u"fi-x-morphoid") ttt = (todays_lunch.parent.parent.text.replace("-", " ").replace("\r", " ").replace( "\n", " ")) all_words = [] for word in ttt.split(" "): word = word.strip("\n\r,.") foo = v.analyze(word) print("-- " + word + "--") if foo and "BASEFORM" in foo[0]: base = foo[0]["BASEFORM"] else: base = word all_words.append(base) print(": " + base) print(all_words) for w in ["härkä", "lohi", "entrecote"]: if w in all_words: print("POMPIERIIN: {} !".format(w))
#!/usr/bin/env python from libvoikko import Voikko v = Voikko("fi") print(v.analyze('astetta'))
class LibvoikkoTest(unittest.TestCase): def setUp(self): self.voikko = Voikko(u"fi") def tearDown(self): self.voikko.terminate() def testInitAndTerminate(self): pass # do nothing, just check that setUp and tearDown complete succesfully def testTerminateCanBeCalledMultipleTimes(self): self.voikko.terminate() self.voikko.terminate() def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self): medicalVoikko = Voikko(u"fi-x-medicine") self.failUnless(medicalVoikko.spell(u"amifostiini")) self.failIf(self.voikko.spell(u"amifostiini")) del medicalVoikko self.failIf(self.voikko.spell(u"amifostiini")) def testDictionaryComparisonWorks(self): d1 = Dictionary(u"fi", u"", u"a", u"b") d2 = Dictionary(u"fi", u"", u"a", u"c") d3 = Dictionary(u"fi", u"", u"c", u"b") d4 = Dictionary(u"fi", u"", u"a", u"b") d5 = Dictionary(u"sv", u"", u"a", u"b") self.assertNotEqual(u"kissa", d1) self.assertNotEqual(d1, u"kissa") self.assertNotEqual(d1, d2) self.assertNotEqual(d1, d3) self.assertNotEqual(d4, d5) self.assertEqual(d1, d4) self.failUnless(d1 < d2) self.failUnless(d2 < d3) self.failUnless(d4 < d5) def testDictionaryHashCodeWorks(self): d1 = Dictionary(u"fi", u"", u"a", u"b") d2 = Dictionary(u"fi", u"", u"a", u"c") d3 = Dictionary(u"fi", u"", u"c", u"b") d4 = Dictionary(u"fi", u"", u"a", u"b") d5 = Dictionary(u"sv", u"", u"a", u"b") self.assertNotEqual(hash(d1), hash(d2)) self.assertNotEqual(hash(d1), hash(d3)) self.assertNotEqual(hash(d4), hash(d5)) self.assertEqual(hash(d1), hash(d4)) def testListDictsWithoutPath(self): dicts = Voikko.listDicts() self.failUnless(len(dicts) > 0) standard = dicts[0] self.assertEqual(u"standard", standard.variant, u"Standard dictionary must be the default in test environment.") def testListSupportedSpellingLanguagesWithoutPath(self): langs = Voikko.listSupportedSpellingLanguages() self.failUnless(u"fi" in langs, u"Finnish dictionary must be present in the test environment") def testListDictsWithPathAndAttributes(self): info = MorphologyInfo() info.variant = u"test-variant-name" info.description = u"Some test description sakldjasd" info.morphology = u"null" dataDir = TestDataDir() dataDir.createMorphology(info.variant, info) dicts = Voikko.listDicts(dataDir.getDirectory()) dataDir.tearDown() dictsWithCorrectVariant = list(filter(lambda aDict: aDict.variant == info.variant, dicts)) self.assertEqual(1, len(dictsWithCorrectVariant)) theDict = dictsWithCorrectVariant[0] self.assertEqual(info.description, theDict.description) self.assertEqual(u"fi", theDict.language) self.assertEqual(u"", theDict.script) def testInitWithCorrectDictWorks(self): self.voikko.terminate() self.voikko = Voikko(u"fi-x-standard") self.failIf(self.voikko.spell(u"amifostiini")) self.voikko.terminate() self.voikko = Voikko(u"fi-x-medicine") self.failUnless(self.voikko.spell(u"amifostiini")) def testInitWithNonExistentDictThrowsException(self): def tryInit(): self.voikko = Voikko(u"fi-x-non-existent-variant") self.voikko.terminate() self.assertRaises(VoikkoException, tryInit) def testInitWithPathWorks(self): # TODO: better test self.voikko.terminate() self.voikko = Voikko(u"fi", path=u"/path/to/nowhere") self.failUnless(self.voikko.spell(u"kissa")) def testSpellAfterTerminateThrowsException(self): def trySpell(): self.voikko.spell(u"kissa") self.voikko.terminate() self.assertRaises(VoikkoException, trySpell) def testSpell(self): self.failUnless(self.voikko.spell(u"määrä")) self.failIf(self.voikko.spell(u"määä")) def testSuggest(self): suggs = self.voikko.suggest(u"koirra") self.failUnless(u"koira" in suggs) def testSuggestReturnsArgumentIfWordIsCorrect(self): suggs = self.voikko.suggest(u"koira") self.assertEqual(1, len(suggs)) self.assertEqual(u"koira", suggs[0]) def testGrammarErrorsAndExplanation(self): errors = self.voikko.grammarErrors(u"Minä olen joten kuten kaunis.", "fi") self.assertEqual(1, len(errors)) error = errors[0] self.assertEqual(10, error.startPos) self.assertEqual(11, error.errorLen) self.assertEqual([u"jotenkuten"], error.suggestions) self.assertEqual(u"Virheellinen kirjoitusasu", error.shortDescription) def testNoGrammarErrorsInEmptyParagraph(self): errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen täi.", "fi") self.assertEqual(0, len(errors)) def testGrammarErrorOffsetsInMultipleParagraphs(self): errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen joten kuten.", "fi") self.assertEqual(1, len(errors)) error = errors[0] self.assertEqual(16, error.startPos) self.assertEqual(11, error.errorLen) def testAnalyze(self): analysisList = self.voikko.analyze(u"kansaneläkehakemus") self.assertEqual(1, len(analysisList)) analysis = analysisList[0] self.assertEqual(u"=pppppp=ppppp=ppppppp", analysis["STRUCTURE"]) def testTokens(self): tokenList = self.voikko.tokens(u"kissa ja koira") self.assertEqual(5, len(tokenList)) tokenJa = tokenList[2] self.assertEqual(Token.WORD, tokenJa.tokenType) self.assertEqual(u"ja", tokenJa.tokenText) def testSentences(self): sentences = self.voikko.sentences(u"Kissa ei ole koira. Koira ei ole kissa.") self.assertEqual(2, len(sentences)) self.assertEqual(u"Kissa ei ole koira. ", sentences[0].sentenceText) self.assertEqual(Sentence.PROBABLE, sentences[0].nextStartType) self.assertEqual(u"Koira ei ole kissa.", sentences[1].sentenceText) self.assertEqual(Sentence.NONE, sentences[1].nextStartType) def testHyphenationPattern(self): pattern = self.voikko.getHyphenationPattern(u"kissa") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"määrä") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"kuorma-auto") self.assertEqual(" - = - ", pattern) pattern = self.voikko.getHyphenationPattern(u"vaa'an") self.assertEqual(" = ", pattern) pattern = self.voikko.getHyphenationPattern(u"auton-") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"aztoa-") self.assertEqual(" - - ", pattern) pattern = self.voikko.getHyphenationPattern(u"aztoa-alus") self.assertEqual(" - -= - ", pattern) pattern = self.voikko.getHyphenationPattern(u"-auton") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"-aztoa") self.assertEqual(" - -", pattern) def testHyphenate(self): self.assertEqual(u"kis-sa", self.voikko.hyphenate(u"kissa")) self.assertEqual(u"mää-rä", self.voikko.hyphenate(u"määrä")) self.assertEqual(u"kuor-ma-au-to", self.voikko.hyphenate(u"kuorma-auto")) self.assertEqual(u"vaa-an", self.voikko.hyphenate(u"vaa'an")) def testHyphenateWithCustomSeparator(self): self.assertEqual(u"kis­sa", self.voikko.hyphenate(u"kissa", u"­", True)) self.assertEqual(u"kuor­ma-au­to", self.voikko.hyphenate(u"kuorma-auto", u"­", True)) self.assertEqual(u"vaa­an", self.voikko.hyphenate(u"vaa'an", u"­", True)) self.assertEqual(u"vaa'an", self.voikko.hyphenate(u"vaa'an", u"­", False)) def testSetIgnoreDot(self): self.voikko.setIgnoreDot(False) self.failIf(self.voikko.spell(u"kissa.")) self.voikko.setIgnoreDot(True) self.failUnless(self.voikko.spell(u"kissa.")) def testSetBooleanOption(self): self.voikko.setBooleanOption(0, False) # This is "ignore dot" self.failIf(self.voikko.spell(u"kissa.")) self.voikko.setBooleanOption(0, True) self.failUnless(self.voikko.spell(u"kissa.")) def testSetIgnoreNumbers(self): self.voikko.setIgnoreNumbers(False) self.failIf(self.voikko.spell(u"kissa2")) self.voikko.setIgnoreNumbers(True) self.failUnless(self.voikko.spell(u"kissa2")) def testSetIgnoreUppercase(self): self.voikko.setIgnoreUppercase(False) self.failIf(self.voikko.spell(u"KAAAA")) self.voikko.setIgnoreUppercase(True) self.failUnless(self.voikko.spell(u"KAAAA")) def testAcceptFirstUppercase(self): self.voikko.setAcceptFirstUppercase(False) self.failIf(self.voikko.spell("Kissa")) self.voikko.setAcceptFirstUppercase(True) self.failUnless(self.voikko.spell("Kissa")) def testUpperCaseScandinavianLetters(self): self.failUnless(self.voikko.spell(u"Äiti")) self.failIf(self.voikko.spell(u"Ääiti")) self.failUnless(self.voikko.spell(u"š")) self.failUnless(self.voikko.spell(u"Š")) def testAcceptAllUppercase(self): self.voikko.setIgnoreUppercase(False) self.voikko.setAcceptAllUppercase(False) self.failIf(self.voikko.spell("KISSA")) self.voikko.setAcceptAllUppercase(True) self.failUnless(self.voikko.spell("KISSA")) self.failIf(self.voikko.spell("KAAAA")) def testIgnoreNonwords(self): self.voikko.setIgnoreNonwords(False) self.failIf(self.voikko.spell("*****@*****.**")) self.voikko.setIgnoreNonwords(True) self.failUnless(self.voikko.spell("*****@*****.**")) self.failIf(self.voikko.spell("ashdaksd")) def testAcceptExtraHyphens(self): self.voikko.setAcceptExtraHyphens(False) self.failIf(self.voikko.spell("kerros-talo")) self.voikko.setAcceptExtraHyphens(True) self.failUnless(self.voikko.spell("kerros-talo")) def testAcceptMissingHyphens(self): self.voikko.setAcceptMissingHyphens(False) self.failIf(self.voikko.spell("sosiaali")) self.voikko.setAcceptMissingHyphens(True) self.failUnless(self.voikko.spell("sosiaali")) def testSetAcceptTitlesInGc(self): self.voikko.setAcceptTitlesInGc(False) self.assertEqual(1, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi"))) self.voikko.setAcceptTitlesInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi"))) def testSetAcceptUnfinishedParagraphsInGc(self): self.voikko.setAcceptUnfinishedParagraphsInGc(False) self.assertEqual(1, len(self.voikko.grammarErrors(u"Kissa on ", "fi"))) self.voikko.setAcceptUnfinishedParagraphsInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"Kissa on ", "fi"))) def testSetAcceptBulletedListsInGc(self): self.voikko.setAcceptBulletedListsInGc(False) self.assertNotEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi"))) self.voikko.setAcceptBulletedListsInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi"))) def testSetNoUglyHyphenation(self): self.voikko.setNoUglyHyphenation(False) self.assertEqual(u"i-va", self.voikko.hyphenate(u"iva")) self.voikko.setNoUglyHyphenation(True) self.assertEqual(u"iva", self.voikko.hyphenate(u"iva")) def testSetHyphenateUnknownWordsWorks(self): self.voikko.setHyphenateUnknownWords(False) self.assertEqual(u"kirjutepo", self.voikko.hyphenate(u"kirjutepo")) self.voikko.setHyphenateUnknownWords(True) self.assertEqual(u"kir-ju-te-po", self.voikko.hyphenate(u"kirjutepo")) def testSetMinHyphenatedWordLength(self): self.voikko.setMinHyphenatedWordLength(6) self.assertEqual(u"koira", self.voikko.hyphenate(u"koira")) self.voikko.setMinHyphenatedWordLength(2) self.assertEqual(u"koi-ra", self.voikko.hyphenate(u"koira")) def testIncreaseSpellerCacheSize(self): # TODO: this only tests that nothing breaks, not that cache is actually increased self.voikko.setSpellerCacheSize(3) self.failUnless(self.voikko.spell(u"kissa")) def testDisableSpellerCache(self): # TODO: this only tests that nothing breaks, not that cache is actually disabled self.voikko.setSpellerCacheSize(-1) self.failUnless(self.voikko.spell(u"kissa")) def testSetSuggestionStrategy(self): self.voikko.setSuggestionStrategy(SuggestionStrategy.OCR) self.failIf(u"koira" in self.voikko.suggest(u"koari")) self.failUnless(u"koira" in self.voikko.suggest(u"koir_")) self.voikko.setSuggestionStrategy(SuggestionStrategy.TYPO) self.failUnless(u"koira" in self.voikko.suggest(u"koari")) def testMaxAnalysisCountIsNotPassed(self): complexWord = u"lumenerolumenerolumenerolumenerolumenero" self.failUnless(len(self.voikko.analyze(complexWord)) <= MAX_ANALYSIS_COUNT) def testMorPruningWorks(self): # TODO: this test will not fail, it just takes very long time # if pruning does not work. complexWord = u"" for i in range(0, 20): complexWord = complexWord + u"lumenero" self.failUnless(len(complexWord) < MAX_WORD_CHARS) self.voikko.analyze(complexWord) def testOverLongWordsAreRejectedInSpellCheck(self): # Limit is 255 characters. This behavior is deprecated and may change. longWord = u"" for i in range(0, 25): longWord = longWord + u"kuraattori" self.failUnless(len(longWord) < MAX_WORD_CHARS) self.failUnless(self.voikko.spell(longWord)) longWord = longWord + u"kuraattori" self.failUnless(len(longWord) > MAX_WORD_CHARS) self.failIf(self.voikko.spell(longWord)) def testOverLongWordsAreRejectedInAnalysis(self): # Limit is 255 characters. This behavior is deprecated and may change. longWord = u"" for i in range(0, 25): longWord = longWord + u"kuraattori" self.failUnless(len(longWord) < MAX_WORD_CHARS) self.assertEqual(1, len(self.voikko.analyze(longWord))) longWord = longWord + u"kuraattori" self.failUnless(len(longWord) > MAX_WORD_CHARS) self.assertEqual(0, len(self.voikko.analyze(longWord))) def testTokenizationWorksForHugeParagraphs(self): hugeParagraph = "Kissa on 29 vuotta vanha... Onhan se silloin vanha. " * 10000 self.assertEqual(10000 * 20, len(self.voikko.tokens(hugeParagraph))) def testTokenizationWorksWithSomeMultibyteCharacters(self): text = u"Kissä on 29 vuotta vanha... Onhan se silloin vanha. \n" * 9 self.assertEqual(180, len(self.voikko.tokens(text))) def testEmbeddedNullsAreNotAccepted(self): self.failIf(self.voikko.spell(u"kissa\0asdasd")) self.assertEqual(0, len(self.voikko.suggest(u"kisssa\0koira"))) self.assertEqual(u"kissa\0koira", self.voikko.hyphenate(u"kissa\0koira")) self.assertEquals(0, len(self.voikko.grammarErrors(u"kissa\0koira", "fi"))) self.assertEquals(0, len(self.voikko.analyze(u"kissa\0koira"))) def testNullCharMeansSingleSentence(self): sentences = self.voikko.sentences(u"kissa\0koira. Koira ja kissa.") self.assertEqual(1, len(sentences)) self.assertEqual(Sentence.NONE, sentences[0].nextStartType) self.assertEqual(u"kissa\0koira. Koira ja kissa.", sentences[0].sentenceText) def testNullCharIsUnknownToken(self): tokens = self.voikko.tokens(u"kissa\0koira") self.assertEquals(3, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) self.assertEquals(Token.WORD, tokens[2].tokenType) self.assertEquals(u"koira", tokens[2].tokenText) tokens = self.voikko.tokens(u"kissa\0\0koira") self.assertEquals(4, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) self.assertEquals(Token.UNKNOWN, tokens[2].tokenType) self.assertEquals(u"\0", tokens[2].tokenText) self.assertEquals(Token.WORD, tokens[3].tokenType) self.assertEquals(u"koira", tokens[3].tokenText) tokens = self.voikko.tokens(u"kissa\0") self.assertEquals(2, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) tokens = self.voikko.tokens(u"\0kissa") self.assertEquals(2, len(tokens)) self.assertEquals(Token.UNKNOWN, tokens[0].tokenType) self.assertEquals(u"\0", tokens[0].tokenText) self.assertEquals(Token.WORD, tokens[1].tokenType) self.assertEquals(u"kissa", tokens[1].tokenText) tokens = self.voikko.tokens(u"\0") self.assertEquals(1, len(tokens)) self.assertEquals(Token.UNKNOWN, tokens[0].tokenType) self.assertEquals(u"\0", tokens[0].tokenText) self.assertEquals(0, len(self.voikko.tokens(u""))) def testAllCapsAndDot(self): self.voikko.setIgnoreDot(True) self.failIf(self.voikko.spell(u"ABC-DEF.")) def testGetVersion(self): version = Voikko.getVersion() # We can't test for correct version but let's assume it starts with a number self.failUnless(re.compile(u"[0-9].*").match(version) is not None)
# This is an example application import pandas as pd from libvoikko import Voikko v = Voikko("fi") print(v.analyze("autossa")) df1 = pd.read_csv("./data/input.csv") df2 = pd.read_csv("./data/input.csv") df1 = df1.append(df2) df1.to_csv("./data/output.csv")
pattern = re.compile('.*{}.*'.format(weekday)) URL = 'http://pompier.fi/espa/lounas/' text = get_html(URL) soup = BeautifulSoup(text) # columns = soup.find_all('strong') todays_lunch = soup.find(text=pattern) print(todays_lunch.parent.parent.text) from libvoikko import Voikko, Token v = Voikko(u"fi-x-morphoid") ttt = todays_lunch.parent.parent.text.replace('-', ' ').replace('\r', ' ').replace('\n', ' ') all_words = [] for word in ttt.split(" "): word = word.strip('\n\r,.') foo = v.analyze(word) print("-- " + word + "--") if foo and 'BASEFORM' in foo[0]: base = foo[0]['BASEFORM'] else: base = word all_words.append(base) print(": " + base) print(all_words) for w in ['härkä', 'lohi', 'entrecote']: if w in all_words: print("POMPIERIIN: {} !".format(w))