class LibvoikkoTest(unittest.TestCase): def setUp(self): self.voikko = Voikko(u"fi") def tearDown(self): self.voikko.terminate() def testInitAndTerminate(self): pass # do nothing, just check that setUp and tearDown complete succesfully def testTerminateCanBeCalledMultipleTimes(self): self.voikko.terminate() self.voikko.terminate() def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self): medicalVoikko = Voikko(u"fi-x-medicine") self.failUnless(medicalVoikko.spell(u"amifostiini")) self.failIf(self.voikko.spell(u"amifostiini")) del medicalVoikko self.failIf(self.voikko.spell(u"amifostiini")) def testDictionaryComparisonWorks(self): d1 = Dictionary(u"fi", u"", u"a", u"b") d2 = Dictionary(u"fi", u"", u"a", u"c") d3 = Dictionary(u"fi", u"", u"c", u"b") d4 = Dictionary(u"fi", u"", u"a", u"b") d5 = Dictionary(u"sv", u"", u"a", u"b") self.assertNotEqual(u"kissa", d1) self.assertNotEqual(d1, u"kissa") self.assertNotEqual(d1, d2) self.assertNotEqual(d1, d3) self.assertNotEqual(d4, d5) self.assertEqual(d1, d4) self.failUnless(d1 < d2) self.failUnless(d2 < d3) self.failUnless(d4 < d5) def testDictionaryHashCodeWorks(self): d1 = Dictionary(u"fi", u"", u"a", u"b") d2 = Dictionary(u"fi", u"", u"a", u"c") d3 = Dictionary(u"fi", u"", u"c", u"b") d4 = Dictionary(u"fi", u"", u"a", u"b") d5 = Dictionary(u"sv", u"", u"a", u"b") self.assertNotEqual(hash(d1), hash(d2)) self.assertNotEqual(hash(d1), hash(d3)) self.assertNotEqual(hash(d4), hash(d5)) self.assertEqual(hash(d1), hash(d4)) def testListDictsWithoutPath(self): dicts = Voikko.listDicts() self.failUnless(len(dicts) > 0) standard = dicts[0] self.assertEqual( u"standard", standard.variant, u"Standard dictionary must be the default in test environment.") def testListSupportedSpellingLanguagesWithoutPath(self): langs = Voikko.listSupportedSpellingLanguages() self.failUnless( u"fi" in langs, u"Finnish dictionary must be present in the test environment") def testListDictsWithPathAndAttributes(self): info = MorphologyInfo() info.variant = u"test-variant-name" info.description = u"Some test description sakldjasd" info.morphology = u"null" dataDir = TestDataDir() dataDir.createMorphology(info.variant, info) dicts = Voikko.listDicts(dataDir.getDirectory()) dataDir.tearDown() dictsWithCorrectVariant = list( filter(lambda aDict: aDict.variant == info.variant, dicts)) self.assertEqual(1, len(dictsWithCorrectVariant)) theDict = dictsWithCorrectVariant[0] self.assertEqual(info.description, theDict.description) self.assertEqual(u"fi", theDict.language) self.assertEqual(u"", theDict.script) def testInitWithCorrectDictWorks(self): self.voikko.terminate() self.voikko = Voikko(u"fi-x-standard") self.failIf(self.voikko.spell(u"amifostiini")) self.voikko.terminate() self.voikko = Voikko(u"fi-x-medicine") self.failUnless(self.voikko.spell(u"amifostiini")) def testInitWithNonExistentDictThrowsException(self): def tryInit(): self.voikko = Voikko(u"fi-x-non-existent-variant") self.voikko.terminate() self.assertRaises(VoikkoException, tryInit) def testInitWithPathWorks(self): # TODO: better test self.voikko.terminate() self.voikko = Voikko(u"fi", path=u"/path/to/nowhere") self.failUnless(self.voikko.spell(u"kissa")) def testSpellAfterTerminateThrowsException(self): def trySpell(): self.voikko.spell(u"kissa") self.voikko.terminate() self.assertRaises(VoikkoException, trySpell) def testSpell(self): self.failUnless(self.voikko.spell(u"määrä")) self.failIf(self.voikko.spell(u"määä")) def testSuggest(self): suggs = self.voikko.suggest(u"koirra") self.failUnless(u"koira" in suggs) def testSuggestReturnsArgumentIfWordIsCorrect(self): suggs = self.voikko.suggest(u"koira") self.assertEqual(1, len(suggs)) self.assertEqual(u"koira", suggs[0]) def testGrammarErrorsAndExplanation(self): errors = self.voikko.grammarErrors(u"Minä olen joten kuten kaunis.", "fi") self.assertEqual(1, len(errors)) error = errors[0] self.assertEqual(10, error.startPos) self.assertEqual(11, error.errorLen) self.assertEqual([u"jotenkuten"], error.suggestions) self.assertEqual(u"Virheellinen kirjoitusasu", error.shortDescription) def testNoGrammarErrorsInEmptyParagraph(self): errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen täi.", "fi") self.assertEqual(0, len(errors)) def testGrammarErrorOffsetsInMultipleParagraphs(self): errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen joten kuten.", "fi") self.assertEqual(1, len(errors)) error = errors[0] self.assertEqual(16, error.startPos) self.assertEqual(11, error.errorLen) def testAnalyze(self): analysisList = self.voikko.analyze(u"kansaneläkehakemus") self.assertEqual(1, len(analysisList)) analysis = analysisList[0] self.assertEqual(u"=pppppp=ppppp=ppppppp", analysis["STRUCTURE"]) def testTokens(self): tokenList = self.voikko.tokens(u"kissa ja koira") self.assertEqual(5, len(tokenList)) tokenJa = tokenList[2] self.assertEqual(Token.WORD, tokenJa.tokenType) self.assertEqual(u"ja", tokenJa.tokenText) def testSentences(self): sentences = self.voikko.sentences( u"Kissa ei ole koira. Koira ei ole kissa.") self.assertEqual(2, len(sentences)) self.assertEqual(u"Kissa ei ole koira. ", sentences[0].sentenceText) self.assertEqual(Sentence.PROBABLE, sentences[0].nextStartType) self.assertEqual(u"Koira ei ole kissa.", sentences[1].sentenceText) self.assertEqual(Sentence.NONE, sentences[1].nextStartType) def testAttributeValuesForEnumeratedAttribute(self): values = self.voikko.attributeValues(u"NUMBER") self.assertEqual(2, len(values)) self.assertTrue("singular" in values) self.assertTrue("plural" in values) def testAttributeValuesForNonEnumeratedAttribute(self): values = self.voikko.attributeValues(u"BASEFORM") self.assertEqual(None, values) def testAttributeValuesForUnknownAttribute(self): values = self.voikko.attributeValues(u"XYZ") self.assertEqual(None, values) def testHyphenationPattern(self): pattern = self.voikko.getHyphenationPattern(u"kissa") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"määrä") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"kuorma-auto") self.assertEqual(" - = - ", pattern) pattern = self.voikko.getHyphenationPattern(u"vaa'an") self.assertEqual(" = ", pattern) pattern = self.voikko.getHyphenationPattern(u"auton-") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"aztoa-") self.assertEqual(" - - ", pattern) pattern = self.voikko.getHyphenationPattern(u"aztoa-alus") self.assertEqual(" - -= - ", pattern) pattern = self.voikko.getHyphenationPattern(u"-auton") self.assertEqual(" - ", pattern) pattern = self.voikko.getHyphenationPattern(u"-aztoa") self.assertEqual(" - -", pattern) def testHyphenate(self): self.assertEqual(u"kis-sa", self.voikko.hyphenate(u"kissa")) self.assertEqual(u"mää-rä", self.voikko.hyphenate(u"määrä")) self.assertEqual(u"kuor-ma-au-to", self.voikko.hyphenate(u"kuorma-auto")) self.assertEqual(u"vaa-an", self.voikko.hyphenate(u"vaa'an")) def testHyphenateWithCustomSeparator(self): self.assertEqual(u"kis­sa", self.voikko.hyphenate(u"kissa", u"­", True)) self.assertEqual(u"kuor­ma-au­to", self.voikko.hyphenate(u"kuorma-auto", u"­", True)) self.assertEqual(u"vaa­an", self.voikko.hyphenate(u"vaa'an", u"­", True)) self.assertEqual(u"vaa'an", self.voikko.hyphenate(u"vaa'an", u"­", False)) def testSetIgnoreDot(self): self.voikko.setIgnoreDot(False) self.failIf(self.voikko.spell(u"kissa.")) self.voikko.setIgnoreDot(True) self.failUnless(self.voikko.spell(u"kissa.")) def testSetBooleanOption(self): self.voikko.setBooleanOption(0, False) # This is "ignore dot" self.failIf(self.voikko.spell(u"kissa.")) self.voikko.setBooleanOption(0, True) self.failUnless(self.voikko.spell(u"kissa.")) def testSetIgnoreNumbers(self): self.voikko.setIgnoreNumbers(False) self.failIf(self.voikko.spell(u"kissa2")) self.voikko.setIgnoreNumbers(True) self.failUnless(self.voikko.spell(u"kissa2")) def testSetIgnoreUppercase(self): self.voikko.setIgnoreUppercase(False) self.failIf(self.voikko.spell(u"KAAAA")) self.voikko.setIgnoreUppercase(True) self.failUnless(self.voikko.spell(u"KAAAA")) def testAcceptFirstUppercase(self): self.voikko.setAcceptFirstUppercase(False) self.failIf(self.voikko.spell("Kissa")) self.voikko.setAcceptFirstUppercase(True) self.failUnless(self.voikko.spell("Kissa")) def testUpperCaseScandinavianLetters(self): self.failUnless(self.voikko.spell(u"Äiti")) self.failIf(self.voikko.spell(u"Ääiti")) self.failUnless(self.voikko.spell(u"š")) self.failUnless(self.voikko.spell(u"Š")) def testAcceptAllUppercase(self): self.voikko.setIgnoreUppercase(False) self.voikko.setAcceptAllUppercase(False) self.failIf(self.voikko.spell("KISSA")) self.voikko.setAcceptAllUppercase(True) self.failUnless(self.voikko.spell("KISSA")) self.failIf(self.voikko.spell("KAAAA")) def testIgnoreNonwords(self): self.voikko.setIgnoreNonwords(False) self.failIf(self.voikko.spell("*****@*****.**")) self.voikko.setIgnoreNonwords(True) self.failUnless(self.voikko.spell("*****@*****.**")) self.failIf(self.voikko.spell("ashdaksd")) def testAcceptExtraHyphens(self): self.voikko.setAcceptExtraHyphens(False) self.failIf(self.voikko.spell("kerros-talo")) self.voikko.setAcceptExtraHyphens(True) self.failUnless(self.voikko.spell("kerros-talo")) def testAcceptMissingHyphens(self): self.voikko.setAcceptMissingHyphens(False) self.failIf(self.voikko.spell("sosiaali")) self.voikko.setAcceptMissingHyphens(True) self.failUnless(self.voikko.spell("sosiaali")) def testSetAcceptTitlesInGc(self): self.voikko.setAcceptTitlesInGc(False) self.assertEqual( 1, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi"))) self.voikko.setAcceptTitlesInGc(True) self.assertEqual( 0, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi"))) def testSetAcceptUnfinishedParagraphsInGc(self): self.voikko.setAcceptUnfinishedParagraphsInGc(False) self.assertEqual(1, len(self.voikko.grammarErrors(u"Kissa on ", "fi"))) self.voikko.setAcceptUnfinishedParagraphsInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"Kissa on ", "fi"))) def testSetAcceptBulletedListsInGc(self): self.voikko.setAcceptBulletedListsInGc(False) self.assertNotEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi"))) self.voikko.setAcceptBulletedListsInGc(True) self.assertEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi"))) def testSetNoUglyHyphenation(self): self.voikko.setNoUglyHyphenation(False) self.assertEqual(u"i-va", self.voikko.hyphenate(u"iva")) self.voikko.setNoUglyHyphenation(True) self.assertEqual(u"iva", self.voikko.hyphenate(u"iva")) def testSetHyphenateUnknownWordsWorks(self): self.voikko.setHyphenateUnknownWords(False) self.assertEqual(u"kirjutepo", self.voikko.hyphenate(u"kirjutepo")) self.voikko.setHyphenateUnknownWords(True) self.assertEqual(u"kir-ju-te-po", self.voikko.hyphenate(u"kirjutepo")) def testSetMinHyphenatedWordLength(self): self.voikko.setMinHyphenatedWordLength(6) self.assertEqual(u"koira", self.voikko.hyphenate(u"koira")) self.voikko.setMinHyphenatedWordLength(2) self.assertEqual(u"koi-ra", self.voikko.hyphenate(u"koira")) def testIncreaseSpellerCacheSize(self): # TODO: this only tests that nothing breaks, not that cache is actually increased self.voikko.setSpellerCacheSize(3) self.failUnless(self.voikko.spell(u"kissa")) def testDisableSpellerCache(self): # TODO: this only tests that nothing breaks, not that cache is actually disabled self.voikko.setSpellerCacheSize(-1) self.failUnless(self.voikko.spell(u"kissa")) def testSetSuggestionStrategy(self): self.voikko.setSuggestionStrategy(SuggestionStrategy.OCR) self.failIf(u"koira" in self.voikko.suggest(u"koari")) self.failUnless(u"koira" in self.voikko.suggest(u"koir_")) self.voikko.setSuggestionStrategy(SuggestionStrategy.TYPO) self.failUnless(u"koira" in self.voikko.suggest(u"koari")) def testMaxAnalysisCountIsNotPassed(self): complexWord = u"lumenerolumenerolumenerolumenerolumenero" self.failUnless( len(self.voikko.analyze(complexWord)) <= MAX_ANALYSIS_COUNT) def testMorPruningWorks(self): # TODO: this test will not fail, it just takes very long time # if pruning does not work. complexWord = u"" for i in range(0, 20): complexWord = complexWord + u"lumenero" self.failUnless(len(complexWord) < MAX_WORD_CHARS) self.voikko.analyze(complexWord) def testOverLongWordsAreRejectedInSpellCheck(self): # Limit is 255 characters. This behavior is deprecated and may change. longWord = u"" for i in range(0, 25): longWord = longWord + u"kuraattori" self.failUnless(len(longWord) < MAX_WORD_CHARS) self.failUnless(self.voikko.spell(longWord)) longWord = longWord + u"kuraattori" self.failUnless(len(longWord) > MAX_WORD_CHARS) self.failIf(self.voikko.spell(longWord)) def testOverLongWordsAreRejectedInAnalysis(self): # Limit is 255 characters. This behavior is deprecated and may change. longWord = u"" for i in range(0, 25): longWord = longWord + u"kuraattori" self.failUnless(len(longWord) < MAX_WORD_CHARS) self.assertEqual(1, len(self.voikko.analyze(longWord))) longWord = longWord + u"kuraattori" self.failUnless(len(longWord) > MAX_WORD_CHARS) self.assertEqual(0, len(self.voikko.analyze(longWord))) def testTokenizationWorksForHugeParagraphs(self): hugeParagraph = "Kissa on 29 vuotta vanha... Onhan se silloin vanha. " * 10000 self.assertEqual(10000 * 20, len(self.voikko.tokens(hugeParagraph))) def testTokenizationWorksWithSomeMultibyteCharacters(self): text = u"Kissä on 29 vuotta vanha... Onhan se silloin vanha. \n" * 9 self.assertEqual(180, len(self.voikko.tokens(text))) def testEmbeddedNullsAreNotAccepted(self): self.failIf(self.voikko.spell(u"kissa\0asdasd")) self.assertEqual(0, len(self.voikko.suggest(u"kisssa\0koira"))) self.assertEqual(u"kissa\0koira", self.voikko.hyphenate(u"kissa\0koira")) self.assertEquals( 0, len(self.voikko.grammarErrors(u"kissa\0koira", "fi"))) self.assertEquals(0, len(self.voikko.analyze(u"kissa\0koira"))) def testNullCharMeansSingleSentence(self): sentences = self.voikko.sentences(u"kissa\0koira. Koira ja kissa.") self.assertEqual(1, len(sentences)) self.assertEqual(Sentence.NONE, sentences[0].nextStartType) self.assertEqual(u"kissa\0koira. Koira ja kissa.", sentences[0].sentenceText) def testNullCharIsUnknownToken(self): tokens = self.voikko.tokens(u"kissa\0koira") self.assertEquals(3, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) self.assertEquals(Token.WORD, tokens[2].tokenType) self.assertEquals(u"koira", tokens[2].tokenText) tokens = self.voikko.tokens(u"kissa\0\0koira") self.assertEquals(4, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) self.assertEquals(Token.UNKNOWN, tokens[2].tokenType) self.assertEquals(u"\0", tokens[2].tokenText) self.assertEquals(Token.WORD, tokens[3].tokenType) self.assertEquals(u"koira", tokens[3].tokenText) tokens = self.voikko.tokens(u"kissa\0") self.assertEquals(2, len(tokens)) self.assertEquals(Token.WORD, tokens[0].tokenType) self.assertEquals(u"kissa", tokens[0].tokenText) self.assertEquals(Token.UNKNOWN, tokens[1].tokenType) self.assertEquals(u"\0", tokens[1].tokenText) tokens = self.voikko.tokens(u"\0kissa") self.assertEquals(2, len(tokens)) self.assertEquals(Token.UNKNOWN, tokens[0].tokenType) self.assertEquals(u"\0", tokens[0].tokenText) self.assertEquals(Token.WORD, tokens[1].tokenType) self.assertEquals(u"kissa", tokens[1].tokenText) tokens = self.voikko.tokens(u"\0") self.assertEquals(1, len(tokens)) self.assertEquals(Token.UNKNOWN, tokens[0].tokenType) self.assertEquals(u"\0", tokens[0].tokenText) self.assertEquals(0, len(self.voikko.tokens(u""))) def testAllCapsAndDot(self): self.voikko.setIgnoreDot(True) self.failIf(self.voikko.spell(u"ABC-DEF.")) def testGetVersion(self): version = Voikko.getVersion() # We can't test for correct version but let's assume it starts with a number self.failUnless(re.compile(u"[0-9].*").match(version) is not None)
class VoikkoAttributeVectorizer: """Converts a collection of text documents to a matrix of counts of words having specific value for enumerated morphological analysis attributes. Examples -------- >>> from voikko_sklearn import VoikkoAttributeVectorizer >>> corpus = [ ... 'Koiran karvat olivat takussa.', ... 'Kissamme goli vanha.' ... ] >>> vectorizer = VoikkoAttributeVectorizer(['NUMBER', 'PERSON'], langtag='fi') >>> print(vectorizer.get_feature_names()) ['unknown', 'NUMBER_plural', 'NUMBER_singular', 'PERSON_1', 'PERSON_2', 'PERSON_3', 'PERSON_4'] >>> X = vectorizer.transform(corpus) >>> print(X.toarray()) [[0. 0.5 0.5 0. 0. 0.25 0. ] [0.33333333 0. 0.66666667 0. 0. 0. 0. ]] """ def __init__(self, attributes, langtag="fi"): self.input = input self.attributes = attributes self.voikko = Voikko(langtag) self.__init_feature_names() def __init_feature_names(self): self.feature_names = ['unknown'] self.feature_name_to_index = {'unknown' : 0} for attribute in self.attributes: values = self.voikko.attributeValues(attribute) if values is None: raise ValueError("Attribute '" + attribute + "' does not exist or is not categorial.") values.sort() for value in values: name = attribute + '_' + value self.feature_name_to_index[name] = len(self.feature_names) self.feature_names.append(name) def terminate(self): self.voikko.terminate() def build_tokenizer(self): return lambda text: [token.tokenText for token in self.voikko.tokens(text) if token.tokenType == Token.WORD] def get_feature_names(self): return self.feature_names def __transform_document(self, document, target_vector): words = self.build_tokenizer()(document) wordcount = len(words) if wordcount == 0: return for word in words: analysis_list = self.voikko.analyze(word) count = len(analysis_list) if count == 0: target_vector[0] += 1 else: for analysis in analysis_list: for attribute in self.attributes: if attribute in analysis: value = analysis[attribute] target_vector[self.feature_name_to_index[attribute + "_" + value]] += 1.0 / count target_vector /= wordcount def transform(self, document_list): document_count = len(document_list) vector_length = len(self.feature_names) data = numpy.zeros((document_count, vector_length), dtype=numpy.float64) for i in range(document_count): self.__transform_document(document_list[i], data[i]) return csr_matrix(data) def fit(self, document_list): return self def fit_transform(self, document_list): return self.transform(document_list)