コード例 #1
0
class LibvoikkoTest(unittest.TestCase):
    def setUp(self):
        self.voikko = Voikko(u"fi")

    def tearDown(self):
        self.voikko.terminate()

    def testInitAndTerminate(self):
        pass  # do nothing, just check that setUp and tearDown complete succesfully

    def testTerminateCanBeCalledMultipleTimes(self):
        self.voikko.terminate()
        self.voikko.terminate()

    def testAnotherObjectCanBeCreatedUsedAndDeletedInParallel(self):
        medicalVoikko = Voikko(u"fi-x-medicine")
        self.failUnless(medicalVoikko.spell(u"amifostiini"))
        self.failIf(self.voikko.spell(u"amifostiini"))
        del medicalVoikko
        self.failIf(self.voikko.spell(u"amifostiini"))

    def testDictionaryComparisonWorks(self):
        d1 = Dictionary(u"fi", u"", u"a", u"b")
        d2 = Dictionary(u"fi", u"", u"a", u"c")
        d3 = Dictionary(u"fi", u"", u"c", u"b")
        d4 = Dictionary(u"fi", u"", u"a", u"b")
        d5 = Dictionary(u"sv", u"", u"a", u"b")
        self.assertNotEqual(u"kissa", d1)
        self.assertNotEqual(d1, u"kissa")
        self.assertNotEqual(d1, d2)
        self.assertNotEqual(d1, d3)
        self.assertNotEqual(d4, d5)
        self.assertEqual(d1, d4)
        self.failUnless(d1 < d2)
        self.failUnless(d2 < d3)
        self.failUnless(d4 < d5)

    def testDictionaryHashCodeWorks(self):
        d1 = Dictionary(u"fi", u"", u"a", u"b")
        d2 = Dictionary(u"fi", u"", u"a", u"c")
        d3 = Dictionary(u"fi", u"", u"c", u"b")
        d4 = Dictionary(u"fi", u"", u"a", u"b")
        d5 = Dictionary(u"sv", u"", u"a", u"b")
        self.assertNotEqual(hash(d1), hash(d2))
        self.assertNotEqual(hash(d1), hash(d3))
        self.assertNotEqual(hash(d4), hash(d5))
        self.assertEqual(hash(d1), hash(d4))

    def testListDictsWithoutPath(self):
        dicts = Voikko.listDicts()
        self.failUnless(len(dicts) > 0)
        standard = dicts[0]
        self.assertEqual(
            u"standard", standard.variant,
            u"Standard dictionary must be the default in test environment.")

    def testListSupportedSpellingLanguagesWithoutPath(self):
        langs = Voikko.listSupportedSpellingLanguages()
        self.failUnless(
            u"fi" in langs,
            u"Finnish dictionary must be present in the test environment")

    def testListDictsWithPathAndAttributes(self):
        info = MorphologyInfo()
        info.variant = u"test-variant-name"
        info.description = u"Some test description sakldjasd"
        info.morphology = u"null"
        dataDir = TestDataDir()
        dataDir.createMorphology(info.variant, info)
        dicts = Voikko.listDicts(dataDir.getDirectory())
        dataDir.tearDown()
        dictsWithCorrectVariant = list(
            filter(lambda aDict: aDict.variant == info.variant, dicts))
        self.assertEqual(1, len(dictsWithCorrectVariant))
        theDict = dictsWithCorrectVariant[0]
        self.assertEqual(info.description, theDict.description)
        self.assertEqual(u"fi", theDict.language)
        self.assertEqual(u"", theDict.script)

    def testInitWithCorrectDictWorks(self):
        self.voikko.terminate()
        self.voikko = Voikko(u"fi-x-standard")
        self.failIf(self.voikko.spell(u"amifostiini"))
        self.voikko.terminate()
        self.voikko = Voikko(u"fi-x-medicine")
        self.failUnless(self.voikko.spell(u"amifostiini"))

    def testInitWithNonExistentDictThrowsException(self):
        def tryInit():
            self.voikko = Voikko(u"fi-x-non-existent-variant")

        self.voikko.terminate()
        self.assertRaises(VoikkoException, tryInit)

    def testInitWithPathWorks(self):
        # TODO: better test
        self.voikko.terminate()
        self.voikko = Voikko(u"fi", path=u"/path/to/nowhere")
        self.failUnless(self.voikko.spell(u"kissa"))

    def testSpellAfterTerminateThrowsException(self):
        def trySpell():
            self.voikko.spell(u"kissa")

        self.voikko.terminate()
        self.assertRaises(VoikkoException, trySpell)

    def testSpell(self):
        self.failUnless(self.voikko.spell(u"määrä"))
        self.failIf(self.voikko.spell(u"määä"))

    def testSuggest(self):
        suggs = self.voikko.suggest(u"koirra")
        self.failUnless(u"koira" in suggs)

    def testSuggestReturnsArgumentIfWordIsCorrect(self):
        suggs = self.voikko.suggest(u"koira")
        self.assertEqual(1, len(suggs))
        self.assertEqual(u"koira", suggs[0])

    def testGrammarErrorsAndExplanation(self):
        errors = self.voikko.grammarErrors(u"Minä olen joten kuten kaunis.",
                                           "fi")
        self.assertEqual(1, len(errors))
        error = errors[0]
        self.assertEqual(10, error.startPos)
        self.assertEqual(11, error.errorLen)
        self.assertEqual([u"jotenkuten"], error.suggestions)
        self.assertEqual(u"Virheellinen kirjoitusasu", error.shortDescription)

    def testNoGrammarErrorsInEmptyParagraph(self):
        errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen täi.", "fi")
        self.assertEqual(0, len(errors))

    def testGrammarErrorOffsetsInMultipleParagraphs(self):
        errors = self.voikko.grammarErrors(u"Olen täi.\n\nOlen joten kuten.",
                                           "fi")
        self.assertEqual(1, len(errors))
        error = errors[0]
        self.assertEqual(16, error.startPos)
        self.assertEqual(11, error.errorLen)

    def testAnalyze(self):
        analysisList = self.voikko.analyze(u"kansaneläkehakemus")
        self.assertEqual(1, len(analysisList))
        analysis = analysisList[0]
        self.assertEqual(u"=pppppp=ppppp=ppppppp", analysis["STRUCTURE"])

    def testTokens(self):
        tokenList = self.voikko.tokens(u"kissa ja koira")
        self.assertEqual(5, len(tokenList))
        tokenJa = tokenList[2]
        self.assertEqual(Token.WORD, tokenJa.tokenType)
        self.assertEqual(u"ja", tokenJa.tokenText)

    def testSentences(self):
        sentences = self.voikko.sentences(
            u"Kissa ei ole koira. Koira ei ole kissa.")
        self.assertEqual(2, len(sentences))
        self.assertEqual(u"Kissa ei ole koira. ", sentences[0].sentenceText)
        self.assertEqual(Sentence.PROBABLE, sentences[0].nextStartType)
        self.assertEqual(u"Koira ei ole kissa.", sentences[1].sentenceText)
        self.assertEqual(Sentence.NONE, sentences[1].nextStartType)

    def testAttributeValuesForEnumeratedAttribute(self):
        values = self.voikko.attributeValues(u"NUMBER")
        self.assertEqual(2, len(values))
        self.assertTrue("singular" in values)
        self.assertTrue("plural" in values)

    def testAttributeValuesForNonEnumeratedAttribute(self):
        values = self.voikko.attributeValues(u"BASEFORM")
        self.assertEqual(None, values)

    def testAttributeValuesForUnknownAttribute(self):
        values = self.voikko.attributeValues(u"XYZ")
        self.assertEqual(None, values)

    def testHyphenationPattern(self):
        pattern = self.voikko.getHyphenationPattern(u"kissa")
        self.assertEqual("   - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"määrä")
        self.assertEqual("   - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"kuorma-auto")
        self.assertEqual("    - =  - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"vaa'an")
        self.assertEqual("   =  ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"auton-")
        self.assertEqual("  -   ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"aztoa-")
        self.assertEqual("  - - ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"aztoa-alus")
        self.assertEqual("  - -= -  ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"-auton")
        self.assertEqual("   -  ", pattern)
        pattern = self.voikko.getHyphenationPattern(u"-aztoa")
        self.assertEqual("   - -", pattern)

    def testHyphenate(self):
        self.assertEqual(u"kis-sa", self.voikko.hyphenate(u"kissa"))
        self.assertEqual(u"mää-rä", self.voikko.hyphenate(u"määrä"))
        self.assertEqual(u"kuor-ma-au-to",
                         self.voikko.hyphenate(u"kuorma-auto"))
        self.assertEqual(u"vaa-an", self.voikko.hyphenate(u"vaa'an"))

    def testHyphenateWithCustomSeparator(self):
        self.assertEqual(u"kis&shy;sa",
                         self.voikko.hyphenate(u"kissa", u"&shy;", True))
        self.assertEqual(u"kuor&shy;ma-au&shy;to",
                         self.voikko.hyphenate(u"kuorma-auto", u"&shy;", True))
        self.assertEqual(u"vaa&shy;an",
                         self.voikko.hyphenate(u"vaa'an", u"&shy;", True))
        self.assertEqual(u"vaa'an",
                         self.voikko.hyphenate(u"vaa'an", u"&shy;", False))

    def testSetIgnoreDot(self):
        self.voikko.setIgnoreDot(False)
        self.failIf(self.voikko.spell(u"kissa."))
        self.voikko.setIgnoreDot(True)
        self.failUnless(self.voikko.spell(u"kissa."))

    def testSetBooleanOption(self):
        self.voikko.setBooleanOption(0, False)  # This is "ignore dot"
        self.failIf(self.voikko.spell(u"kissa."))
        self.voikko.setBooleanOption(0, True)
        self.failUnless(self.voikko.spell(u"kissa."))

    def testSetIgnoreNumbers(self):
        self.voikko.setIgnoreNumbers(False)
        self.failIf(self.voikko.spell(u"kissa2"))
        self.voikko.setIgnoreNumbers(True)
        self.failUnless(self.voikko.spell(u"kissa2"))

    def testSetIgnoreUppercase(self):
        self.voikko.setIgnoreUppercase(False)
        self.failIf(self.voikko.spell(u"KAAAA"))
        self.voikko.setIgnoreUppercase(True)
        self.failUnless(self.voikko.spell(u"KAAAA"))

    def testAcceptFirstUppercase(self):
        self.voikko.setAcceptFirstUppercase(False)
        self.failIf(self.voikko.spell("Kissa"))
        self.voikko.setAcceptFirstUppercase(True)
        self.failUnless(self.voikko.spell("Kissa"))

    def testUpperCaseScandinavianLetters(self):
        self.failUnless(self.voikko.spell(u"Äiti"))
        self.failIf(self.voikko.spell(u"Ääiti"))
        self.failUnless(self.voikko.spell(u"š"))
        self.failUnless(self.voikko.spell(u"Š"))

    def testAcceptAllUppercase(self):
        self.voikko.setIgnoreUppercase(False)
        self.voikko.setAcceptAllUppercase(False)
        self.failIf(self.voikko.spell("KISSA"))
        self.voikko.setAcceptAllUppercase(True)
        self.failUnless(self.voikko.spell("KISSA"))
        self.failIf(self.voikko.spell("KAAAA"))

    def testIgnoreNonwords(self):
        self.voikko.setIgnoreNonwords(False)
        self.failIf(self.voikko.spell("*****@*****.**"))
        self.voikko.setIgnoreNonwords(True)
        self.failUnless(self.voikko.spell("*****@*****.**"))
        self.failIf(self.voikko.spell("ashdaksd"))

    def testAcceptExtraHyphens(self):
        self.voikko.setAcceptExtraHyphens(False)
        self.failIf(self.voikko.spell("kerros-talo"))
        self.voikko.setAcceptExtraHyphens(True)
        self.failUnless(self.voikko.spell("kerros-talo"))

    def testAcceptMissingHyphens(self):
        self.voikko.setAcceptMissingHyphens(False)
        self.failIf(self.voikko.spell("sosiaali"))
        self.voikko.setAcceptMissingHyphens(True)
        self.failUnless(self.voikko.spell("sosiaali"))

    def testSetAcceptTitlesInGc(self):
        self.voikko.setAcceptTitlesInGc(False)
        self.assertEqual(
            1, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi")))
        self.voikko.setAcceptTitlesInGc(True)
        self.assertEqual(
            0, len(self.voikko.grammarErrors(u"Kissa on eläin", "fi")))

    def testSetAcceptUnfinishedParagraphsInGc(self):
        self.voikko.setAcceptUnfinishedParagraphsInGc(False)
        self.assertEqual(1, len(self.voikko.grammarErrors(u"Kissa on ", "fi")))
        self.voikko.setAcceptUnfinishedParagraphsInGc(True)
        self.assertEqual(0, len(self.voikko.grammarErrors(u"Kissa on ", "fi")))

    def testSetAcceptBulletedListsInGc(self):
        self.voikko.setAcceptBulletedListsInGc(False)
        self.assertNotEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi")))
        self.voikko.setAcceptBulletedListsInGc(True)
        self.assertEqual(0, len(self.voikko.grammarErrors(u"kissa", "fi")))

    def testSetNoUglyHyphenation(self):
        self.voikko.setNoUglyHyphenation(False)
        self.assertEqual(u"i-va", self.voikko.hyphenate(u"iva"))
        self.voikko.setNoUglyHyphenation(True)
        self.assertEqual(u"iva", self.voikko.hyphenate(u"iva"))

    def testSetHyphenateUnknownWordsWorks(self):
        self.voikko.setHyphenateUnknownWords(False)
        self.assertEqual(u"kirjutepo", self.voikko.hyphenate(u"kirjutepo"))
        self.voikko.setHyphenateUnknownWords(True)
        self.assertEqual(u"kir-ju-te-po", self.voikko.hyphenate(u"kirjutepo"))

    def testSetMinHyphenatedWordLength(self):
        self.voikko.setMinHyphenatedWordLength(6)
        self.assertEqual(u"koira", self.voikko.hyphenate(u"koira"))
        self.voikko.setMinHyphenatedWordLength(2)
        self.assertEqual(u"koi-ra", self.voikko.hyphenate(u"koira"))

    def testIncreaseSpellerCacheSize(self):
        # TODO: this only tests that nothing breaks, not that cache is actually increased
        self.voikko.setSpellerCacheSize(3)
        self.failUnless(self.voikko.spell(u"kissa"))

    def testDisableSpellerCache(self):
        # TODO: this only tests that nothing breaks, not that cache is actually disabled
        self.voikko.setSpellerCacheSize(-1)
        self.failUnless(self.voikko.spell(u"kissa"))

    def testSetSuggestionStrategy(self):
        self.voikko.setSuggestionStrategy(SuggestionStrategy.OCR)
        self.failIf(u"koira" in self.voikko.suggest(u"koari"))
        self.failUnless(u"koira" in self.voikko.suggest(u"koir_"))
        self.voikko.setSuggestionStrategy(SuggestionStrategy.TYPO)
        self.failUnless(u"koira" in self.voikko.suggest(u"koari"))

    def testMaxAnalysisCountIsNotPassed(self):
        complexWord = u"lumenerolumenerolumenerolumenerolumenero"
        self.failUnless(
            len(self.voikko.analyze(complexWord)) <= MAX_ANALYSIS_COUNT)

    def testMorPruningWorks(self):
        # TODO: this test will not fail, it just takes very long time
        # if pruning does not work.
        complexWord = u""
        for i in range(0, 20):
            complexWord = complexWord + u"lumenero"
        self.failUnless(len(complexWord) < MAX_WORD_CHARS)
        self.voikko.analyze(complexWord)

    def testOverLongWordsAreRejectedInSpellCheck(self):
        # Limit is 255 characters. This behavior is deprecated and may change.
        longWord = u""
        for i in range(0, 25):
            longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) < MAX_WORD_CHARS)
        self.failUnless(self.voikko.spell(longWord))

        longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) > MAX_WORD_CHARS)
        self.failIf(self.voikko.spell(longWord))

    def testOverLongWordsAreRejectedInAnalysis(self):
        # Limit is 255 characters. This behavior is deprecated and may change.
        longWord = u""
        for i in range(0, 25):
            longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) < MAX_WORD_CHARS)
        self.assertEqual(1, len(self.voikko.analyze(longWord)))

        longWord = longWord + u"kuraattori"
        self.failUnless(len(longWord) > MAX_WORD_CHARS)
        self.assertEqual(0, len(self.voikko.analyze(longWord)))

    def testTokenizationWorksForHugeParagraphs(self):
        hugeParagraph = "Kissa on 29 vuotta vanha... Onhan se silloin vanha. " * 10000
        self.assertEqual(10000 * 20, len(self.voikko.tokens(hugeParagraph)))

    def testTokenizationWorksWithSomeMultibyteCharacters(self):
        text = u"Kissä on 29 vuotta vanha... Onhan se silloin vanha. \n" * 9
        self.assertEqual(180, len(self.voikko.tokens(text)))

    def testEmbeddedNullsAreNotAccepted(self):
        self.failIf(self.voikko.spell(u"kissa\0asdasd"))
        self.assertEqual(0, len(self.voikko.suggest(u"kisssa\0koira")))
        self.assertEqual(u"kissa\0koira",
                         self.voikko.hyphenate(u"kissa\0koira"))
        self.assertEquals(
            0, len(self.voikko.grammarErrors(u"kissa\0koira", "fi")))
        self.assertEquals(0, len(self.voikko.analyze(u"kissa\0koira")))

    def testNullCharMeansSingleSentence(self):
        sentences = self.voikko.sentences(u"kissa\0koira. Koira ja kissa.")
        self.assertEqual(1, len(sentences))
        self.assertEqual(Sentence.NONE, sentences[0].nextStartType)
        self.assertEqual(u"kissa\0koira. Koira ja kissa.",
                         sentences[0].sentenceText)

    def testNullCharIsUnknownToken(self):
        tokens = self.voikko.tokens(u"kissa\0koira")
        self.assertEquals(3, len(tokens))
        self.assertEquals(Token.WORD, tokens[0].tokenType)
        self.assertEquals(u"kissa", tokens[0].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[1].tokenType)
        self.assertEquals(u"\0", tokens[1].tokenText)
        self.assertEquals(Token.WORD, tokens[2].tokenType)
        self.assertEquals(u"koira", tokens[2].tokenText)

        tokens = self.voikko.tokens(u"kissa\0\0koira")
        self.assertEquals(4, len(tokens))
        self.assertEquals(Token.WORD, tokens[0].tokenType)
        self.assertEquals(u"kissa", tokens[0].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[1].tokenType)
        self.assertEquals(u"\0", tokens[1].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[2].tokenType)
        self.assertEquals(u"\0", tokens[2].tokenText)
        self.assertEquals(Token.WORD, tokens[3].tokenType)
        self.assertEquals(u"koira", tokens[3].tokenText)

        tokens = self.voikko.tokens(u"kissa\0")
        self.assertEquals(2, len(tokens))
        self.assertEquals(Token.WORD, tokens[0].tokenType)
        self.assertEquals(u"kissa", tokens[0].tokenText)
        self.assertEquals(Token.UNKNOWN, tokens[1].tokenType)
        self.assertEquals(u"\0", tokens[1].tokenText)

        tokens = self.voikko.tokens(u"\0kissa")
        self.assertEquals(2, len(tokens))
        self.assertEquals(Token.UNKNOWN, tokens[0].tokenType)
        self.assertEquals(u"\0", tokens[0].tokenText)
        self.assertEquals(Token.WORD, tokens[1].tokenType)
        self.assertEquals(u"kissa", tokens[1].tokenText)

        tokens = self.voikko.tokens(u"\0")
        self.assertEquals(1, len(tokens))
        self.assertEquals(Token.UNKNOWN, tokens[0].tokenType)
        self.assertEquals(u"\0", tokens[0].tokenText)

        self.assertEquals(0, len(self.voikko.tokens(u"")))

    def testAllCapsAndDot(self):
        self.voikko.setIgnoreDot(True)
        self.failIf(self.voikko.spell(u"ABC-DEF."))

    def testGetVersion(self):
        version = Voikko.getVersion()
        # We can't test for correct version but let's assume it starts with a number
        self.failUnless(re.compile(u"[0-9].*").match(version) is not None)
コード例 #2
0
class VoikkoAttributeVectorizer:
	"""Converts a collection of text documents to a matrix of counts of words
	having specific value for enumerated morphological analysis attributes.
	
	Examples
	--------
	>>> from voikko_sklearn import VoikkoAttributeVectorizer
	>>> corpus = [
	...     'Koiran karvat olivat takussa.',
	...     'Kissamme goli vanha.'
	... ]
	>>> vectorizer = VoikkoAttributeVectorizer(['NUMBER', 'PERSON'], langtag='fi')
	>>> print(vectorizer.get_feature_names())
	['unknown', 'NUMBER_plural', 'NUMBER_singular', 'PERSON_1', 'PERSON_2', 'PERSON_3', 'PERSON_4']
	>>> X = vectorizer.transform(corpus)
	>>> print(X.toarray())
	[[0.         0.5        0.5        0.         0.         0.25       0.        ]
	[0.33333333 0.         0.66666667 0.         0.         0.         0.        ]]
	"""
	
	def __init__(self, attributes, langtag="fi"):
		self.input = input
		self.attributes = attributes
		self.voikko = Voikko(langtag)
		self.__init_feature_names()

	def __init_feature_names(self):
		self.feature_names = ['unknown']
		self.feature_name_to_index = {'unknown' : 0}
		for attribute in self.attributes:
			values = self.voikko.attributeValues(attribute)
			if values is None:
				raise ValueError("Attribute '" + attribute + "' does not exist or is not categorial.")
			values.sort()
			for value in values:
				name = attribute + '_' + value
				self.feature_name_to_index[name] = len(self.feature_names)
				self.feature_names.append(name)

	def terminate(self):
		self.voikko.terminate()

	def build_tokenizer(self):
		return lambda text: [token.tokenText for token in self.voikko.tokens(text) if token.tokenType == Token.WORD]

	def get_feature_names(self):
		return self.feature_names

	def __transform_document(self, document, target_vector):
		words = self.build_tokenizer()(document)
		wordcount = len(words)
		if wordcount == 0:
			return
		for word in words:
			analysis_list = self.voikko.analyze(word)
			count = len(analysis_list)
			if count == 0:
				target_vector[0] += 1
			else:
				for analysis in analysis_list:
					for attribute in self.attributes:
						if attribute in analysis:
							value = analysis[attribute]
							target_vector[self.feature_name_to_index[attribute + "_" + value]] += 1.0 / count
		target_vector /= wordcount

	def transform(self, document_list):
		document_count = len(document_list)
		vector_length = len(self.feature_names)
		data = numpy.zeros((document_count, vector_length), dtype=numpy.float64)
		for i in range(document_count):
			self.__transform_document(document_list[i], data[i])
		return csr_matrix(data)

	def fit(self, document_list):
		return self

	def fit_transform(self, document_list):
		return self.transform(document_list)