Ejemplo n.º 1
0
    def test_caumanns(self):
        """Test abydos.stemmer.Caumanns."""
        # base case
        self.assertEqual(self.stmr.stem(''), '')

        # tests from Caumanns' description of the algorithm
        self.assertEqual(self.stmr.stem('singt'), 'sing')
        self.assertEqual(self.stmr.stem('singen'), 'sing')
        self.assertEqual(self.stmr.stem('beliebt'), 'belieb')
        self.assertEqual(self.stmr.stem('beliebtester'), 'belieb')
        self.assertEqual(self.stmr.stem('stören'), 'stor')
        self.assertEqual(self.stmr.stem('stöhnen'), 'stoh')
        self.assertEqual(self.stmr.stem('Kuß'), 'kuss')
        self.assertEqual(self.stmr.stem('Küsse'), 'kuss')
        self.assertEqual(self.stmr.stem('Verlierer'), 'verlier')
        self.assertEqual(self.stmr.stem('Verlies'), 'verlie')
        self.assertEqual(self.stmr.stem('Maus'), 'mau')
        self.assertEqual(self.stmr.stem('Mauer'), 'mau')
        self.assertEqual(self.stmr.stem('Störsender'), 'stor')

        # additional tests to achieve full coverage
        self.assertEqual(self.stmr.stem('Müllerinnen'), 'mullerin')
        self.assertEqual(self.stmr.stem('Matrix'), 'matrix')
        self.assertEqual(self.stmr.stem('Matrizen'), 'matrix')

        # Test wrapper
        self.assertEqual(caumanns('singt'), 'sing')
Ejemplo n.º 2
0
    def test_caumanns(self):
        """Test abydos.stemmer.Caumanns."""
        # base case
        self.assertEqual(self.stmr.stem(''), '')

        # tests from Caumanns' description of the algorithm
        self.assertEqual(self.stmr.stem('singt'), 'sing')
        self.assertEqual(self.stmr.stem('singen'), 'sing')
        self.assertEqual(self.stmr.stem('beliebt'), 'belieb')
        self.assertEqual(self.stmr.stem('beliebtester'), 'belieb')
        self.assertEqual(self.stmr.stem('stören'), 'stor')
        self.assertEqual(self.stmr.stem('stöhnen'), 'stoh')
        self.assertEqual(self.stmr.stem('Kuß'), 'kuss')
        self.assertEqual(self.stmr.stem('Küsse'), 'kuss')
        self.assertEqual(self.stmr.stem('Verlierer'), 'verlier')
        self.assertEqual(self.stmr.stem('Verlies'), 'verlie')
        self.assertEqual(self.stmr.stem('Maus'), 'mau')
        self.assertEqual(self.stmr.stem('Mauer'), 'mau')
        self.assertEqual(self.stmr.stem('Störsender'), 'stor')

        # additional tests to achieve full coverage
        self.assertEqual(self.stmr.stem('Müllerinnen'), 'mullerin')
        self.assertEqual(self.stmr.stem('Matrix'), 'matrix')
        self.assertEqual(self.stmr.stem('Matrizen'), 'matrix')

        # Test wrapper
        self.assertEqual(caumanns('singt'), 'sing')
Ejemplo n.º 3
0
    def test_caumanns_lucene(self):
        """Test abydos.stemmer.caumanns (Lucene tests).

        Based on tests from
        https://svn.apache.org/repos/asf/lucene.net/trunk/test/contrib/Analyzers/De/data.txt
        This is presumably Apache-licensed.
        """
        # German special characters are replaced:
        self.assertEqual(caumanns('häufig'), 'haufig')
        self.assertEqual(caumanns('üor'), 'uor')
        self.assertEqual(caumanns('björk'), 'bjork')

        # here the stemmer works okay, it maps related words to the same stem:
        self.assertEqual(caumanns('abschließen'), 'abschliess')
        self.assertEqual(caumanns('abschließender'), 'abschliess')
        self.assertEqual(caumanns('abschließendes'), 'abschliess')
        self.assertEqual(caumanns('abschließenden'), 'abschliess')

        self.assertEqual(caumanns('Tisch'), 'tisch')
        self.assertEqual(caumanns('Tische'), 'tisch')
        self.assertEqual(caumanns('Tischen'), 'tisch')
        self.assertEqual(caumanns('geheimtür'), 'geheimtur')

        self.assertEqual(caumanns('Haus'), 'hau')
        self.assertEqual(caumanns('Hauses'), 'hau')
        self.assertEqual(caumanns('Häuser'), 'hau')
        self.assertEqual(caumanns('Häusern'), 'hau')
        # here's a case where overstemming occurs, i.e. a word is
        # mapped to the same stem as unrelated words:
        self.assertEqual(caumanns('hauen'), 'hau')

        # here's a case where understemming occurs, i.e. two related words
        # are not mapped to the same stem. This is the case with basically
        # all irregular forms:
        self.assertEqual(caumanns('Drama'), 'drama')
        self.assertEqual(caumanns('Dramen'), 'dram')

        # replace "ß" with 'ss':
        self.assertEqual(caumanns('Ausmaß'), 'ausmass')

        # fake words to test if suffixes are cut off:
        self.assertEqual(caumanns('xxxxxe'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxs'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxn'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxt'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxem'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxer'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxnd'), 'xxxxx')
        # the suffixes are also removed when combined:
        self.assertEqual(caumanns('xxxxxetende'), 'xxxxx')

        # words that are shorter than four charcters are not changed:
        self.assertEqual(caumanns('xxe'), 'xxe')
        # -em and -er are not removed from words shorter than five characters:
        self.assertEqual(caumanns('xxem'), 'xxem')
        self.assertEqual(caumanns('xxer'), 'xxer')
        # -nd is not removed from words shorter than six characters:
        self.assertEqual(caumanns('xxxnd'), 'xxxnd')
Ejemplo n.º 4
0
    def test_caumanns_lucene(self):
        """test abydos.stemmer.caumanns (Lucene tests)

        Based on tests from
        https://svn.apache.org/repos/asf/lucene.net/trunk/test/contrib/Analyzers/De/data.txt
        This is presumably Apache-licensed.
        """
        # German special characters are replaced:
        self.assertEqual(caumanns('häufig'), 'haufig')
        self.assertEqual(caumanns('üor'), 'uor')
        self.assertEqual(caumanns('björk'), 'bjork')

        # here the stemmer works okay, it maps related words to the same stem:
        self.assertEqual(caumanns('abschließen'), 'abschliess')
        self.assertEqual(caumanns('abschließender'), 'abschliess')
        self.assertEqual(caumanns('abschließendes'), 'abschliess')
        self.assertEqual(caumanns('abschließenden'), 'abschliess')

        self.assertEqual(caumanns('Tisch'), 'tisch')
        self.assertEqual(caumanns('Tische'), 'tisch')
        self.assertEqual(caumanns('Tischen'), 'tisch')
        self.assertEqual(caumanns('geheimtür'), 'geheimtur')

        self.assertEqual(caumanns('Haus'), 'hau')
        self.assertEqual(caumanns('Hauses'), 'hau')
        self.assertEqual(caumanns('Häuser'), 'hau')
        self.assertEqual(caumanns('Häusern'), 'hau')
        # here's a case where overstemming occurs, i.e. a word is
        # mapped to the same stem as unrelated words:
        self.assertEqual(caumanns('hauen'), 'hau')

        # here's a case where understemming occurs, i.e. two related words
        # are not mapped to the same stem. This is the case with basically
        # all irregular forms:
        self.assertEqual(caumanns('Drama'), 'drama')
        self.assertEqual(caumanns('Dramen'), 'dram')

        # replace "ß" with 'ss':
        self.assertEqual(caumanns('Ausmaß'), 'ausmass')

        # fake words to test if suffixes are cut off:
        self.assertEqual(caumanns('xxxxxe'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxs'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxn'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxt'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxem'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxer'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxnd'), 'xxxxx')
        # the suffixes are also removed when combined:
        self.assertEqual(caumanns('xxxxxetende'), 'xxxxx')

        # words that are shorter than four charcters are not changed:
        self.assertEqual(caumanns('xxe'), 'xxe')
        # -em and -er are not removed from words shorter than five characters:
        self.assertEqual(caumanns('xxem'), 'xxem')
        self.assertEqual(caumanns('xxer'), 'xxer')
        # -nd is not removed from words shorter than six characters:
        self.assertEqual(caumanns('xxxnd'), 'xxxnd')