Beispiel #1
0
class TestSimpleFeatureGenerator(unittest.TestCase):
    def setUp(self):
        part = Part('Word1 word2 word3. Word4 word5 word6.')
        part.sentences = [[
            Token('Word1', 0),
            Token('word2', 6),
            Token('word3', 12)
        ], [Token('Word4', 19),
            Token('word5', 25),
            Token('word6', 31)]]

        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        self.simple_generator = SimpleFeatureGenerator()
        self.sentence_generator = SentenceMarkerFeatureGenerator()

    def test_simple_generate(self):
        self.simple_generator.generate(self.dataset)
        features = [token.features for token in self.dataset.tokens()]
        expected = iter([{
            'word[0]': 'Word1'
        }, {
            'word[0]': 'word2'
        }, {
            'word[0]': 'word3'
        }, {
            'word[0]': 'Word4'
        }, {
            'word[0]': 'word5'
        }, {
            'word[0]': 'word6'
        }])
        for feature in features:
            self.assertEqual(feature, next(expected))

    def test_sentence_generate(self):
        self.sentence_generator.generate(self.dataset)
        features = [token.features for token in self.dataset.tokens()]
        expected = iter([{
            'BOS[0]': 1
        }, {}, {
            'EOS[0]': 1
        }, {
            'BOS[0]': 1
        }, {}, {
            'EOS[0]': 1
        }])

        for feature in features:
            self.assertEqual(feature, next(expected))
Beispiel #2
0
class TestPorterStemFeatureGenerator(unittest.TestCase):
    def setUp(self):
        part = Part('Make making made. Try tried tries.')
        part.sentences = [[Token('Make', 0), Token('making', 5), Token('made', 12)],
                          [Token('Try', 18), Token('tried', 22), Token('tries', 28)]]

        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        self.generator = PorterStemFeatureGenerator()

    def test_generate(self):
        self.generator.generate(self.dataset)
        features = [token.features for token in self.dataset.tokens()]
        expected = iter([{'stem[0]': 'make'}, {'stem[0]': 'make'}, {'stem[0]': 'made'},
                         {'stem[0]': 'tri'}, {'stem[0]': 'tri'}, {'stem[0]': 'tri'}])
        for feature in features:
            self.assertEqual(feature, next(expected))
Beispiel #3
0
class TestWindowFeatureGenerator(unittest.TestCase):
    def setUp(self):
        part = Part('Make making made. Try tried tries.')
        part.sentences = [[
            Token('Make', 0),
            Token('making', 5),
            Token('made', 12)
        ], [Token('Try', 18),
            Token('tried', 22),
            Token('tries', 28)]]
        self.dataset = Dataset()
        self.dataset.documents['doc_1'] = Document()
        self.dataset.documents['doc_1'].parts['part_1'] = part

        for token in self.dataset.tokens():
            token.features['a'] = 'a'
            token.features['b'] = 'b'

    def test_default_window(self):
        WindowFeatureGenerator().generate(self.dataset)
        sentences = self.dataset.documents['doc_1'].parts['part_1'].sentences

        self.assertEqual(
            sentences[0][0].features, {
                'a[0]': 'a',
                'a[1]': 'a',
                'a[2]': 'a',
                'b[0]': 'b',
                'b[1]': 'b',
                'b[2]': 'b'
            })
        self.assertEqual(
            sentences[0][1].features, {
                'a[-1]': 'a',
                'a[0]': 'a',
                'a[1]': 'a',
                'b[-1]': 'b',
                'b[0]': 'b',
                'b[1]': 'b'
            })
        self.assertEqual(
            sentences[0][2].features, {
                'a[-2]': 'a',
                'a[-1]': 'a',
                'a[0]': 'a',
                'b[-2]': 'b',
                'b[-1]': 'b',
                'b[0]': 'b'
            })
        self.assertEqual(
            sentences[1][0].features, {
                'a[0]': 'a',
                'a[1]': 'a',
                'a[2]': 'a',
                'b[0]': 'b',
                'b[1]': 'b',
                'b[2]': 'b'
            })
        self.assertEqual(
            sentences[1][1].features, {
                'a[-1]': 'a',
                'a[0]': 'a',
                'a[1]': 'a',
                'b[-1]': 'b',
                'b[0]': 'b',
                'b[1]': 'b'
            })
        self.assertEqual(
            sentences[1][2].features, {
                'a[-2]': 'a',
                'a[-1]': 'a',
                'a[0]': 'a',
                'b[-2]': 'b',
                'b[-1]': 'b',
                'b[0]': 'b'
            })

    def test_custom_window(self):
        WindowFeatureGenerator(template=(-2, 1)).generate(self.dataset)
        sentences = self.dataset.documents['doc_1'].parts['part_1'].sentences
        self.assertEqual(sentences[0][0].features, {
            'a[0]': 'a',
            'a[1]': 'a',
            'b[0]': 'b',
            'b[1]': 'b'
        })
        self.assertEqual(sentences[0][1].features, {
            'a[0]': 'a',
            'a[1]': 'a',
            'b[0]': 'b',
            'b[1]': 'b'
        })
        self.assertEqual(sentences[0][2].features, {
            'a[-2]': 'a',
            'a[0]': 'a',
            'b[-2]': 'b',
            'b[0]': 'b'
        })
        self.assertEqual(sentences[1][0].features, {
            'a[0]': 'a',
            'a[1]': 'a',
            'b[0]': 'b',
            'b[1]': 'b'
        })
        self.assertEqual(sentences[1][1].features, {
            'a[0]': 'a',
            'a[1]': 'a',
            'b[0]': 'b',
            'b[1]': 'b'
        })
        self.assertEqual(sentences[1][2].features, {
            'a[-2]': 'a',
            'a[0]': 'a',
            'b[-2]': 'b',
            'b[0]': 'b'
        })

    def test_include_list(self):
        WindowFeatureGenerator(include_list=['a[0]']).generate(self.dataset)
        sentences = self.dataset.documents['doc_1'].parts['part_1'].sentences

        self.assertEqual(sentences[0][0].features, {
            'a[0]': 'a',
            'a[1]': 'a',
            'a[2]': 'a',
            'b[0]': 'b'
        })
        self.assertEqual(sentences[0][1].features, {
            'a[-1]': 'a',
            'a[0]': 'a',
            'a[1]': 'a',
            'b[0]': 'b'
        })
        self.assertEqual(sentences[0][2].features, {
            'a[-2]': 'a',
            'a[-1]': 'a',
            'a[0]': 'a',
            'b[0]': 'b'
        })

        self.assertEqual(sentences[1][0].features, {
            'a[0]': 'a',
            'a[1]': 'a',
            'a[2]': 'a',
            'b[0]': 'b'
        })
        self.assertEqual(sentences[1][1].features, {
            'a[-1]': 'a',
            'a[0]': 'a',
            'a[1]': 'a',
            'b[0]': 'b'
        })
        self.assertEqual(sentences[1][2].features, {
            'a[-2]': 'a',
            'a[-1]': 'a',
            'a[0]': 'a',
            'b[0]': 'b'
        })