class TestSimpleFeatureGenerator(unittest.TestCase): def setUp(self): part = Part('Word1 word2 word3. Word4 word5 word6.') part.sentences = [[ Token('Word1', 0), Token('word2', 6), Token('word3', 12) ], [Token('Word4', 19), Token('word5', 25), Token('word6', 31)]] self.dataset = Dataset() self.dataset.documents['doc_1'] = Document() self.dataset.documents['doc_1'].parts['part_1'] = part self.simple_generator = SimpleFeatureGenerator() self.sentence_generator = SentenceMarkerFeatureGenerator() def test_simple_generate(self): self.simple_generator.generate(self.dataset) features = [token.features for token in self.dataset.tokens()] expected = iter([{ 'word[0]': 'Word1' }, { 'word[0]': 'word2' }, { 'word[0]': 'word3' }, { 'word[0]': 'Word4' }, { 'word[0]': 'word5' }, { 'word[0]': 'word6' }]) for feature in features: self.assertEqual(feature, next(expected)) def test_sentence_generate(self): self.sentence_generator.generate(self.dataset) features = [token.features for token in self.dataset.tokens()] expected = iter([{ 'BOS[0]': 1 }, {}, { 'EOS[0]': 1 }, { 'BOS[0]': 1 }, {}, { 'EOS[0]': 1 }]) for feature in features: self.assertEqual(feature, next(expected))
class TestPorterStemFeatureGenerator(unittest.TestCase): def setUp(self): part = Part('Make making made. Try tried tries.') part.sentences = [[Token('Make', 0), Token('making', 5), Token('made', 12)], [Token('Try', 18), Token('tried', 22), Token('tries', 28)]] self.dataset = Dataset() self.dataset.documents['doc_1'] = Document() self.dataset.documents['doc_1'].parts['part_1'] = part self.generator = PorterStemFeatureGenerator() def test_generate(self): self.generator.generate(self.dataset) features = [token.features for token in self.dataset.tokens()] expected = iter([{'stem[0]': 'make'}, {'stem[0]': 'make'}, {'stem[0]': 'made'}, {'stem[0]': 'tri'}, {'stem[0]': 'tri'}, {'stem[0]': 'tri'}]) for feature in features: self.assertEqual(feature, next(expected))
class TestWindowFeatureGenerator(unittest.TestCase): def setUp(self): part = Part('Make making made. Try tried tries.') part.sentences = [[ Token('Make', 0), Token('making', 5), Token('made', 12) ], [Token('Try', 18), Token('tried', 22), Token('tries', 28)]] self.dataset = Dataset() self.dataset.documents['doc_1'] = Document() self.dataset.documents['doc_1'].parts['part_1'] = part for token in self.dataset.tokens(): token.features['a'] = 'a' token.features['b'] = 'b' def test_default_window(self): WindowFeatureGenerator().generate(self.dataset) sentences = self.dataset.documents['doc_1'].parts['part_1'].sentences self.assertEqual( sentences[0][0].features, { 'a[0]': 'a', 'a[1]': 'a', 'a[2]': 'a', 'b[0]': 'b', 'b[1]': 'b', 'b[2]': 'b' }) self.assertEqual( sentences[0][1].features, { 'a[-1]': 'a', 'a[0]': 'a', 'a[1]': 'a', 'b[-1]': 'b', 'b[0]': 'b', 'b[1]': 'b' }) self.assertEqual( sentences[0][2].features, { 'a[-2]': 'a', 'a[-1]': 'a', 'a[0]': 'a', 'b[-2]': 'b', 'b[-1]': 'b', 'b[0]': 'b' }) self.assertEqual( sentences[1][0].features, { 'a[0]': 'a', 'a[1]': 'a', 'a[2]': 'a', 'b[0]': 'b', 'b[1]': 'b', 'b[2]': 'b' }) self.assertEqual( sentences[1][1].features, { 'a[-1]': 'a', 'a[0]': 'a', 'a[1]': 'a', 'b[-1]': 'b', 'b[0]': 'b', 'b[1]': 'b' }) self.assertEqual( sentences[1][2].features, { 'a[-2]': 'a', 'a[-1]': 'a', 'a[0]': 'a', 'b[-2]': 'b', 'b[-1]': 'b', 'b[0]': 'b' }) def test_custom_window(self): WindowFeatureGenerator(template=(-2, 1)).generate(self.dataset) sentences = self.dataset.documents['doc_1'].parts['part_1'].sentences self.assertEqual(sentences[0][0].features, { 'a[0]': 'a', 'a[1]': 'a', 'b[0]': 'b', 'b[1]': 'b' }) self.assertEqual(sentences[0][1].features, { 'a[0]': 'a', 'a[1]': 'a', 'b[0]': 'b', 'b[1]': 'b' }) self.assertEqual(sentences[0][2].features, { 'a[-2]': 'a', 'a[0]': 'a', 'b[-2]': 'b', 'b[0]': 'b' }) self.assertEqual(sentences[1][0].features, { 'a[0]': 'a', 'a[1]': 'a', 'b[0]': 'b', 'b[1]': 'b' }) self.assertEqual(sentences[1][1].features, { 'a[0]': 'a', 'a[1]': 'a', 'b[0]': 'b', 'b[1]': 'b' }) self.assertEqual(sentences[1][2].features, { 'a[-2]': 'a', 'a[0]': 'a', 'b[-2]': 'b', 'b[0]': 'b' }) def test_include_list(self): WindowFeatureGenerator(include_list=['a[0]']).generate(self.dataset) sentences = self.dataset.documents['doc_1'].parts['part_1'].sentences self.assertEqual(sentences[0][0].features, { 'a[0]': 'a', 'a[1]': 'a', 'a[2]': 'a', 'b[0]': 'b' }) self.assertEqual(sentences[0][1].features, { 'a[-1]': 'a', 'a[0]': 'a', 'a[1]': 'a', 'b[0]': 'b' }) self.assertEqual(sentences[0][2].features, { 'a[-2]': 'a', 'a[-1]': 'a', 'a[0]': 'a', 'b[0]': 'b' }) self.assertEqual(sentences[1][0].features, { 'a[0]': 'a', 'a[1]': 'a', 'a[2]': 'a', 'b[0]': 'b' }) self.assertEqual(sentences[1][1].features, { 'a[-1]': 'a', 'a[0]': 'a', 'a[1]': 'a', 'b[0]': 'b' }) self.assertEqual(sentences[1][2].features, { 'a[-2]': 'a', 'a[-1]': 'a', 'a[0]': 'a', 'b[0]': 'b' })