コード例 #1
0
 def test_get_ngrams(self):
     content = '阿闍世[(禾*尤)\n/上/日]首佛足。敬強阿闍世耶。又'
     text = tacl.Text('test', 'base', content, self._tokenizer)
     expected_ngrams = [(3, {
         '阿闍世': 2,
         '闍世[(禾*尤)/上/日]': 1,
         '世[(禾*尤)/上/日]首': 1,
         '[(禾*尤)/上/日]首佛': 1,
         '首佛足': 1,
         '佛足敬': 1,
         '足敬強': 1,
         '敬強阿': 1,
         '強阿闍': 1,
         '闍世耶': 1,
         '世耶又': 1
     }),
                        (4, {
                            '阿闍世[(禾*尤)/上/日]': 1,
                            '闍世[(禾*尤)/上/日]首': 1,
                            '世[(禾*尤)/上/日]首佛': 1,
                            '[(禾*尤)/上/日]首佛足': 1,
                            '首佛足敬': 1,
                            '佛足敬強': 1,
                            '足敬強阿': 1,
                            '敬強阿闍': 1,
                            '強阿闍世': 1,
                            '阿闍世耶': 1,
                            '闍世耶又': 1
                        })]
     for actual, expected in zip(text.get_ngrams(3, 4), expected_ngrams):
         self.assertEqual(actual[0], expected[0])
         self.assertEqual(actual[1], collections.Counter(expected[1]))
コード例 #2
0
 def test_get_filename(self):
     content = '阿闍世[(禾*尤)\n/上/日]首佛足。敬強阿闍世耶。又'
     filename = 'test/base.txt'
     text = tacl.Text('test', 'base', content, self._tokenizer)
     actual_filename = text.get_filename()
     expected_filename = filename
     self.assertEqual(actual_filename, expected_filename)
コード例 #3
0
 def test_get_tokens(self):
     content = '阿闍世[(禾*尤)/上/日]首佛足。敬\n強耶。又'
     self._tokenizer.tokenize = MagicMock(return_value=sentinel.tokens)
     text = tacl.Text('test', 'base', content, self._tokenizer)
     actual_tokens = text.get_tokens()
     self._tokenizer.tokenize.assert_called_once_with(content)
     self.assertEqual(actual_tokens, sentinel.tokens)
コード例 #4
0
ファイル: text_test.py プロジェクト: sebastian-nehrdich/tacl
 def test_get_tokens_cbeta(self):
     content = '阿闍世[(禾*尤)\n/上/日]首佛足。敬\n強耶。又'
     text = tacl.Text(content, self._tokenizer)
     expected_tokens = ['阿', '闍', '世', '[(禾*尤)\n/上/日]', '首', '佛',
                        '足', '敬', '強', '耶', '又']
     actual_tokens = text.get_tokens()
     self.assertEqual(actual_tokens, expected_tokens)
コード例 #5
0
ファイル: text_test.py プロジェクト: ajenhl/tacl
 def test_excise(self):
     content = 'abcd efgh. ije'
     excised_ngrams = ['b', 'de', 'je', 'hij']
     replacement = 'F'
     text = tacl.Text(content, self._tokenizer)
     actual_content = text.excise(excised_ngrams, replacement)
     expected_content = 'aFcFfgFe'
     self.assertEqual(actual_content, expected_content)
コード例 #6
0
 def test_get_text(self):
     corpus = tacl.Corpus(self._data_dir, self._tokenizer)
     actual_text = corpus.get_text('T1', 'base')
     expected_text = tacl.Text('T1', 'base', 'then we went\n',
                               self._tokenizer)
     self.assertEqual(actual_text.get_checksum(),
                      expected_text.get_checksum())
     self.assertEqual(actual_text.get_filename(),
                      expected_text.get_filename())
コード例 #7
0
ファイル: text_test.py プロジェクト: sebastian-nehrdich/tacl
 def test_get_tokens_pagel(self):
     content = "bka' stsal pa  | rigs kyi\nbu dag de'i || rigs kyi"
     tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL,
                                tacl.constants.TOKENIZER_JOINER_PAGEL)
     text = tacl.Text(content, tokenizer)
     expected_content = ["bka'", 'stsal', 'pa', 'rigs', 'kyi', 'bu', 'dag',
                         "de'i", 'rigs', 'kyi']
     actual_content = text.get_tokens()
     self.assertEqual(actual_content, expected_content)
コード例 #8
0
 def test_get_tokens_pagel(self):
     content = "bka' stsal pa | rigs kyi\nbu dag de'i || rigs kyi"
     tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL,
                                tacl.constants.TOKENIZER_JOINER_PAGEL)
     text = tacl.Text('test', 'base', content, tokenizer)
     expected_tokens = [
         "bka'", "stsal", "pa", "rigs", "kyi", "bu", "dag", "de'i", "rigs",
         "kyi"
     ]
     actual_tokens = text.get_tokens()
     self.assertEqual(actual_tokens, expected_tokens)
コード例 #9
0
 def test_ngrams_pagel(self):
     content = ''
     tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL,
                                tacl.constants.TOKENIZER_JOINER_PAGEL)
     text = tacl.Text('test', 'base', content, tokenizer)
     tokens = ["dpa'", "sems", "dpa'", "chen", "po", "rnam", "par", "mi"]
     expected_ngrams = [
         "dpa' sems dpa'", "sems dpa' chen", "dpa' chen po", "chen po rnam",
         "po rnam par", "rnam par mi"
     ]
     actual_ngrams = text._ngrams(tokens, 3)
     self.assertEqual(expected_ngrams, actual_ngrams)
コード例 #10
0
 def test_ngrams_cbeta(self):
     content = ''
     text = tacl.Text('test', 'base', content, self._tokenizer)
     tokens = [
         '阿', '闍', '世', '[(禾*尤)\n/上/日]', '首', '佛', '足', '敬', '強', '耶', '又'
     ]
     expected_ngrams = [
         '阿闍世', '闍世[(禾*尤)/上/日]', '世[(禾*尤)/上/日]首', '[(禾*尤)/上/日]首佛', '首佛足',
         '佛足敬', '足敬強', '敬強耶', '強耶又'
     ]
     actual_ngrams = text._ngrams(tokens, 3)
     self.assertEqual(expected_ngrams, actual_ngrams)
コード例 #11
0
 def test_get_ngrams(self):
     # Being a static method, a mock of tacl.Text.ngrams using
     # autospec will be non-callable, so avoid this.
     ngrams = self._create_patch('tacl.Text._ngrams', False)
     sample_ngrams = ['a', 'b', 'c']
     ngrams.return_value = sample_ngrams
     get_tokens = self._create_patch('tacl.Text.get_tokens')
     get_tokens.return_value = sentinel.tokens
     collection = collections.Counter(sample_ngrams)
     text = tacl.Text('test', 'base', 'test content', self._tokenizer)
     actual_ngrams = list(text.get_ngrams(2, 3))
     expected_ngrams = [(2, collection), (3, collection)]
     get_tokens.assert_called_once_with(text)
     self.assertEqual(ngrams.mock_calls,
                      [call(sentinel.tokens, 2),
                       call(sentinel.tokens, 3)])
     self.assertEqual(actual_ngrams, expected_ngrams)
コード例 #12
0
 def test_get_texts(self):
     corpus = tacl.Corpus(self._data_dir, self._tokenizer)
     expected_texts = [
         tacl.Text('T1', 'a', 'the we went\n', self._tokenizer),
         tacl.Text('T1', 'base', 'then we went\n', self._tokenizer),
         tacl.Text('T2', 'a', 'thews he sent\n', self._tokenizer),
         tacl.Text('T2', 'base', 'these he sent\n', self._tokenizer),
         tacl.Text('T3', 'base', 'that\n', self._tokenizer),
         tacl.Text('T4', 'base', 'hense\n', self._tokenizer),
         tacl.Text('T5', 'base', 'well\n', self._tokenizer)
     ]
     actual_texts = list(corpus.get_texts())
     actual_texts.sort(key=lambda x: x.get_filename())
     for actual_text, expected_text in zip(actual_texts, expected_texts):
         self.assertEqual(actual_text.get_filename(),
                          expected_text.get_filename())
         message = 'Checksum of {} does not match expected checksum from supplied {}'.format(
             actual_text.get_filename(), expected_text.get_filename())
         self.assertEqual(actual_text.get_checksum(),
                          expected_text.get_checksum(), message)
コード例 #13
0
 def test_get_names(self):
     text = tacl.Text('T1', 'base', 'test content', self._tokenizer)
     actual_names = text.get_names()
     expected_names = ('T1', 'base')
     self.assertEqual(actual_names, expected_names)
コード例 #14
0
ファイル: text_test.py プロジェクト: ajenhl/tacl
 def test_content(self):
     content = 'test content'
     text = tacl.Text(content, self._tokenizer)
     self.assertEqual(text.content, content)
コード例 #15
0
 def test_get_checksum(self):
     content = '阿闍世[(禾*尤)\n/上/日]首佛足。敬強阿闍世耶。又'
     text = tacl.Text('test', 'base', content, self._tokenizer)
     actual_checksum = text.get_checksum()
     expected_checksum = 'b8f33a481780c4128c1b852488cede88'
     self.assertEqual(actual_checksum, expected_checksum)
コード例 #16
0
 def test_get_filename(self):
     text = tacl.Text('test', 'base', 'test content', self._tokenizer)
     actual_filename = text.get_filename()
     expected_filename = 'test/base.txt'
     self.assertEqual(actual_filename, expected_filename)
コード例 #17
0
 def test_get_checksum(self):
     content = '阿闍世[(禾*尤)/上/日]首佛足。敬\n強耶。又'
     text = tacl.Text('test', 'base', content, self._tokenizer)
     actual_checksum = text.get_checksum()
     expected_checksum = 'a94e3a20bc95a93710487611e65484d1'
     self.assertEqual(actual_checksum, expected_checksum)