def test_get_ngrams(self): content = '阿闍世[(禾*尤)\n/上/日]首佛足。敬強阿闍世耶。又' text = tacl.Text('test', 'base', content, self._tokenizer) expected_ngrams = [(3, { '阿闍世': 2, '闍世[(禾*尤)/上/日]': 1, '世[(禾*尤)/上/日]首': 1, '[(禾*尤)/上/日]首佛': 1, '首佛足': 1, '佛足敬': 1, '足敬強': 1, '敬強阿': 1, '強阿闍': 1, '闍世耶': 1, '世耶又': 1 }), (4, { '阿闍世[(禾*尤)/上/日]': 1, '闍世[(禾*尤)/上/日]首': 1, '世[(禾*尤)/上/日]首佛': 1, '[(禾*尤)/上/日]首佛足': 1, '首佛足敬': 1, '佛足敬強': 1, '足敬強阿': 1, '敬強阿闍': 1, '強阿闍世': 1, '阿闍世耶': 1, '闍世耶又': 1 })] for actual, expected in zip(text.get_ngrams(3, 4), expected_ngrams): self.assertEqual(actual[0], expected[0]) self.assertEqual(actual[1], collections.Counter(expected[1]))
def test_get_filename(self): content = '阿闍世[(禾*尤)\n/上/日]首佛足。敬強阿闍世耶。又' filename = 'test/base.txt' text = tacl.Text('test', 'base', content, self._tokenizer) actual_filename = text.get_filename() expected_filename = filename self.assertEqual(actual_filename, expected_filename)
def test_get_tokens(self): content = '阿闍世[(禾*尤)/上/日]首佛足。敬\n強耶。又' self._tokenizer.tokenize = MagicMock(return_value=sentinel.tokens) text = tacl.Text('test', 'base', content, self._tokenizer) actual_tokens = text.get_tokens() self._tokenizer.tokenize.assert_called_once_with(content) self.assertEqual(actual_tokens, sentinel.tokens)
def test_get_tokens_cbeta(self): content = '阿闍世[(禾*尤)\n/上/日]首佛足。敬\n強耶。又' text = tacl.Text(content, self._tokenizer) expected_tokens = ['阿', '闍', '世', '[(禾*尤)\n/上/日]', '首', '佛', '足', '敬', '強', '耶', '又'] actual_tokens = text.get_tokens() self.assertEqual(actual_tokens, expected_tokens)
def test_excise(self): content = 'abcd efgh. ije' excised_ngrams = ['b', 'de', 'je', 'hij'] replacement = 'F' text = tacl.Text(content, self._tokenizer) actual_content = text.excise(excised_ngrams, replacement) expected_content = 'aFcFfgFe' self.assertEqual(actual_content, expected_content)
def test_get_text(self): corpus = tacl.Corpus(self._data_dir, self._tokenizer) actual_text = corpus.get_text('T1', 'base') expected_text = tacl.Text('T1', 'base', 'then we went\n', self._tokenizer) self.assertEqual(actual_text.get_checksum(), expected_text.get_checksum()) self.assertEqual(actual_text.get_filename(), expected_text.get_filename())
def test_get_tokens_pagel(self): content = "bka' stsal pa | rigs kyi\nbu dag de'i || rigs kyi" tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL, tacl.constants.TOKENIZER_JOINER_PAGEL) text = tacl.Text(content, tokenizer) expected_content = ["bka'", 'stsal', 'pa', 'rigs', 'kyi', 'bu', 'dag', "de'i", 'rigs', 'kyi'] actual_content = text.get_tokens() self.assertEqual(actual_content, expected_content)
def test_get_tokens_pagel(self): content = "bka' stsal pa | rigs kyi\nbu dag de'i || rigs kyi" tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL, tacl.constants.TOKENIZER_JOINER_PAGEL) text = tacl.Text('test', 'base', content, tokenizer) expected_tokens = [ "bka'", "stsal", "pa", "rigs", "kyi", "bu", "dag", "de'i", "rigs", "kyi" ] actual_tokens = text.get_tokens() self.assertEqual(actual_tokens, expected_tokens)
def test_ngrams_pagel(self): content = '' tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL, tacl.constants.TOKENIZER_JOINER_PAGEL) text = tacl.Text('test', 'base', content, tokenizer) tokens = ["dpa'", "sems", "dpa'", "chen", "po", "rnam", "par", "mi"] expected_ngrams = [ "dpa' sems dpa'", "sems dpa' chen", "dpa' chen po", "chen po rnam", "po rnam par", "rnam par mi" ] actual_ngrams = text._ngrams(tokens, 3) self.assertEqual(expected_ngrams, actual_ngrams)
def test_ngrams_cbeta(self): content = '' text = tacl.Text('test', 'base', content, self._tokenizer) tokens = [ '阿', '闍', '世', '[(禾*尤)\n/上/日]', '首', '佛', '足', '敬', '強', '耶', '又' ] expected_ngrams = [ '阿闍世', '闍世[(禾*尤)/上/日]', '世[(禾*尤)/上/日]首', '[(禾*尤)/上/日]首佛', '首佛足', '佛足敬', '足敬強', '敬強耶', '強耶又' ] actual_ngrams = text._ngrams(tokens, 3) self.assertEqual(expected_ngrams, actual_ngrams)
def test_get_ngrams(self): # Being a static method, a mock of tacl.Text.ngrams using # autospec will be non-callable, so avoid this. ngrams = self._create_patch('tacl.Text._ngrams', False) sample_ngrams = ['a', 'b', 'c'] ngrams.return_value = sample_ngrams get_tokens = self._create_patch('tacl.Text.get_tokens') get_tokens.return_value = sentinel.tokens collection = collections.Counter(sample_ngrams) text = tacl.Text('test', 'base', 'test content', self._tokenizer) actual_ngrams = list(text.get_ngrams(2, 3)) expected_ngrams = [(2, collection), (3, collection)] get_tokens.assert_called_once_with(text) self.assertEqual(ngrams.mock_calls, [call(sentinel.tokens, 2), call(sentinel.tokens, 3)]) self.assertEqual(actual_ngrams, expected_ngrams)
def test_get_texts(self): corpus = tacl.Corpus(self._data_dir, self._tokenizer) expected_texts = [ tacl.Text('T1', 'a', 'the we went\n', self._tokenizer), tacl.Text('T1', 'base', 'then we went\n', self._tokenizer), tacl.Text('T2', 'a', 'thews he sent\n', self._tokenizer), tacl.Text('T2', 'base', 'these he sent\n', self._tokenizer), tacl.Text('T3', 'base', 'that\n', self._tokenizer), tacl.Text('T4', 'base', 'hense\n', self._tokenizer), tacl.Text('T5', 'base', 'well\n', self._tokenizer) ] actual_texts = list(corpus.get_texts()) actual_texts.sort(key=lambda x: x.get_filename()) for actual_text, expected_text in zip(actual_texts, expected_texts): self.assertEqual(actual_text.get_filename(), expected_text.get_filename()) message = 'Checksum of {} does not match expected checksum from supplied {}'.format( actual_text.get_filename(), expected_text.get_filename()) self.assertEqual(actual_text.get_checksum(), expected_text.get_checksum(), message)
def test_get_names(self): text = tacl.Text('T1', 'base', 'test content', self._tokenizer) actual_names = text.get_names() expected_names = ('T1', 'base') self.assertEqual(actual_names, expected_names)
def test_content(self): content = 'test content' text = tacl.Text(content, self._tokenizer) self.assertEqual(text.content, content)
def test_get_checksum(self): content = '阿闍世[(禾*尤)\n/上/日]首佛足。敬強阿闍世耶。又' text = tacl.Text('test', 'base', content, self._tokenizer) actual_checksum = text.get_checksum() expected_checksum = 'b8f33a481780c4128c1b852488cede88' self.assertEqual(actual_checksum, expected_checksum)
def test_get_filename(self): text = tacl.Text('test', 'base', 'test content', self._tokenizer) actual_filename = text.get_filename() expected_filename = 'test/base.txt' self.assertEqual(actual_filename, expected_filename)
def test_get_checksum(self): content = '阿闍世[(禾*尤)/上/日]首佛足。敬\n強耶。又' text = tacl.Text('test', 'base', content, self._tokenizer) actual_checksum = text.get_checksum() expected_checksum = 'a94e3a20bc95a93710487611e65484d1' self.assertEqual(actual_checksum, expected_checksum)