def test_reduce_diff_results_composed(self): # Consider the diff between a text "abcdefg" and # "abcABCDEFdGHIefg". Only fully composed n-grams (those made # up of two (n-1)-grams that are in the results) should be # kept. store = tacl.DataStore(':memory:') tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta']) input_data = (['cA', '2', 'a', 'base', '1', 'A'], ['AB', '2', 'a', 'base', '1', 'A'], ['B[C]', '2', 'a', 'base', '1', 'A'], ['[C]D', '2', 'a', 'base', '1', 'A'], ['DE', '2', 'a', 'base', '1', 'A'], ['EF', '2', 'a', 'base', '1', 'A'], ['Fd', '2', 'a', 'base', '1', 'A'], ['dG', '2', 'a', 'base', '1', 'A'], ['GH', '2', 'a', 'base', '1', 'A'], ['HI', '2', 'a', 'base', '1', 'A'], ['Ie', '2', 'a', 'base', '1', 'A'], ['cd', '2', 'b', 'base', '1', 'B'], ['de', '2', 'b', 'base', '1', 'B'], ['bcA', '3', 'a', 'base', '1', 'A'], ['cAB', '3', 'a', 'base', '1', 'A'], ['AB[C]', '3', 'a', 'base', '1', 'A'], ['B[C]D', '3', 'a', 'base', '1', 'A'], ['[C]DE', '3', 'a', 'base', '1', 'A'], ['DEF', '3', 'a', 'base', '1', 'A'], ['EFd', '3', 'a', 'base', '1', 'A'], ['FdG', '3', 'a', 'base', '1', 'A'], ['dGH', '3', 'a', 'base', '1', 'A'], ['GHI', '3', 'a', 'base', '1', 'A'], ['HIe', '3', 'a', 'base', '1', 'A'], ['Ief', '3', 'a', 'base', '1', 'A'], ['bcd', '3', 'b', 'base', '1', 'B'], ['cde', '3', 'b', 'base', '1', 'B'], ['def', '3', 'b', 'base', '1', 'B'], ['abcA', '4', 'a', 'base', '1', 'A'], ['bcAB', '4', 'a', 'base', '1', 'A'], ['cAB[C]', '4', 'a', 'base', '1', 'A'], ['AB[C]D', '4', 'a', 'base', '1', 'A'], ['B[C]DE', '4', 'a', 'base', '1', 'A'], [ '[C]DEF', '4', 'a', 'base', '1', 'A' ], ['DEFd', '4', 'a', 'base', '1', 'A'], ['EFdG', '4', 'a', 'base', '1', 'A'], ['FdGH', '4', 'a', 'base', '1', 'A'], ['dGHI', '4', 'a', 'base', '1', 'A'], ['GHIe', '4', 'a', 'base', '1', 'A' ], ['HIef', '4', 'a', 'base', '1', 'A'], ['Iefg', '4', 'a', 'base', '1', 'A'], ['abcd', '4', 'b', 'base', '1', 'B'], ['bcde', '4', 'b', 'base', '1', 'B' ], ['cdef', '4', 'b', 'base', '1', 'B'], ['defg', '4', 'b', 'base', '1', 'B'], ['abcde', '5', 'b', 'base', '1', 'B'], ['bcdef', '5', 'b', 'base', '1', 'B' ], ['cdefg', '5', 'b', 'base', '1', 'B']) expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('cA', '2', 'a', 'base', '1', 'A'), ('AB', '2', 'a', 'base', '1', 'A'), ('B[C]', '2', 'a', 'base', '1', 'A'), ('[C]D', '2', 'a', 'base', '1', 'A'), ('DE', '2', 'a', 'base', '1', 'A'), ('EF', '2', 'a', 'base', '1', 'A'), ('Fd', '2', 'a', 'base', '1', 'A'), ('dG', '2', 'a', 'base', '1', 'A'), ('GH', '2', 'a', 'base', '1', 'A'), ('HI', '2', 'a', 'base', '1', 'A'), ('Ie', '2', 'a', 'base', '1', 'A'), ('cd', '2', 'b', 'base', '1', 'B'), ('de', '2', 'b', 'base', '1', 'B'), ('cAB', '3', 'a', 'base', '1', 'A'), ('AB[C]', '3', 'a', 'base', '1', 'A'), ('B[C]D', '3', 'a', 'base', '1', 'A'), ('[C]DE', '3', 'a', 'base', '1', 'A'), ('DEF', '3', 'a', 'base', '1', 'A'), ('EFd', '3', 'a', 'base', '1', 'A'), ('FdG', '3', 'a', 'base', '1', 'A'), ('dGH', '3', 'a', 'base', '1', 'A'), ('GHI', '3', 'a', 'base', '1', 'A'), ('HIe', '3', 'a', 'base', '1', 'A'), ('cde', '3', 'b', 'base', '1', 'B'), ('cAB[C]', '4', 'a', 'base', '1', 'A'), ('AB[C]D', '4', 'a', 'base', '1', 'A'), ('B[C]DE', '4', 'a', 'base', '1', 'A'), ('[C]DEF', '4', 'a', 'base', '1', 'A'), ('DEFd', '4', 'a', 'base', '1', 'A'), ('EFdG', '4', 'a', 'base', '1', 'A'), ('FdGH', '4', 'a', 'base', '1', 'A'), ('dGHI', '4', 'a', 'base', '1', 'A'), ('GHIe', '4', 'a', 'base', '1', 'A') ] actual_rows = self._reduce_diff(store, input_data, tokenizer) self.assertEqual(set(actual_rows), set(expected_rows))
def setUp(self): base_dir = os.path.dirname(__file__) self._data_dir = os.path.join(base_dir, 'int_all_data') self._tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta'])
def setUp(self): self._tokenizer = tacl.Tokenizer( tacl.constants.TOKENIZER_PATTERN_CBETA, tacl.constants.TOKENIZER_JOINER_CBETA) self.maxDiff = None
def get_tokenizer(args): return tacl.Tokenizer(*constants.TOKENIZERS[args.tokenizer])
def setUp(self): self._data_dir = os.path.join(os.path.dirname(__file__), 'data', 'stripped') self._tokenizer = tacl.Tokenizer( tacl.constants.TOKENIZER_PATTERN_CBETA, tacl.constants.TOKENIZER_JOINER_CBETA)
def setUp(self): base_dir = os.path.dirname(__file__) self._data_dir = os.path.join(base_dir, 'paternity_data') self._tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta']) self._corpus = os.path.join(self._data_dir, 'corpus') self._catalogue = os.path.join(self._data_dir, 'catalogue.txt')
def test_pattern (self): expected_pattern = r'[\w]+' tokenizer = tacl.Tokenizer(expected_pattern, ' ') actual_pattern = tokenizer.pattern self.assertEqual(actual_pattern, expected_pattern)
def test_joiner (self): expected_joiner = ' ' tokenizer = tacl.Tokenizer(r'[\w]+', expected_joiner) actual_joiner = tokenizer.joiner self.assertEqual(actual_joiner, expected_joiner)