def test_has_ngrams(self): store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) cursor = store._conn.execute.return_value store._conn.execute.return_value = cursor # Path one: there are n-grams. cursor.fetchone.return_value = True actual_result = store._has_ngrams(sentinel.text_id, sentinel.size) self.assertEqual(store._conn.mock_calls, [ call.execute(tacl.constants.SELECT_HAS_NGRAMS_SQL, [sentinel.text_id, sentinel.size]), call.execute().fetchone() ]) self.assertEqual(actual_result, True) # Path two: there are no n-grams. store._conn.reset_mock() cursor.reset_mock() cursor.fetchone.return_value = None actual_result = store._has_ngrams(sentinel.text_id, sentinel.size) self.assertEqual(store._conn.mock_calls, [ call.execute(tacl.constants.SELECT_HAS_NGRAMS_SQL, [sentinel.text_id, sentinel.size]), call.execute().fetchone() ]) self.assertEqual(actual_result, False)
def test_add_ngrams_with_catalogue(self): catalogue = tacl.Catalogue({'T1': 'A', 'T5': 'B'}) store = tacl.DataStore(':memory:') store.add_ngrams(self._corpus, 1, 1, catalogue) store._conn.row_factory = None actual_rows = store._conn.execute( 'SELECT Text.work, Text.siglum, Text.checksum, Text.label, ' 'TextNGram.ngram, TextNGram.size, TextNGram.count ' 'FROM Text, TextNGram WHERE Text.id = TextNGram.text').fetchall() expected_rows = [ ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 't', 1, 2), ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'h', 1, 1), ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'e', 1, 3), ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'n', 1, 2), ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'w', 1, 2), ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 't', 1, 2), ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'h', 1, 1), ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'e', 1, 3), ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'w', 1, 2), ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'n', 1, 1), ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'w', 1, 1), ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'e', 1, 1), ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'l', 1, 2), ] self.assertEqual(set(actual_rows), set(expected_rows))
def _compare_results(self, expected_dir_name, minimum, maximum, catalogue, seen_pairs, db_name='test.db'): expected_dir = os.path.join(self._data_dir, 'expected', expected_dir_name) corpus = tacl.Corpus(os.path.join(self._data_dir, 'corpus'), self._tokenizer) with tempfile.TemporaryDirectory() as temp_dir: if db_name is None: data_store = None else: data_store = tacl.DataStore(os.path.join(temp_dir, db_name), False) data_store.add_ngrams(corpus, minimum, maximum) actual_dir = os.path.join(temp_dir, 'actual') tracker_path = os.path.join(actual_dir, 'tracker.csv') if seen_pairs: os.makedirs(actual_dir, exist_ok=True) with open(tracker_path, 'w') as fh: fh.writelines( ['{},{}\n'.format(a, b) for a, b in seen_pairs]) pi = PairedIntersector(data_store, corpus, self._tokenizer, catalogue, actual_dir, tracker_path, 1, 1) pi.intersect_all() self._compare_results_dirs(actual_dir, expected_dir)
def test_intersection_supplied_one_label(self): filenames = ['a.csv'] labels = ['A'] store = tacl.DataStore(':memory:') output_fh = MagicMock(name='fh') self.assertRaises(MalformedQueryError, store.intersection_supplied, filenames, labels, output_fh)
def test_reduce_diff_size(self): # Consider a diff where the smallest gram for a witness is # larger than the smallest gram across all witnesses: # abdef vs abcbdef store = tacl.DataStore(':memory:') tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta']) input_data = (['abd', '3', 'a', 'base', '1', 'A'], ['abde', '4', 'a', 'base', '1', 'A'], ['abdef', '5', 'a', 'base', '1', 'A'], ['bc', '2', 'b', 'base', '1', 'B'], ['cb', '2', 'b', 'base', '1', 'B'], ['abc', '3', 'b', 'base', '1', 'B'], ['bcb', '3', 'b', 'base', '1', 'B'], ['cbd', '3', 'b', 'base', '1', 'B'], ['abcb', '4', 'b', 'base', '1', 'B'], ['bcbd', '4', 'b', 'base', '1', 'B'], ['cbde', '4', 'b', 'base', '1', 'B'], ['abcbd', '5', 'b', 'base', '1', 'B'], ['bcbde', '5', 'b', 'base', '1', 'B'], ['cbdef', '5', 'b', 'base', '1', 'B'], ['abcbde', '6', 'b', 'base', '1', 'B'], ['bcbdef', '6', 'b', 'base', '1', 'B'], ['abcbdef', '7', 'b', 'base', '1', 'B']) expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('abd', '3', 'a', 'base', '1', 'A'), ('bc', '2', 'b', 'base', '1', 'B'), ('cb', '2', 'b', 'base', '1', 'B'), ('bcb', '3', 'b', 'base', '1', 'B') ] actual_rows = self._reduce_diff(store, input_data, tokenizer) self.assertEqual(set(actual_rows), set(expected_rows))
def test_add_ngrams_with_catalogue(self): add_indices = self._create_patch('tacl.DataStore._add_indices') add_text_ngrams = self._create_patch('tacl.DataStore._add_text_ngrams') analyse = self._create_patch('tacl.DataStore._analyse') initialise = self._create_patch('tacl.DataStore._initialise_database') text1 = MagicMock(spec_set=tacl.WitnessText) text1.get_names = MagicMock(name='get_names') text1.get_names.return_value = ['T1', 'wit1'] text2 = MagicMock(spec_set=tacl.WitnessText) text2.get_names = MagicMock(name='get_names') text2.get_names.return_value = ['T1', 'wit2'] corpus = MagicMock(spec_set=tacl.Corpus) corpus.get_witnesses = MagicMock(name='get_witnesses') corpus.get_witnesses.return_value = iter([text1, text2]) store = tacl.DataStore(':memory:') catalogue = tacl.Catalogue({'T1': 'A'}) store.add_ngrams(corpus, 2, 3, catalogue) initialise.assert_called_once_with(store) corpus.get_witnesses.assert_called_once_with( 'T1', text_class=tacl.WitnessText) add_text_ngrams.assert_has_calls( [call(store, text1, 2, 3), call(store, text2, 2, 3)]) add_indices.assert_called_once_with(store) analyse.assert_called_once_with(store)
def test_set_labels(self): catalogue = collections.OrderedDict([(sentinel.text1, sentinel.label1), (sentinel.text2, sentinel.label2), (sentinel.text3, sentinel.label1) ]) store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) cursor = store._conn.execute.return_value store._conn.execute.return_value = cursor cursor.fetchone.return_value = {'token_count': 10} actual_labels = store._set_labels(catalogue) expected_labels = {sentinel.label1: 20, sentinel.label2: 10} connection_calls = [ call.execute(tacl.constants.UPDATE_LABELS_SQL, ['']), call.execute(tacl.constants.UPDATE_LABEL_SQL, [sentinel.label1, sentinel.text1]), call.execute(tacl.constants.SELECT_TEXT_TOKEN_COUNT_SQL, [sentinel.text1]), call.execute(tacl.constants.UPDATE_LABEL_SQL, [sentinel.label2, sentinel.text2]), call.execute(tacl.constants.SELECT_TEXT_TOKEN_COUNT_SQL, [sentinel.text2]), call.execute(tacl.constants.UPDATE_LABEL_SQL, [sentinel.label1, sentinel.text3]), call.execute(tacl.constants.SELECT_TEXT_TOKEN_COUNT_SQL, [sentinel.text3]), ] for connection_call in connection_calls: self.assertIn(connection_call, store._conn.mock_calls) self.assertEqual(actual_labels, expected_labels)
def test_intersection(self): labels = [sentinel.label1, sentinel.label2] set_labels = self._create_patch('tacl.DataStore._set_labels') set_labels.return_value = {} sort_labels = self._create_patch('tacl.DataStore._sort_labels', False) sort_labels.return_value = labels get_placeholders = self._create_patch( 'tacl.DataStore._get_placeholders', False) get_placeholders.return_value = sentinel.placeholders log_query_plan = self._create_patch('tacl.DataStore._log_query_plan', False) input_fh = MagicMock(name='fh') csv = self._create_patch('tacl.DataStore._csv', False) csv.return_value = input_fh catalogue = MagicMock(name='catalogue') store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) cursor = store._conn.execute.return_value output_fh = store.intersection(catalogue, input_fh) set_labels.assert_called_once_with(store, catalogue) get_placeholders.assert_called_once_with(labels) log_query_plan.assert_called_once() sql = 'SELECT TextNGram.ngram, TextNGram.size, TextNGram.count, Text.name AS "text name", Text.siglum, Text.label FROM Text, TextNGram WHERE Text.label IN (sentinel.placeholders) AND Text.id = TextNGram.text AND TextNGram.ngram IN (SELECT TextNGram.ngram FROM Text, TextNGram WHERE Text.label = ? AND Text.id = TextNGram.text AND TextNGram.ngram IN (SELECT TextNGram.ngram FROM Text, TextNGram WHERE Text.label = ? AND Text.id = TextNGram.text))' self.assertEqual(store._conn.mock_calls, [call.execute(sql, labels * 2)]) csv.assert_called_once_with(cursor, tacl.constants.QUERY_FIELDNAMES, input_fh) self.assertEqual(input_fh, output_fh)
def test_diff_asymmetric(self): labels = {sentinel.label: 1, sentinel.prime_label: 1} set_labels = self._create_patch('tacl.DataStore._set_labels') set_labels.return_value = labels get_placeholders = self._create_patch( 'tacl.DataStore._get_placeholders', False) get_placeholders.return_value = sentinel.placeholders log_query_plan = self._create_patch('tacl.DataStore._log_query_plan', False) input_fh = MagicMock(name='fh') csv = self._create_patch('tacl.DataStore._csv', False) csv.return_value = input_fh catalogue = MagicMock(name='catalogue') store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) cursor = store._conn.execute.return_value output_fh = store.diff_asymmetric(catalogue, sentinel.prime_label, input_fh) set_labels.assert_called_once_with(store, catalogue) get_placeholders.assert_called_once_with([sentinel.label]) log_query_plan.assert_called_once() sql = tacl.constants.SELECT_DIFF_ASYMMETRIC_SQL.format( sentinel.placeholders) self.assertEqual(store._conn.mock_calls, [ call.execute( sql, [sentinel.prime_label, sentinel.prime_label, sentinel.label]) ]) csv.assert_called_once_with(cursor, tacl.constants.QUERY_FIELDNAMES, input_fh) self.assertEqual(input_fh, output_fh)
def test_validate_true(self): corpus = MagicMock(spec_set=tacl.Corpus) text = MagicMock(spec_set=tacl.WitnessText) text.get_checksum.return_value = sentinel.checksum text.get_names.return_value = (sentinel.name, sentinel.siglum) corpus.get_witnesses.return_value = (text, ) catalogue = collections.OrderedDict([(sentinel.text1, sentinel.label1), (sentinel.text2, sentinel.label2), (sentinel.text3, sentinel.label1) ]) store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) cursor = store._conn.execute.return_value cursor.fetchone.return_value = {'checksum': sentinel.checksum} actual_result = store.validate(corpus, catalogue) corpus.get_witnesses.assert_has_calls( [call(sentinel.text1), call(sentinel.text2), call(sentinel.text3)]) self.assertEqual(store._conn.mock_calls, [ call.execute(tacl.constants.SELECT_TEXT_SQL, [sentinel.name, sentinel.siglum]), call.execute().fetchone(), call.execute(tacl.constants.SELECT_TEXT_SQL, [sentinel.name, sentinel.siglum]), call.execute().fetchone(), call.execute(tacl.constants.SELECT_TEXT_SQL, [sentinel.name, sentinel.siglum]), call.execute().fetchone() ]) self.assertEqual(actual_result, True)
def test_diff_asymmetric(self): labels = {sentinel.label: 1, sentinel.prime_label: 1} set_labels = self._create_patch('tacl.DataStore._set_labels') set_labels.return_value = labels get_placeholders = self._create_patch( 'tacl.DataStore._get_placeholders', False) get_placeholders.return_value = sentinel.placeholders log_query_plan = self._create_patch('tacl.DataStore._log_query_plan', False) input_fh = MagicMock(name='fh') catalogue = MagicMock(name='catalogue') store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) tokenizer = MagicMock(name='tokenizer') _diff = self._create_patch('tacl.DataStore._diff', False) _diff.return_value = input_fh output_fh = store.diff_asymmetric(catalogue, sentinel.prime_label, tokenizer, input_fh) set_labels.assert_called_once_with(store, catalogue) get_placeholders.assert_called_once_with([sentinel.label]) self.assertTrue(log_query_plan.called) sql = tacl.constants.SELECT_DIFF_ASYMMETRIC_SQL.format( sentinel.placeholders) self.assertEqual(store._conn.mock_calls, [ call.execute( sql, [sentinel.prime_label, sentinel.prime_label, sentinel.label]) ]) self.assertTrue(_diff.called) self.assertEqual(input_fh, output_fh)
def test_check_diff_result(self): # Test the various possibilities that # DataStore._reduce_diff_results must handle. store = tacl.DataStore(':memory:') tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta']) tokenize = tokenizer.tokenize join = tokenizer.joiner.join row = pd.Series(['ABC', 3, 'a', 'base', 1, 'A'], index=tacl.constants.QUERY_FIELDNAMES) # N-gram is not composed of any existing (n-1)-gram. matches = {'CD': 1} actual_row = store._check_diff_result(row, matches, tokenize, join) self.assertEqual(actual_row['count'], 1) # N-gram is composed entirely of existing (n-1)-grams. matches = {'AB': 1, 'BC': 1, 'CD': 1} actual_row = store._check_diff_result(row, matches, tokenize, join) self.assertEqual(actual_row['count'], 1) # N-gram is composed partly by existing (n-1)-grams. matches = {'AB': 1, 'CD': 1} actual_row = store._check_diff_result(row, matches, tokenize, join) self.assertEqual(actual_row['count'], 0) matches = {'BC': 1, 'CD': 1} actual_row = store._check_diff_result(row, matches, tokenize, join) self.assertEqual(actual_row['count'], 0) # N-gram is composed of one or more n-grams with count 0. matches = {'AB': 0, 'BC': 1, 'CD': 1} actual_row = store._check_diff_result(row, matches, tokenize, join) self.assertEqual(actual_row['count'], 0) matches = {'AB': 1, 'BC': 0, 'CD': 1} actual_row = store._check_diff_result(row, matches, tokenize, join) self.assertEqual(actual_row['count'], 0) matches = {'AB': 0, 'BC': 0, 'CD': 1} actual_row = store._check_diff_result(row, matches, tokenize, join) self.assertEqual(actual_row['count'], 0)
def test_delete_text_ngrams(self): store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) store._delete_text_ngrams(sentinel.text_id) store._conn.execute.has_calls( call(tacl.constants.DELETE_TEXT_NGRAMS_SQL, [sentinel.text_id]), call(tacl.constants.DELETE_TEXT_HAS_NGRAMS_SQL, [sentinel.text_id]))
def test_diff_one_label(self): catalogue = {'T1': 'A', 'T2': 'A'} store = tacl.DataStore(':memory:') output_fh = MagicMock(name='fh') set_labels = self._create_patch('tacl.DataStore._set_labels') set_labels.return_value = {'A': 2} self.assertRaises(MalformedQueryError, store.diff, catalogue, output_fh)
def test_diff_supplied_one_label(self): filenames = ['a.csv'] labels = ['A'] store = tacl.DataStore(':memory:') tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta']) output_fh = MagicMock(name='fh') self.assertRaises(MalformedQueryError, store.diff_supplied, filenames, labels, tokenizer, output_fh)
def test_diff_one_label(self): catalogue = {'T1': 'A', 'T2': 'A'} store = tacl.DataStore(':memory:') output_fh = MagicMock(name='fh') tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta']) set_labels = self._create_patch('tacl.DataStore._set_labels') set_labels.return_value = {'A': 2} self.assertRaises(MalformedQueryError, store.diff, catalogue, tokenizer, output_fh)
def test_analyse(self): store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) store._analyse() store._conn.execute.assert_called_once_with( tacl.constants.ANALYSE_SQL.format('')) store._conn.reset_mock() store._analyse(sentinel.table) store._conn.execute.assert_called_once_with( tacl.constants.ANALYSE_SQL.format(sentinel.table))
def test_add_temporary_ngrams_not_duplicate(self): """Tests that duplicates n-grams are added only once to the temporary table.""" store = tacl.DataStore(':memory:') input_ngrams = ['A', 'A'] store._add_temporary_ngrams(input_ngrams) cursor = store._conn.execute('SELECT * FROM InputNGram') expected_ngrams = ['A'] actual_ngrams = ([row['ngram'] for row in cursor.fetchall()]) self.assertEqual(actual_ngrams, expected_ngrams)
def setUp (self): self._tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_CBETA, tacl.constants.TOKENIZER_JOINER_CBETA) self._data_dir = os.path.join(os.path.dirname(__file__), 'data') self._corpus = tacl.Corpus(os.path.join(self._data_dir, 'stripped'), self._tokenizer) self._catalogue = tacl.Catalogue() self._catalogue.load(os.path.join(self._data_dir, 'catalogue.txt')) self._store = tacl.DataStore(':memory:') self._store.add_ngrams(self._corpus, 1, 3)
def test_sort_labels(self): store = tacl.DataStore(':memory:') label_data = { sentinel.label1: 2, sentinel.label2: 3, sentinel.label3: 1 } actual_labels = store._sort_labels(label_data) expected_labels = [sentinel.label2, sentinel.label1, sentinel.label3] self.assertEqual(actual_labels, expected_labels)
def test_add_temporary_ngrams_empty(self): """Tests that n-grams that are empty strings are not added to the temporary table.""" store = tacl.DataStore(':memory:') input_ngrams = ['', 'A'] store._add_temporary_ngrams(input_ngrams) cursor = store._conn.execute('SELECT * FROM InputNGram') expected_ngrams = ['A'] actual_ngrams = ([row['ngram'] for row in cursor.fetchall()]) self.assertEqual(actual_ngrams, expected_ngrams)
def test_add_temporary_ngrams(self): store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) store._add_temporary_ngrams(['A', 'B']) self.assertEqual(store._conn.mock_calls, [ call.execute(tacl.constants.DROP_TEMPORARY_NGRAMS_TABLE_SQL), call.execute(tacl.constants.CREATE_TEMPORARY_NGRAMS_TABLE_SQL), call.executemany(tacl.constants.INSERT_TEMPORARY_NGRAM_SQL, [('A', ), ('B', )]) ])
def test_diff_asymmetric_invalid_label(self): # Tests that the right error is raised when the supplied label # is not present in the catalogue. catalogue = {'T1': 'A', 'T2': 'B'} prime_label = 'C' input_fh = MagicMock(name='fh') store = tacl.DataStore(':memory:') set_labels = self._create_patch('tacl.DataStore._set_labels') set_labels.return_value = {'A': 1, 'B': 1} self.assertRaises(MalformedQueryError, store.diff_asymmetric, catalogue, prime_label, input_fh)
def test_intersection_one_label(self): labels = [sentinel.label1] set_labels = self._create_patch('tacl.DataStore._set_labels') set_labels.return_value = {} sort_labels = self._create_patch('tacl.DataStore._sort_labels', False) sort_labels.return_value = labels output_fh = MagicMock(name='fh') catalogue = MagicMock(name='catalogue') store = tacl.DataStore(':memory:') self.assertRaises(MalformedQueryError, store.intersection, catalogue, output_fh)
def test_delete_text_ngrams(self): store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) store._delete_text_ngrams(sentinel.text_id) expected_calls = [ call.execute(tacl.constants.DELETE_TEXT_NGRAMS_SQL, [sentinel.text_id]), call.execute(tacl.constants.DELETE_TEXT_HAS_NGRAMS_SQL, [sentinel.text_id]), call.commit() ] self.assertEqual(store._conn.mock_calls, expected_calls)
def test_add_text_size_ngrams(self): store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) size = 1 ngrams = collections.OrderedDict([('a', 2), ('b', 1)]) store._add_text_size_ngrams(sentinel.text_id, size, ngrams) store._conn.execute.assert_called_once_with( tacl.constants.INSERT_TEXT_HAS_NGRAM_SQL, [sentinel.text_id, size, len(ngrams)]) store._conn.executemany.assert_called_once_with( tacl.constants.INSERT_NGRAM_SQL, [[sentinel.text_id, 'a', size, 2], [sentinel.text_id, 'b', size, 1]])
def test_update_text_record(self): store = tacl.DataStore(':memory:') store._conn = MagicMock(spec_set=sqlite3.Connection) text = MagicMock(spec_set=tacl.WitnessText) text.get_checksum.return_value = sentinel.checksum tokens = [sentinel.token] text.get_tokens.return_value = tokens store._update_text_record(text, sentinel.text_id) self.assertEqual( text.mock_calls, [call.get_checksum(), call.get_tokens()]) store._conn.execute.assert_called_once_with( tacl.constants.UPDATE_TEXT_SQL, [sentinel.checksum, len(tokens), sentinel.text_id])
def test_add_temporary_ngrams_twice(self): # Test that multiple calls to the method succeed. store = tacl.DataStore(':memory:') input_ngrams = ['禁律', '律藏也'] store._add_temporary_ngrams(input_ngrams) cursor = store._conn.execute('SELECT * FROM InputNGram') expected_ngrams = set(input_ngrams) actual_ngrams = set([row['ngram'] for row in cursor.fetchall()]) self.assertEqual(actual_ngrams, expected_ngrams) input_ngrams = ['每', '以示'] store._add_temporary_ngrams(input_ngrams) cursor = store._conn.execute('SELECT * FROM InputNGram') expected_ngrams = set(input_ngrams) actual_ngrams = set([row['ngram'] for row in cursor.fetchall()]) self.assertEqual(actual_ngrams, expected_ngrams)
def _compare_results(self, max_works, expected_dir_name): expected_dir = os.path.join(self._data_dir, 'expected', expected_dir_name) corpus = tacl.Corpus(self._corpus, self._tokenizer) catalogue = tacl.Catalogue() catalogue.load(self._catalogue) with tempfile.TemporaryDirectory() as temp_dir: data_store = tacl.DataStore(os.path.join(temp_dir, 'test.db'), False) data_store.add_ngrams(corpus, 1, 1) output_dir = os.path.join(temp_dir, 'output') test = paternity.PaternityTest(data_store, catalogue, self._tokenizer, 'P', 'C', 'U', max_works, output_dir) test.process() self._compare_results_dirs(output_dir, expected_dir)
def _compare_results(self, corpus_dir, catalogue_name): """Compare all of the actual results files with the expected versions.""" expected_dir = os.path.join(self._data_dir, 'expected') corpus = tacl.Corpus(os.path.join(self._data_dir, corpus_dir), self._tokenizer) catalogue = tacl.Catalogue() catalogue.load(os.path.join(self._data_dir, catalogue_name)) with tempfile.TemporaryDirectory() as temp_dir: data_store = tacl.DataStore(os.path.join(temp_dir, 'test.db'), False) data_store.add_ngrams(corpus, 1, 1) output_dir = os.path.join(temp_dir, 'output') reporter = lifetime.LifetimeReporter(data_store, catalogue, self._tokenizer, output_dir) reporter.process() self._compare_results_dirs(output_dir, expected_dir)