def report (args, parser): if args.results == '-': results = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', newline='') else: results = open(args.results, 'r', encoding='utf-8', newline='') tokenizer = get_tokenizer(args) report = tacl.Report(results, tokenizer) if args.extend: corpus = tacl.Corpus(args.extend, tokenizer) report.extend(corpus) if args.reduce: report.reduce() if args.reciprocal: report.reciprocal_remove() if args.zero_fill: if not args.catalogue: parser.error('The zero-fill option requires that the -c option also be supplied.') corpus = tacl.Corpus(args.zero_fill, tokenizer) catalogue = get_catalogue(args.catalogue) report.zero_fill(corpus, catalogue) if args.min_texts or args.max_texts: report.prune_by_text_count(args.min_texts, args.max_texts) if args.min_size or args.max_size: report.prune_by_ngram_size(args.min_size, args.max_size) if args.min_count or args.max_count: report.prune_by_ngram_count(args.min_count, args.max_count) if args.remove: report.remove_label(args.remove) if args.sort: report.sort() report.csv(sys.stdout)
def results(args, parser): if args.results == '-': results_fh = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', newline='') else: results_fh = open(args.results, 'r', encoding='utf-8', newline='') tokenizer = utils.get_tokenizer(args) results = tacl.Results(results_fh, tokenizer) if args.extend: corpus = tacl.Corpus(args.extend, tokenizer) results.extend(corpus) if args.bifurcated_extend: if not args.bifurcated_extend_size: parser.error('The bifurcated extend option requires that the ' '--max-be-count option also be supplied') corpus = tacl.Corpus(args.bifurcated_extend, tokenizer) results.bifurcated_extend(corpus, args.bifurcated_extend_size) if args.reduce: results.reduce() if args.reciprocal: results.reciprocal_remove() if args.excise: results.excise(args.excise) if args.zero_fill: corpus = tacl.Corpus(args.zero_fill, tokenizer) results.zero_fill(corpus) if args.ngrams: with open(args.ngrams, encoding='utf-8') as fh: ngrams = fh.read().split() results.prune_by_ngram(ngrams) if args.min_works or args.max_works: results.prune_by_work_count(args.min_works, args.max_works) if args.min_size or args.max_size: results.prune_by_ngram_size(args.min_size, args.max_size) if args.min_count or args.max_count: results.prune_by_ngram_count(args.min_count, args.max_count) if args.min_count_work or args.max_count_work: results.prune_by_ngram_count_per_work(args.min_count_work, args.max_count_work) if args.remove: results.remove_label(args.remove) if args.sort: results.sort() # Run format-changing operations last. if args.add_label_count: results.add_label_count() if args.add_label_work_count: results.add_label_work_count() if args.group_by_ngram: catalogue = tacl.Catalogue() catalogue.load(args.group_by_ngram) results.group_by_ngram(catalogue.ordered_labels) if args.group_by_witness: results.group_by_witness() if args.collapse_witnesses: results.collapse_witnesses() results.csv(sys.stdout)
def excise(args, parser): logger = colorlog.getLogger('tacl') tokenizer = utils.get_tokenizer(args) corpus = tacl.Corpus(args.corpus, tokenizer) with open(args.ngrams) as fh: ngrams = [line.strip() for line in fh.readlines()] # It is no issue if the output directory already exists; it is a # reasonable use case to create an excised corpus from multiple # excise operations. try: os.mkdir(args.output) except FileExistsError: pass for work in args.works: # It is worth warning about writing in existing work # directories, however, since that might be unintended. Do not # prevent this, however, since it is a reasonable use case. try: os.mkdir(os.path.join(args.output, work)) except FileExistsError: logger.warning(constants.EXCISE_OVERWRITE_WORK_WARNING, work) for witness in corpus.get_witnesses(work): path = os.path.join(args.output, witness.get_filename()) content = witness.excise(ngrams, args.replacement) with open(path, 'w') as fh: fh.write(content)
def test_zero_fill(self): tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_CBETA, tacl.constants.TOKENIZER_JOINER_CBETA) input_data = ( ['AB', '2', 'T1', 'base', '7', 'A'], ['AB', '2', 'T2', 'a', '3', 'B'], ['ABC', '3', 'T5', 'base', '1', 'A'], ) base_dir = os.path.dirname(__file__) stripped_dir = os.path.join(base_dir, 'integration_tests', 'data', 'stripped') corpus = tacl.Corpus(stripped_dir, tokenizer) fh = self._create_csv(input_data) report = tacl.Report(fh, tokenizer) catalogue = {'T1': 'A', 'T2': 'B', 'T3': 'C', 'T5': 'A'} report.zero_fill(corpus, catalogue) actual_rows = self._get_rows_from_csv( report.csv(io.StringIO(newline=''))) expected_rows = [ ('AB', '2', 'T1', 'base', '7', 'A'), ('AB', '2', 'T1', 'a', '0', 'A'), ('AB', '2', 'T2', 'a', '3', 'B'), ('AB', '2', 'T2', 'base', '0', 'B'), ('ABC', '3', 'T5', 'base', '1', 'A'), ] self.assertEqual(set(actual_rows), set(expected_rows))
def test_generate_statistics(self): tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_CBETA, tacl.constants.TOKENIZER_JOINER_CBETA) corpus = tacl.Corpus(self._stripped_dir, tokenizer) input_results = ( # Test two n-grams that overlap. ['he', '2', 'a', 'base', '1', 'A'], ['th', '2', 'a', 'base', '1', 'A'], # Test a single n-gram that overlaps itself. Also # alternate witness. ['heh', '3', 'a', 'v1', '2', 'A'], ['AB', '2', 'b', 'base', '1', 'B'], ['ABD', '3', 'b', 'base', '1', 'B'], ['ABCD', '4', 'b', 'base', '2', 'B'], ) results_fh = self._create_csv(input_results) report = tacl.StatisticsReport(corpus, tokenizer, results_fh) report.generate_statistics() actual_results = self._get_rows_from_csv( report.csv(io.StringIO(newline=''))) expected_results = [ tacl.constants.STATISTICS_FIELDNAMES, ('a', 'base', '3', '3', '100.0', 'A'), ('a', 'v1', '5', '6', str(5 / 6 * 100), 'A'), ('b', 'base', '13', '14', str(13 / 14 * 100), 'B'), ] self.assertEqual(set(actual_results), set(expected_results))
def _compare_results(self, expected_dir_name, minimum, maximum, catalogue, seen_pairs, db_name='test.db'): expected_dir = os.path.join(self._data_dir, 'expected', expected_dir_name) corpus = tacl.Corpus(os.path.join(self._data_dir, 'corpus'), self._tokenizer) with tempfile.TemporaryDirectory() as temp_dir: if db_name is None: data_store = None else: data_store = tacl.DataStore(os.path.join(temp_dir, db_name), False) data_store.add_ngrams(corpus, minimum, maximum) actual_dir = os.path.join(temp_dir, 'actual') tracker_path = os.path.join(actual_dir, 'tracker.csv') if seen_pairs: os.makedirs(actual_dir, exist_ok=True) with open(tracker_path, 'w') as fh: fh.writelines( ['{},{}\n'.format(a, b) for a, b in seen_pairs]) pi = PairedIntersector(data_store, corpus, self._tokenizer, catalogue, actual_dir, tracker_path, 1, 1) pi.intersect_all() self._compare_results_dirs(actual_dir, expected_dir)
def test_get_witness(self): corpus = tacl.Corpus(self._data_dir, self._tokenizer) actual_text = corpus.get_witness('T1', 'base') expected_text = tacl.WitnessText('T1', 'base', 'then we went\n', self._tokenizer) self.assertEqual(actual_text.get_checksum(), expected_text.get_checksum()) self.assertEqual(actual_text.get_filename(), expected_text.get_filename())
def setUp (self): self._tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_CBETA, tacl.constants.TOKENIZER_JOINER_CBETA) self._data_dir = os.path.join(os.path.dirname(__file__), 'data') self._corpus = tacl.Corpus(os.path.join(self._data_dir, 'stripped'), self._tokenizer) self._catalogue = tacl.Catalogue() self._catalogue.load(os.path.join(self._data_dir, 'catalogue.txt')) self._store = tacl.DataStore(':memory:') self._store.add_ngrams(self._corpus, 1, 3)
def align_results (args, parser): if args.results == '-': results = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', newline='') else: results = open(args.results, 'r', encoding='utf-8', newline='') tokenizer = get_tokenizer(args) corpus = tacl.Corpus(args.corpus, tokenizer) s = tacl.Sequencer(corpus, tokenizer, results, args.output) s.generate_sequences(args.minimum)
def test_bifurcated_extend_malformed_results(self): fieldnames = [ tacl.constants.NGRAM_FIELDNAME, tacl.constants.SIZE_FIELDNAME, tacl.constants.WORK_FIELDNAME, tacl.constants.SIGLUM_FIELDNAME, tacl.constants.LABEL_FIELDNAME ] data_dir = os.path.join(os.path.dirname(__file__), 'data') corpus = tacl.Corpus(os.path.join(data_dir, 'stripped'), self._tokenizer) self._test_required_columns(fieldnames, 'bifurcated_extend', corpus, 2)
def test_extend_no_extension(self): # Extend should return the original results when there are no # extensions to make. input_data = os.path.join(self._data_dir, 'extend-no-extensions.csv') corpus = tacl.Corpus(os.path.join(self._stripped_dir, 'cbeta'), self._tokenizer) results = tacl.Results(input_data, self._tokenizer) results.extend(corpus) actual_rows = self._get_rows_from_results(results) expected_rows = self._get_rows_from_file(input_data) self.assertEqual(actual_rows, expected_rows)
def test_normalise(self): data_dir = os.path.join(os.path.dirname(__file__), 'normaliser_data') corpus_dir = os.path.join(data_dir, 'corpora') corpus = tacl.Corpus(os.path.join(corpus_dir, 'unnormalised'), self._tokenizer) expected_dir = os.path.join(corpus_dir, 'normalised') mapping = tacl.VariantMapping( os.path.join(data_dir, 'mappings', 'map2.csv'), self._tokenizer) with tempfile.TemporaryDirectory() as output_dir: actual_dir = os.path.join(output_dir, 'corpus') corpus.normalise(mapping, actual_dir) self._compare_dirs(actual_dir, expected_dir)
def test_get_witness(self): path = '/test' work = 'foo' siglum = 'base' content = 'test content' filename = os.path.join(work, siglum + '.txt') m = mock_open(read_data=content) with patch('builtins.open', m, create=True): corpus = tacl.Corpus(path, self._tokenizer) actual_text = corpus.get_witness(work, siglum) m.assert_called_once_with(os.path.join(path, filename), encoding='utf-8') assert isinstance(actual_text, tacl.WitnessText)
def test_zero_fill_no_duplicate_index_values(self): # Zero fill should not leave the matches with duplicate values # in the index, potentially raising a "cannot reindex from a # duplicate axis" ValueError when followed by another # operation. data_dir = os.path.join(os.path.dirname(__file__), 'data') corpus = tacl.Corpus(os.path.join(data_dir, 'stripped'), self._tokenizer) input_file = os.path.join(self._data_dir, 'non-zero-fill-results.csv') results = tacl.Results(input_file, self._tokenizer) results.zero_fill(corpus) self.assertFalse( results._matches.index.has_duplicates, 'Results._matches DataFrame is left with duplicate index values.')
def test_extend_no_duplicate_index_values(self): # Extend should not leave the matches with duplicate values in # the index, potentially raising a "cannot reindex from a # duplicate axis" ValueError when followed by another # operation. input_data = os.path.join(self._data_dir, 'cbeta-non-extend-results.csv') corpus = tacl.Corpus(os.path.join(self._stripped_dir, 'cbeta'), self._tokenizer) results = tacl.Results(input_data, self._tokenizer) results.extend(corpus) self.assertFalse( results._matches.index.has_duplicates, 'Results._matches DataFrame is left with duplicate index values.')
def _compare_results(self, max_works, expected_dir_name): expected_dir = os.path.join(self._data_dir, 'expected', expected_dir_name) corpus = tacl.Corpus(self._corpus, self._tokenizer) catalogue = tacl.Catalogue() catalogue.load(self._catalogue) with tempfile.TemporaryDirectory() as temp_dir: data_store = tacl.DataStore(os.path.join(temp_dir, 'test.db'), False) data_store.add_ngrams(corpus, 1, 1) output_dir = os.path.join(temp_dir, 'output') test = paternity.PaternityTest(data_store, catalogue, self._tokenizer, 'P', 'C', 'U', max_works, output_dir) test.process() self._compare_results_dirs(output_dir, expected_dir)
def test_cli(self): """Tests that the jitc CLI script runs correctly.""" expected_dir = os.path.join(self._data_dir, 'expected') corpus_path = os.path.join(self._data_dir, 'corpus') corpus = tacl.Corpus(corpus_path, self._tokenizer) catalogue_path = os.path.join(self._data_dir, 'catalogue.txt') label = 'in' with tempfile.TemporaryDirectory() as temp_dir: actual_dir = os.path.join(temp_dir, 'actual') data_store_path = os.path.join(temp_dir, 'test.db') self._generate_database(data_store_path, corpus, self._tokenizer) command = 'jitc {} {} {} {} {}'.format(data_store_path, corpus_path, catalogue_path, label, actual_dir) subprocess.run(shlex.split(command)) self._compare_results_dirs(actual_dir, expected_dir)
def _compare_results(self, corpus_dir, catalogue_name): """Compare all of the actual results files with the expected versions.""" expected_dir = os.path.join(self._data_dir, 'expected') corpus = tacl.Corpus(os.path.join(self._data_dir, corpus_dir), self._tokenizer) catalogue = tacl.Catalogue() catalogue.load(os.path.join(self._data_dir, catalogue_name)) with tempfile.TemporaryDirectory() as temp_dir: data_store = tacl.DataStore(os.path.join(temp_dir, 'test.db'), False) data_store.add_ngrams(corpus, 1, 1) output_dir = os.path.join(temp_dir, 'output') reporter = lifetime.LifetimeReporter(data_store, catalogue, self._tokenizer, output_dir) reporter.process() self._compare_results_dirs(output_dir, expected_dir)
def test_get_texts(self): corpus = tacl.Corpus(self._data_dir, self._tokenizer) expected_texts = [ tacl.Text('T1', 'a', 'the we went\n', self._tokenizer), tacl.Text('T1', 'base', 'then we went\n', self._tokenizer), tacl.Text('T2', 'a', 'thews he sent\n', self._tokenizer), tacl.Text('T2', 'base', 'these he sent\n', self._tokenizer), tacl.Text('T3', 'base', 'that\n', self._tokenizer), tacl.Text('T4', 'base', 'hense\n', self._tokenizer), tacl.Text('T5', 'base', 'well\n', self._tokenizer) ] actual_texts = list(corpus.get_texts()) actual_texts.sort(key=lambda x: x.get_filename()) for actual_text, expected_text in zip(actual_texts, expected_texts): self.assertEqual(actual_text.get_filename(), expected_text.get_filename()) message = 'Checksum of {} does not match expected checksum from supplied {}'.format( actual_text.get_filename(), expected_text.get_filename()) self.assertEqual(actual_text.get_checksum(), expected_text.get_checksum(), message)
def test_get_texts (self): path = '/test' name1 = 'T1' name2 = 'T2' siglum1 = 'base' siglum2 = 'a' glob = self._create_patch('glob.glob') glob.return_value = [ os.path.join(path, name1, siglum1 + '.txt'), os.path.join(path, name1, siglum2 + '.txt'), os.path.join(path, name2, siglum1 + '.txt')] isfile = self._create_patch('os.path.isfile') isfile.return_value = True get_text = self._create_patch('tacl.Corpus.get_text') get_text.return_value = MagicMock(spec_set=tacl.Text) corpus = tacl.Corpus(path, self._tokenizer) for text in corpus.get_texts(): assert isinstance(text, tacl.Text) glob.assert_called_once_with(os.path.join(path, '*/*.txt')) self.assertEqual(get_text.mock_calls, [call(corpus, name1, siglum1), call(corpus, name1, siglum2), call(corpus, name2, siglum1)])
def _setup(self, actual_dir): corpus_dir = os.path.join(actual_dir, 'corpus') shutil.copytree(self._corpus_dir, corpus_dir) corpus = tacl.Corpus(corpus_dir, self._tokenizer) splitter = tacl.Splitter(corpus) return splitter, corpus_dir
def test_get_works(self): corpus = tacl.Corpus(self._data_dir, self._tokenizer) expected_works = ['T1', 'T2', 'T3', 'T4', 'T5'] actual_works = sorted(corpus.get_works()) self.assertEqual(actual_works, expected_works)
def test_get_sigla(self): corpus = tacl.Corpus(self._data_dir, self._tokenizer) actual_sigla = corpus.get_sigla('T1') expected_sigla = ['base', 'a'] self.assertEqual(set(actual_sigla), set(expected_sigla))
def get_corpus(args): """Returns a `tacl.Corpus`.""" tokenizer = get_tokenizer(args) return tacl.Corpus(args.corpus, tokenizer)