def test_prune_by_ngram_count(self): input_data = (['AB', '2', 'a', 'base', '7', 'A'], ['BA', '2', 'a', 'wit', '1', 'A'], ['BA', '2', 'b', 'base', '3', 'B']) fh = self._create_csv(input_data) report = tacl.Report(fh, self._tokenizer) report.prune_by_ngram_count(minimum=3) expected_rows = [('AB', '2', 'a', 'base', '7', 'A'), ('BA', '2', 'a', 'wit', '1', 'A'), ('BA', '2', 'b', 'base', '3', 'B')] actual_rows = self._get_rows_from_csv( report.csv(io.StringIO(newline=''))) self.assertEqual(actual_rows, expected_rows) fh.seek(0) report = tacl.Report(fh, self._tokenizer) report.prune_by_ngram_count(maximum=4) expected_rows = [('BA', '2', 'a', 'wit', '1', 'A'), ('BA', '2', 'b', 'base', '3', 'B')] actual_rows = self._get_rows_from_csv( report.csv(io.StringIO(newline=''))) self.assertEqual(actual_rows, expected_rows) fh.seek(0) report = tacl.Report(fh, self._tokenizer) report.prune_by_ngram_count(minimum=4, maximum=5) expected_rows = [('BA', '2', 'a', 'wit', '1', 'A'), ('BA', '2', 'b', 'base', '3', 'B')] actual_rows = self._get_rows_from_csv( report.csv(io.StringIO(newline=''))) self.assertEqual(actual_rows, expected_rows)
def test_zero_fill(self): tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_CBETA, tacl.constants.TOKENIZER_JOINER_CBETA) input_data = ( ['AB', '2', 'T1', 'base', '7', 'A'], ['AB', '2', 'T2', 'a', '3', 'B'], ['ABC', '3', 'T5', 'base', '1', 'A'], ) base_dir = os.path.dirname(__file__) stripped_dir = os.path.join(base_dir, 'integration_tests', 'data', 'stripped') corpus = tacl.Corpus(stripped_dir, tokenizer) fh = self._create_csv(input_data) report = tacl.Report(fh, tokenizer) catalogue = {'T1': 'A', 'T2': 'B', 'T3': 'C', 'T5': 'A'} report.zero_fill(corpus, catalogue) actual_rows = self._get_rows_from_csv( report.csv(io.StringIO(newline=''))) expected_rows = [ ('AB', '2', 'T1', 'base', '7', 'A'), ('AB', '2', 'T1', 'a', '0', 'A'), ('AB', '2', 'T2', 'a', '3', 'B'), ('AB', '2', 'T2', 'base', '0', 'B'), ('ABC', '3', 'T5', 'base', '1', 'A'), ] self.assertEqual(set(actual_rows), set(expected_rows))
def test_sort(self): input_data = (['AB', '2', 'a', 'base', '4', 'A'], ['AB', '2', 'a', 'wit', '3', 'A'], ['ABC', '3', 'a', 'base', '2', 'A'], ['ABD', '3', 'a', 'base', '1', 'B'], ['ABCD', '4', 'a', 'base', '2', 'B'], ['AB', '2', 'b', 'base', '2', 'AB'], ['AB', '2', 'b', 'a', '2', 'AB'], ['ABC', '3', 'b', 'base', '2', 'AB'], ['ABC', '3', 'c', 'base', '3', 'A']) fh = self._create_csv(input_data) report = tacl.Report(fh, self._tokenizer) report.sort() expected_rows = [('ABCD', '4', 'a', 'base', '2', 'B'), ('ABC', '3', 'c', 'base', '3', 'A'), ('ABC', '3', 'a', 'base', '2', 'A'), ('ABC', '3', 'b', 'base', '2', 'AB'), ('ABD', '3', 'a', 'base', '1', 'B'), ('AB', '2', 'a', 'base', '4', 'A'), ('AB', '2', 'a', 'wit', '3', 'A'), ('AB', '2', 'b', 'a', '2', 'AB'), ('AB', '2', 'b', 'base', '2', 'AB')] actual_rows = self._get_rows_from_csv( report.csv(io.StringIO(newline=''))) self.assertEqual(actual_rows, expected_rows)
def report (args, parser): if args.results == '-': results = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', newline='') else: results = open(args.results, 'r', encoding='utf-8', newline='') tokenizer = get_tokenizer(args) report = tacl.Report(results, tokenizer) if args.extend: corpus = tacl.Corpus(args.extend, tokenizer) report.extend(corpus) if args.reduce: report.reduce() if args.reciprocal: report.reciprocal_remove() if args.zero_fill: if not args.catalogue: parser.error('The zero-fill option requires that the -c option also be supplied.') corpus = tacl.Corpus(args.zero_fill, tokenizer) catalogue = get_catalogue(args.catalogue) report.zero_fill(corpus, catalogue) if args.min_texts or args.max_texts: report.prune_by_text_count(args.min_texts, args.max_texts) if args.min_size or args.max_size: report.prune_by_ngram_size(args.min_size, args.max_size) if args.min_count or args.max_count: report.prune_by_ngram_count(args.min_count, args.max_count) if args.remove: report.remove_label(args.remove) if args.sort: report.sort() report.csv(sys.stdout)
def test_reduce_nan(self): # Check that the n-gram "nan" is not interpreted as NaN. tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL, tacl.constants.TOKENIZER_JOINER_PAGEL) input_data = (['nan', '1', 'text1', 'base', '2', 'A'], ['nan dus', '2', 'text1', 'base', '1', 'A'], ['pa', '1', 'text2', 'base', '1', 'B']) fh = self._create_csv(input_data) report = tacl.Report(fh, tokenizer) report.reduce() actual_rows = self._get_rows_from_csv( report.csv(io.StringIO(newline=''))) expected_rows = (('nan', '1', 'text1', 'base', '1', 'A'), ('nan dus', '2', 'text1', 'base', '1', 'A'), ('pa', '1', 'text2', 'base', '1', 'B')) self.assertEqual(set(actual_rows), set(expected_rows))
def _perform_reduce(self, input_data, tokenizer): fh = self._create_csv(input_data) report = tacl.Report(fh, tokenizer) report.reduce() return self._get_rows_from_csv(report.csv(io.StringIO(newline='')))
def test_reciprocal_remove(self): input_data = (['AB', '2', 'a', 'base', '5', 'A'], ['ABCDEF', '6', 'a', 'base', '7', 'A'], ['DEF', '3', 'a', 'base', '2', 'A'], ['GHIJ', '4', 'a', 'base', '3', 'A'], ['KLM', '3', 'b', 'base', '0', 'A'], ['ABCDEF', '6', 'b', 'base', '3', 'B'], ['GHIJ', '4', 'b', 'base', '2', 'B'], ['KLM', '3', 'b', 'base', '17', 'B']) fh = self._create_csv(input_data) report = tacl.Report(fh, self._tokenizer) report.reciprocal_remove() expected_rows = [('ABCDEF', '6', 'a', 'base', '7', 'A'), ('GHIJ', '4', 'a', 'base', '3', 'A'), ('ABCDEF', '6', 'b', 'base', '3', 'B'), ('GHIJ', '4', 'b', 'base', '2', 'B')] actual_rows = self._get_rows_from_csv( report.csv(io.StringIO(newline=''))) self.assertEqual(set(actual_rows), set(expected_rows)) # More than two labels, and more than one text per label. input_data = (['AB', '2', 'a', 'base', '5', 'A'], ['ABCDEF', '6', 'a', 'base', '7', 'A'], ['DEF', '3', 'a', 'base', '2', 'A'], ['AB', '2', 'b', 'base', '6', 'A'], ['GHIJ', '4', 'b', 'base', '3', 'A'], ['KLM', '3', 'b', 'base', '0', 'A'], ['ABCDEF', '6', 'c', 'base', '3', 'B'], ['KLM', '3', 'c', 'base', '17', 'B'], ['GHIJ', '4', 'd', 'base', '2', 'B'], ['KLM', '3', 'e', 'base', '3', 'C'], ['GHIJ', '4', 'f', 'base', '11', 'C'], ['ABCDEF', '6', 'g', 'base', '8', 'C']) fh = self._create_csv(input_data) report = tacl.Report(fh, self._tokenizer) report.reciprocal_remove() expected_rows = [('ABCDEF', '6', 'a', 'base', '7', 'A'), ('GHIJ', '4', 'b', 'base', '3', 'A'), ('ABCDEF', '6', 'c', 'base', '3', 'B'), ('GHIJ', '4', 'd', 'base', '2', 'B'), ('GHIJ', '4', 'f', 'base', '11', 'C'), ('ABCDEF', '6', 'g', 'base', '8', 'C')] actual_rows = self._get_rows_from_csv( report.csv(io.StringIO(newline=''))) self.assertEqual(set(actual_rows), set(expected_rows)) # Now with variants. input_data = (['AB', '2', 'a', 'base', '5', 'A'], ['ABCDEF', '6', 'a', 'wit1', '7', 'A'], ['DEF', '3', 'a', 'base', '2', 'A'], ['AB', '2', 'b', 'base', '6', 'A'], ['GHIJ', '4', 'b', 'base', '3', 'A'], ['KLM', '3', 'b', 'base', '0', 'A'], ['ABCDEF', '6', 'c', 'base', '3', 'B'], ['KLM', '3', 'c', 'base', '17', 'B'], ['GHIJ', '4', 'd', 'base', '2', 'B'], ['KLM', '3', 'e', 'base', '3', 'C'], ['GHIJ', '4', 'f', 'wit2', '11', 'C'], ['ABCDEF', '6', 'g', 'base', '8', 'C']) fh = self._create_csv(input_data) report = tacl.Report(fh, self._tokenizer) report.reciprocal_remove() expected_rows = [('ABCDEF', '6', 'a', 'wit1', '7', 'A'), ('GHIJ', '4', 'b', 'base', '3', 'A'), ('ABCDEF', '6', 'c', 'base', '3', 'B'), ('GHIJ', '4', 'd', 'base', '2', 'B'), ('GHIJ', '4', 'f', 'wit2', '11', 'C'), ('ABCDEF', '6', 'g', 'base', '8', 'C')] actual_rows = self._get_rows_from_csv( report.csv(io.StringIO(newline=''))) self.assertEqual(set(actual_rows), set(expected_rows))