Beispiel #1
0
 def test_prune_by_ngram_count(self):
     input_data = (['AB', '2', 'a', 'base', '7',
                    'A'], ['BA', '2', 'a', 'wit', '1',
                           'A'], ['BA', '2', 'b', 'base', '3', 'B'])
     fh = self._create_csv(input_data)
     report = tacl.Report(fh, self._tokenizer)
     report.prune_by_ngram_count(minimum=3)
     expected_rows = [('AB', '2', 'a', 'base', '7', 'A'),
                      ('BA', '2', 'a', 'wit', '1', 'A'),
                      ('BA', '2', 'b', 'base', '3', 'B')]
     actual_rows = self._get_rows_from_csv(
         report.csv(io.StringIO(newline='')))
     self.assertEqual(actual_rows, expected_rows)
     fh.seek(0)
     report = tacl.Report(fh, self._tokenizer)
     report.prune_by_ngram_count(maximum=4)
     expected_rows = [('BA', '2', 'a', 'wit', '1', 'A'),
                      ('BA', '2', 'b', 'base', '3', 'B')]
     actual_rows = self._get_rows_from_csv(
         report.csv(io.StringIO(newline='')))
     self.assertEqual(actual_rows, expected_rows)
     fh.seek(0)
     report = tacl.Report(fh, self._tokenizer)
     report.prune_by_ngram_count(minimum=4, maximum=5)
     expected_rows = [('BA', '2', 'a', 'wit', '1', 'A'),
                      ('BA', '2', 'b', 'base', '3', 'B')]
     actual_rows = self._get_rows_from_csv(
         report.csv(io.StringIO(newline='')))
     self.assertEqual(actual_rows, expected_rows)
Beispiel #2
0
 def test_zero_fill(self):
     tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_CBETA,
                                tacl.constants.TOKENIZER_JOINER_CBETA)
     input_data = (
         ['AB', '2', 'T1', 'base', '7', 'A'],
         ['AB', '2', 'T2', 'a', '3', 'B'],
         ['ABC', '3', 'T5', 'base', '1', 'A'],
     )
     base_dir = os.path.dirname(__file__)
     stripped_dir = os.path.join(base_dir, 'integration_tests', 'data',
                                 'stripped')
     corpus = tacl.Corpus(stripped_dir, tokenizer)
     fh = self._create_csv(input_data)
     report = tacl.Report(fh, tokenizer)
     catalogue = {'T1': 'A', 'T2': 'B', 'T3': 'C', 'T5': 'A'}
     report.zero_fill(corpus, catalogue)
     actual_rows = self._get_rows_from_csv(
         report.csv(io.StringIO(newline='')))
     expected_rows = [
         ('AB', '2', 'T1', 'base', '7', 'A'),
         ('AB', '2', 'T1', 'a', '0', 'A'),
         ('AB', '2', 'T2', 'a', '3', 'B'),
         ('AB', '2', 'T2', 'base', '0', 'B'),
         ('ABC', '3', 'T5', 'base', '1', 'A'),
     ]
     self.assertEqual(set(actual_rows), set(expected_rows))
Beispiel #3
0
 def test_sort(self):
     input_data = (['AB', '2', 'a', 'base', '4',
                    'A'], ['AB', '2', 'a', 'wit', '3',
                           'A'], ['ABC', '3', 'a', 'base', '2',
                                  'A'], ['ABD', '3', 'a', 'base', '1', 'B'],
                   ['ABCD', '4', 'a', 'base', '2',
                    'B'], ['AB', '2', 'b', 'base', '2',
                           'AB'], ['AB', '2', 'b', 'a', '2', 'AB'],
                   ['ABC', '3', 'b', 'base', '2',
                    'AB'], ['ABC', '3', 'c', 'base', '3', 'A'])
     fh = self._create_csv(input_data)
     report = tacl.Report(fh, self._tokenizer)
     report.sort()
     expected_rows = [('ABCD', '4', 'a', 'base', '2', 'B'),
                      ('ABC', '3', 'c', 'base', '3', 'A'),
                      ('ABC', '3', 'a', 'base', '2', 'A'),
                      ('ABC', '3', 'b', 'base', '2', 'AB'),
                      ('ABD', '3', 'a', 'base', '1', 'B'),
                      ('AB', '2', 'a', 'base', '4', 'A'),
                      ('AB', '2', 'a', 'wit', '3', 'A'),
                      ('AB', '2', 'b', 'a', '2', 'AB'),
                      ('AB', '2', 'b', 'base', '2', 'AB')]
     actual_rows = self._get_rows_from_csv(
         report.csv(io.StringIO(newline='')))
     self.assertEqual(actual_rows, expected_rows)
Beispiel #4
0
def report (args, parser):
    if args.results == '-':
        results = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8',
                                   newline='')
    else:
        results = open(args.results, 'r', encoding='utf-8', newline='')
    tokenizer = get_tokenizer(args)
    report = tacl.Report(results, tokenizer)
    if args.extend:
        corpus = tacl.Corpus(args.extend, tokenizer)
        report.extend(corpus)
    if args.reduce:
        report.reduce()
    if args.reciprocal:
        report.reciprocal_remove()
    if args.zero_fill:
        if not args.catalogue:
            parser.error('The zero-fill option requires that the -c option also be supplied.')
        corpus = tacl.Corpus(args.zero_fill, tokenizer)
        catalogue = get_catalogue(args.catalogue)
        report.zero_fill(corpus, catalogue)
    if args.min_texts or args.max_texts:
        report.prune_by_text_count(args.min_texts, args.max_texts)
    if args.min_size or args.max_size:
        report.prune_by_ngram_size(args.min_size, args.max_size)
    if args.min_count or args.max_count:
        report.prune_by_ngram_count(args.min_count, args.max_count)
    if args.remove:
        report.remove_label(args.remove)
    if args.sort:
        report.sort()
    report.csv(sys.stdout)
Beispiel #5
0
 def test_reduce_nan(self):
     # Check that the n-gram "nan" is not interpreted as NaN.
     tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL,
                                tacl.constants.TOKENIZER_JOINER_PAGEL)
     input_data = (['nan', '1', 'text1', 'base', '2',
                    'A'], ['nan dus', '2', 'text1', 'base', '1',
                           'A'], ['pa', '1', 'text2', 'base', '1', 'B'])
     fh = self._create_csv(input_data)
     report = tacl.Report(fh, tokenizer)
     report.reduce()
     actual_rows = self._get_rows_from_csv(
         report.csv(io.StringIO(newline='')))
     expected_rows = (('nan', '1', 'text1', 'base', '1',
                       'A'), ('nan dus', '2', 'text1', 'base', '1', 'A'),
                      ('pa', '1', 'text2', 'base', '1', 'B'))
     self.assertEqual(set(actual_rows), set(expected_rows))
Beispiel #6
0
 def _perform_reduce(self, input_data, tokenizer):
     fh = self._create_csv(input_data)
     report = tacl.Report(fh, tokenizer)
     report.reduce()
     return self._get_rows_from_csv(report.csv(io.StringIO(newline='')))
Beispiel #7
0
 def test_reciprocal_remove(self):
     input_data = (['AB', '2', 'a', 'base', '5',
                    'A'], ['ABCDEF', '6', 'a', 'base', '7',
                           'A'], ['DEF', '3', 'a', 'base', '2', 'A'],
                   ['GHIJ', '4', 'a', 'base', '3',
                    'A'], ['KLM', '3', 'b', 'base', '0',
                           'A'], ['ABCDEF', '6', 'b', 'base', '3', 'B'],
                   ['GHIJ', '4', 'b', 'base', '2',
                    'B'], ['KLM', '3', 'b', 'base', '17', 'B'])
     fh = self._create_csv(input_data)
     report = tacl.Report(fh, self._tokenizer)
     report.reciprocal_remove()
     expected_rows = [('ABCDEF', '6', 'a', 'base', '7', 'A'),
                      ('GHIJ', '4', 'a', 'base', '3', 'A'),
                      ('ABCDEF', '6', 'b', 'base', '3', 'B'),
                      ('GHIJ', '4', 'b', 'base', '2', 'B')]
     actual_rows = self._get_rows_from_csv(
         report.csv(io.StringIO(newline='')))
     self.assertEqual(set(actual_rows), set(expected_rows))
     # More than two labels, and more than one text per label.
     input_data = (['AB', '2', 'a', 'base', '5',
                    'A'], ['ABCDEF', '6', 'a', 'base', '7',
                           'A'], ['DEF', '3', 'a', 'base', '2',
                                  'A'], ['AB', '2', 'b', 'base', '6', 'A'],
                   ['GHIJ', '4', 'b', 'base', '3',
                    'A'], ['KLM', '3', 'b', 'base', '0',
                           'A'], ['ABCDEF', '6', 'c', 'base', '3', 'B'],
                   ['KLM', '3', 'c', 'base', '17',
                    'B'], ['GHIJ', '4', 'd', 'base', '2',
                           'B'], ['KLM', '3', 'e', 'base', '3', 'C'],
                   ['GHIJ', '4', 'f', 'base', '11',
                    'C'], ['ABCDEF', '6', 'g', 'base', '8', 'C'])
     fh = self._create_csv(input_data)
     report = tacl.Report(fh, self._tokenizer)
     report.reciprocal_remove()
     expected_rows = [('ABCDEF', '6', 'a', 'base', '7', 'A'),
                      ('GHIJ', '4', 'b', 'base', '3', 'A'),
                      ('ABCDEF', '6', 'c', 'base', '3', 'B'),
                      ('GHIJ', '4', 'd', 'base', '2', 'B'),
                      ('GHIJ', '4', 'f', 'base', '11', 'C'),
                      ('ABCDEF', '6', 'g', 'base', '8', 'C')]
     actual_rows = self._get_rows_from_csv(
         report.csv(io.StringIO(newline='')))
     self.assertEqual(set(actual_rows), set(expected_rows))
     # Now with variants.
     input_data = (['AB', '2', 'a', 'base', '5',
                    'A'], ['ABCDEF', '6', 'a', 'wit1', '7',
                           'A'], ['DEF', '3', 'a', 'base', '2',
                                  'A'], ['AB', '2', 'b', 'base', '6', 'A'],
                   ['GHIJ', '4', 'b', 'base', '3',
                    'A'], ['KLM', '3', 'b', 'base', '0',
                           'A'], ['ABCDEF', '6', 'c', 'base', '3', 'B'],
                   ['KLM', '3', 'c', 'base', '17',
                    'B'], ['GHIJ', '4', 'd', 'base', '2',
                           'B'], ['KLM', '3', 'e', 'base', '3', 'C'],
                   ['GHIJ', '4', 'f', 'wit2', '11',
                    'C'], ['ABCDEF', '6', 'g', 'base', '8', 'C'])
     fh = self._create_csv(input_data)
     report = tacl.Report(fh, self._tokenizer)
     report.reciprocal_remove()
     expected_rows = [('ABCDEF', '6', 'a', 'wit1', '7', 'A'),
                      ('GHIJ', '4', 'b', 'base', '3', 'A'),
                      ('ABCDEF', '6', 'c', 'base', '3', 'B'),
                      ('GHIJ', '4', 'd', 'base', '2', 'B'),
                      ('GHIJ', '4', 'f', 'wit2', '11', 'C'),
                      ('ABCDEF', '6', 'g', 'base', '8', 'C')]
     actual_rows = self._get_rows_from_csv(
         report.csv(io.StringIO(newline='')))
     self.assertEqual(set(actual_rows), set(expected_rows))