Ejemplo n.º 1
0
def highlight_text(args, parser):
    """Outputs the result of highlighting a text."""
    tokenizer = utils.get_tokenizer(args)
    corpus = utils.get_corpus(args)
    output_dir = os.path.abspath(args.output)
    if os.path.exists(output_dir):
        parser.exit(status=3,
                    message='Output directory already exists, '
                    'aborting.\n')
    os.makedirs(output_dir, exist_ok=True)
    if args.ngrams:
        if args.label is None or len(args.label) != len(args.ngrams):
            parser.error('There must be as many labels as there are files '
                         'of n-grams')
        report = tacl.NgramHighlightReport(corpus, tokenizer)
        ngrams = []
        for ngram_file in args.ngrams:
            ngrams.append(utils.get_ngrams(ngram_file))
        minus_ngrams = []
        if args.minus_ngrams:
            minus_ngrams = utils.get_ngrams(args.minus_ngrams)
        report.generate(args.output, args.base_name, ngrams, args.label,
                        minus_ngrams)
    else:
        report = tacl.ResultsHighlightReport(corpus, tokenizer)
        report.generate(args.output, args.base_name, args.results)
Ejemplo n.º 2
0
 def test_highlight(self):
     input_text = ('<span data-count="0" data-texts=" ">火</span>'
                   '<span data-count="0" data-texts=" ">無</span>'
                   '<span data-count="0" data-texts=" ">[火*因]</span>。'
                   '<span data-count="0" data-texts=" ">是</span>'
                   '<span data-count="0" data-texts=" ">故</span>'
                   '<span data-count="0" data-texts=" ">顯</span>'
                   '<span data-count="0" data-texts=" ">物</span>')
     input_results = pd.DataFrame([{
         tacl.constants.NGRAM_FIELDNAME: '無[火*因]是',
         tacl.constants.SIZE_FIELDNAME: '3',
         tacl.constants.WORK_FIELDNAME: 't2',
         tacl.constants.SIGLUM_FIELDNAME: 'base',
         tacl.constants.COUNT_FIELDNAME: '2',
         tacl.constants.LABEL_FIELDNAME: 'B'
     }])
     report = tacl.ResultsHighlightReport(None, self._tokenizer)
     actual_text = report._highlight(input_text, input_results)
     expected_text = (
         '<span data-count="0" data-texts=" ">火</span>'
         '<span data-count="0" data-texts=" t2/base.txt ">無</span>'
         '<span data-count="0" data-texts=" t2/base.txt ">[火*因]</span>。'
         '<span data-count="0" data-texts=" t2/base.txt ">是</span>'
         '<span data-count="0" data-texts=" ">故</span>'
         '<span data-count="0" data-texts=" ">顯</span>'
         '<span data-count="0" data-texts=" ">物</span>')
     self.assertEqual(actual_text, expected_text)
Ejemplo n.º 3
0
 def test_get_regexp_pattern(self):
     input_ngram = 'ab[cd]e'
     report = tacl.ResultsHighlightReport(None, self._tokenizer)
     actual_pattern = report._get_regexp_pattern(input_ngram)
     expected_pattern = (r'(<span[^>]*>a</span>\W*<span[^>]*>b</span>\W*'
                         r'<span[^>]*>\[cd\]</span>\W*<span[^>]*>e</span>)')
     self.assertEqual(actual_pattern, expected_pattern)
Ejemplo n.º 4
0
 def test_prepare_text_pagel(self):
     input_text = "'dzin dang | snang\n \nba'i"
     expected_text = (
         '''<span data-count="0" data-texts=" ">'dzin</span>'''
         ' <span data-count="0" data-texts=" ">dang</span> |'
         ' <span data-count="0" data-texts=" ">snang</span>\n \n'
         '''<span data-count="0" data-texts=" ">ba'i</span>''')
     tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL,
                                tacl.constants.TOKENIZER_JOINER_PAGEL)
     report = tacl.ResultsHighlightReport(None, tokenizer)
     actual_text = report._prepare_text(input_text)
     self.assertEqual(actual_text, expected_text)
Ejemplo n.º 5
0
 def test_prepare_text_cbeta(self):
     input_text = '無[火*因]是<物即\n\n    同如'
     expected_text = ('<span data-count="0" data-texts=" ">無</span>'
                      '<span data-count="0" data-texts=" ">[火*因]</span>'
                      '<span data-count="0" data-texts=" ">是</span>'
                      '<span data-count="0" data-texts=" ">物</span>'
                      '<span data-count="0" data-texts=" ">即</span>\n\n'
                      '    <span data-count="0" data-texts=" ">同</span>'
                      '<span data-count="0" data-texts=" ">如</span>')
     report = tacl.ResultsHighlightReport(None, self._tokenizer)
     actual_text = report._prepare_text(input_text)
     self.assertEqual(actual_text, expected_text)
Ejemplo n.º 6
0
 def test_format_content(self):
     input_text = ('<span data-count="0" data-texts=" ">火</span>'
                   '<span data-count="0" data-texts=" ">無</span>'
                   ' <span data-count="0" data-texts=" ">[火*因]</span>。'
                   '<span data-count="0" data-texts=" ">是</span>'
                   '<span data-count="0" data-texts=" ">故</span>\n\n'
                   '    <span data-count="0" data-texts=" ">顯</span>'
                   '     <span data-count="0" data-texts=" ">物</span>')
     report = tacl.ResultsHighlightReport(None, self._tokenizer)
     actual_output = report._format_content(input_text)
     expected_output = (
         '<span data-count="0" data-texts=" ">火</span>'
         '<span data-count="0" data-texts=" ">無</span>'
         ' <span data-count="0" data-texts=" ">[火*因]</span>。'
         '<span data-count="0" data-texts=" ">是</span>'
         '<span data-count="0" data-texts=" ">故</span><br/>\n<br/>'
         '\n&#160;&#160;&#160;&#160;<span data-count="0" data-texts=" ">'
         '顯</span>&#160;&#160;&#160;&#160;&#160;'
         '<span data-count="0" data-texts=" ">物</span>')
     self.assertEqual(actual_output, expected_output)