def highlight_text(args, parser): """Outputs the result of highlighting a text.""" tokenizer = utils.get_tokenizer(args) corpus = utils.get_corpus(args) output_dir = os.path.abspath(args.output) if os.path.exists(output_dir): parser.exit(status=3, message='Output directory already exists, ' 'aborting.\n') os.makedirs(output_dir, exist_ok=True) if args.ngrams: if args.label is None or len(args.label) != len(args.ngrams): parser.error('There must be as many labels as there are files ' 'of n-grams') report = tacl.NgramHighlightReport(corpus, tokenizer) ngrams = [] for ngram_file in args.ngrams: ngrams.append(utils.get_ngrams(ngram_file)) minus_ngrams = [] if args.minus_ngrams: minus_ngrams = utils.get_ngrams(args.minus_ngrams) report.generate(args.output, args.base_name, ngrams, args.label, minus_ngrams) else: report = tacl.ResultsHighlightReport(corpus, tokenizer) report.generate(args.output, args.base_name, args.results)
def test_highlight(self): input_text = ('<span data-count="0" data-texts=" ">火</span>' '<span data-count="0" data-texts=" ">無</span>' '<span data-count="0" data-texts=" ">[火*因]</span>。' '<span data-count="0" data-texts=" ">是</span>' '<span data-count="0" data-texts=" ">故</span>' '<span data-count="0" data-texts=" ">顯</span>' '<span data-count="0" data-texts=" ">物</span>') input_results = pd.DataFrame([{ tacl.constants.NGRAM_FIELDNAME: '無[火*因]是', tacl.constants.SIZE_FIELDNAME: '3', tacl.constants.WORK_FIELDNAME: 't2', tacl.constants.SIGLUM_FIELDNAME: 'base', tacl.constants.COUNT_FIELDNAME: '2', tacl.constants.LABEL_FIELDNAME: 'B' }]) report = tacl.ResultsHighlightReport(None, self._tokenizer) actual_text = report._highlight(input_text, input_results) expected_text = ( '<span data-count="0" data-texts=" ">火</span>' '<span data-count="0" data-texts=" t2/base.txt ">無</span>' '<span data-count="0" data-texts=" t2/base.txt ">[火*因]</span>。' '<span data-count="0" data-texts=" t2/base.txt ">是</span>' '<span data-count="0" data-texts=" ">故</span>' '<span data-count="0" data-texts=" ">顯</span>' '<span data-count="0" data-texts=" ">物</span>') self.assertEqual(actual_text, expected_text)
def test_get_regexp_pattern(self): input_ngram = 'ab[cd]e' report = tacl.ResultsHighlightReport(None, self._tokenizer) actual_pattern = report._get_regexp_pattern(input_ngram) expected_pattern = (r'(<span[^>]*>a</span>\W*<span[^>]*>b</span>\W*' r'<span[^>]*>\[cd\]</span>\W*<span[^>]*>e</span>)') self.assertEqual(actual_pattern, expected_pattern)
def test_prepare_text_pagel(self): input_text = "'dzin dang | snang\n \nba'i" expected_text = ( '''<span data-count="0" data-texts=" ">'dzin</span>''' ' <span data-count="0" data-texts=" ">dang</span> |' ' <span data-count="0" data-texts=" ">snang</span>\n \n' '''<span data-count="0" data-texts=" ">ba'i</span>''') tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL, tacl.constants.TOKENIZER_JOINER_PAGEL) report = tacl.ResultsHighlightReport(None, tokenizer) actual_text = report._prepare_text(input_text) self.assertEqual(actual_text, expected_text)
def test_prepare_text_cbeta(self): input_text = '無[火*因]是<物即\n\n 同如' expected_text = ('<span data-count="0" data-texts=" ">無</span>' '<span data-count="0" data-texts=" ">[火*因]</span>' '<span data-count="0" data-texts=" ">是</span>' '<span data-count="0" data-texts=" ">物</span>' '<span data-count="0" data-texts=" ">即</span>\n\n' ' <span data-count="0" data-texts=" ">同</span>' '<span data-count="0" data-texts=" ">如</span>') report = tacl.ResultsHighlightReport(None, self._tokenizer) actual_text = report._prepare_text(input_text) self.assertEqual(actual_text, expected_text)
def test_format_content(self): input_text = ('<span data-count="0" data-texts=" ">火</span>' '<span data-count="0" data-texts=" ">無</span>' ' <span data-count="0" data-texts=" ">[火*因]</span>。' '<span data-count="0" data-texts=" ">是</span>' '<span data-count="0" data-texts=" ">故</span>\n\n' ' <span data-count="0" data-texts=" ">顯</span>' ' <span data-count="0" data-texts=" ">物</span>') report = tacl.ResultsHighlightReport(None, self._tokenizer) actual_output = report._format_content(input_text) expected_output = ( '<span data-count="0" data-texts=" ">火</span>' '<span data-count="0" data-texts=" ">無</span>' ' <span data-count="0" data-texts=" ">[火*因]</span>。' '<span data-count="0" data-texts=" ">是</span>' '<span data-count="0" data-texts=" ">故</span><br/>\n<br/>' '\n    <span data-count="0" data-texts=" ">' '顯</span>     ' '<span data-count="0" data-texts=" ">物</span>') self.assertEqual(actual_output, expected_output)