def test_process_with_aria(self): """Demonstrates advanced usage considering accessibility.""" expected_chunks = [ budou.Chunk(u'今日は', u'NOUN', u'NN', True), budou.Chunk(u'晴れ。', u'NOUN', u'ROOT', False) ] expected_html_code = ( u'<span aria-describedby="parent" class="text-chunk">今日は</span>' u'<span aria-describedby="parent" class="text-chunk">晴れ。</span>') result = self.parser.parse(DEFAULT_SENTENCE, { 'aria-describedby': 'parent', 'class': 'text-chunk' }, use_cache=False) self.assertIn( 'chunks', result, 'Processed result should include chunks.') self.assertIn( 'html_code', result, 'Processed result should include organized html code.') self.assertEqual( expected_chunks, result['chunks'], 'Processed result should include expected chunks.') self.assertEqual( expected_html_code, result['html_code'], 'Processed result should include expected html code.')
def setUp(self): queue = budou.ChunkQueue() chunks = [ budou.Chunk('ab', dependency=None), budou.Chunk('cde', dependency=True), budou.Chunk('fgh', dependency=False) ] for chunk in chunks: queue.add(chunk) self.queue = queue
def test_get_source_chunks(self): expected = [ budou.Chunk(u'今日', u'NOUN', u'NN', True), budou.Chunk(u'は', u'PRT', u'PRT', False), budou.Chunk(u'晴れ', u'NOUN', u'ROOT', False), budou.Chunk(u'。', u'PUNCT', u'P', False), ] result = self.parser._get_source_chunks(DEFAULT_SENTENCE) self.assertEqual( expected, result, 'Input sentence should be processed into source chunks.')
def test_get_chunks_per_space(self): source = 'a b' expected = [ budou.Chunk('a', None, None, True), budou.Chunk(' ', budou.SPACE_POS, budou.SPACE_POS, True), budou.Chunk('b', None, None, True) ] result = self.parser._get_chunks_per_space(source) self.assertEqual( result, expected, 'Input text should be parsed into chunks separated by spaces.')
def test_spanize(self): chunks = [ budou.Chunk(u'a', None, None, None), budou.Chunk(u'b', None, None, None), budou.Chunk(u'c', None, None, None), ] classname = 'foo' expected = (u'<span class="foo">a</span>' '<span class="foo">b</span>' '<span class="foo">c</span>') result = self.parser._spanize(chunks, classname) self.assertEqual(result, expected, 'The chunks should be compiled to a HTML code.')
def test_concatenate_punctuations(self): chunks = [ budou.Chunk(u'a', None, None, None), budou.Chunk(u'b', u'PUNCT', None, None), budou.Chunk(u'c', None, None, None), ] expected_forward_concat = [ budou.Chunk(u'ab', None, None, None), budou.Chunk(u'c', None, None, None), ] result = self.parser._concatenate_punctuations(chunks) self.assertEqual(result, expected_forward_concat, 'Punctuation marks should be concatenated backward.')
def test_html_serialize(self): chunks = budou.ChunkList([ budou.Chunk('Hello'), budou.Chunk.space(), budou.Chunk(u'今天'), budou.Chunk(u'天气'), budou.Chunk(u'很好') ]) attributes = {'class': 'foo'} expected = ('<span>' 'Hello ' u'<span class="foo">今天</span>' u'<span class="foo">天气</span>' u'<span class="foo">很好</span>' '</span>') result = self.parser._html_serialize(chunks, attributes, None) self.assertEqual(result, expected, 'The chunks should be compiled to a HTML code.') chunks = budou.ChunkList([ budou.Chunk('Hey<'), budou.Chunk('<script>alert(1)</script>'), budou.Chunk('>guys') ]) attributes = {'class': 'foo'} expected = ('<span>' 'Hey<<script>alert(1)</script>>guys' '</span>') result = self.parser._html_serialize(chunks, attributes, None) self.assertEqual(result, expected, 'HTML tags included in a chunk should be encoded.') chunks = budou.ChunkList([ budou.Chunk(u'去年'), budou.Chunk(u'インフルエンザに'), budou.Chunk(u'かかった。') ]) attributes = {'class': 'foo'} expected = ('<span>' u'<span class="foo">去年</span>' u'インフルエンザに' u'<span class="foo">かかった。</span>' '</span>') result = self.parser._html_serialize(chunks, attributes, 6) self.assertEqual( result, expected, 'Chunks that exceed the max length should not be enclosed by a span.' )
def test_swap(self): old_chunks = self.queue.chunks[0:2] new_chunk = budou.Chunk('ijk') self.queue.swap(old_chunks, new_chunk) expected = ['ijk', 'fgh'] self.assertEqual(expected, [chunk.word for chunk in self.queue.chunks], 'Old chunks should be replaced with the new chunk.')
def test_migrate_html(self): source = u'こ<a>ちらを</a>クリック' dom = html.fragment_fromstring(source, create_parent='body') chunks = [ budou.Chunk(u'こちら', u'PRON', u'NSUBJ', True), budou.Chunk(u'を', u'PRT', u'PRT', False), budou.Chunk(u'クリック', u'NOUN', u'ROOT', False), ] expected = [ budou.Chunk(u'こ<a>ちらを</a>', budou.HTML_POS, budou.HTML_POS, True), budou.Chunk(u'クリック', u'NOUN', u'ROOT', False), ] result = self.parser._migrate_html(chunks, dom) self.assertEqual( expected, result, 'The HTML source code should be migrated into the chunk list.')
def test_html_serialize(self): chunks = budou.ChunkList([ budou.Chunk('a'), budou.Chunk('b'), budou.Chunk.space(), budou.Chunk('c') ]) attributes = {'class': 'foo'} expected = ('<span>' '<span class="foo">a</span>' '<span class="foo">b</span> ' '<span class="foo">c</span>' '</span>') result = self.parser._html_serialize(chunks, attributes) self.assertEqual(result, expected, 'The chunks should be compiled to a HTML code.')
def test_spanize(self): queue = budou.ChunkQueue() chunks = [ budou.Chunk('a'), budou.Chunk('b'), budou.Chunk.space(), budou.Chunk('c'), ] for chunk in chunks: queue.add(chunk) attributes = {'class': 'foo'} expected = ('<span class="foo">a</span>' '<span class="foo">b</span> ' '<span class="foo">c</span>') result = self.parser._spanize(queue, attributes) self.assertEqual(result, expected, 'The chunks should be compiled to a HTML code.')
def test_html_serialize(self): chunks = budou.ChunkList([ budou.Chunk('Hello'), budou.Chunk.space(), budou.Chunk(u'今天'), budou.Chunk(u'天气'), budou.Chunk(u'很好') ]) attributes = {'class': 'foo'} expected = ('<span>' 'Hello ' u'<span class="foo">今天</span>' u'<span class="foo">天气</span>' u'<span class="foo">很好</span>' '</span>') result = self.parser._html_serialize(chunks, attributes) self.assertEqual(result, expected, 'The chunks should be compiled to a HTML code.') chunks = budou.ChunkList([ budou.Chunk('Hey<'), budou.Chunk('<script>alert(1)</script>'), budou.Chunk('>guys') ]) attributes = {'class': 'foo'} expected = ('<span>' 'Hey<<script>alert(1)</script>>guys' '</span>') result = self.parser._html_serialize(chunks, attributes) self.assertEqual(result, expected, 'HTML tags included in a chunk should be encoded.')
def test_concatenate_inner(self): chunks = budou.ChunkList() chunks.append(budou.Chunk('ab', dependency=None)) chunks.append(budou.Chunk('cde', dependency=True)) chunks.append(budou.Chunk('fgh', dependency=False)) chunks = self.parser._concatenate_inner(chunks, True) self.assertEqual(['ab', 'cdefgh'], [ chunk.word for chunk in chunks ], 'Chunks should be concatenated if they depends on the following word.' ) self.assertEqual( [None, False], [chunk.dependency for chunk in chunks], 'Dependency should persist even if it\'s concatenated by others.') chunks = self.parser._concatenate_inner(chunks, False) self.assertEqual(['abcdefgh'], [ chunk.word for chunk in chunks ], 'Chunks should be concatenated if they depends on the previous word.' )
def test_process(self): expected_chunks = [ budou.Chunk(u'今日は', u'NOUN', u'NN', True), budou.Chunk(u'晴れ。', u'NOUN', u'ROOT', False) ] expected_html_code = (u'<span class="ww">今日は</span>' u'<span class="ww">晴れ。</span>') result = self.parser.parse(DEFAULT_SENTENCE) self.assertIn('chunks', result, 'Processed result should include chunks.') self.assertIn('html_code', result, 'Processed result should include organized html code.') self.assertEqual(expected_chunks, result['chunks'], 'Processed result should include expected chunks.') self.assertEqual( expected_html_code, result['html_code'], 'Processed result should include expected html code.')
def test_parse_ja(self): """Demonstrates standard usage in Japanese.""" expected_chunks = [ budou.Chunk(u'今日は', u'NOUN', u'NN', True), budou.Chunk(u'晴れ。', u'NOUN', u'ROOT', False) ] expected_html_code = (u'<span class="ww">今日は</span>' u'<span class="ww">晴れ。</span>') result = self.parser.parse(DEFAULT_SENTENCE_JA, language='ja', use_cache=False) self.assertEqual( expected_chunks, result['chunks'], 'Processed result should include expected chunks in Japanese.') self.assertEqual( expected_html_code, result['html_code'], 'Processed result should include expected html code in Japanese.')
def test_parse_ko(self): """Demonstrates standard usage in Japanese.""" expected_chunks = [ budou.Chunk(u'오늘은', None, None, True), budou.Chunk(' ', budou.SPACE_POS, budou.SPACE_POS, True), budou.Chunk(u'맑음.', None, None, True) ] expected_html_code = (u'<span class="ww">오늘은</span> ' u'<span class="ww">맑음.</span>') result = self.parser.parse(DEFAULT_SENTENCE_KO, language='ko', use_cache=False) self.assertEqual( expected_chunks, result['chunks'], 'Processed result should include expected chunks in Korean.') self.assertEqual( expected_html_code, result['html_code'], 'Processed result should include expected html code in Korean.')
def test_maybe_add_dependency(self): chunk = budou.Chunk('foo', label=None) chunk.maybe_add_dependency(True) self.assertEqual( None, chunk.dependency, 'Dependency should not be added if the chunk does not belong to' 'dependent labels.') chunk = budou.Chunk('foo', label=budou.Chunk.DEPENDENT_LABEL[0]) chunk.maybe_add_dependency(True) self.assertEqual( True, chunk.dependency, 'Dependency should be added if the chunk belongs to dependent labels.' ) chunk = budou.Chunk('foo', label=budou.Chunk.DEPENDENT_LABEL[0]) chunk.dependency = False chunk.maybe_add_dependency(True) self.assertEqual( False, chunk.dependency, 'Dependency should not be added if the chunk has dependency already.' )
def test_add_dependency_if_punct(self): test_characters = [ u'。', u'、', u'「', u'」', u'(', u')', u'[', u']', u'(', u')' ] expected_dependency = [ False, False, True, False, True, False, True, False, True, False ] for i, character in enumerate(test_characters): # _add_dependency_if_punct is called in __init__ implicitly. chunk = budou.Chunk(character, pos='PUNCT') self.assertEqual( expected_dependency[i], chunk.dependency, 'Punctuation marks should be assigned with proper dependencies.' )
def test_concatenate_by_label(self): chunks = [ budou.Chunk(u'a', None, budou.TARGET_LABEL[0], True), budou.Chunk(u'b', None, budou.TARGET_LABEL[1], False), budou.Chunk(u'c', None, budou.TARGET_LABEL[2], True), ] expected_forward_concat = [ budou.Chunk(u'ab', None, budou.TARGET_LABEL[1], False), budou.Chunk(u'c', None, budou.TARGET_LABEL[2], True), ] result = self.parser._concatenate_by_label(chunks, True) self.assertEqual( result, expected_forward_concat, 'Forward directional chunks should be concatenated to following ' 'chunks.') expected_backward_concat = [ budou.Chunk(u'ab', None, budou.TARGET_LABEL[0], True), budou.Chunk(u'c', None, budou.TARGET_LABEL[2], True), ] result = self.parser._concatenate_by_label(chunks, False) self.assertEqual( result, expected_backward_concat, 'Backward directional chunks should be concatenated to preceding ' 'chunks.')
def test_group_chunks_by_entities(self): # chunks: foo bar baz # entity: ___ bar ___ chunks = budou.ChunkList( [budou.Chunk('foo'), budou.Chunk('bar'), budou.Chunk('baz')]) entities = [{'beginOffset': 3, 'content': 'bar'}] expected = ['foo', 'bar', 'baz'] result = self.parser._group_chunks_by_entities(chunks, entities) self.assertEqual(expected, [chunk.word for chunk in result]) # chunks: foo bar baz # entity: foo ba_ ___ chunks = budou.ChunkList( [budou.Chunk('foo'), budou.Chunk('bar'), budou.Chunk('baz')]) entities = [{'beginOffset': 0, 'content': 'fooba'}] expected = ['foobar', 'baz'] result = self.parser._group_chunks_by_entities(chunks, entities) self.assertEqual(expected, [chunk.word for chunk in result])
def setUp(self): chunks = budou.ChunkList() chunks.append(budou.Chunk('ab', dependency=None)) chunks.append(budou.Chunk('cde', dependency=True)) chunks.append(budou.Chunk('fgh', dependency=False)) self.chunks = chunks
def test_get_source_chunks(self): queue = self.parser._get_source_chunks(DEFAULT_SENTENCE_JA) expected = [ budou.Chunk(u'六本木', label='NN', pos='NOUN', dependency=None), budou.Chunk(u'ヒルズ', label='ADVPHMOD', pos='NOUN', dependency=None), budou.Chunk(u'で', label='PRT', pos='PRT', dependency=False), budou.Chunk(u'、', label='P', pos='PUNCT', dependency=False), budou.Chunk(u'「', label='P', pos='PUNCT', dependency=True), budou.Chunk(u'ご飯', label='DOBJ', pos='NOUN', dependency=None), budou.Chunk(u'」', label='P', pos='PUNCT', dependency=False), budou.Chunk(u'を', label='PRT', pos='PRT', dependency=False), budou.Chunk(u'食べ', label='ROOT', pos='VERB', dependency=None), budou.Chunk(u'ます', label='AUX', pos='VERB', dependency=False), budou.Chunk(u'。', label='P', pos='PUNCT', dependency=False) ] self.assertEqual( [chunk.word for chunk in expected], [chunk.word for chunk in queue.chunks], 'Words should be match between input text and retrieved chunks.') self.assertEqual([chunk.dependency for chunk in expected], [ chunk.dependency for chunk in queue.chunks ], 'Dependency should be match between input text and retrieved chunks.' )
def test_get_source_chunks(self): tokens = [{ 'dependencyEdge': { 'headTokenIndex': 1, 'label': 'NN' }, 'partOfSpeech': { 'tag': 'NOUN' }, 'text': { 'beginOffset': 0, 'content': u'六本木' } }, { 'dependencyEdge': { 'headTokenIndex': 8, 'label': 'ADVPHMOD' }, 'partOfSpeech': { 'tag': 'NOUN' }, 'text': { 'beginOffset': 3, 'content': u'ヒルズ' } }, { 'dependencyEdge': { 'headTokenIndex': 1, 'label': 'PRT' }, 'partOfSpeech': { 'tag': 'PRT' }, 'text': { 'beginOffset': 6, 'content': u'で' } }, { 'dependencyEdge': { 'headTokenIndex': 8, 'label': 'P' }, 'partOfSpeech': { 'tag': 'PUNCT' }, 'text': { 'beginOffset': 7, 'content': u'、' } }, { 'dependencyEdge': { 'headTokenIndex': 5, 'label': 'P' }, 'partOfSpeech': { 'tag': 'PUNCT' }, 'text': { 'beginOffset': 8, 'content': u'「' } }, { 'dependencyEdge': { 'headTokenIndex': 8, 'label': 'DOBJ' }, 'partOfSpeech': { 'tag': 'NOUN' }, 'text': { 'beginOffset': 9, 'content': u'ご飯' } }, { 'dependencyEdge': { 'headTokenIndex': 5, 'label': 'P' }, 'partOfSpeech': { 'tag': 'PUNCT' }, 'text': { 'beginOffset': 11, 'content': u'」' } }, { 'dependencyEdge': { 'headTokenIndex': 5, 'label': 'PRT' }, 'partOfSpeech': { 'tag': 'PRT' }, 'text': { 'beginOffset': 12, 'content': u'を' } }, { 'dependencyEdge': { 'headTokenIndex': 8, 'label': 'ROOT' }, 'partOfSpeech': { 'tag': 'VERB' }, 'text': { 'beginOffset': 13, 'content': u'食べ' } }, { 'dependencyEdge': { 'headTokenIndex': 8, 'label': 'AUX' }, 'partOfSpeech': { 'tag': 'VERB' }, 'text': { 'beginOffset': 15, 'content': u'ます' } }, { 'dependencyEdge': { 'headTokenIndex': 8, 'label': 'P' }, 'partOfSpeech': { 'tag': 'PUNCT' }, 'text': { 'beginOffset': 17, 'content': u'。' } }] self.parser._get_annotations = MagicMock(return_value=(tokens, None)) chunks, _, _ = self.parser._get_source_chunks(u'六本木ヒルズで、「ご飯」を食べます。') expected = [ budou.Chunk(u'六本木', label='NN', pos='NOUN', dependency=None), budou.Chunk(u'ヒルズ', label='ADVPHMOD', pos='NOUN', dependency=None), budou.Chunk(u'で', label='PRT', pos='PRT', dependency=False), budou.Chunk(u'、', label='P', pos='PUNCT', dependency=False), budou.Chunk(u'「', label='P', pos='PUNCT', dependency=True), budou.Chunk(u'ご飯', label='DOBJ', pos='NOUN', dependency=None), budou.Chunk(u'」', label='P', pos='PUNCT', dependency=False), budou.Chunk(u'を', label='PRT', pos='PRT', dependency=False), budou.Chunk(u'食べ', label='ROOT', pos='VERB', dependency=None), budou.Chunk(u'ます', label='AUX', pos='VERB', dependency=False), budou.Chunk(u'。', label='P', pos='PUNCT', dependency=False) ] self.assertEqual( [chunk.word for chunk in expected], [chunk.word for chunk in chunks], 'Words should be match between input text and retrieved chunks.') self.assertEqual([chunk.dependency for chunk in expected], [ chunk.dependency for chunk in chunks ], 'Dependency should be match between input text and retrieved chunks.' )
def reset_queue(self): chunks = [budou.Chunk('foo'), budou.Chunk('bar'), budou.Chunk('baz')] queue = budou.ChunkQueue() for chunk in chunks: queue.add(chunk) return queue
def test_get_source_chunks(self): budou.api.get_annotations = MagicMock( return_value=self.cases['ja-case1']['tokens']) queue = self.parser._get_source_chunks( self.cases['ja-case1']['sentence']) expected = [ budou.Chunk(u'六本木', label='NN', pos='NOUN', dependency=None), budou.Chunk(u'ヒルズ', label='ADVPHMOD', pos='NOUN', dependency=None), budou.Chunk(u'で', label='PRT', pos='PRT', dependency=False), budou.Chunk(u'、', label='P', pos='PUNCT', dependency=False), budou.Chunk(u'「', label='P', pos='PUNCT', dependency=True), budou.Chunk(u'ご飯', label='DOBJ', pos='NOUN', dependency=None), budou.Chunk(u'」', label='P', pos='PUNCT', dependency=False), budou.Chunk(u'を', label='PRT', pos='PRT', dependency=False), budou.Chunk(u'食べ', label='ROOT', pos='VERB', dependency=None), budou.Chunk(u'ます', label='AUX', pos='VERB', dependency=False), budou.Chunk(u'。', label='P', pos='PUNCT', dependency=False) ] self.assertEqual( [chunk.word for chunk in expected], [chunk.word for chunk in queue.chunks], 'Words should be match between input text and retrieved chunks.') self.assertEqual([chunk.dependency for chunk in expected], [ chunk.dependency for chunk in queue.chunks ], 'Dependency should be match between input text and retrieved chunks.' )