def test_gz_trie_match(self): taxonomy_matcher = Matcher(gazetteer=self.gz_file) text = 'ab Foo bar Foo foo chao foo\nBar foo bar foo foo' self.assertEqual( [ (match.surface_form, match.start_pos, match.end_pos) for match in list(taxonomy_matcher.matching(text)) ], [ ('Foo bar Foo', 3, 13), ('chao', 19, 22), ('foo\nBar foo', 24, 37), ('bar foo foo', 39, 49) ] ) text = 'ab Foo Foo foo foo\nBar bar foo foo' self.assertEqual( [ (match.surface_form, match.start_pos, match.end_pos) for match in list(taxonomy_matcher.matching(text)) ], [ ('foo\nBar', 15, 21), ('bar foo foo', 27, 37) ] )
def test_partial_trie_match_braket(self): self.matcher = Matcher(gazetteer=self.gz_file.name) text = "November 1954 Place of Birth : Rotterdam Holland Passport : \ dutch (Current) Domiciled in NZ : 47 years" self.assertEqual( [ (match.surface_form, match.start_pos, match.end_pos) for match in list(self.matcher.matching(text)) ], [ ('dutch', 62, 66), ] )
def test_partial_trie_match_doc_end(self): self.matcher = Matcher(gazetteer=self.gz_file.name) text = ''' foo bar ''' self.assertEqual( [ (match.surface_form, match.start_pos, match.end_pos) for match in list(self.matcher.matching(text)) ], [ ('foo bar', 2, 8), ] )
class ParticalMatcherTestCases(TestCase): def setUp(self): gz_content = '''dutch dutch (flemish) foo bar''' self.gz_file = tempfile.NamedTemporaryFile(mode='w', delete=False) with self.gz_file as f: f.write(gz_content) def test_partial_trie_match_braket(self): self.matcher = Matcher(gazetteer=self.gz_file.name) text = "November 1954 Place of Birth : Rotterdam Holland Passport : \ dutch (Current) Domiciled in NZ : 47 years" self.assertEqual( [ (match.surface_form, match.start_pos, match.end_pos) for match in list(self.matcher.matching(text)) ], [ ('dutch', 62, 66), ] ) def test_partial_trie_match_doc_end(self): self.matcher = Matcher(gazetteer=self.gz_file.name) text = ''' foo bar ''' self.assertEqual( [ (match.surface_form, match.start_pos, match.end_pos) for match in list(self.matcher.matching(text)) ], [ ('foo bar', 2, 8), ] ) def tearDown(self): path = pathlib.Path(self.gz_file.name) path.unlink()
def test_build_from_gz(self): taxonomy_matcher = Matcher(gazetteer=self.gz_file) self.assertEqual(taxonomy_matcher.trie_matcher.token_trie, { 'abc': {'def': {'fed': {'xxENDxx': ('abc def fed', None)}}}, 'foo': {'bar': {'xxENDxx': ('foo bar', None), 'foo': {'xxENDxx': ('foo bar foo', None)}}}, 'old': {'foo': {'xxENDxx': ('old foo', None)}}, 'new': {'foo': {'xxENDxx': ('new foo', None)}}, 'bar': {'foo': {'foo': {'xxENDxx': ('bar foo foo', None)}}}, 'chao': {'xxENDxx': ('chao', None)}} )
def test_likelihood_from_nt(self): """the likelihood with the correct value""" taxonomy_matcher = Matcher(normtable=self.nt_file) print(taxonomy_matcher.trie_matcher.token_trie) text = '''try this: UI programming or Ui Programming, and math tools like Mathématiques or mathematiques, or mathématiques.''' matched_phrases = list(taxonomy_matcher.matching(text)) print(matched_phrases) self.assertEqual(len(matched_phrases), 5) self.assertEqual([ matched_phrase.surface_form for matched_phrase in matched_phrases ], [ 'UI programming', 'Ui Programming', 'Mathématiques', 'mathematiques', 'mathématiques' ], 'extracted phrases') self.assertEqual([ matched_phrase.skill_likelihood for matched_phrase in matched_phrases ], [0.5, 0.5, 0.4, 0.4, 0.4], 'skill_likelihood')
def test_nt_trie_match(self): taxonomy_matcher = Matcher(normtable=self.nt_file) text = '''A build script is required to do the UI programming or so called user interface programming, the whole process can be managed by applying Oracle agile product lifecycle management with the help of Orale Apache Mahout expert (using Oracle agile PLM and MAHOUT). ''' self.assertEqual( [(match.surface_form, match.start_pos, match.end_pos, match.code_description) for match in list(taxonomy_matcher.matching(text))], [('build script', 2, 13, 'Build Script'), ('UI programming', 37, 50, 'User Interface Programming'), ('user interface programming', 65, 90, 'User Interface Programming'), ('Oracle agile product lifecycle management', 138, 178, 'Oracle Agile Product Lifecycle Management'), ('Apache Mahout', 203, 215, 'Mahout'), ('Oracle agile PLM', 231, 246, 'Oracle Agile Product Lifecycle Management'), ('MAHOUT', 252, 257, 'Mahout')])
def test_empty_surface_form(self): """Log error on empty surface form.""" with self.assertLogs(level='ERROR') as log: Matcher("tests/resource/skills-normalization-EN-empty.json") self.assertIn("Empty surface form", log.output[0])
def test_build_from_nt(self): taxonomy_matcher = Matcher(normtable=self.nt_file) self.assertEqual( taxonomy_matcher.trie_matcher.token_trie, { 'linked': { 'server': { 'xxENDxx': ('linked server', 'KSA8JE6A22KUR2OLU7RG') } }, 'build': { 'script': { 'xxENDxx': ('build script', 'KSFVUGQPCSO6RS0X07G8') } }, 'user': { 'interface': { 'programming': { 'xxENDxx': ('user interface programming', 'KSHI3HJOWVSR6PGHQ7CA') } } }, 'ui': { 'programming': { 'xxENDxx': ('ui programming', 'KSHI3HJOWVSR6PGHQ7CA') } }, 'oracle': { 'agile': { 'product': { 'lifecycle': { 'management': { 'xxENDxx': ('oracle agile product lifecycle management', 'KS0W11G2V8ETTQCKOV7S') } } }, 'plm': { 'xxENDxx': ('oracle agile plm', 'KS0W11G2V8ETTQCKOV7S') } }, 'apache': { 'mahout': { 'xxENDxx': ('oracle apache mahout', 'KSRT0BE62KLQPF7WZC3O') } } }, 'apache': { 'mahout': { 'xxENDxx': ('apache mahout', 'KSRT0BE62KLQPF7WZC3O') } }, 'mahout': { 'xxENDxx': ('Mahout', 'KSRT0BE62KLQPF7WZC3O') }, 'mathematiques': { 'xxENDxx': ('mathématiques', 'KS126706DPFD3354M7YK') }, 'c++': { 'xxENDxx': ('c++', 'KS126706DPFD3354M7Y0') } })