class TestExtractionsUsingSpacy(unittest.TestCase): def setUp(self): self.c = Core() self.ground_truth = dict() ground_truth_files = {"name": os.path.join(os.path.dirname(__file__), "ground_truth/name_my_name_1.jl"), "name_i_am_2": os.path.join(os.path.dirname(__file__), "ground_truth/name_i_am_2.jl"), "name_name_3": os.path.join(os.path.dirname(__file__), "ground_truth/name_name_3.jl"), "name_it_is_4": os.path.join(os.path.dirname(__file__), "ground_truth/name_it_is_4.jl"), "name_this_is_5": os.path.join(os.path.dirname(__file__), "ground_truth/name_this_is_5.jl"), "name_im_6": os.path.join(os.path.dirname(__file__), "ground_truth/name_im_6.jl"), "name_its_7": os.path.join(os.path.dirname(__file__), "ground_truth/name_its_7.jl"), "name_teleph_number_split_8": os.path.join(os.path.dirname(__file__), "ground_truth/name_teleph_number_split_8.jl"), "name_teleph_number_9": os.path.join(os.path.dirname(__file__), "ground_truth/name_teleph_number_9.jl") } for extractor, file_name in ground_truth_files.items(): with open(file_name, 'r') as f: test_data = f.read().split('\n') #test_data is a list, contains dictionary self.ground_truth[extractor] = list() for test_case in test_data: self.ground_truth[extractor].append(json.loads(test_case)) # ground_truth = {"text": 'Hello guy's, it's Jessica', 'extracted': 'Sara'} def generic_token(slef, type="word", token=[], shape=[], capitalization=[], part_of_speech=[], length=[], prefix="", suffix="", is_followed_by_space="", is_required="true", is_in_output="true", is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""): return { "type": type, "token": token, "shapes": shape, "capitalization": capitalization, "part_of_speech": part_of_speech, "length": length, "prefix": prefix, "suffix": suffix, "is_followed_by_space": is_followed_by_space, "is_required": is_required, "is_in_output": is_in_output, "is_out_of_vocabulary": is_out_of_vocabulary, "is_in_vocabulary": is_in_vocabulary, "contain_digit": contain_digit } def word_token(self, token=[], capitalization=[], part_of_speech=[], length=[], prefix="", suffix="", is_followed_by_space="", is_required="true", is_in_output="false", is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""): return self.generic_token(type="word", token=token, capitalization=capitalization, part_of_speech=part_of_speech, length=length, prefix=prefix, suffix=suffix, is_followed_by_space=is_followed_by_space, is_required=is_required, is_in_output=is_in_output, is_out_of_vocabulary=is_out_of_vocabulary, is_in_vocabulary=is_in_vocabulary, contain_digit=contain_digit) def punctuation_token(self, token=[], capitalization=[], part_of_speech=[], length=[], prefix="", suffix="", is_followed_by_space="", is_required="true", is_in_output="false", is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""): return self.generic_token(type="punctuation", token=token, capitalization=capitalization, part_of_speech=part_of_speech, length=length, prefix=prefix, suffix=suffix, is_followed_by_space=is_followed_by_space, is_required=is_required, is_in_output=is_in_output, is_out_of_vocabulary=is_out_of_vocabulary, is_in_vocabulary=is_in_vocabulary, contain_digit=contain_digit) def shape_token(self, shape=[], capitalization=[], part_of_speech=[], length=[], prefix="", suffix="", is_followed_by_space="", is_required="true", is_in_output="false", is_out_of_vocabulary="", is_in_vocabulary="", contain_digit=""): return self.generic_token(type="shape", shape=shape, capitalization=capitalization, part_of_speech=part_of_speech, length=length, prefix=prefix, suffix=suffix, is_followed_by_space=is_followed_by_space, is_required=is_required, is_in_output=is_in_output, is_out_of_vocabulary=is_out_of_vocabulary, is_in_vocabulary=is_in_vocabulary, contain_digit=contain_digit) #1.my name / names is def test_rule_my_name(self): field_rules = { "rules": [ { "identifier": "name_rule_01", "description": "a description", "is_active": "false", "polarity": [], "pattern": [ self.word_token(token=["my"]), self.word_token(token=["name", "names"]), self.word_token(token=["is"], is_required="false"), self.word_token(capitalization=["title", "upper"], is_in_output="true") ] } ] } for t in self.ground_truth['name']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules= field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) #2. i am def test_rule_i_am(self): field_rules = { "rules": [ { "identifier": "name_rule_02", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["i"]), self.word_token(token=["am"]), self.word_token(capitalization=["title", "upper"], is_in_output="true") ] } ] } for t in self.ground_truth['name_i_am_2']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) #3. name : Name: def test_rule_name_(self): field_rules = { "rules": [ { "identifier": "name_rule_03", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["name"]), self.punctuation_token(token=[":"]), self.word_token(token=[], is_in_output="true"), ] } ] } for t in self.ground_truth['name_name_3']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules= field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) # 4.it is def test_rule_it_is(self): field_rules = { "rules": [ { "identifier": "name_rule_04", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["it"]), self.word_token(token=["is"]), # word_token(capitalization=["title", "mixed"], is_in_output="true") self.word_token(part_of_speech=["proper noun"], is_in_output="true") ] } ] } for t in self.ground_truth['name_it_is_4']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) # 5.this is , This is def test_rule_this_is(self): field_rules = { "rules": [ { "identifier": "name_rule_05", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["this"]), self.word_token(token=["is"]), self.word_token(part_of_speech=["proper noun"], capitalization=["title", "mixed", "upper"], is_in_output="true") ] } ] } for t in self.ground_truth['name_this_is_5']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) # 6.I'm def test_rule_Im(self): field_rules = { "rules": [ { "identifier": "name_rule_06", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["i"]), self.punctuation_token(token=["'"]), self.word_token(token=["m"]), self.word_token(part_of_speech=["proper noun"], capitalization=["title", "mixed", "upper"], is_in_output="true") ] } ] } for t in self.ground_truth['name_im_6']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) # 7. it's def test_rule_its(self): field_rules = { "rules": [ { "identifier": "name_rule_07", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(token=["it"]), self.punctuation_token(token=["'"]), self.word_token(token=["s"]), self.word_token(part_of_speech=["proper noun"], capitalization=["title", "mixed", "upper"], is_in_output="true") ] } ] } for t in self.ground_truth['name_its_7']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) # 8.Ashley (702) def test_rule_teleph_number_split(self): field_rules = { "rules": [ { "identifier": "name_rule_08", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(capitalization=["title"], is_in_output="true"), self.punctuation_token(token=["(", "["]), self.shape_token(shape=["ddd"]) ] } ] } for t in self.ground_truth['name_teleph_number_split_8']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names) #9. Jessica 7135975313 def test_rule_teleph_number(self): field_rules = { "rules": [ { "identifier": "name_rule_09", "description": "a description", "is_active": "true", "polarity": [], "pattern": [ self.word_token(capitalization=["title", "upper", "mixed"], is_in_output="true"), self.shape_token(shape=["dddddddddd"]) ] } ] } for t in self.ground_truth['name_teleph_number_9']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'], lowercase=False)) extraction_config = {'field_name': 'my_name_is'} d = {'simple_tokens_original_case': crf_tokens} extracted_names = self.c.extract_using_custom_spacy(d, extraction_config, field_rules=field_rules) extracted_names = [name['value'] for name in extracted_names] correct_names = t['extracted'] self.assertEquals(extracted_names, correct_names)
class TestCustomSpacyNameExtraction(unittest.TestCase): def setUp(self): self.c = Core() self.data = dict() rule_01 = { "identifier": "name_rule_01", "description": "my name/names is", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["my"]), word_token(token=["name", "names"]), word_token(token=["is"], is_required="false"), word_token(part_of_speech=["proper noun"], capitalization=["title", "upper"], is_in_output="true") ] } rule_02 = { "identifier": "name_rule_02", "description": "i am", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["i"]), word_token(token=["am"]), word_token(capitalization=["title", "upper"], is_in_output="true") ] } rule_03 = { "identifier": "name_rule_03", "description": "name : Sara", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["name"]), punctuation_token(token=[":"]), word_token(token=[], is_in_output="true"), ] } rule_04 = { "identifier": "name_rule_04", "description": "it is Jessicala", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["it"]), word_token(token=["is"]), word_token(part_of_speech=["proper noun"], capitalization=["title", "upper"], is_in_output="true") ] } rule_05 = { "identifier": "name_rule_05", "description": "this is", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["this"]), word_token(token=["is"]), word_token(part_of_speech=["proper noun"], capitalization=["title", "upper"], is_in_output="true") ] } rule_06 = { "identifier": "name_rule_06", "description": "i'm", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["i"]), punctuation_token(token=["'"]), word_token(token=["m"]), word_token(part_of_speech=["proper noun"], capitalization=["title", "upper"], is_in_output="true") ] } rule_07 = { "identifier": "name_rule_07", "description": "it's", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(token=["it"]), punctuation_token(token=["'"]), word_token(token=["s"]), word_token(part_of_speech=["proper noun"], capitalization=["title", "upper"], is_in_output="true") ] } rule_08 = { "identifier": "name_rule_08", "description": "name followed by telephone number[123]", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(capitalization=["title"], is_in_output="true"), punctuation_token(token=["(", "["]), shape_token(shape=["ddd"]) ] } rule_09 = { "identifier": "name_rule_09", "description": "name followed by telephone number 7135975313", "is_active": "true", "output_format": "{1}", "pattern": [ word_token(capitalization=["title", "upper"], is_in_output="true"), shape_token(shape=["dddddddddd"]) ] } text_01 = u"Hi Gentlemen, My name is Ashley . my name Monica I am the one and, My names is Alanda" text_02 = u"I'm Ashley I'm bored i am Alison, I am Gimly" text_03 = u"Name : Sara . I am the one and, Name: JILL , Name:Jessie" text_04 = u"Hello guy's, it's Jessica here from the #@%%% Spa. I cant say the name on here, and it is Jessica, " \ u"and it is cold" text_05 = u"this is Legolas I'm bored This is Danaerys This is AshleyC" text_06 = text_02 text_07 = text_04 text_08 = u"Ashley (702)628-9035 XOXO . Aslll (702) 628-9035 XOXO Alppp 7026289035" text_09 = text_08 self.data['1'] = dict() self.data['1']['text'] = text_01 self.data['1']['rules'] = {"rules": [rule_01]} self.data['2'] = dict() self.data['2']['text'] = text_02 self.data['2']['rules'] = {"rules": [rule_02]} self.data['3'] = dict() self.data['3']['text'] = text_03 self.data['3']['rules'] = {"rules": [rule_03]} self.data['4'] = dict() self.data['4']['text'] = text_04 self.data['4']['rules'] = {"rules": [rule_04]} self.data['5'] = dict() self.data['5']['text'] = text_05 self.data['5']['rules'] = {"rules": [rule_05]} self.data['6'] = dict() self.data['6']['text'] = text_06 self.data['6']['rules'] = {"rules": [rule_06]} self.data['7'] = dict() self.data['7']['text'] = text_07 self.data['7']['rules'] = {"rules": [rule_07]} self.data['8'] = dict() self.data['8']['text'] = text_08 self.data['8']['rules'] = {"rules": [rule_08]} self.data['9'] = dict() self.data['9']['text'] = text_09 self.data['9']['rules'] = {"rules": [rule_09]} self.expected_data = dict() self.expected_data['1'] = dict() self.expected_data['1']['length'] = 3 self.expected_data['1']['results'] = ['Ashley', 'Alanda', 'Monica'] self.expected_data['2'] = dict() self.expected_data['2']['length'] = 2 self.expected_data['2']['results'] = ['Alison', 'Gimly'] self.expected_data['3'] = dict() self.expected_data['3']['length'] = 3 self.expected_data['3']['results'] = ['Sara', 'JILL', 'Jessie'] self.expected_data['4'] = dict() self.expected_data['4']['length'] = 1 self.expected_data['4']['results'] = ['Jessica'] self.expected_data['5'] = dict() self.expected_data['5']['length'] = 2 self.expected_data['5']['results'] = ['Legolas', 'Danaerys'] self.expected_data['6'] = dict() self.expected_data['6']['length'] = 1 self.expected_data['6']['results'] = ['Ashley'] self.expected_data['7'] = dict() self.expected_data['7']['length'] = 1 self.expected_data['7']['results'] = ['Jessica'] self.expected_data['8'] = dict() self.expected_data['8']['length'] = 2 self.expected_data['8']['results'] = ['Ashley', 'Aslll'] self.expected_data['9'] = dict() self.expected_data['9']['length'] = 1 self.expected_data['9']['results'] = ['Alppp'] def test_rules(self): for key in self.data.keys(): d = dict() d['text'] = self.data[key]['text'] d['simple_tokens_original_case'] = self.c.extract_tokens_from_crf( self.c.extract_crftokens(d['text'], lowercase=False)) config = dict() config['field_name'] = 'name' results = self.c.extract_using_custom_spacy( d, config, field_rules=self.data[key]['rules']) self.assertTrue(len(results) == self.expected_data[key]['length']) for r in results: self.assertTrue( r['value'] in self.expected_data[key]['results'])