class TestExtractionsUsingSpacy(unittest.TestCase): def setUp(self): e_config = { 'data_extraction': [{ 'input_path': 'text.`parent`', 'fields': { "posting_date": { "extractors": { "extract_using_spacy": { "config": {} } } }, "age": { "extractors": { "extract_using_spacy": { "config": {} } } }, "social_media": { "extractors": { "extract_using_spacy": { "config": {} } } }, "address": { "extractors": { "extract_using_spacy": { "config": {} } } } } }] } self.c = Core(extraction_config=e_config, load_spacy=True) self.ground_truth = dict() ground_truth_files = { "age": os.path.join(os.path.dirname(__file__), "ground_truth/age.jl"), "date": os.path.join(os.path.dirname(__file__), "ground_truth/date.jl"), "social_media": os.path.join(os.path.dirname(__file__), "ground_truth/social_media.jl"), "address": os.path.join(os.path.dirname(__file__), "ground_truth/address.jl") } for extractor, file_name in ground_truth_files.items(): with open(file_name, 'r') as f: test_data = f.read().split('\n') self.ground_truth[extractor] = list() for test_case in test_data: self.ground_truth[extractor].append(json.loads(test_case)) def test_spacy_extractions(self): # Date extractor for t in self.ground_truth['date']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'posting_date'} d = {'simple_tokens': crf_tokens} extracted_dates = self.c.extract_using_spacy(d, extraction_config) extracted_dates = [date['value'] for date in extracted_dates] correct_dates = t['extracted'] self.assertEquals(extracted_dates, correct_dates) # Age extractor for t in self.ground_truth['age']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'age'} d = {'simple_tokens': crf_tokens} extracted_ages = self.c.extract_using_spacy(d, extraction_config) extracted_ages = [match['value'] for match in extracted_ages] if len(extracted_ages) == 0 and len(t['correct']) == 0: self.assertFalse(extracted_ages) self.assertEquals(sorted(extracted_ages), sorted(t['correct'])) # Social media extractor for t in self.ground_truth['social_media']: for social_media in t['correct']: t['correct'][social_media] = [ h.lower() for h in t['correct'][social_media] ] crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'social_media'} d = {'simple_tokens': crf_tokens} extracted_social_media_handles = self.c.extract_using_spacy( d, extraction_config) extracted_handles = dict() for match in extracted_social_media_handles: social_network = match['metadata']['social_network'] if social_network not in extracted_handles: extracted_handles[social_network] = [match['value']] else: extracted_handles[social_network].append(match['value']) if len(extracted_social_media_handles) == 0 and len( t['correct']) == 0: self.assertFalse(extracted_social_media_handles) self.assertEquals(extracted_handles, t['correct']) # Address extractor for t in self.ground_truth['address']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'address'} d = {'simple_tokens': crf_tokens} extracted_addresses = self.c.extract_using_spacy( d, extraction_config) extracted_addresses = [ address['value'] for address in extracted_addresses ] correct_addresses = t['extracted'] self.assertEquals(extracted_addresses, correct_addresses) # Extract using config # Date extractor for t in self.ground_truth['date']: r = self.c.process(t) if 'data_extraction' in r: extracted_dates = [ x['value'] for x in r['data_extraction']['posting_date'] ['extract_using_spacy']['results'] ] else: extracted_dates = [] correct_dates = t['extracted'] self.assertEquals(extracted_dates, correct_dates) # Age extractor for t in self.ground_truth['age']: r = self.c.process(t) if 'data_extraction' in r: extracted_ages = [ x['value'] for x in r['data_extraction']['age'] ['extract_using_spacy']['results'] ] else: extracted_ages = [] self.assertEquals(sorted(extracted_ages), sorted(t['correct'])) # Social media extractor for t in self.ground_truth['social_media']: for social_media in t['correct']: t['correct'][social_media] = [ h.lower() for h in t['correct'][social_media] ] extracted_social_media_handles = self.c.process(t) if 'data_extraction' in extracted_social_media_handles: extracted_social_media_handles = [ x for x in extracted_social_media_handles['data_extraction'] ['social_media']['extract_using_spacy']['results'] ] else: extracted_social_media_handles = [] extracted_handles = dict() for match in extracted_social_media_handles: social_network = match['metadata']['social_network'] if social_network not in extracted_handles: extracted_handles[social_network] = [match['value']] else: extracted_handles[social_network].append(match['value']) if len(extracted_social_media_handles) == 0 and len( t['correct']) == 0: self.assertFalse(extracted_social_media_handles) self.assertEquals(extracted_handles, t['correct']) # Address extractor for t in self.ground_truth['address']: r = self.c.process(t) if 'data_extraction' in r: extracted_addresses = [ x['value'] for x in r['data_extraction']['address'] ['extract_using_spacy']['results'] ] else: extracted_addresses = [] correct_addresses = t['extracted'] self.assertEquals(extracted_addresses, correct_addresses)
class TestExtractionsUsingSpacy(unittest.TestCase): def setUp(self): e_config = { 'data_extraction': [{ 'input_path': 'text.`parent`', 'fields': { "posting_date": { "extractors": { "extract_using_spacy": { "config": {} } } }, "age": { "extractors": { "extract_using_spacy": { "config": {} } } }, "social_media": { "extractors": { "extract_using_spacy": { "config": {} } } }, "address": { "extractors": { "extract_using_spacy": { "config": {} } } }, "email": { "extractors": { "extract_using_spacy": { "config": {} } } } } }] } self.c = Core(extraction_config=e_config, load_spacy=True) self.ground_truth = dict() ground_truth_files = { "age": os.path.join(os.path.dirname(__file__), "ground_truth/age.jl"), "date": os.path.join(os.path.dirname(__file__), "ground_truth/date.jl"), "social_media": os.path.join(os.path.dirname(__file__), "ground_truth/social_media.jl"), "address": os.path.join(os.path.dirname(__file__), "ground_truth/address.jl"), "email": os.path.join(os.path.dirname(__file__), "ground_truth/email.jl") } for extractor, file_name in ground_truth_files.items(): with open(file_name, 'r') as f: test_data = f.read().split('\n') self.ground_truth[extractor] = list() for test_case in test_data: self.ground_truth[extractor].append(json.loads(test_case)) @staticmethod def create_list_from_kg(extractions): results = list() for e in extractions: ps = e['provenance'] if not isinstance(ps, list): ps = [ps] for p in ps: results.append(p['extracted_value']) return results @staticmethod def create_list_from_social_media(extractions): results = dict() for e in extractions: ps = e['provenance'] if not isinstance(ps, list): ps = [ps] for p in ps: x = p['qualifiers']['social_network'] results[x] = [p['extracted_value']] return results def test_spacy_extractions(self): # Date extractor for t in self.ground_truth['date']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'posting_date'} d = {'simple_tokens': crf_tokens} extracted_dates = self.c.extract_using_spacy(d, extraction_config) extracted_dates = [date['value'] for date in extracted_dates] correct_dates = t['extracted'] self.assertEquals(extracted_dates, correct_dates) # Age extractor for t in self.ground_truth['age']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'age'} d = {'simple_tokens': crf_tokens} extracted_ages = self.c.extract_using_spacy(d, extraction_config) extracted_ages = [match['value'] for match in extracted_ages] if len(extracted_ages) == 0 and len(t['correct']) == 0: self.assertFalse(extracted_ages) self.assertEquals(sorted(extracted_ages), sorted(t['correct'])) # Social media extractor for t in self.ground_truth['social_media']: for social_media in t['correct']: t['correct'][social_media] = [ h.lower() for h in t['correct'][social_media] ] crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'social_media'} d = {'simple_tokens': crf_tokens} extracted_social_media_handles = self.c.extract_using_spacy( d, extraction_config) extracted_handles = dict() for match in extracted_social_media_handles: social_network = match['metadata']['social_network'] if social_network not in extracted_handles: extracted_handles[social_network] = [match['value']] else: extracted_handles[social_network].append(match['value']) if len(extracted_social_media_handles) == 0 and len( t['correct']) == 0: self.assertFalse(extracted_social_media_handles) self.assertEquals(extracted_handles, t['correct']) # Address extractor for t in self.ground_truth['address']: crf_tokens = self.c.extract_tokens_from_crf( self.c.extract_crftokens(t['text'])) extraction_config = {'field_name': 'address'} d = {'simple_tokens': crf_tokens} extracted_addresses = self.c.extract_using_spacy( d, extraction_config) extracted_addresses = [ address['value'] for address in extracted_addresses ] correct_addresses = t['extracted'] self.assertEquals(extracted_addresses, correct_addresses) # Extract using config # Date extractor for t in self.ground_truth['date']: r = self.c.process(t) if 'knowledge_graph' in r: extracted_dates = self.create_list_from_kg( r["knowledge_graph"]['posting_date']) else: extracted_dates = [] correct_dates = t['extracted'] self.assertEquals(extracted_dates, correct_dates) # Age extractor for t in self.ground_truth['age']: r = self.c.process(t) if 'knowledge_graph' in r: extracted_ages = self.create_list_from_kg( r["knowledge_graph"]['age']) else: extracted_ages = [] self.assertEquals(sorted(extracted_ages), sorted(t['correct'])) # Email extractor for t in self.ground_truth['email']: r = self.c.process(t) if 'knowledge_graph' in r: extracted_ages = self.create_list_from_kg( r["knowledge_graph"]['email']) else: extracted_ages = [] self.assertEquals(sorted(extracted_ages), sorted(t['correct'])) # Social media extractor for t in self.ground_truth['social_media']: for social_media in t['correct']: t['correct'][social_media] = [ h.lower() for h in t['correct'][social_media] ] extracted_social_media_handles = self.c.process(t) if 'knowledge_graph' in extracted_social_media_handles: extracted_social_media_handles = self.create_list_from_social_media( extracted_social_media_handles["knowledge_graph"] ['social_media']) else: extracted_social_media_handles = {} if len(extracted_social_media_handles) == 0 and len( t['correct']) == 0: self.assertFalse(extracted_social_media_handles) self.assertEquals(extracted_social_media_handles, t['correct']) # Address extractor for t in self.ground_truth['address']: r = self.c.process(t) if 'knowledge_graph' in r: extracted_addresses = self.create_list_from_kg( r["knowledge_graph"]['address']) else: extracted_addresses = [] correct_addresses = t['extracted'] self.assertEquals(extracted_addresses.sort(), correct_addresses.sort()) def test_spacy_date(self): doc = { "url": "http://date.test.com", "doc_id": "12344", "content_extraction": { "useful_text": { "text": u"Alert: Tue, 2006-02-07" } } } e_config = { "document_id": "doc_id", 'data_extraction': [{ "fields": { "event_date": { "extractors": { "extract_using_spacy": { "config": { "post_filter": "parse_date" } } } } }, "input_path": ["content_extraction.useful_text.text.`parent`"] }] } core = Core(extraction_config=e_config) r = core.process(doc) kg = r['knowledge_graph'] self.assertTrue('event_date' in kg) self.assertEqual(kg['event_date'][0]['value'], '2006-02-07T00:00:00')