def test_glossary_extractor(self) -> None: t = Tokenizer() g = ['New York', 'Shanghai', 'Los Angeles', 'Beijing'] ge = GlossaryExtractor(g, 'test_glossary', t, 2, False) text = 'i live in los angeles. my hometown is Beijing' tokens = t.tokenize(text) test_result = [i.value for i in ge.extract(tokens)] expected = ["Beijing", "Los Angeles"] self.assertEqual(test_result, expected)
def test_glossary_extractor(self) -> None: t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) ge = GlossaryExtractor(self.glossary_1, 'test_glossary', t, 3, False) results = [i.value for i in ge.extract(tokens)] expected = ['Beijing', 'los angeles', 'New York'] self.assertEqual(results, expected)
def test_case_sensitive(self) -> None: t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) g = ['Beijing', 'Los Angeles', 'New York', 'Shanghai'] ge = GlossaryExtractor(g, 'test_glossary', t, 2, True) results = [i.value for i in ge.extract(tokens)] expected = ['Beijing', 'New York'] self.assertEqual(results, expected)
def __init__(self, etk): ETKModule.__init__(self, etk) self.name_extractor = GlossaryExtractor( self.etk.load_glossary("./names.txt"), "name_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=1) self.student_extractor = GlossaryExtractor( self.etk.load_glossary("./student.txt"), "student_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=1)
def __init__(self, etk): ETKModule.__init__(self, etk) self.metadata_extractor = HTMLMetadataExtractor() self.content_extractor = HTMLContentExtractor() self.date_extractor = DateExtractor(self.etk, 'demo_date_parser') self.country_extractor = GlossaryExtractor( self.etk.load_glossary("${GLOSSARY_PATH}/countries.txt"), "country_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.cities_extractor = GlossaryExtractor( self.etk.load_glossary("${GLOSSARY_PATH}/cities.txt"), "cities_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3)
def setUp(self): self.text = 'Napoléon Bonaparte was a French statesman and military leader who rose to prominence during the ' \ 'French Revolution and led several successful campaigns during the French Revolutionary Wars. ' \ 'As Napoleon, he was Emperor of the French from 1804 until 1814, and again briefly in 1815 during ' \ 'the Hundred Days. Napoleon dominated European and global affairs for more than a decade while ' \ 'leading France against a series of coalitions in the Napoleonic Wars. He won most of these wars ' \ 'and the vast majority of his battles, building a large empire that ruled over continental Europe ' \ 'before its final collapse in 1815. He is considered one of the greatest commanders in history, ' \ 'and his wars and campaigns are studied at military schools worldwide. Napoleon\'s political and ' \ 'cultural legacy has endured as one of the most celebrated and controversial leaders in human history.' extractor = SpacyNerExtractor(extractor_name='spacy_ner_extractor') self.results = extractor.extract(self.text) glossary_1 = ['Beijing', 'Los Angeles', 'New York', 'Shanghai'] t = Tokenizer() text = 'i live in los angeles. my hometown is Beijing. I love New York City.' tokens = t.tokenize(text) ge = GlossaryExtractor(glossary_1, 'test_glossary', t, 3, False) self.results2 = ge.extract(tokens)
def __init__(self, etk): ETKModule.__init__(self, etk) self.my_table_extractor = TableExtractor() self.etk.parser = jex.parse file_name = '${GLOSSARY_PATH}/cities_ppl_25000.json' file = open(file_name, 'r') self.city_dataset = json.loads(file.read()) file.close() self.city_list = list(self.city_dataset.keys()) self.my_glossary_extractor = GlossaryExtractor( glossary=self.city_list, extractor_name='tutorial_glossary', tokenizer=etk.default_tokenizer, ngrams=3, case_sensitive=False)
def test_etk_crf_glossary_extraction(self): etk = ETK(use_spacy_tokenizer=False) s = time.time() city_extractor = GlossaryExtractor( ['los angeles', 'new york', 'angeles'], 'city_extractor', etk.default_tokenizer, case_sensitive=False, ngrams=3) doc_json = { 'text': 'i live in los angeles. my hometown is Beijing. I love New York City.' } doc = Document(etk, cdr_document=doc_json, mime_type='json', url='', doc_id='1') t_segments = doc.select_segments("$.text") for t_segment in t_segments: extracted_cities = doc.extract(city_extractor, t_segment) for extracted_city in extracted_cities: self.assertTrue(extracted_city.value in ['los angeles', 'New York', 'angeles'])
def test_Provenance(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) self.etk = ETK(kg_schema=kg_schema) g = [ 'runqi', 'sylvia', 'dongyu', 'mayank', 'pedro', 'amandeep', 'yixiang' ] self.name_extractor = GlossaryExtractor(g, "name_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=1) doc = self.etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): names = doc.extract(self.name_extractor, d) p.store(names, "members") expected_provenances = [{ "@id": 0, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 33, "end_char": 38 } }, { "@id": 1, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 40, "end_char": 46 } }, { "@id": 2, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 48, "end_char": 54 } }, { "@id": 3, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 56, "end_char": 64 } }, { "@id": 4, "@type": "storage_provenance_record", "doc_id": None, "field": None, "destination": "projects.[0].members", "parent_provenances": { "Runqi": 0, "Dongyu": 1, "Sylvia": 2, "Amandeep": 3 } }, { "@id": 5, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 39, "end_char": 44 } }, { "@id": 6, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 46, "end_char": 52 } }, { "@id": 7, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 54, "end_char": 61 } }, { "@id": 8, "@type": "storage_provenance_record", "doc_id": None, "field": None, "destination": "projects.[1].members", "parent_provenances": { "Pedro": 5, "Mayank": 6, "Yixiang": 7 } }] expected_projects = [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.", "members": ["Runqi", "Dongyu", "Sylvia", "Amandeep"] }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.", "members": ["Pedro", "Mayank", "Yixiang"] }] #print ("hiiiiiiiiiiiiiiiii") #print ("projects: " + str(doc.value["projects"])) #print ("provenances: " + str(doc.value["provenances"])) self.assertEqual(expected_projects, doc.value["projects"]) self.assertEqual(expected_provenances, doc.value["provenances"])
def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'acled_date_parser') self.country_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/countries.json.gz", read_json=True), "country_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.states_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/states_usa_canada.json.gz", read_json=True), "states_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.cities_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/cities.json.gz", read_json=True), "cities_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.csv_processor = CsvProcessor(etk=etk, heading_row=1) self.interaction_decoding_dict = { "10": "Sole Military Action", "11": "Military Versus Military", "12": "Military Versus Rebels", "13": "Military Versus Political Militia", "14": "Military Versus Communal Militia", "15": "Military Versus Rioters", "16": "Military Versus Protesters", "17": "Military Versus Civilians", "18": "Military Versus Other", "20": "Sole Rebel Action", "22": "Rebels Versus Rebels", "23": "Rebels Versus Political Militia", "24": "Rebels Versus Communal Militia", "25": "Rebels Versus Rioters", "26": "Rebels Versus Protesters", "27": "Rebels Versus Civilians", "28": "Rebels Versus Other", "30": "Sole Political Militia Action", "33": "Political Militia Versus Political Militia", "34": "Political Militia Versus Communal Militia", "35": "Political Militia Versus Rioters", "36": "Political Militia Versus Protesters", "37": "Political Militia Versus Civilians", "38": "Political Militia Versus Other", "40": "Sole Communal Militia Action", "44": "Communal Militia Versus Communal Militia", "45": "Communal Militia Versus Rioters", "46": "Communal Militia Versus Protesters", "47": "Communal Militia Versus Civilians", "48": "Communal Militia Versus Other", "50": "Sole Rioter Action", "55": "Rioters Versus Rioters", "56": "Rioters Versus Protesters", "57": "Rioters Versus Civilians", "58": "Rioters Versus Other", "60": "Sole Protester Action", "66": "Protesters Versus Protesters", "68": "Protesters Versus Other", "78": "Other Actor Versus Civilians", "80": "Sole Other Action" } self.interaction_decoder = DecodingValueExtractor( self.interaction_decoding_dict, 'default_decoding', case_sensitive=True)
"description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students." } ], "members": [ { "name": "Dongyu Li", "description": "03/05/2018: I went to USC on Aug 20th, 2016 and will graduate on 2018, May 11. My birthday is 29-04-1994." } ] } etk = ETK() doc = etk.create_document(sample_input) # example for glossary extractor: name_extractor = GlossaryExtractor(etk.load_glossary("./names.txt"), "name_extractor", etk.default_tokenizer, case_sensitive=False, ngrams=1) descriptions = doc.select_segments("projects[*].description") projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): print ("Iam d path: " + d.full_path) names = doc.invoke_extractor(name_extractor, d) p.store_extractions(names, "members") # example for date extractor: date_extractor = DateExtractor('test_date_parser') member_descriptions = doc.select_segments("members[*].description") members = doc.select_segments("members[*]") for m_d, m in zip(member_descriptions, members):