def test_EmailExtractor(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema, use_spacy_tokenizer=True) text = "[email protected] [email protected] " \ "[email protected] [email protected] E-mail:[email protected] [email protected]" email_extractor = EmailExtractor(nlp=etk.default_nlp, tokenizer=etk.default_tokenizer, extractor_name="email_extractor") extractions = email_extractor.extract(text) extracted = [] for i in extractions: extracted_value = { "value": i.value, "start_char": i.provenance["start_char"], "end_char": i.provenance["end_char"], "value_from_text": text[i.provenance["start_char"]:i.provenance["end_char"]] } extracted.append(extracted_value) self.assertEqual(extracted_value["value"], extracted_value["value_from_text"]) expected = [{ 'value': '*****@*****.**', 'start_char': 97, 'end_char': 122, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 0, 'end_char': 16, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 77, 'end_char': 96, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 17, 'end_char': 40, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 51, 'end_char': 68, 'value_from_text': '*****@*****.**' }] self.assertEqual(sorted(expected, key=lambda x: x["start_char"]), sorted(extracted, key=lambda x: x["start_char"]))
def test_segment(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema) doc = etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") description_value = [i.value for i in descriptions] expected = [ 'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.', 'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.' ] self.assertEqual(description_value, expected)
# Record the country of this actor doc.kg.add_value("country", json_path="$.Side") # Add a title to the actor document doc.kg.add_value("title", json_path="$.Side") # Return an empty list because we didn't create new documents return [] # The main is for testing, and is not used in the DIG pipeline if __name__ == "__main__": # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema. kg_schema = KGSchema(json.load(open('master_config.json'))) # Instantiate ETK, with the two processing modules and the schema. etk = ETK(modules=[UCDPModule, UCDPActorModule], kg_schema=kg_schema) # Create a CSV processor to create documents for the relevant rows in the Excel sheet cp = CsvProcessor(etk=etk, heading_row=1) with open("ucdp.jl", "w") as f: # Iterate over all the rows in the spredsheet for doc in cp.tabular_extractor(filename="ucdp_sample.xls", dataset='ucdp'): # Each row produces a document, which we sent to ETK. # Note that each invocation of process_ems will also process any new documents created while # processing each doc etk.process_and_frame(doc) f.write(json.dumps(doc.cdr_document) + "\n")
import unittest, json from etk.csv_processor import CsvProcessor from etk.etk import ETK from etk.knowledge_graph_schema import KGSchema csv_str = """text,with,Polish,non-Latin,lettes 1,2,3,4,5,6 a,b,c,d,e,f gęś,zółty,wąż,idzie,wąską,dróżką, ,b,c,s,w,f """ kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema) class TestCsvProcessor(unittest.TestCase): def test_csv_str_with_all_args(self) -> None: csv_processor = CsvProcessor(etk=etk, heading_row=1, content_start_row=2, heading_columns=(1, 3), content_end_row=3, ends_with_blank_row=True, remove_leading_empty_rows=True, required_columns=['text']) test_docs = [
if extractions: path = '$."' + \ extractions[0].value + '"[?(@.country == "Italy")]' jsonpath_expr = jex.parse(path) city_match = jsonpath_expr.find(self.city_dataset) if city_match: # add corresponding values of city_dataset into knowledge graph of the doc for field in city_match[0].value: doc.kg.add_value( field, value=city_match[0].value[field]) new_docs.append(doc) return new_docs def document_selector(self, doc) -> bool: return doc.cdr_document.get("dataset") == "italy_team" if __name__ == "__main__": # url = 'https://en.wikipedia.org/wiki/List_of_football_clubs_in_Italy' cdr = json.load( open('./resources/italy_teams.json', mode='r', encoding='utf-8')) kg_schema = KGSchema(json.load(open('./resources/master_config.json'))) etk = ETK(modules=ItalyTeamsModule, kg_schema=kg_schema) etk.parser = jex.parse cdr_doc = Document(etk, cdr_document=cdr, mime_type='json', url=cdr['url']) results = etk.process_ems(cdr_doc)[1:] print('Total docs:', len(results)) print("Sample result:\n") print(json.dumps(results[0].value, indent=2))
def test_Provenance(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) self.etk = ETK(kg_schema=kg_schema) g = [ 'runqi', 'sylvia', 'dongyu', 'mayank', 'pedro', 'amandeep', 'yixiang' ] self.name_extractor = GlossaryExtractor(g, "name_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=1) doc = self.etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): names = doc.extract(self.name_extractor, d) p.store(names, "members") expected_provenances = [{ "@id": 0, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 33, "end_char": 38 } }, { "@id": 1, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 40, "end_char": 46 } }, { "@id": 2, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 48, "end_char": 54 } }, { "@id": 3, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 56, "end_char": 64 } }, { "@id": 4, "@type": "storage_provenance_record", "doc_id": None, "field": None, "destination": "projects.[0].members", "parent_provenances": { "Runqi": 0, "Dongyu": 1, "Sylvia": 2, "Amandeep": 3 } }, { "@id": 5, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 39, "end_char": 44 } }, { "@id": 6, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 46, "end_char": 52 } }, { "@id": 7, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 54, "end_char": 61 } }, { "@id": 8, "@type": "storage_provenance_record", "doc_id": None, "field": None, "destination": "projects.[1].members", "parent_provenances": { "Pedro": 5, "Mayank": 6, "Yixiang": 7 } }] expected_projects = [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.", "members": ["Runqi", "Dongyu", "Sylvia", "Amandeep"] }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.", "members": ["Pedro", "Mayank", "Yixiang"] }] #print ("hiiiiiiiiiiiiiiiii") #print ("projects: " + str(doc.value["projects"])) #print ("provenances: " + str(doc.value["provenances"])) self.assertEqual(expected_projects, doc.value["projects"]) self.assertEqual(expected_provenances, doc.value["provenances"])
# Country, refers to the affiliation, being mapped to country of actor, losing the distinction. doc.kg.add_value("country", json_path="$.ActorCountryCode") doc.kg.add_value("country", value=doc.extract( self.country_decoder, doc.select_segments("$.ActorCountryCode")[0])) # Note: not mapping the Actor Geo codes, because Pedro doesn't understand what they mean. return list() if __name__ == "__main__": # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema. kg_schema = KGSchema(json.load(open('../events_ucdp/master_config.json'))) # Instantiate ETK, with the two processing modules and the schema. etk = ETK(modules=[GdeltModule, GdeltActorModule], kg_schema=kg_schema) # Create a CSV processor to create documents for the relevant rows in the TSV file cp = CsvProcessor(etk=etk, heading_columns=(1, len(GdeltModule.header_fields)), column_name_prefix="COL") with open("gdelt.jl", "w") as f: # Iterate over all the rows in the spredsheet for d in cp.tabular_extractor(filename="20170912.export_sample.tsv", dataset='gdelt'): for result in etk.process_ems(d): # print(d.cdr_document)
return doc.cdr_document.get( "dataset") == "lake_chad_basin_displaced_victim" def process_document(self, doc: Document) -> List[Document]: doc.kg.add_value("size", json_path="total") doc.kg.add_value("type", json_path="type") return list() if __name__ == "__main__": dir_path = sys.argv[1] master_config_path = sys.argv[2] file_name = 'lake_chad_basin_displaced.csv' input_path = os.path.join(dir_path, file_name) output_path = os.path.join(dir_path, file_name + '.jl') kg_schema = KGSchema(json.load(open(master_config_path))) etk = ETK(modules=[ LakeChadBasinDisplacedModule, LakeChadBasinDisplacedVictimModule, LCBPlaceModule ], kg_schema=kg_schema) cp = CsvProcessor(etk=etk, heading_row=1, content_start_row=3) with open(output_path, "w") as f: print(input_path, output_path) for doc in cp.tabular_extractor(filename=input_path, dataset='lake_chad_basin_displaced'): etk.process_and_frame(doc) f.write(json.dumps(doc.cdr_document) + "\n")