def process_ems(self, doc: Document) -> List[Document]: """ Factory method to wrap input JSON docs in an ETK Document object. Args: doc (Document): process on this document Returns: a Document object and a KnowledgeGraph object """ new_docs = list() for a_em in self.em_lst: if a_em.document_selector(doc): self.log(" processing with " + str(type(a_em)) + ". Process", "info", doc.doc_id, doc.url) fresh_docs = a_em.process_document(doc) # Allow ETKModules to return nothing in lieu of an empty list (people forget to return empty list) if fresh_docs: new_docs.extend(fresh_docs) # try: # if a_em.document_selector(doc): # self.log(" processing with " + str(type(a_em)) + ". Process", "info", doc.doc_id, doc.url) # new_docs.extend(a_em.process_document(doc)) # except Exception as e: # if self.error_policy == ErrorPolicy.THROW_EXTRACTION: # self.log(str(e) + " processing with " + str(type(a_em)) + ". Continue", "error", doc.doc_id, # doc.url) # continue # if self.error_policy == ErrorPolicy.THROW_DOCUMENT: # self.log(str(e) + " processing with " + str(type(a_em)) + ". Throw doc", "error", doc.doc_id, # doc.url) # return list() # if self.error_policy == ErrorPolicy.RAISE: # self.log(str(e) + " processing with " + str(type(a_em)), "error", doc.doc_id, doc.url) # raise e # Do house cleaning. doc.insert_kg_into_cdr() if not self.generate_json_ld: if "knowledge_graph" in doc.cdr_document: doc.cdr_document["knowledge_graph"].pop("@context", None) Utility.make_json_serializable(doc.cdr_document) if self.output_kg_only: doc = doc.kg.value elif not doc.doc_id: doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document) results = [doc] for new_doc in new_docs: results.extend(self.process_ems(new_doc)) return results
def build_knowledge_graph(self, json_ontology: dict) -> List: """ The idea if to be able to build a json knowledge graph from a json like ontology representation, eg: kg_object_ontology = { "uri": doc.doc_id, "country": doc.select_segments("$.Location"), "type": [ "Event", doc.extract(self.incomp_decoder, doc.select_segments("$.Incomp")[0]), doc.extract(self.int_decoder, doc.select_segments("$.Int")[0]) ] } Currently json ontology representation is supported, might add new ways later Args: json_ontology: a json ontology representation of a knowledge graph Returns: returns a list of nested documents created """ nested_docs = list() if json_ontology: for key in list(json_ontology): j_values = json_ontology[key] if not isinstance(j_values, list): j_values = [j_values] for j_value in j_values: if not isinstance(j_value, dict): if self.kg: if key not in ['doc_id', 'uri']: self.kg.add_value(key, value=j_value) else: """Now we have to create a nested document, assign it a doc_id and add the doc_id to parent document's knowledge graph""" child_doc_id = None if 'uri' in j_value: child_doc_id = j_value['uri'] elif 'doc_id' in j_value: child_doc_id = j_value['doc_id'] child_doc = Document(self.etk, cdr_document=dict(), mime_type='json', url='') nested_docs.extend( child_doc.build_knowledge_graph(j_value)) if not child_doc_id: child_doc_id = Utility.create_doc_id_from_json( child_doc.kg._kg) if self.kg: self.kg.add_value(key, value=child_doc_id) child_doc.cdr_document["doc_id"] = child_doc_id nested_docs.append(child_doc) return nested_docs
def process_document(self, doc: Document) -> List[Document]: nested_docs = list() doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document) doc.cdr_document[ "title"] = "{Total} Displaced from {ReportedLocation} in {Country}".format( Total=doc.cdr_document["Total"], ReportedLocation=doc.cdr_document["ReportedLocation"], Country=doc.cdr_document["Country"]) doc.cdr_document["dataset"] = "lake_chad_basin_displaced" place = { "uri": '{}_place'.format(doc.doc_id), "doc_id": '{}_place'.format(doc.doc_id), "country": doc.cdr_document.get("Country", ''), "dataset": "lcb_place" } place_doc = etk.create_document(place) nested_docs.append(place_doc) doc.kg.add_value("place", value='{}_place'.format(doc.doc_id)) # Add event_date to the KG extracted_dates = self.date_extractor.extract( doc.cdr_document.get('Period', '')) doc.kg.add_value("event_date", value=extracted_dates) doc.kg.add_value("event_date_end", value=extracted_dates) doc.kg.add_value("location", json_path="ReportedLocation") doc.kg.add_value( "causeex_class", value= "http://ontology.causeex.com/ontology/odps/EventHierarchy#ForcedMove" ) doc.kg.add_value("type", value=["event", "Displacement Event"]) doc.kg.add_value("title", json_path="title") victim = { "dataset": "lake_chad_basin_displaced_victim", "total": doc.cdr_document["Total"], "type": ["Group", "Displaced People"], "uri": '{}_victim'.format(doc.doc_id) } victim_doc = etk.create_document(victim) victim_doc.doc_id = '{}_victim'.format(doc.doc_id) doc.kg.add_value("victim", value=victim_doc.doc_id) nested_docs.append(victim_doc) return nested_docs
def process_document(self, doc: Document) -> List[Document]: nested_docs = list() # pyexcel produces dics with date objects, which are not JSON serializable, fix that. Utility.make_json_serializable(doc.cdr_document) # Add an ID based on the full contents of the raw document doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document) # Create a CDR document for an actor, we only put the SideA attribute in it, # and we give it a new dataset identifier so we can match it in an ETKModule actor1_dict = { "Side": doc.cdr_document["SideA"], "dataset": "ucdp-actor" } actor1_doc = etk.create_document(actor1_dict) # Create a doc_id for the actor document, from the doc_id of the event document actor1_doc.doc_id = doc.doc_id + "_actor1" # Now do the exact same thing for SideB actor2_dict = { "Side": doc.cdr_document["SideB"], "dataset": "ucdp-actor" } actor2_doc = etk.create_document(actor2_dict) actor2_doc.doc_id = doc.doc_id + "_actor2" kg_object_old_ontology = { "uri": doc.doc_id, "place":{ "uri": doc.doc_id + "_place", "doc_id": doc.doc_id + "_place", "country": doc.select_segments("$.Location"), "type": ["Place"] }, "type": [ "Event", doc.extract(self.incomp_decoder, doc.select_segments("$.Incomp")[0]), doc.extract(self.int_decoder, doc.select_segments("$.Int")[0]) ], "title": "{}/{} armed conflict in {}".format( doc.cdr_document["SideA"], doc.cdr_document["SideB"], doc.cdr_document["YEAR"] ), "causeex_class": [ doc.extract(self.int_causeex_decoder, doc.select_segments("$.Int")[0]), self.event_prefix + "ArmedConflict" ], "event_date": doc.select_segments("$.StartDate"), "event_date_end": doc.select_segments("$.EpEndDate"), "fatalities": { "uri": doc.doc_id + "_fatalities", "title": doc.extract(self.int_fatalities_decoder, doc.select_segments("$.Int")[0]), "type": ["Group", "Dead People"], "min_size": doc.extract(self.int_fatalities_size_lower_decoder, doc.select_segments("$.Int")[0]), "max_size": doc.extract(self.int_fatalities_size_upper_decoder, doc.select_segments("$.Int")[0]) }, "actor": [actor1_doc.doc_id, actor2_doc.doc_id] } ds = doc.build_knowledge_graph(kg_object_old_ontology) nested_docs.extend(ds) # Return the list of new documents that we created to be processed by ETK. # Note that fatalities_dco is in the list as it is a newly created document. It does not have an # extraction module, so it will be passed to the output unchanged. nested_docs.append(actor1_doc) nested_docs.append(actor2_doc) return nested_docs
def process_document(self, doc: Document) -> List[Document]: # pyexcel produces dics with date objects, which are not JSON serializable, fix that. Utility.make_json_serializable(doc.cdr_document) # Add an ID based on the full contents of the raw document doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document) # Map location to country doc.kg.add_value("country", json_path="$.Location") # map incomp to type, after using a decoding dict doc.kg.add_value("type", value=doc.extract(self.incomp_decoder, doc.select_segments("$.Incomp")[0])) # map Int to type, also after using a decoding dict doc.kg.add_value("type", value=doc.extract(self.int_decoder, doc.select_segments("$.Int")[0])) # Add "Event" to type, as all these documents are events doc.kg.add_value("type", value="Event") # Add a title to our event doc.kg.add_value("title", value="{}/{} armed conflict in {}".format( doc.cdr_document["SideA"], doc.cdr_document["SideB"], doc.cdr_document["YEAR"] )) # Add the specific CauseEx ontology classes that we want to use for this event doc.kg.add_value("causeex_class", value=doc.extract(self.int_causeex_decoder, doc.select_segments("$.Int")[0])) doc.kg.add_value("causeex_class", value=self.event_prefix+"ArmedConflict") # Map dates to event_date doc.kg.add_value("event_date", json_path="$.StartDate") doc.kg.add_value("event_date_end", json_path="$.EpEndDate") # Create a CDR document for an actor, we only put the SideA attribute in it, # and we give it a new dataset identifier so we can match it in an ETKModule actor1_dict = { "Side": doc.cdr_document["SideA"], "dataset": "ucdp-actor" } actor1_doc = etk.create_document(actor1_dict) # Create a doc_id for the actor document, from the doc_id of the event document actor1_doc.doc_id = doc.doc_id + "_actor1" # Record the identifier of the actor object in the "actor" field of the event. doc.kg.add_value("actor", value=actor1_doc.doc_id) # Now do the exact same thing for SideB actor2_dict = { "Side": doc.cdr_document["SideB"], "dataset": "ucdp-actor" } actor2_doc = etk.create_document(actor2_dict) actor2_doc.doc_id = doc.doc_id + "_actor2" doc.kg.add_value("actor", value=actor2_doc.doc_id) # Create a fatalities object to record information about the fatalities in the conflict # Instead of creating an ETK module for it, it is possible to do it inline. fatalities_doc = etk.create_document({"Int": doc.cdr_document["Int"]}) fatalities_doc.doc_id = doc.doc_id + "_fatalities" doc.kg.add_value("fatalities", value=fatalities_doc.doc_id) fatalities_doc.kg.add_value( "title", fatalities_doc.extract(self.int_fatalities_decoder, fatalities_doc.select_segments("$.Int")[0])) fatalities_doc.kg.add_value("type", value=["Group", "Dead People"]) fatalities_doc.kg.add_value( "size_lower_bound", value=fatalities_doc.extract(self.int_fatalities_size_lower_decoder, fatalities_doc.select_segments("$.Int")[0])) fatalities_doc.kg.add_value( "size_upper_bound", value=fatalities_doc.extract(self.int_fatalities_size_upper_decoder, fatalities_doc.select_segments("$.Int")[0])) # Return the list of new documents that we created to be processed by ETK. # Note that fatalities_dco is in the list as it is a newly created document. It does not have an # extraction module, so it will be passed to the output unchanged. return [ actor1_doc, actor2_doc, fatalities_doc ]
def process_document(self, doc: Document) -> List[Document]: nested_docs = list() json_doc = doc.cdr_document filename = json_doc.get('file_name') doc.doc_id = Utility.create_doc_id_from_json(json_doc) doc.cdr_document['uri'] = doc.doc_id doc.kg.add_value("type", value="Event") doc.kg.add_value("type", value="Act of Terrorism") doc.kg.add_value("provenance_filename", value=filename) for attack_type_code in attack_type_fields_code: ac = json_doc.get(attack_type_code, '') if ac != "": doc.kg.add_value("causeex_class", value=doc.extract( self.causeex_decoder, doc.select_segments( "$.{}".format(attack_type_code))[0])) # Add event_date to the KG extracted_dates = self.date_extractor.extract('{}-{}-{}'.format( json_doc.get('iyear'), json_doc.get('imonth'), json_doc.get('iday'))) if len(extracted_dates) > 0: doc.kg.add_value("event_date", value=extracted_dates) doc.kg.add_value("event_date_end", value=extracted_dates) else: # no proper date mentioned in the event, try the approximate date approximate_date_txt = json_doc.get("approxdate") extracted_approx_dates = self.date_extractor.extract( approximate_date_txt) if len(extracted_approx_dates) > 0: doc.kg.add_value("event_date", value=extracted_approx_dates) doc.kg.add_value("event_date_end", value=extracted_approx_dates) # summary, aka description only available for incident after 1997 doc.kg.add_value("description", json_path="$.summary") # add inclusion criteria: why is this incident regarded as a terrorist incident # TODO: ADD this to master_config crit1 = json_doc.get('crit1', 0) if crit1 == 1: doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_1) crit2 = json_doc.get('crit2', 0) if crit2 == 1: doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_2) crit3 = json_doc.get('crit3', 0) if crit3 == 1: doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_3) # add related events to KG # TODO: ADD this to master_config related_event_ids_txt = json_doc.get('related', '') if related_event_ids_txt.strip() != "": related_event_ids = related_event_ids_txt.split(',') if len(related_event_ids) > 0: doc.kg.add_value("related_events", value=related_event_ids) # add attack information, on second thoughts, this qualifies as event type for attack_type_field in attack_type_fields: doc.kg.add_value("type", value=json_doc.get(attack_type_field, '')) # TODO check the following 2 if json_doc.get("suicide", 0) == 1: doc.kg.add_value("type", value='Suicide') if json_doc.get("success", 0) == 1: doc.kg.add_value("type", value='Success') # create nested objects for places place_object = dict() for place_field in place_fields: place_object[place_field] = json_doc.get(place_field) place_object["dataset"] = "gtd_place" place_doc_id = '{}_place'.format(doc.doc_id) place_object['uri'] = place_doc_id place_object['filename'] = filename place_doc = etk.create_document(place_object) place_doc.doc_id = place_doc_id doc.kg.add_value("place", value=place_doc.doc_id) nested_docs.append(place_doc) # create victim objects, there can be upto 3 if json_doc.get('targtype1_txt', '').strip(): victim1_object = dict() victim1_object['dataset'] = 'gtd_victim' victim1_object['filename'] = filename victim1_object['victim_type'] = list() victim1_object['victim_type'].append(json_doc.get('targtype1_txt')) if json_doc.get('targsubtype1_txt', ''): victim1_object['victim_type'].append( json_doc.get('targsubtype1_txt')) victim1_object['victim_corp'] = json_doc.get('corp1', '') victim1_object['victim_target'] = json_doc.get('target1', '') victim1_object['victim_nationality'] = json_doc.get( 'natlty1_txt', '') victim1_doc_id = '{}_victim1'.format(doc.doc_id) victim1_object['uri'] = victim1_doc_id victim1_doc = etk.create_document(victim1_object) victim1_doc.doc_id = victim1_doc_id doc.kg.add_value('victim', value=victim1_doc.doc_id) nested_docs.append(victim1_doc) if json_doc.get('targtype2_txt', '').strip(): victim2_object = dict() victim2_object['dataset'] = 'gtd_victim' victim2_object['filename'] = filename victim2_object['victim_type'] = list() victim2_object['victim_type'].append(json_doc.get('targtype2_txt')) if json_doc.get('targsubtype2_txt', ''): victim2_object['victim_type'].append( json_doc.get('targsubtype2_txt')) victim2_object['victim_corp'] = json_doc.get('corp2', '') victim2_object['victim_target'] = json_doc.get('target2', '') victim2_object['victim_nationality'] = json_doc.get( 'natlty2_txt', '') victim2_doc_id = '{}_victim2'.format(doc.doc_id) victim2_object['uri'] = victim2_doc_id victim2_doc = etk.create_document(victim2_object) victim2_doc.doc_id = victim2_doc_id doc.kg.add_value('victim', value=victim2_doc.doc_id) nested_docs.append(victim2_doc) if json_doc.get('targtype3_txt', '').strip(): victim3_object = dict() victim3_object['dataset'] = 'gtd_victim' victim3_object['filename'] = filename victim3_object['victim_type'] = list() victim3_object['victim_type'].append(json_doc.get('targtype3_txt')) if json_doc.get('targsubtype3_txt', ''): victim3_object['victim_type'].append( json_doc.get('targsubtype3_txt')) victim3_object['victim_corp'] = json_doc.get('corp3', '') victim3_object['victim_target'] = json_doc.get('target3', '') victim3_object['victim_nationality'] = json_doc.get( 'natlty3_txt', '') victim3_doc_id = '{}_victim3'.format(doc.doc_id) victim3_object['uri'] = victim3_doc_id victim3_doc = etk.create_document(victim3_object) victim3_doc.doc_id = victim3_doc_id doc.kg.add_value('victim', value=victim3_doc.doc_id) nested_docs.append(victim3_doc) # create actor/perpetrators objects if json_doc.get('gname', '').strip(): actor1_object = dict() actor1_object['dataset'] = 'gtd_actor' actor1_object['filename'] = filename actor1_object['actor_group'] = list() actor1_object['actor_group'].append(json_doc.get('gname')) if json_doc.get('gsubname', ''): actor1_object['actor_group'].append(json_doc.get('gsubname')) actor1_doc_id = '{}_actor1'.format(doc.doc_id) actor1_object['uri'] = actor1_doc_id actor1_doc = etk.create_document(actor1_object) actor1_doc.doc_id = actor1_doc_id doc.kg.add_value('actor', value=actor1_doc.doc_id) nested_docs.append(actor1_doc) if json_doc.get('gname2', '').strip(): actor2_object = dict() actor2_object['dataset'] = 'gtd_actor' actor2_object['filename'] = filename actor2_object['actor_group'] = list() actor2_object['actor_group'].append(json_doc.get('gname2')) if json_doc.get('gsubname2', ''): actor2_object['actor_group'].append(json_doc.get('gsubname2')) actor2_doc_id = '{}_actor2'.format(doc.doc_id) actor2_object['uri'] = actor2_doc_id actor2_doc = etk.create_document(actor2_object) actor2_doc.doc_id = actor2_doc_id doc.kg.add_value('actor', value=actor2_doc.doc_id) nested_docs.append(actor2_doc) if json_doc.get('gname3', '').strip(): actor3_object = dict() actor3_object['dataset'] = 'gtd_actor' actor3_object['filename'] = filename actor3_object['actor_group'] = list() actor3_object['actor_group'].append(json_doc.get('gname3')) if json_doc.get('gsubname3', ''): actor3_object['actor_group'].append(json_doc.get('gsubname3')) actor3_doc_id = '{}_actor3'.format(doc.doc_id) actor3_object['uri'] = actor3_doc_id actor3_doc = etk.create_document(actor3_object) actor3_doc.doc_id = actor3_doc_id doc.kg.add_value('actor', value=actor3_doc.doc_id) nested_docs.append(actor3_doc) # create weapon objects, upto 4 if json_doc.get('weaptype1_txt', '').strip(): weapon1_object = dict() weapon1_object['dataset'] = 'gtd_weapon' weapon1_object['filename'] = filename weapon1_object['weapon_title'] = json_doc.get('weapdetail', '') weapon1_object['weapon_type'] = list() weapon1_object['weapon_type'].append(json_doc.get('weaptype1_txt')) if json_doc.get('weapsubtype1_txt', ''): weapon1_object['weapon_type'].append( json_doc.get('weapsubtype1_txt')) if json_doc.get('weaptype1', '') != '': weapon1_object['weapon_code'] = json_doc.get('weaptype1') weapon1_doc_id = '{}_weapons1'.format(doc.doc_id) weapon1_object['uri'] = weapon1_doc_id weapon1_doc = etk.create_document(weapon1_object) weapon1_doc.doc_id = weapon1_doc_id doc.kg.add_value('weapons', weapon1_doc.doc_id) nested_docs.append(weapon1_doc) if json_doc.get('weaptype2_txt', '').strip(): weapon2_object = dict() weapon2_object['dataset'] = 'gtd_weapon' weapon2_object['filename'] = filename weapon2_object['weapon_title'] = json_doc.get('weapdetail', '') weapon2_object['weapon_type'] = list() weapon2_object['weapon_type'].append(json_doc.get('weaptype2_txt')) if json_doc.get('weapsubtype2_txt', ''): weapon2_object['weapon_type'].append( json_doc.get('weapsubtype2_txt')) if json_doc.get('weaptype2', '') != '': weapon2_object['weapon_code'] = json_doc.get('weaptype2') weapon2_doc_id = '{}_weapons2'.format(doc.doc_id) weapon2_object['uri'] = weapon2_doc_id weapon2_doc = etk.create_document(weapon2_object) weapon2_doc.doc_id = weapon2_doc_id doc.kg.add_value('weapons', weapon2_doc.doc_id) nested_docs.append(weapon2_doc) if json_doc.get('weaptype3_txt', '').strip(): weapon3_object = dict() weapon3_object['dataset'] = 'gtd_weapon' weapon3_object['filename'] = filename weapon3_object['weapon_title'] = json_doc.get('weapdetail', '') weapon3_object['weapon_type'] = list() weapon3_object['weapon_type'].append(json_doc.get('weaptype3_txt')) if json_doc.get('weapsubtype3_txt', ''): weapon3_object['weapon_type'].append( json_doc.get('weapsubtype3_txt')) if json_doc.get('weaptype3', '') != '': weapon3_object['weapon_code'] = json_doc.get('weaptype3') weapon3_doc_id = '{}_weapons3'.format(doc.doc_id) weapon3_object['uri'] = weapon3_doc_id weapon3_doc = etk.create_document(weapon3_object) weapon3_doc.doc_id = weapon3_doc_id doc.kg.add_value('weapons', weapon3_doc.doc_id) nested_docs.append(weapon3_doc) if json_doc.get('weaptype4_txt', '').strip(): weapon4_object = dict() weapon4_object['dataset'] = 'gtd_weapon' weapon4_object['filename'] = filename weapon4_object['weapon_title'] = json_doc.get('weapdetail', '') weapon4_object['weapon_type'] = list() weapon4_object['weapon_type'].append(json_doc.get('weaptype4_txt')) if json_doc.get('weapsubtype4_txt', ''): weapon4_object['weapon_type'].append( json_doc.get('weapsubtype4_txt')) if json_doc.get('weaptype4', '') != '': weapon4_object['weapon_code'] = json_doc.get('weaptype4') weapon4_doc_id = '{}_weapons4'.format(doc.doc_id) weapon4_object['uri'] = weapon4_doc_id weapon4_doc = etk.create_document(weapon4_object) weapon4_doc.doc_id = weapon4_doc_id doc.kg.add_value('weapons', weapon4_doc.doc_id) nested_docs.append(weapon4_doc) # create total fatalities docs nkill = json_doc.get("nkill", 0) if nkill != "": total_fatalities_object = dict() total_fatalities_object["dataset"] = "gtd_fatality" total_fatalities_object['filename'] = filename total_fatalities_doc_id = '{}_total_fatalitites'.format(doc.doc_id) total_fatalities_object['uri'] = total_fatalities_doc_id total_fatalities_object["size"] = nkill total_fatalities_doc = etk.create_document(total_fatalities_object) total_fatalities_doc.doc_id = total_fatalities_doc_id doc.kg.add_value("fatalities", value=total_fatalities_doc_id) nested_docs.append(total_fatalities_doc) # create US fatalities docs nkillus = json_doc.get("nkillus", 0) if nkillus != "": us_fatalities_object = dict() us_fatalities_object["dataset"] = "gtd_fatality" us_fatalities_object['filename'] = filename us_fatalities_doc_id = '{}_us_fatalitites'.format(doc.doc_id) us_fatalities_object['uri'] = us_fatalities_doc_id us_fatalities_object["size"] = nkillus us_fatalities_object["nationality"] = "United States" us_fatalities_doc = etk.create_document(us_fatalities_object) us_fatalities_doc.doc_id = us_fatalities_doc_id doc.kg.add_value("fatalities", value=us_fatalities_doc_id) nested_docs.append(us_fatalities_doc) # create total injuries docs nwound = json_doc.get("nwound", 0) if nwound != "": total_injuries_object = dict() total_injuries_object["dataset"] = "gtd_injury" total_injuries_object['filename'] = filename total_injuries_doc_id = '{}_total_injuries'.format(doc.doc_id) total_injuries_object['uri'] = total_injuries_doc_id total_injuries_object["size"] = nwound total_injuries_doc = etk.create_document(total_injuries_object) total_injuries_doc.doc_id = total_injuries_doc_id doc.kg.add_value("injuries", value=total_injuries_doc_id) nested_docs.append(total_injuries_doc) # create US injuries docs nwoundus = json_doc.get("nwoundus", 0) if nwoundus != "": us_injuries_object = dict() us_injuries_object["dataset"] = "gtd_injury" us_injuries_object['filename'] = filename us_injuries_doc_id = '{}_us_injuries'.format(doc.doc_id) us_injuries_object['uri'] = us_injuries_doc_id us_injuries_object["size"] = nwoundus us_injuries_doc = etk.create_document(us_injuries_object) us_injuries_doc.doc_id = us_injuries_doc_id doc.kg.add_value("injuries", value=us_injuries_doc_id) nested_docs.append(us_injuries_doc) # create damage docs # in this dataset we only have property damage if json_doc.get("property", 0) == 1: damage_object = dict() damage_object["dataset"] = "gtd_damage" damage_object['filename'] = filename damage_object["damage_title"] = json_doc.get("propextent_txt") damage_object["damage_value"] = json_doc.get("propvalue") damage_object["damage_description"] = json_doc.get("propcomment") damage_object_doc_id = '{}_damage'.format(doc.doc_id) damage_object['uri'] = damage_object_doc_id damage_doc = etk.create_document(damage_object) damage_doc.doc_id = damage_object_doc_id doc.kg.add_value("damage", value=damage_object_doc_id) nested_docs.append(damage_doc) return nested_docs
def process_document(self, doc): # extraction variables = { 'value': '$col,$row', 'food_name_in_english': '$B,$row', 'food_name_in_french': '$C,$row', 'scientific_name': '$D,$row', 'code': '$A,$row', 'source': '$E,$row', 'nutrition': '$col,$2', 'row': '$row', 'col': '$col' } raw_extractions = self.ee.extract(doc.cdr_document['file_path'], 'USERDATABASE', ['F,3', 'AG,971'], variables) # post processing re_code = re.compile(r'^[0-9]{2}_[0-9]{3}$') re_value_in_bracket = re.compile(r'^.*\((.*)\)') re_value_in_square_bracket = re.compile(r'^.*\[(.*)\]') extracted_docs = [] for e in raw_extractions: code = e['code'].strip() if not re_code.match(code): continue in_bracket_unit = re_value_in_bracket.search(e['nutrition']) unit = '' if not in_bracket_unit else in_bracket_unit.groups(1)[0] # parse value value = e['value'] if e['nutrition'] == 'Energy (kcal) kJ': in_bracket_value = re_value_in_bracket.search(e['value']) value = in_bracket_value.groups(1)[0] elif isinstance(value, str): value = value.strip() in_square_bracket_value = re_value_in_square_bracket.search( e['value']) if in_square_bracket_value: value = in_square_bracket_value.groups(1)[0] # if it's a range, get the lower bound dash_pos = value.find('-') if dash_pos != -1: value = value[:dash_pos] try: value = float(value) except: value = 0.0 extracted_doc = { 'tld': '', 'website': '', 'type': 'factoid', 'factoid': { 'value': value, 'unit': unit, 'food_name_in_english': e['food_name_in_english'], 'food_name_in_french': e['food_name_in_french'], 'scientific_name': e['scientific_name'], 'source': e['source'], 'code': code, 'nutrition': e['nutrition'], 'metadata': { 'file_name': os.path.basename(doc.cdr_document['file_path']), 'sheet_name': 'USERDATABASE', 'row': str(e['row']), 'col': str(e['col']) }, 'identifier_key': 'code', 'identifier_value': code } } extracted_doc['doc_id'] = Utility.create_doc_id_from_json( extracted_doc) extracted_doc = etk.create_document(extracted_doc) # build kg extracted_doc.kg.add_value('metadata__unit', json_path='$.factoid.unit') extracted_doc.kg.add_value( 'metadata__property_type', value=[ 'http://ontology.causeex.com/ontology/odps/TimeSeriesAndMeasurements#Nutrition' ]) extracted_doc.kg.add_value( 'metadata__reported_value', value=[ 'http://ontology.causeex.com/ontology/odps/TimeSeriesAndMeasurements#ReportedValue' ]) extracted_doc.kg.add_value('provenance_col', json_path='$.factoid.metadata.col') extracted_doc.kg.add_value('provenance_row', json_path='$.factoid.metadata.row') extracted_doc.kg.add_value( 'provenance_filename', json_path='$.factoid.metadata.file_name') extracted_doc.kg.add_value( 'provenance_sheet', json_path='$.factoid.metadata.sheet_name') extracted_doc.kg.add_value('value', json_path='$.factoid.value') extracted_doc.kg.add_value('type', json_path='$.factoid.type') extracted_doc.kg.add_value('identifier_key', json_path='$.factoid.identifier_key') extracted_doc.kg.add_value('identifier_value', json_path='$.factoid.identifier_value') extracted_docs.append(extracted_doc) return extracted_docs