def process_document(self, doc: Document) -> List[Document]: # Record new documents we create that need to be processed later. new_docs = list() cameo_code = self.attribute_value(doc, "EventCode") print("Processing cameo code {}".format(cameo_code)) doc.doc_id = str(doc.cdr_document[self.attribute("GLOBALEVENTID")]) # Type doc.kg.add_value("type", "Event") if self.mapping.has_cameo_code(cameo_code): # Type fields for t in self.mapping.event_type("event1", cameo_code): doc.kg.add_value("type", value=t) doc.kg.add_value("causeex_class", value=self.expand_prefix(t)) # Event_date for s in doc.select_segments("$." + self.attribute("SQLDATE")): doc.kg.add_value("event_date", value=doc.extract(self.date_extractor, s, prefer_language_date_order=None, additional_formats=["%Y%m%d"], detect_relative_dates=False, use_default_formats=False)) # CAMEO code cameo_code_label = "CAMEO Code: " + str( doc.select_segments("$." + self.attribute("EventCode"))[0].value) doc.kg.add_value("code", value=cameo_code_label) # simpler without provenance: # doc.kg.add_value("code", "CAMEO Code: " + doc.cdr_document[self.attribute("EventCode")]) # Identifier doc.kg.add_value("identifier", json_path="$." + self.attribute("GLOBALEVENTID")) # Geographical information doc.kg.add_value("country_code", json_path="$." + self.attribute("ActionGeo_CountryCode")) doc.kg.add_value("location", json_path="$." + self.attribute("ActionGeo_FullName")) # Actors actor1, actor2 = self.add_actors(doc, "event1", cameo_code) new_docs.append(actor1) new_docs.append(actor2) # has topic events for event in ["event2", "event3"]: new_docs.extend( self.add_topic_events(doc, event, actor1.doc_id, actor2.doc_id)) return new_docs
def process_document(self, doc: Document) -> List[Document]: doc.kg.add_value("type", value="Weapon") doc.kg.add_value("title", json_path="$.weapon_title") doc.kg.add_value("type", json_path="$.weapon_type[*]") doc.kg.add_value("provenance_filename", json_path="$.filename") doc.kg.add_value("causeex_class", value=doc.extract( self.weapon_decoder, doc.select_segments("$.weapon_code")[0])) return list()
def process_document(self, doc: Document): descriptions = doc.select_segments("projects[*].description") projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): spacy_names = doc.extract(self.sample_rule_extractor, d) p.store(spacy_names, "spacy_names") for a_name in spacy_names: doc.kg.add_value("spacy_name", value=a_name.value)
def process_document(self, doc: Document): descriptions = doc.select_segments("date_description") date_text = doc.select_segments("date_description.text") ignore_before = datetime.datetime(1890, 1, 1) ignore_after = datetime.datetime(2500, 10, 10) relative_base = datetime.datetime(2018, 1, 1) for d, p in zip(date_text, descriptions): extracted_date = doc.extract( self.date_extractor, d, extract_first_date_only=False, # first valid additional_formats=['%Y@%m@%d', '%a %Y, %b %d'], use_default_formats=True, # ignore_dates_before: datetime.datetime = None, ignore_dates_before=ignore_before, # ignore_dates_after: datetime.datetime = None, ignore_dates_after=ignore_after, detect_relative_dates=False, relative_base=relative_base, # preferred_date_order: str = "MDY", # used for interpreting ambiguous dates that are missing parts preferred_date_order="DMY", prefer_language_date_order=True, # timezone: str = None, # default is local timezone. # timezone='GMT', # to_timezone: str = None, # when not specified, not timezone conversion is done. # to_timezone='UTC', # return_as_timezone_aware: bool = True return_as_timezone_aware=False, # prefer_day_of_month: str = "first", # can be "current", "first", "last". prefer_day_of_month='first', # prefer_dates_from: str = "current" # can be "future", "future", "past".) prefer_dates_from='future', # date_value_resolution: DateResolution = DateResolution.DAY ) p.store(extracted_date, "extracted_date") doc.kg.add_doc_value("date", "date_description.extracted_date[*]")
def process_ems(self, doc: Document) -> List[Document]: """ Factory method to wrap input JSON docs in an ETK Document object. Args: doc (Document): process on this document Returns: a Document object and a KnowledgeGraph object """ new_docs = list() for a_em in self.em_lst: if a_em.document_selector(doc): self.log(" processing with " + str(type(a_em)) + ". Process", "info", doc.doc_id, doc.url) fresh_docs = a_em.process_document(doc) # Allow ETKModules to return nothing in lieu of an empty list (people forget to return empty list) if fresh_docs: new_docs.extend(fresh_docs) # try: # if a_em.document_selector(doc): # self.log(" processing with " + str(type(a_em)) + ". Process", "info", doc.doc_id, doc.url) # new_docs.extend(a_em.process_document(doc)) # except Exception as e: # if self.error_policy == ErrorPolicy.THROW_EXTRACTION: # self.log(str(e) + " processing with " + str(type(a_em)) + ". Continue", "error", doc.doc_id, # doc.url) # continue # if self.error_policy == ErrorPolicy.THROW_DOCUMENT: # self.log(str(e) + " processing with " + str(type(a_em)) + ". Throw doc", "error", doc.doc_id, # doc.url) # return list() # if self.error_policy == ErrorPolicy.RAISE: # self.log(str(e) + " processing with " + str(type(a_em)), "error", doc.doc_id, doc.url) # raise e # Do house cleaning. doc.insert_kg_into_cdr() if not self.generate_json_ld: if "knowledge_graph" in doc.cdr_document: doc.cdr_document["knowledge_graph"].pop("@context", None) Utility.make_json_serializable(doc.cdr_document) if self.output_kg_only: doc = doc.kg.value elif not doc.doc_id: doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document) results = [doc] for new_doc in new_docs: results.extend(self.process_ems(new_doc)) return results
def process_document(self, doc: Document): """ Add your code for processing the document """ text_segments = doc.select_segments("lexisnexis.doc_description") for text_segment in text_segments: split_sentences = doc.extract(self.sentence_extractor, text_segment) doc.store(split_sentences, 'split_sentences') # for t, u in zip(text_to_be_split, units_of_text): # split_sentences = doc.extract(self.sentence_extractor, t) # u.store(split_sentences, "split_sentences") return list()
def process_document(self, doc: Document) -> List[Document]: nested_docs = list() doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document) doc.cdr_document[ "title"] = "{Total} Displaced from {ReportedLocation} in {Country}".format( Total=doc.cdr_document["Total"], ReportedLocation=doc.cdr_document["ReportedLocation"], Country=doc.cdr_document["Country"]) doc.cdr_document["dataset"] = "lake_chad_basin_displaced" place = { "uri": '{}_place'.format(doc.doc_id), "doc_id": '{}_place'.format(doc.doc_id), "country": doc.cdr_document.get("Country", ''), "dataset": "lcb_place" } place_doc = etk.create_document(place) nested_docs.append(place_doc) doc.kg.add_value("place", value='{}_place'.format(doc.doc_id)) # Add event_date to the KG extracted_dates = self.date_extractor.extract( doc.cdr_document.get('Period', '')) doc.kg.add_value("event_date", value=extracted_dates) doc.kg.add_value("event_date_end", value=extracted_dates) doc.kg.add_value("location", json_path="ReportedLocation") doc.kg.add_value( "causeex_class", value= "http://ontology.causeex.com/ontology/odps/EventHierarchy#ForcedMove" ) doc.kg.add_value("type", value=["event", "Displacement Event"]) doc.kg.add_value("title", json_path="title") victim = { "dataset": "lake_chad_basin_displaced_victim", "total": doc.cdr_document["Total"], "type": ["Group", "Displaced People"], "uri": '{}_victim'.format(doc.doc_id) } victim_doc = etk.create_document(victim) victim_doc.doc_id = '{}_victim'.format(doc.doc_id) doc.kg.add_value("victim", value=victim_doc.doc_id) nested_docs.append(victim_doc) return nested_docs
def process_document(self, doc: Document): if self.document_selector(doc): event_date = doc.select_segments(jsonpath='$.event_date') for segment in event_date: extractions = doc.extract(extractor=self.date_extractor, extractable=segment) # doc.store(extractions=extractions, attribute=self.date_extractor.name) # doc.kg.add_doc_value("event_date", "$.{}[*]".format(self.date_extractor.name)) for extraction in extractions: doc.kg.add_value("event_date", value=extraction.value) # for segment in doc.select_segments(jsonpath='$.notes'): # doc.kg.add_value("description", segment.value) doc.kg.add_value("description", json_path='$.notes')
def process_document(self, doc: Document): descriptions = doc.select_segments("projects[*].description") projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): names = doc.extract(self.name_extractor, d) p.store(names, "members") students = [] for name_extraction in names: students += doc.extract(self.student_extractor, name_extraction) p.store(students, "students") doc.kg.add_value("developer", json_path="projects[*].members[*]") doc.kg.add_value("student_developer", json_path="projects[*].students[*]") doc.kg.add_value("id", json_path='$.doc_id')
def test_etk_crf_glossary_extraction(self): etk = ETK(use_spacy_tokenizer=False) s = time.time() city_extractor = GlossaryExtractor( ['los angeles', 'new york', 'angeles'], 'city_extractor', etk.default_tokenizer, case_sensitive=False, ngrams=3) doc_json = { 'text': 'i live in los angeles. my hometown is Beijing. I love New York City.' } doc = Document(etk, cdr_document=doc_json, mime_type='json', url='', doc_id='1') t_segments = doc.select_segments("$.text") for t_segment in t_segments: extracted_cities = doc.extract(city_extractor, t_segment) for extracted_city in extracted_cities: self.assertTrue(extracted_city.value in ['los angeles', 'New York', 'angeles'])
def create_document(self, doc: Dict, mime_type: str = None, url: str = "http://ex.com/123") -> Document: """ Factory method to wrap input JSON docs in an ETK Document object. Args: doc (object): a JSON object containing a document in CDR format. mime_type (str): if doc is a tring, the mime_type tells what it is url (str): if the doc came from the web, specifies the URL for it Returns: wrapped Document """ return Document(self, doc)
def process_document(self, doc: Document) -> List[Document]: nested_docs = list() # pyexcel produces dics with date objects, which are not JSON serializable, fix that. Utility.make_json_serializable(doc.cdr_document) # Add an ID based on the full contents of the raw document doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document) # Create a CDR document for an actor, we only put the SideA attribute in it, # and we give it a new dataset identifier so we can match it in an ETKModule actor1_dict = { "Side": doc.cdr_document["SideA"], "dataset": "ucdp-actor" } actor1_doc = etk.create_document(actor1_dict) # Create a doc_id for the actor document, from the doc_id of the event document actor1_doc.doc_id = doc.doc_id + "_actor1" # Now do the exact same thing for SideB actor2_dict = { "Side": doc.cdr_document["SideB"], "dataset": "ucdp-actor" } actor2_doc = etk.create_document(actor2_dict) actor2_doc.doc_id = doc.doc_id + "_actor2" kg_object_old_ontology = { "uri": doc.doc_id, "place":{ "uri": doc.doc_id + "_place", "doc_id": doc.doc_id + "_place", "country": doc.select_segments("$.Location"), "type": ["Place"] }, "type": [ "Event", doc.extract(self.incomp_decoder, doc.select_segments("$.Incomp")[0]), doc.extract(self.int_decoder, doc.select_segments("$.Int")[0]) ], "title": "{}/{} armed conflict in {}".format( doc.cdr_document["SideA"], doc.cdr_document["SideB"], doc.cdr_document["YEAR"] ), "causeex_class": [ doc.extract(self.int_causeex_decoder, doc.select_segments("$.Int")[0]), self.event_prefix + "ArmedConflict" ], "event_date": doc.select_segments("$.StartDate"), "event_date_end": doc.select_segments("$.EpEndDate"), "fatalities": { "uri": doc.doc_id + "_fatalities", "title": doc.extract(self.int_fatalities_decoder, doc.select_segments("$.Int")[0]), "type": ["Group", "Dead People"], "min_size": doc.extract(self.int_fatalities_size_lower_decoder, doc.select_segments("$.Int")[0]), "max_size": doc.extract(self.int_fatalities_size_upper_decoder, doc.select_segments("$.Int")[0]) }, "actor": [actor1_doc.doc_id, actor2_doc.doc_id] } ds = doc.build_knowledge_graph(kg_object_old_ontology) nested_docs.extend(ds) # Return the list of new documents that we created to be processed by ETK. # Note that fatalities_dco is in the list as it is a newly created document. It does not have an # extraction module, so it will be passed to the output unchanged. nested_docs.append(actor1_doc) nested_docs.append(actor2_doc) return nested_docs
if extractions: path = '$."' + \ extractions[0].value + '"[?(@.country == "Italy")]' jsonpath_expr = jex.parse(path) city_match = jsonpath_expr.find(self.city_dataset) if city_match: # add corresponding values of city_dataset into knowledge graph of the doc for field in city_match[0].value: doc.kg.add_value( field, value=city_match[0].value[field]) new_docs.append(doc) return new_docs def document_selector(self, doc) -> bool: return doc.cdr_document.get("dataset") == "italy_team" if __name__ == "__main__": # url = 'https://en.wikipedia.org/wiki/List_of_football_clubs_in_Italy' cdr = json.load( open('./resources/italy_teams.json', mode='r', encoding='utf-8')) kg_schema = KGSchema(json.load(open('./resources/master_config.json'))) etk = ETK(modules=ItalyTeamsModule, kg_schema=kg_schema) etk.parser = jex.parse cdr_doc = Document(etk, cdr_document=cdr, mime_type='json', url=cdr['url']) results = etk.process_ems(cdr_doc)[1:] print('Total docs:', len(results)) print("Sample result:\n") print(json.dumps(results[0].value, indent=2))
def process_document(self, doc: Document) -> List[Document]: # pyexcel produces dics with date objects, which are not JSON serializable, fix that. Utility.make_json_serializable(doc.cdr_document) # Add an ID based on the full contents of the raw document doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document) # Map location to country doc.kg.add_value("country", json_path="$.Location") # map incomp to type, after using a decoding dict doc.kg.add_value("type", value=doc.extract(self.incomp_decoder, doc.select_segments("$.Incomp")[0])) # map Int to type, also after using a decoding dict doc.kg.add_value("type", value=doc.extract(self.int_decoder, doc.select_segments("$.Int")[0])) # Add "Event" to type, as all these documents are events doc.kg.add_value("type", value="Event") # Add a title to our event doc.kg.add_value("title", value="{}/{} armed conflict in {}".format( doc.cdr_document["SideA"], doc.cdr_document["SideB"], doc.cdr_document["YEAR"] )) # Add the specific CauseEx ontology classes that we want to use for this event doc.kg.add_value("causeex_class", value=doc.extract(self.int_causeex_decoder, doc.select_segments("$.Int")[0])) doc.kg.add_value("causeex_class", value=self.event_prefix+"ArmedConflict") # Map dates to event_date doc.kg.add_value("event_date", json_path="$.StartDate") doc.kg.add_value("event_date_end", json_path="$.EpEndDate") # Create a CDR document for an actor, we only put the SideA attribute in it, # and we give it a new dataset identifier so we can match it in an ETKModule actor1_dict = { "Side": doc.cdr_document["SideA"], "dataset": "ucdp-actor" } actor1_doc = etk.create_document(actor1_dict) # Create a doc_id for the actor document, from the doc_id of the event document actor1_doc.doc_id = doc.doc_id + "_actor1" # Record the identifier of the actor object in the "actor" field of the event. doc.kg.add_value("actor", value=actor1_doc.doc_id) # Now do the exact same thing for SideB actor2_dict = { "Side": doc.cdr_document["SideB"], "dataset": "ucdp-actor" } actor2_doc = etk.create_document(actor2_dict) actor2_doc.doc_id = doc.doc_id + "_actor2" doc.kg.add_value("actor", value=actor2_doc.doc_id) # Create a fatalities object to record information about the fatalities in the conflict # Instead of creating an ETK module for it, it is possible to do it inline. fatalities_doc = etk.create_document({"Int": doc.cdr_document["Int"]}) fatalities_doc.doc_id = doc.doc_id + "_fatalities" doc.kg.add_value("fatalities", value=fatalities_doc.doc_id) fatalities_doc.kg.add_value( "title", fatalities_doc.extract(self.int_fatalities_decoder, fatalities_doc.select_segments("$.Int")[0])) fatalities_doc.kg.add_value("type", value=["Group", "Dead People"]) fatalities_doc.kg.add_value( "size_lower_bound", value=fatalities_doc.extract(self.int_fatalities_size_lower_decoder, fatalities_doc.select_segments("$.Int")[0])) fatalities_doc.kg.add_value( "size_upper_bound", value=fatalities_doc.extract(self.int_fatalities_size_upper_decoder, fatalities_doc.select_segments("$.Int")[0])) # Return the list of new documents that we created to be processed by ETK. # Note that fatalities_dco is in the list as it is a newly created document. It does not have an # extraction module, so it will be passed to the output unchanged. return [ actor1_doc, actor2_doc, fatalities_doc ]
def process_document(self, doc: Document) -> List[Document]: nested_docs = list() json_doc = doc.cdr_document filename = json_doc.get('file_name') doc.doc_id = Utility.create_doc_id_from_json(json_doc) doc.cdr_document['uri'] = doc.doc_id doc.kg.add_value("type", value="Event") doc.kg.add_value("type", value="Act of Terrorism") doc.kg.add_value("provenance_filename", value=filename) for attack_type_code in attack_type_fields_code: ac = json_doc.get(attack_type_code, '') if ac != "": doc.kg.add_value("causeex_class", value=doc.extract( self.causeex_decoder, doc.select_segments( "$.{}".format(attack_type_code))[0])) # Add event_date to the KG extracted_dates = self.date_extractor.extract('{}-{}-{}'.format( json_doc.get('iyear'), json_doc.get('imonth'), json_doc.get('iday'))) if len(extracted_dates) > 0: doc.kg.add_value("event_date", value=extracted_dates) doc.kg.add_value("event_date_end", value=extracted_dates) else: # no proper date mentioned in the event, try the approximate date approximate_date_txt = json_doc.get("approxdate") extracted_approx_dates = self.date_extractor.extract( approximate_date_txt) if len(extracted_approx_dates) > 0: doc.kg.add_value("event_date", value=extracted_approx_dates) doc.kg.add_value("event_date_end", value=extracted_approx_dates) # summary, aka description only available for incident after 1997 doc.kg.add_value("description", json_path="$.summary") # add inclusion criteria: why is this incident regarded as a terrorist incident # TODO: ADD this to master_config crit1 = json_doc.get('crit1', 0) if crit1 == 1: doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_1) crit2 = json_doc.get('crit2', 0) if crit2 == 1: doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_2) crit3 = json_doc.get('crit3', 0) if crit3 == 1: doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_3) # add related events to KG # TODO: ADD this to master_config related_event_ids_txt = json_doc.get('related', '') if related_event_ids_txt.strip() != "": related_event_ids = related_event_ids_txt.split(',') if len(related_event_ids) > 0: doc.kg.add_value("related_events", value=related_event_ids) # add attack information, on second thoughts, this qualifies as event type for attack_type_field in attack_type_fields: doc.kg.add_value("type", value=json_doc.get(attack_type_field, '')) # TODO check the following 2 if json_doc.get("suicide", 0) == 1: doc.kg.add_value("type", value='Suicide') if json_doc.get("success", 0) == 1: doc.kg.add_value("type", value='Success') # create nested objects for places place_object = dict() for place_field in place_fields: place_object[place_field] = json_doc.get(place_field) place_object["dataset"] = "gtd_place" place_doc_id = '{}_place'.format(doc.doc_id) place_object['uri'] = place_doc_id place_object['filename'] = filename place_doc = etk.create_document(place_object) place_doc.doc_id = place_doc_id doc.kg.add_value("place", value=place_doc.doc_id) nested_docs.append(place_doc) # create victim objects, there can be upto 3 if json_doc.get('targtype1_txt', '').strip(): victim1_object = dict() victim1_object['dataset'] = 'gtd_victim' victim1_object['filename'] = filename victim1_object['victim_type'] = list() victim1_object['victim_type'].append(json_doc.get('targtype1_txt')) if json_doc.get('targsubtype1_txt', ''): victim1_object['victim_type'].append( json_doc.get('targsubtype1_txt')) victim1_object['victim_corp'] = json_doc.get('corp1', '') victim1_object['victim_target'] = json_doc.get('target1', '') victim1_object['victim_nationality'] = json_doc.get( 'natlty1_txt', '') victim1_doc_id = '{}_victim1'.format(doc.doc_id) victim1_object['uri'] = victim1_doc_id victim1_doc = etk.create_document(victim1_object) victim1_doc.doc_id = victim1_doc_id doc.kg.add_value('victim', value=victim1_doc.doc_id) nested_docs.append(victim1_doc) if json_doc.get('targtype2_txt', '').strip(): victim2_object = dict() victim2_object['dataset'] = 'gtd_victim' victim2_object['filename'] = filename victim2_object['victim_type'] = list() victim2_object['victim_type'].append(json_doc.get('targtype2_txt')) if json_doc.get('targsubtype2_txt', ''): victim2_object['victim_type'].append( json_doc.get('targsubtype2_txt')) victim2_object['victim_corp'] = json_doc.get('corp2', '') victim2_object['victim_target'] = json_doc.get('target2', '') victim2_object['victim_nationality'] = json_doc.get( 'natlty2_txt', '') victim2_doc_id = '{}_victim2'.format(doc.doc_id) victim2_object['uri'] = victim2_doc_id victim2_doc = etk.create_document(victim2_object) victim2_doc.doc_id = victim2_doc_id doc.kg.add_value('victim', value=victim2_doc.doc_id) nested_docs.append(victim2_doc) if json_doc.get('targtype3_txt', '').strip(): victim3_object = dict() victim3_object['dataset'] = 'gtd_victim' victim3_object['filename'] = filename victim3_object['victim_type'] = list() victim3_object['victim_type'].append(json_doc.get('targtype3_txt')) if json_doc.get('targsubtype3_txt', ''): victim3_object['victim_type'].append( json_doc.get('targsubtype3_txt')) victim3_object['victim_corp'] = json_doc.get('corp3', '') victim3_object['victim_target'] = json_doc.get('target3', '') victim3_object['victim_nationality'] = json_doc.get( 'natlty3_txt', '') victim3_doc_id = '{}_victim3'.format(doc.doc_id) victim3_object['uri'] = victim3_doc_id victim3_doc = etk.create_document(victim3_object) victim3_doc.doc_id = victim3_doc_id doc.kg.add_value('victim', value=victim3_doc.doc_id) nested_docs.append(victim3_doc) # create actor/perpetrators objects if json_doc.get('gname', '').strip(): actor1_object = dict() actor1_object['dataset'] = 'gtd_actor' actor1_object['filename'] = filename actor1_object['actor_group'] = list() actor1_object['actor_group'].append(json_doc.get('gname')) if json_doc.get('gsubname', ''): actor1_object['actor_group'].append(json_doc.get('gsubname')) actor1_doc_id = '{}_actor1'.format(doc.doc_id) actor1_object['uri'] = actor1_doc_id actor1_doc = etk.create_document(actor1_object) actor1_doc.doc_id = actor1_doc_id doc.kg.add_value('actor', value=actor1_doc.doc_id) nested_docs.append(actor1_doc) if json_doc.get('gname2', '').strip(): actor2_object = dict() actor2_object['dataset'] = 'gtd_actor' actor2_object['filename'] = filename actor2_object['actor_group'] = list() actor2_object['actor_group'].append(json_doc.get('gname2')) if json_doc.get('gsubname2', ''): actor2_object['actor_group'].append(json_doc.get('gsubname2')) actor2_doc_id = '{}_actor2'.format(doc.doc_id) actor2_object['uri'] = actor2_doc_id actor2_doc = etk.create_document(actor2_object) actor2_doc.doc_id = actor2_doc_id doc.kg.add_value('actor', value=actor2_doc.doc_id) nested_docs.append(actor2_doc) if json_doc.get('gname3', '').strip(): actor3_object = dict() actor3_object['dataset'] = 'gtd_actor' actor3_object['filename'] = filename actor3_object['actor_group'] = list() actor3_object['actor_group'].append(json_doc.get('gname3')) if json_doc.get('gsubname3', ''): actor3_object['actor_group'].append(json_doc.get('gsubname3')) actor3_doc_id = '{}_actor3'.format(doc.doc_id) actor3_object['uri'] = actor3_doc_id actor3_doc = etk.create_document(actor3_object) actor3_doc.doc_id = actor3_doc_id doc.kg.add_value('actor', value=actor3_doc.doc_id) nested_docs.append(actor3_doc) # create weapon objects, upto 4 if json_doc.get('weaptype1_txt', '').strip(): weapon1_object = dict() weapon1_object['dataset'] = 'gtd_weapon' weapon1_object['filename'] = filename weapon1_object['weapon_title'] = json_doc.get('weapdetail', '') weapon1_object['weapon_type'] = list() weapon1_object['weapon_type'].append(json_doc.get('weaptype1_txt')) if json_doc.get('weapsubtype1_txt', ''): weapon1_object['weapon_type'].append( json_doc.get('weapsubtype1_txt')) if json_doc.get('weaptype1', '') != '': weapon1_object['weapon_code'] = json_doc.get('weaptype1') weapon1_doc_id = '{}_weapons1'.format(doc.doc_id) weapon1_object['uri'] = weapon1_doc_id weapon1_doc = etk.create_document(weapon1_object) weapon1_doc.doc_id = weapon1_doc_id doc.kg.add_value('weapons', weapon1_doc.doc_id) nested_docs.append(weapon1_doc) if json_doc.get('weaptype2_txt', '').strip(): weapon2_object = dict() weapon2_object['dataset'] = 'gtd_weapon' weapon2_object['filename'] = filename weapon2_object['weapon_title'] = json_doc.get('weapdetail', '') weapon2_object['weapon_type'] = list() weapon2_object['weapon_type'].append(json_doc.get('weaptype2_txt')) if json_doc.get('weapsubtype2_txt', ''): weapon2_object['weapon_type'].append( json_doc.get('weapsubtype2_txt')) if json_doc.get('weaptype2', '') != '': weapon2_object['weapon_code'] = json_doc.get('weaptype2') weapon2_doc_id = '{}_weapons2'.format(doc.doc_id) weapon2_object['uri'] = weapon2_doc_id weapon2_doc = etk.create_document(weapon2_object) weapon2_doc.doc_id = weapon2_doc_id doc.kg.add_value('weapons', weapon2_doc.doc_id) nested_docs.append(weapon2_doc) if json_doc.get('weaptype3_txt', '').strip(): weapon3_object = dict() weapon3_object['dataset'] = 'gtd_weapon' weapon3_object['filename'] = filename weapon3_object['weapon_title'] = json_doc.get('weapdetail', '') weapon3_object['weapon_type'] = list() weapon3_object['weapon_type'].append(json_doc.get('weaptype3_txt')) if json_doc.get('weapsubtype3_txt', ''): weapon3_object['weapon_type'].append( json_doc.get('weapsubtype3_txt')) if json_doc.get('weaptype3', '') != '': weapon3_object['weapon_code'] = json_doc.get('weaptype3') weapon3_doc_id = '{}_weapons3'.format(doc.doc_id) weapon3_object['uri'] = weapon3_doc_id weapon3_doc = etk.create_document(weapon3_object) weapon3_doc.doc_id = weapon3_doc_id doc.kg.add_value('weapons', weapon3_doc.doc_id) nested_docs.append(weapon3_doc) if json_doc.get('weaptype4_txt', '').strip(): weapon4_object = dict() weapon4_object['dataset'] = 'gtd_weapon' weapon4_object['filename'] = filename weapon4_object['weapon_title'] = json_doc.get('weapdetail', '') weapon4_object['weapon_type'] = list() weapon4_object['weapon_type'].append(json_doc.get('weaptype4_txt')) if json_doc.get('weapsubtype4_txt', ''): weapon4_object['weapon_type'].append( json_doc.get('weapsubtype4_txt')) if json_doc.get('weaptype4', '') != '': weapon4_object['weapon_code'] = json_doc.get('weaptype4') weapon4_doc_id = '{}_weapons4'.format(doc.doc_id) weapon4_object['uri'] = weapon4_doc_id weapon4_doc = etk.create_document(weapon4_object) weapon4_doc.doc_id = weapon4_doc_id doc.kg.add_value('weapons', weapon4_doc.doc_id) nested_docs.append(weapon4_doc) # create total fatalities docs nkill = json_doc.get("nkill", 0) if nkill != "": total_fatalities_object = dict() total_fatalities_object["dataset"] = "gtd_fatality" total_fatalities_object['filename'] = filename total_fatalities_doc_id = '{}_total_fatalitites'.format(doc.doc_id) total_fatalities_object['uri'] = total_fatalities_doc_id total_fatalities_object["size"] = nkill total_fatalities_doc = etk.create_document(total_fatalities_object) total_fatalities_doc.doc_id = total_fatalities_doc_id doc.kg.add_value("fatalities", value=total_fatalities_doc_id) nested_docs.append(total_fatalities_doc) # create US fatalities docs nkillus = json_doc.get("nkillus", 0) if nkillus != "": us_fatalities_object = dict() us_fatalities_object["dataset"] = "gtd_fatality" us_fatalities_object['filename'] = filename us_fatalities_doc_id = '{}_us_fatalitites'.format(doc.doc_id) us_fatalities_object['uri'] = us_fatalities_doc_id us_fatalities_object["size"] = nkillus us_fatalities_object["nationality"] = "United States" us_fatalities_doc = etk.create_document(us_fatalities_object) us_fatalities_doc.doc_id = us_fatalities_doc_id doc.kg.add_value("fatalities", value=us_fatalities_doc_id) nested_docs.append(us_fatalities_doc) # create total injuries docs nwound = json_doc.get("nwound", 0) if nwound != "": total_injuries_object = dict() total_injuries_object["dataset"] = "gtd_injury" total_injuries_object['filename'] = filename total_injuries_doc_id = '{}_total_injuries'.format(doc.doc_id) total_injuries_object['uri'] = total_injuries_doc_id total_injuries_object["size"] = nwound total_injuries_doc = etk.create_document(total_injuries_object) total_injuries_doc.doc_id = total_injuries_doc_id doc.kg.add_value("injuries", value=total_injuries_doc_id) nested_docs.append(total_injuries_doc) # create US injuries docs nwoundus = json_doc.get("nwoundus", 0) if nwoundus != "": us_injuries_object = dict() us_injuries_object["dataset"] = "gtd_injury" us_injuries_object['filename'] = filename us_injuries_doc_id = '{}_us_injuries'.format(doc.doc_id) us_injuries_object['uri'] = us_injuries_doc_id us_injuries_object["size"] = nwoundus us_injuries_doc = etk.create_document(us_injuries_object) us_injuries_doc.doc_id = us_injuries_doc_id doc.kg.add_value("injuries", value=us_injuries_doc_id) nested_docs.append(us_injuries_doc) # create damage docs # in this dataset we only have property damage if json_doc.get("property", 0) == 1: damage_object = dict() damage_object["dataset"] = "gtd_damage" damage_object['filename'] = filename damage_object["damage_title"] = json_doc.get("propextent_txt") damage_object["damage_value"] = json_doc.get("propvalue") damage_object["damage_description"] = json_doc.get("propcomment") damage_object_doc_id = '{}_damage'.format(doc.doc_id) damage_object['uri'] = damage_object_doc_id damage_doc = etk.create_document(damage_object) damage_doc.doc_id = damage_object_doc_id doc.kg.add_value("damage", value=damage_object_doc_id) nested_docs.append(damage_doc) return nested_docs
def process_document(self, doc: Document) -> List[Document]: doc.kg.add_value("title", json_path="$.ActorName") # Type doc.kg.add_value("type", value="Actor") doc.kg.add_value("type", json_path="$.ActorType1Code") doc.kg.add_value("type", value=doc.extract( self.actor_type_decoder, doc.select_segments("$.ActorType1Code")[0])) doc.kg.add_value("type", json_path="$.ActorType2Code") doc.kg.add_value("type", value=doc.extract( self.actor_type_decoder, doc.select_segments("$.ActorType2Code")[0])) doc.kg.add_value("type", json_path="$.ActorType3Code") doc.kg.add_value("type", value=doc.extract( self.actor_type_decoder, doc.select_segments("$.ActorType2Code")[0])) # Ethnic group doc.kg.add_value("ethnic_group", json_path="$.ActorEthnicCode") doc.kg.add_value("ethnic_group", value=doc.extract( self.ethnic_group_decoder, doc.select_segments("$.ActorEthnicCode")[0])) # Religion doc.kg.add_value("religion", json_path="$.ActorReligion1Code") doc.kg.add_value("religion", value=doc.extract( self.religion_decoder, doc.select_segments("$.ActorReligion1Code")[0])) doc.kg.add_value("religion", json_path="$.ActorReligion2Code") doc.kg.add_value("religion", value=doc.extract( self.religion_decoder, doc.select_segments("$.ActorReligion2Code")[0])) # Known group: putting as label doc.kg.add_value("label", json_path="$.ActorKnownGroupCode") doc.kg.add_value("label", value=doc.extract( self.known_group_decoder, doc.select_segments("$.ActorKnownGroupCode")[0])) # Country, refers to the affiliation, being mapped to country of actor, losing the distinction. doc.kg.add_value("country", json_path="$.ActorCountryCode") doc.kg.add_value("country", value=doc.extract( self.country_decoder, doc.select_segments("$.ActorCountryCode")[0])) # Note: not mapping the Actor Geo codes, because Pedro doesn't understand what they mean. return list()