Ejemplo n.º 1
0
    def process_ems(self, doc: Document) -> List[Document]:
        """
        Factory method to wrap input JSON docs in an ETK Document object.

        Args:
            doc (Document): process on this document

        Returns: a Document object and a KnowledgeGraph object

        """
        new_docs = list()

        for a_em in self.em_lst:
            if a_em.document_selector(doc):
                self.log(" processing with " + str(type(a_em)) + ". Process",
                         "info", doc.doc_id, doc.url)
                fresh_docs = a_em.process_document(doc)
                # Allow ETKModules to return nothing in lieu of an empty list (people forget to return empty list)
                if fresh_docs:
                    new_docs.extend(fresh_docs)
            # try:
            #     if a_em.document_selector(doc):
            #         self.log(" processing with " + str(type(a_em)) + ". Process", "info", doc.doc_id, doc.url)
            #         new_docs.extend(a_em.process_document(doc))
            # except Exception as e:
            #     if self.error_policy == ErrorPolicy.THROW_EXTRACTION:
            #         self.log(str(e) + " processing with " + str(type(a_em)) + ". Continue", "error", doc.doc_id,
            #                  doc.url)
            #         continue
            #     if self.error_policy == ErrorPolicy.THROW_DOCUMENT:
            #         self.log(str(e) + " processing with " + str(type(a_em)) + ". Throw doc", "error", doc.doc_id,
            #                  doc.url)
            #         return list()
            #     if self.error_policy == ErrorPolicy.RAISE:
            #         self.log(str(e) + " processing with " + str(type(a_em)), "error", doc.doc_id, doc.url)
            #         raise e

        # Do house cleaning.
        doc.insert_kg_into_cdr()
        if not self.generate_json_ld:
            if "knowledge_graph" in doc.cdr_document:
                doc.cdr_document["knowledge_graph"].pop("@context", None)
        Utility.make_json_serializable(doc.cdr_document)

        if self.output_kg_only:
            doc = doc.kg.value
        elif not doc.doc_id:
            doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document)

        results = [doc]
        for new_doc in new_docs:
            results.extend(self.process_ems(new_doc))

        return results
Ejemplo n.º 2
0
    def build_knowledge_graph(self, json_ontology: dict) -> List:
        """
        The idea if to be able to build a json knowledge graph from a json like ontology representation, eg:
         kg_object_ontology = {
            "uri": doc.doc_id,
            "country": doc.select_segments("$.Location"),
            "type": [
                "Event",
                doc.extract(self.incomp_decoder, doc.select_segments("$.Incomp")[0]),
                doc.extract(self.int_decoder, doc.select_segments("$.Int")[0])
            ]
        }
        Currently json ontology representation is supported, might add new ways later
        Args:
            json_ontology: a json ontology representation of a knowledge graph

        Returns: returns  a list of nested documents created

        """
        nested_docs = list()
        if json_ontology:
            for key in list(json_ontology):
                j_values = json_ontology[key]
                if not isinstance(j_values, list):
                    j_values = [j_values]
                for j_value in j_values:
                    if not isinstance(j_value, dict):
                        if self.kg:
                            if key not in ['doc_id', 'uri']:
                                self.kg.add_value(key, value=j_value)
                    else:
                        """Now we have to create a nested document, assign it a doc_id and 
                           add the doc_id to parent document's knowledge graph"""
                        child_doc_id = None
                        if 'uri' in j_value:
                            child_doc_id = j_value['uri']
                        elif 'doc_id' in j_value:
                            child_doc_id = j_value['doc_id']

                        child_doc = Document(self.etk,
                                             cdr_document=dict(),
                                             mime_type='json',
                                             url='')
                        nested_docs.extend(
                            child_doc.build_knowledge_graph(j_value))

                        if not child_doc_id:
                            child_doc_id = Utility.create_doc_id_from_json(
                                child_doc.kg._kg)

                        if self.kg:
                            self.kg.add_value(key, value=child_doc_id)
                        child_doc.cdr_document["doc_id"] = child_doc_id

                        nested_docs.append(child_doc)

        return nested_docs
Ejemplo n.º 3
0
    def process_document(self, doc: Document) -> List[Document]:
        nested_docs = list()
        doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document)

        doc.cdr_document[
            "title"] = "{Total} Displaced from {ReportedLocation} in {Country}".format(
                Total=doc.cdr_document["Total"],
                ReportedLocation=doc.cdr_document["ReportedLocation"],
                Country=doc.cdr_document["Country"])
        doc.cdr_document["dataset"] = "lake_chad_basin_displaced"

        place = {
            "uri": '{}_place'.format(doc.doc_id),
            "doc_id": '{}_place'.format(doc.doc_id),
            "country": doc.cdr_document.get("Country", ''),
            "dataset": "lcb_place"
        }
        place_doc = etk.create_document(place)
        nested_docs.append(place_doc)
        doc.kg.add_value("place", value='{}_place'.format(doc.doc_id))

        # Add event_date to the KG
        extracted_dates = self.date_extractor.extract(
            doc.cdr_document.get('Period', ''))
        doc.kg.add_value("event_date", value=extracted_dates)
        doc.kg.add_value("event_date_end", value=extracted_dates)

        doc.kg.add_value("location", json_path="ReportedLocation")
        doc.kg.add_value(
            "causeex_class",
            value=
            "http://ontology.causeex.com/ontology/odps/EventHierarchy#ForcedMove"
        )
        doc.kg.add_value("type", value=["event", "Displacement Event"])
        doc.kg.add_value("title", json_path="title")

        victim = {
            "dataset": "lake_chad_basin_displaced_victim",
            "total": doc.cdr_document["Total"],
            "type": ["Group", "Displaced People"],
            "uri": '{}_victim'.format(doc.doc_id)
        }
        victim_doc = etk.create_document(victim)
        victim_doc.doc_id = '{}_victim'.format(doc.doc_id)

        doc.kg.add_value("victim", value=victim_doc.doc_id)
        nested_docs.append(victim_doc)

        return nested_docs
Ejemplo n.º 4
0
    def process_document(self, doc: Document) -> List[Document]:
        nested_docs = list()
        # pyexcel produces dics with date objects, which are not JSON serializable, fix that.
        Utility.make_json_serializable(doc.cdr_document)

        # Add an ID based on the full contents of the raw document
        doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document)

        # Create a CDR document for an actor, we only put the SideA attribute in it,
        # and we give it a new dataset identifier so we can match it in an ETKModule
        actor1_dict = {
            "Side": doc.cdr_document["SideA"],
            "dataset": "ucdp-actor"
        }
        actor1_doc = etk.create_document(actor1_dict)

        # Create a doc_id for the actor document, from the doc_id of the event document
        actor1_doc.doc_id = doc.doc_id + "_actor1"

        # Now do the exact same thing for SideB
        actor2_dict = {
            "Side": doc.cdr_document["SideB"],
            "dataset": "ucdp-actor"
        }
        actor2_doc = etk.create_document(actor2_dict)
        actor2_doc.doc_id = doc.doc_id + "_actor2"

        kg_object_old_ontology = {
            "uri": doc.doc_id,
            "place":{
                "uri": doc.doc_id + "_place",
                "doc_id": doc.doc_id + "_place",
                "country": doc.select_segments("$.Location"),
                "type": ["Place"]
            },
            "type": [
                "Event",
                doc.extract(self.incomp_decoder, doc.select_segments("$.Incomp")[0]),
                doc.extract(self.int_decoder, doc.select_segments("$.Int")[0])
            ],
            "title": "{}/{} armed conflict in {}".format(
                doc.cdr_document["SideA"],
                doc.cdr_document["SideB"],
                doc.cdr_document["YEAR"]
            ),
            "causeex_class": [
                doc.extract(self.int_causeex_decoder, doc.select_segments("$.Int")[0]),
                self.event_prefix + "ArmedConflict"
            ],
            "event_date": doc.select_segments("$.StartDate"),
            "event_date_end": doc.select_segments("$.EpEndDate"),
            "fatalities": {
                "uri": doc.doc_id + "_fatalities",
                "title": doc.extract(self.int_fatalities_decoder, doc.select_segments("$.Int")[0]),
                "type": ["Group", "Dead People"],
                "min_size": doc.extract(self.int_fatalities_size_lower_decoder,
                                                doc.select_segments("$.Int")[0]),
                "max_size": doc.extract(self.int_fatalities_size_upper_decoder, doc.select_segments("$.Int")[0])
            },
            "actor": [actor1_doc.doc_id, actor2_doc.doc_id]
        }
        ds = doc.build_knowledge_graph(kg_object_old_ontology)

        nested_docs.extend(ds)

        # Return the list of new documents that we created to be processed by ETK.
        # Note that fatalities_dco is in the list as it is a newly created document. It does not have an
        # extraction module, so it will be passed to the output unchanged.
        nested_docs.append(actor1_doc)
        nested_docs.append(actor2_doc)
        return nested_docs
Ejemplo n.º 5
0
    def process_document(self, doc: Document) -> List[Document]:
        # pyexcel produces dics with date objects, which are not JSON serializable, fix that.
        Utility.make_json_serializable(doc.cdr_document)

        # Add an ID based on the full contents of the raw document
        doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document)

        # Map location to country
        doc.kg.add_value("country", json_path="$.Location")

        # map incomp to type, after using a decoding dict
        doc.kg.add_value("type", value=doc.extract(self.incomp_decoder, doc.select_segments("$.Incomp")[0]))

        # map Int to type, also after using a decoding dict
        doc.kg.add_value("type", value=doc.extract(self.int_decoder, doc.select_segments("$.Int")[0]))

        # Add "Event" to type, as all these documents are events
        doc.kg.add_value("type", value="Event")

        # Add a title to our event
        doc.kg.add_value("title", value="{}/{} armed conflict in {}".format(
            doc.cdr_document["SideA"],
            doc.cdr_document["SideB"],
            doc.cdr_document["YEAR"]
        ))

        # Add the specific CauseEx ontology classes that we want to use for this event
        doc.kg.add_value("causeex_class", value=doc.extract(self.int_causeex_decoder, doc.select_segments("$.Int")[0]))
        doc.kg.add_value("causeex_class", value=self.event_prefix+"ArmedConflict")

        # Map dates to event_date
        doc.kg.add_value("event_date", json_path="$.StartDate")
        doc.kg.add_value("event_date_end", json_path="$.EpEndDate")

        # Create a CDR document for an actor, we only put the SideA attribute in it,
        # and we give it a new dataset identifier so we can match it in an ETKModule
        actor1_dict = {
            "Side": doc.cdr_document["SideA"],
            "dataset": "ucdp-actor"
        }
        actor1_doc = etk.create_document(actor1_dict)

        # Create a doc_id for the actor document, from the doc_id of the event document
        actor1_doc.doc_id = doc.doc_id + "_actor1"

        # Record the identifier of the actor object in the "actor" field of the event.
        doc.kg.add_value("actor", value=actor1_doc.doc_id)

        # Now do the exact same thing for SideB
        actor2_dict = {
            "Side": doc.cdr_document["SideB"],
            "dataset": "ucdp-actor"
        }
        actor2_doc = etk.create_document(actor2_dict)
        actor2_doc.doc_id = doc.doc_id + "_actor2"
        doc.kg.add_value("actor", value=actor2_doc.doc_id)

        # Create a fatalities object to record information about the fatalities in the conflict
        # Instead of creating an ETK module for it, it is possible to do it inline.
        fatalities_doc = etk.create_document({"Int": doc.cdr_document["Int"]})
        fatalities_doc.doc_id = doc.doc_id + "_fatalities"
        doc.kg.add_value("fatalities", value=fatalities_doc.doc_id)
        fatalities_doc.kg.add_value(
            "title",
            fatalities_doc.extract(self.int_fatalities_decoder, fatalities_doc.select_segments("$.Int")[0]))
        fatalities_doc.kg.add_value("type", value=["Group", "Dead People"])
        fatalities_doc.kg.add_value(
            "size_lower_bound", value=fatalities_doc.extract(self.int_fatalities_size_lower_decoder,
                                                             fatalities_doc.select_segments("$.Int")[0]))
        fatalities_doc.kg.add_value(
            "size_upper_bound", value=fatalities_doc.extract(self.int_fatalities_size_upper_decoder,
                                                             fatalities_doc.select_segments("$.Int")[0]))

        # Return the list of new documents that we created to be processed by ETK.
        # Note that fatalities_dco is in the list as it is a newly created document. It does not have an
        # extraction module, so it will be passed to the output unchanged.
        return [
            actor1_doc,
            actor2_doc,
            fatalities_doc
        ]
Ejemplo n.º 6
0
    def process_document(self, doc: Document) -> List[Document]:
        nested_docs = list()

        json_doc = doc.cdr_document
        filename = json_doc.get('file_name')
        doc.doc_id = Utility.create_doc_id_from_json(json_doc)
        doc.cdr_document['uri'] = doc.doc_id
        doc.kg.add_value("type", value="Event")
        doc.kg.add_value("type", value="Act of Terrorism")
        doc.kg.add_value("provenance_filename", value=filename)
        for attack_type_code in attack_type_fields_code:
            ac = json_doc.get(attack_type_code, '')
            if ac != "":
                doc.kg.add_value("causeex_class",
                                 value=doc.extract(
                                     self.causeex_decoder,
                                     doc.select_segments(
                                         "$.{}".format(attack_type_code))[0]))

        # Add event_date to the KG
        extracted_dates = self.date_extractor.extract('{}-{}-{}'.format(
            json_doc.get('iyear'), json_doc.get('imonth'),
            json_doc.get('iday')))
        if len(extracted_dates) > 0:
            doc.kg.add_value("event_date", value=extracted_dates)
            doc.kg.add_value("event_date_end", value=extracted_dates)
        else:
            # no proper date mentioned in the event, try the approximate date
            approximate_date_txt = json_doc.get("approxdate")
            extracted_approx_dates = self.date_extractor.extract(
                approximate_date_txt)
            if len(extracted_approx_dates) > 0:
                doc.kg.add_value("event_date", value=extracted_approx_dates)
                doc.kg.add_value("event_date_end",
                                 value=extracted_approx_dates)

        # summary, aka description only available for incident after 1997
        doc.kg.add_value("description", json_path="$.summary")

        # add inclusion criteria: why is this incident regarded as a terrorist incident
        # TODO: ADD this to master_config
        crit1 = json_doc.get('crit1', 0)
        if crit1 == 1:
            doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_1)

        crit2 = json_doc.get('crit2', 0)
        if crit2 == 1:
            doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_2)

        crit3 = json_doc.get('crit3', 0)
        if crit3 == 1:
            doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_3)

        # add related events to KG
        # TODO: ADD this to master_config
        related_event_ids_txt = json_doc.get('related', '')
        if related_event_ids_txt.strip() != "":
            related_event_ids = related_event_ids_txt.split(',')
            if len(related_event_ids) > 0:
                doc.kg.add_value("related_events", value=related_event_ids)

        # add attack information, on second thoughts, this qualifies as event type
        for attack_type_field in attack_type_fields:
            doc.kg.add_value("type", value=json_doc.get(attack_type_field, ''))

        # TODO check the following 2
        if json_doc.get("suicide", 0) == 1:
            doc.kg.add_value("type", value='Suicide')

        if json_doc.get("success", 0) == 1:
            doc.kg.add_value("type", value='Success')

        # create nested objects for places
        place_object = dict()
        for place_field in place_fields:
            place_object[place_field] = json_doc.get(place_field)
        place_object["dataset"] = "gtd_place"

        place_doc_id = '{}_place'.format(doc.doc_id)
        place_object['uri'] = place_doc_id
        place_object['filename'] = filename
        place_doc = etk.create_document(place_object)
        place_doc.doc_id = place_doc_id

        doc.kg.add_value("place", value=place_doc.doc_id)
        nested_docs.append(place_doc)

        # create victim objects, there can be upto 3
        if json_doc.get('targtype1_txt', '').strip():
            victim1_object = dict()
            victim1_object['dataset'] = 'gtd_victim'
            victim1_object['filename'] = filename
            victim1_object['victim_type'] = list()
            victim1_object['victim_type'].append(json_doc.get('targtype1_txt'))
            if json_doc.get('targsubtype1_txt', ''):
                victim1_object['victim_type'].append(
                    json_doc.get('targsubtype1_txt'))
            victim1_object['victim_corp'] = json_doc.get('corp1', '')
            victim1_object['victim_target'] = json_doc.get('target1', '')
            victim1_object['victim_nationality'] = json_doc.get(
                'natlty1_txt', '')
            victim1_doc_id = '{}_victim1'.format(doc.doc_id)
            victim1_object['uri'] = victim1_doc_id
            victim1_doc = etk.create_document(victim1_object)
            victim1_doc.doc_id = victim1_doc_id
            doc.kg.add_value('victim', value=victim1_doc.doc_id)
            nested_docs.append(victim1_doc)

        if json_doc.get('targtype2_txt', '').strip():
            victim2_object = dict()
            victim2_object['dataset'] = 'gtd_victim'
            victim2_object['filename'] = filename
            victim2_object['victim_type'] = list()
            victim2_object['victim_type'].append(json_doc.get('targtype2_txt'))
            if json_doc.get('targsubtype2_txt', ''):
                victim2_object['victim_type'].append(
                    json_doc.get('targsubtype2_txt'))
            victim2_object['victim_corp'] = json_doc.get('corp2', '')
            victim2_object['victim_target'] = json_doc.get('target2', '')
            victim2_object['victim_nationality'] = json_doc.get(
                'natlty2_txt', '')
            victim2_doc_id = '{}_victim2'.format(doc.doc_id)
            victim2_object['uri'] = victim2_doc_id
            victim2_doc = etk.create_document(victim2_object)
            victim2_doc.doc_id = victim2_doc_id
            doc.kg.add_value('victim', value=victim2_doc.doc_id)
            nested_docs.append(victim2_doc)

        if json_doc.get('targtype3_txt', '').strip():
            victim3_object = dict()
            victim3_object['dataset'] = 'gtd_victim'
            victim3_object['filename'] = filename
            victim3_object['victim_type'] = list()
            victim3_object['victim_type'].append(json_doc.get('targtype3_txt'))
            if json_doc.get('targsubtype3_txt', ''):
                victim3_object['victim_type'].append(
                    json_doc.get('targsubtype3_txt'))
            victim3_object['victim_corp'] = json_doc.get('corp3', '')
            victim3_object['victim_target'] = json_doc.get('target3', '')
            victim3_object['victim_nationality'] = json_doc.get(
                'natlty3_txt', '')
            victim3_doc_id = '{}_victim3'.format(doc.doc_id)
            victim3_object['uri'] = victim3_doc_id
            victim3_doc = etk.create_document(victim3_object)
            victim3_doc.doc_id = victim3_doc_id
            doc.kg.add_value('victim', value=victim3_doc.doc_id)
            nested_docs.append(victim3_doc)

        # create actor/perpetrators objects
        if json_doc.get('gname', '').strip():
            actor1_object = dict()
            actor1_object['dataset'] = 'gtd_actor'
            actor1_object['filename'] = filename
            actor1_object['actor_group'] = list()
            actor1_object['actor_group'].append(json_doc.get('gname'))
            if json_doc.get('gsubname', ''):
                actor1_object['actor_group'].append(json_doc.get('gsubname'))

            actor1_doc_id = '{}_actor1'.format(doc.doc_id)
            actor1_object['uri'] = actor1_doc_id
            actor1_doc = etk.create_document(actor1_object)
            actor1_doc.doc_id = actor1_doc_id
            doc.kg.add_value('actor', value=actor1_doc.doc_id)
            nested_docs.append(actor1_doc)

        if json_doc.get('gname2', '').strip():
            actor2_object = dict()
            actor2_object['dataset'] = 'gtd_actor'
            actor2_object['filename'] = filename
            actor2_object['actor_group'] = list()
            actor2_object['actor_group'].append(json_doc.get('gname2'))
            if json_doc.get('gsubname2', ''):
                actor2_object['actor_group'].append(json_doc.get('gsubname2'))
            actor2_doc_id = '{}_actor2'.format(doc.doc_id)
            actor2_object['uri'] = actor2_doc_id
            actor2_doc = etk.create_document(actor2_object)
            actor2_doc.doc_id = actor2_doc_id
            doc.kg.add_value('actor', value=actor2_doc.doc_id)
            nested_docs.append(actor2_doc)

        if json_doc.get('gname3', '').strip():
            actor3_object = dict()
            actor3_object['dataset'] = 'gtd_actor'
            actor3_object['filename'] = filename
            actor3_object['actor_group'] = list()
            actor3_object['actor_group'].append(json_doc.get('gname3'))
            if json_doc.get('gsubname3', ''):
                actor3_object['actor_group'].append(json_doc.get('gsubname3'))
            actor3_doc_id = '{}_actor3'.format(doc.doc_id)
            actor3_object['uri'] = actor3_doc_id
            actor3_doc = etk.create_document(actor3_object)
            actor3_doc.doc_id = actor3_doc_id
            doc.kg.add_value('actor', value=actor3_doc.doc_id)
            nested_docs.append(actor3_doc)

        # create weapon objects, upto 4
        if json_doc.get('weaptype1_txt', '').strip():
            weapon1_object = dict()
            weapon1_object['dataset'] = 'gtd_weapon'
            weapon1_object['filename'] = filename
            weapon1_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon1_object['weapon_type'] = list()
            weapon1_object['weapon_type'].append(json_doc.get('weaptype1_txt'))
            if json_doc.get('weapsubtype1_txt', ''):
                weapon1_object['weapon_type'].append(
                    json_doc.get('weapsubtype1_txt'))
            if json_doc.get('weaptype1', '') != '':
                weapon1_object['weapon_code'] = json_doc.get('weaptype1')
            weapon1_doc_id = '{}_weapons1'.format(doc.doc_id)
            weapon1_object['uri'] = weapon1_doc_id
            weapon1_doc = etk.create_document(weapon1_object)
            weapon1_doc.doc_id = weapon1_doc_id
            doc.kg.add_value('weapons', weapon1_doc.doc_id)
            nested_docs.append(weapon1_doc)

        if json_doc.get('weaptype2_txt', '').strip():
            weapon2_object = dict()
            weapon2_object['dataset'] = 'gtd_weapon'
            weapon2_object['filename'] = filename
            weapon2_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon2_object['weapon_type'] = list()
            weapon2_object['weapon_type'].append(json_doc.get('weaptype2_txt'))
            if json_doc.get('weapsubtype2_txt', ''):
                weapon2_object['weapon_type'].append(
                    json_doc.get('weapsubtype2_txt'))
            if json_doc.get('weaptype2', '') != '':
                weapon2_object['weapon_code'] = json_doc.get('weaptype2')
            weapon2_doc_id = '{}_weapons2'.format(doc.doc_id)
            weapon2_object['uri'] = weapon2_doc_id
            weapon2_doc = etk.create_document(weapon2_object)
            weapon2_doc.doc_id = weapon2_doc_id
            doc.kg.add_value('weapons', weapon2_doc.doc_id)
            nested_docs.append(weapon2_doc)

        if json_doc.get('weaptype3_txt', '').strip():
            weapon3_object = dict()
            weapon3_object['dataset'] = 'gtd_weapon'
            weapon3_object['filename'] = filename
            weapon3_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon3_object['weapon_type'] = list()
            weapon3_object['weapon_type'].append(json_doc.get('weaptype3_txt'))
            if json_doc.get('weapsubtype3_txt', ''):
                weapon3_object['weapon_type'].append(
                    json_doc.get('weapsubtype3_txt'))
            if json_doc.get('weaptype3', '') != '':
                weapon3_object['weapon_code'] = json_doc.get('weaptype3')
            weapon3_doc_id = '{}_weapons3'.format(doc.doc_id)
            weapon3_object['uri'] = weapon3_doc_id
            weapon3_doc = etk.create_document(weapon3_object)
            weapon3_doc.doc_id = weapon3_doc_id
            doc.kg.add_value('weapons', weapon3_doc.doc_id)
            nested_docs.append(weapon3_doc)

        if json_doc.get('weaptype4_txt', '').strip():
            weapon4_object = dict()
            weapon4_object['dataset'] = 'gtd_weapon'
            weapon4_object['filename'] = filename
            weapon4_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon4_object['weapon_type'] = list()
            weapon4_object['weapon_type'].append(json_doc.get('weaptype4_txt'))
            if json_doc.get('weapsubtype4_txt', ''):
                weapon4_object['weapon_type'].append(
                    json_doc.get('weapsubtype4_txt'))
            if json_doc.get('weaptype4', '') != '':
                weapon4_object['weapon_code'] = json_doc.get('weaptype4')
            weapon4_doc_id = '{}_weapons4'.format(doc.doc_id)
            weapon4_object['uri'] = weapon4_doc_id
            weapon4_doc = etk.create_document(weapon4_object)
            weapon4_doc.doc_id = weapon4_doc_id
            doc.kg.add_value('weapons', weapon4_doc.doc_id)
            nested_docs.append(weapon4_doc)

        # create total fatalities docs
        nkill = json_doc.get("nkill", 0)
        if nkill != "":
            total_fatalities_object = dict()
            total_fatalities_object["dataset"] = "gtd_fatality"
            total_fatalities_object['filename'] = filename
            total_fatalities_doc_id = '{}_total_fatalitites'.format(doc.doc_id)
            total_fatalities_object['uri'] = total_fatalities_doc_id
            total_fatalities_object["size"] = nkill
            total_fatalities_doc = etk.create_document(total_fatalities_object)
            total_fatalities_doc.doc_id = total_fatalities_doc_id
            doc.kg.add_value("fatalities", value=total_fatalities_doc_id)
            nested_docs.append(total_fatalities_doc)

        # create US fatalities docs
        nkillus = json_doc.get("nkillus", 0)
        if nkillus != "":
            us_fatalities_object = dict()
            us_fatalities_object["dataset"] = "gtd_fatality"
            us_fatalities_object['filename'] = filename
            us_fatalities_doc_id = '{}_us_fatalitites'.format(doc.doc_id)
            us_fatalities_object['uri'] = us_fatalities_doc_id
            us_fatalities_object["size"] = nkillus
            us_fatalities_object["nationality"] = "United States"
            us_fatalities_doc = etk.create_document(us_fatalities_object)
            us_fatalities_doc.doc_id = us_fatalities_doc_id
            doc.kg.add_value("fatalities", value=us_fatalities_doc_id)
            nested_docs.append(us_fatalities_doc)

        # create total injuries docs
        nwound = json_doc.get("nwound", 0)
        if nwound != "":
            total_injuries_object = dict()
            total_injuries_object["dataset"] = "gtd_injury"
            total_injuries_object['filename'] = filename
            total_injuries_doc_id = '{}_total_injuries'.format(doc.doc_id)
            total_injuries_object['uri'] = total_injuries_doc_id
            total_injuries_object["size"] = nwound
            total_injuries_doc = etk.create_document(total_injuries_object)
            total_injuries_doc.doc_id = total_injuries_doc_id
            doc.kg.add_value("injuries", value=total_injuries_doc_id)
            nested_docs.append(total_injuries_doc)

        # create US injuries docs
        nwoundus = json_doc.get("nwoundus", 0)
        if nwoundus != "":
            us_injuries_object = dict()
            us_injuries_object["dataset"] = "gtd_injury"
            us_injuries_object['filename'] = filename
            us_injuries_doc_id = '{}_us_injuries'.format(doc.doc_id)
            us_injuries_object['uri'] = us_injuries_doc_id
            us_injuries_object["size"] = nwoundus
            us_injuries_doc = etk.create_document(us_injuries_object)
            us_injuries_doc.doc_id = us_injuries_doc_id
            doc.kg.add_value("injuries", value=us_injuries_doc_id)
            nested_docs.append(us_injuries_doc)

        # create damage docs
        # in this dataset we only have property damage
        if json_doc.get("property", 0) == 1:
            damage_object = dict()
            damage_object["dataset"] = "gtd_damage"
            damage_object['filename'] = filename
            damage_object["damage_title"] = json_doc.get("propextent_txt")
            damage_object["damage_value"] = json_doc.get("propvalue")
            damage_object["damage_description"] = json_doc.get("propcomment")
            damage_object_doc_id = '{}_damage'.format(doc.doc_id)
            damage_object['uri'] = damage_object_doc_id
            damage_doc = etk.create_document(damage_object)
            damage_doc.doc_id = damage_object_doc_id
            doc.kg.add_value("damage", value=damage_object_doc_id)
            nested_docs.append(damage_doc)

        return nested_docs
Ejemplo n.º 7
0
    def process_document(self, doc):
        # extraction
        variables = {
            'value': '$col,$row',
            'food_name_in_english': '$B,$row',
            'food_name_in_french': '$C,$row',
            'scientific_name': '$D,$row',
            'code': '$A,$row',
            'source': '$E,$row',
            'nutrition': '$col,$2',
            'row': '$row',
            'col': '$col'
        }

        raw_extractions = self.ee.extract(doc.cdr_document['file_path'],
                                          'USERDATABASE', ['F,3', 'AG,971'],
                                          variables)

        # post processing
        re_code = re.compile(r'^[0-9]{2}_[0-9]{3}$')
        re_value_in_bracket = re.compile(r'^.*\((.*)\)')
        re_value_in_square_bracket = re.compile(r'^.*\[(.*)\]')
        extracted_docs = []
        for e in raw_extractions:
            code = e['code'].strip()
            if not re_code.match(code):
                continue

            in_bracket_unit = re_value_in_bracket.search(e['nutrition'])
            unit = '' if not in_bracket_unit else in_bracket_unit.groups(1)[0]

            # parse value
            value = e['value']
            if e['nutrition'] == 'Energy (kcal) kJ':
                in_bracket_value = re_value_in_bracket.search(e['value'])
                value = in_bracket_value.groups(1)[0]
            elif isinstance(value, str):
                value = value.strip()
                in_square_bracket_value = re_value_in_square_bracket.search(
                    e['value'])
                if in_square_bracket_value:
                    value = in_square_bracket_value.groups(1)[0]

                # if it's a range, get the lower bound
                dash_pos = value.find('-')
                if dash_pos != -1:
                    value = value[:dash_pos]
            try:
                value = float(value)
            except:
                value = 0.0

            extracted_doc = {
                'tld': '',
                'website': '',
                'type': 'factoid',
                'factoid': {
                    'value': value,
                    'unit': unit,
                    'food_name_in_english': e['food_name_in_english'],
                    'food_name_in_french': e['food_name_in_french'],
                    'scientific_name': e['scientific_name'],
                    'source': e['source'],
                    'code': code,
                    'nutrition': e['nutrition'],
                    'metadata': {
                        'file_name':
                        os.path.basename(doc.cdr_document['file_path']),
                        'sheet_name':
                        'USERDATABASE',
                        'row':
                        str(e['row']),
                        'col':
                        str(e['col'])
                    },
                    'identifier_key': 'code',
                    'identifier_value': code
                }
            }
            extracted_doc['doc_id'] = Utility.create_doc_id_from_json(
                extracted_doc)
            extracted_doc = etk.create_document(extracted_doc)

            # build kg
            extracted_doc.kg.add_value('metadata__unit',
                                       json_path='$.factoid.unit')
            extracted_doc.kg.add_value(
                'metadata__property_type',
                value=[
                    'http://ontology.causeex.com/ontology/odps/TimeSeriesAndMeasurements#Nutrition'
                ])
            extracted_doc.kg.add_value(
                'metadata__reported_value',
                value=[
                    'http://ontology.causeex.com/ontology/odps/TimeSeriesAndMeasurements#ReportedValue'
                ])
            extracted_doc.kg.add_value('provenance_col',
                                       json_path='$.factoid.metadata.col')
            extracted_doc.kg.add_value('provenance_row',
                                       json_path='$.factoid.metadata.row')
            extracted_doc.kg.add_value(
                'provenance_filename',
                json_path='$.factoid.metadata.file_name')
            extracted_doc.kg.add_value(
                'provenance_sheet', json_path='$.factoid.metadata.sheet_name')
            extracted_doc.kg.add_value('value', json_path='$.factoid.value')
            extracted_doc.kg.add_value('type', json_path='$.factoid.type')
            extracted_doc.kg.add_value('identifier_key',
                                       json_path='$.factoid.identifier_key')
            extracted_doc.kg.add_value('identifier_value',
                                       json_path='$.factoid.identifier_value')

            extracted_docs.append(extracted_doc)

        return extracted_docs