Esempio n. 1
0
    def process_document(self, doc: Document) -> List[Document]:
        # Record new documents we create that need to be processed later.
        new_docs = list()

        cameo_code = self.attribute_value(doc, "EventCode")
        print("Processing cameo code {}".format(cameo_code))

        doc.doc_id = str(doc.cdr_document[self.attribute("GLOBALEVENTID")])

        # Type
        doc.kg.add_value("type", "Event")
        if self.mapping.has_cameo_code(cameo_code):
            # Type fields
            for t in self.mapping.event_type("event1", cameo_code):
                doc.kg.add_value("type", value=t)
                doc.kg.add_value("causeex_class", value=self.expand_prefix(t))

        # Event_date
        for s in doc.select_segments("$." + self.attribute("SQLDATE")):
            doc.kg.add_value("event_date",
                             value=doc.extract(self.date_extractor,
                                               s,
                                               prefer_language_date_order=None,
                                               additional_formats=["%Y%m%d"],
                                               detect_relative_dates=False,
                                               use_default_formats=False))

        # CAMEO code
        cameo_code_label = "CAMEO Code: " + str(
            doc.select_segments("$." + self.attribute("EventCode"))[0].value)
        doc.kg.add_value("code", value=cameo_code_label)
        # simpler without provenance:
        # doc.kg.add_value("code", "CAMEO Code: " + doc.cdr_document[self.attribute("EventCode")])

        # Identifier
        doc.kg.add_value("identifier",
                         json_path="$." + self.attribute("GLOBALEVENTID"))

        # Geographical information
        doc.kg.add_value("country_code",
                         json_path="$." +
                         self.attribute("ActionGeo_CountryCode"))
        doc.kg.add_value("location",
                         json_path="$." + self.attribute("ActionGeo_FullName"))

        # Actors
        actor1, actor2 = self.add_actors(doc, "event1", cameo_code)
        new_docs.append(actor1)
        new_docs.append(actor2)

        # has topic events
        for event in ["event2", "event3"]:
            new_docs.extend(
                self.add_topic_events(doc, event, actor1.doc_id,
                                      actor2.doc_id))

        return new_docs
Esempio n. 2
0
 def process_document(self, doc: Document) -> List[Document]:
     doc.kg.add_value("type", value="Weapon")
     doc.kg.add_value("title", json_path="$.weapon_title")
     doc.kg.add_value("type", json_path="$.weapon_type[*]")
     doc.kg.add_value("provenance_filename", json_path="$.filename")
     doc.kg.add_value("causeex_class",
                      value=doc.extract(
                          self.weapon_decoder,
                          doc.select_segments("$.weapon_code")[0]))
     return list()
Esempio n. 3
0
    def process_document(self, doc: Document):

        descriptions = doc.select_segments("projects[*].description")
        projects = doc.select_segments("projects[*]")

        for d, p in zip(descriptions, projects):
            spacy_names = doc.extract(self.sample_rule_extractor, d)
            p.store(spacy_names, "spacy_names")
            for a_name in spacy_names:
                doc.kg.add_value("spacy_name", value=a_name.value)
Esempio n. 4
0
    def process_document(self, doc: Document):

        descriptions = doc.select_segments("date_description")
        date_text = doc.select_segments("date_description.text")

        ignore_before = datetime.datetime(1890, 1, 1)
        ignore_after = datetime.datetime(2500, 10, 10)
        relative_base = datetime.datetime(2018, 1, 1)

        for d, p in zip(date_text, descriptions):
            extracted_date = doc.extract(
                self.date_extractor,
                d,
                extract_first_date_only=False,   # first valid

                additional_formats=['%Y@%m@%d', '%a %Y, %b %d'],

                use_default_formats=True,

                # ignore_dates_before: datetime.datetime = None,
                ignore_dates_before=ignore_before,

                # ignore_dates_after: datetime.datetime = None,
                ignore_dates_after=ignore_after,

                detect_relative_dates=False,

                relative_base=relative_base,

                # preferred_date_order: str = "MDY",  # used for interpreting ambiguous dates that are missing parts
                preferred_date_order="DMY",

                prefer_language_date_order=True,

                # timezone: str = None,  # default is local timezone.
                # timezone='GMT',

                # to_timezone: str = None,  # when not specified, not timezone conversion is done.
                # to_timezone='UTC',

                # return_as_timezone_aware: bool = True
                return_as_timezone_aware=False,

                # prefer_day_of_month: str = "first",  # can be "current", "first", "last".
                prefer_day_of_month='first',

                # prefer_dates_from: str = "current"  # can be "future", "future", "past".)
                prefer_dates_from='future',

                # date_value_resolution: DateResolution = DateResolution.DAY
            )

            p.store(extracted_date, "extracted_date")

        doc.kg.add_doc_value("date", "date_description.extracted_date[*]")
Esempio n. 5
0
    def process_ems(self, doc: Document) -> List[Document]:
        """
        Factory method to wrap input JSON docs in an ETK Document object.

        Args:
            doc (Document): process on this document

        Returns: a Document object and a KnowledgeGraph object

        """
        new_docs = list()

        for a_em in self.em_lst:
            if a_em.document_selector(doc):
                self.log(" processing with " + str(type(a_em)) + ". Process",
                         "info", doc.doc_id, doc.url)
                fresh_docs = a_em.process_document(doc)
                # Allow ETKModules to return nothing in lieu of an empty list (people forget to return empty list)
                if fresh_docs:
                    new_docs.extend(fresh_docs)
            # try:
            #     if a_em.document_selector(doc):
            #         self.log(" processing with " + str(type(a_em)) + ". Process", "info", doc.doc_id, doc.url)
            #         new_docs.extend(a_em.process_document(doc))
            # except Exception as e:
            #     if self.error_policy == ErrorPolicy.THROW_EXTRACTION:
            #         self.log(str(e) + " processing with " + str(type(a_em)) + ". Continue", "error", doc.doc_id,
            #                  doc.url)
            #         continue
            #     if self.error_policy == ErrorPolicy.THROW_DOCUMENT:
            #         self.log(str(e) + " processing with " + str(type(a_em)) + ". Throw doc", "error", doc.doc_id,
            #                  doc.url)
            #         return list()
            #     if self.error_policy == ErrorPolicy.RAISE:
            #         self.log(str(e) + " processing with " + str(type(a_em)), "error", doc.doc_id, doc.url)
            #         raise e

        # Do house cleaning.
        doc.insert_kg_into_cdr()
        if not self.generate_json_ld:
            if "knowledge_graph" in doc.cdr_document:
                doc.cdr_document["knowledge_graph"].pop("@context", None)
        Utility.make_json_serializable(doc.cdr_document)

        if self.output_kg_only:
            doc = doc.kg.value
        elif not doc.doc_id:
            doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document)

        results = [doc]
        for new_doc in new_docs:
            results.extend(self.process_ems(new_doc))

        return results
    def process_document(self, doc: Document):
        """
        Add your code for processing the document
        """

        text_segments = doc.select_segments("lexisnexis.doc_description")
        for text_segment in text_segments:
            split_sentences = doc.extract(self.sentence_extractor,
                                          text_segment)
            doc.store(split_sentences, 'split_sentences')
        # for t, u in zip(text_to_be_split, units_of_text):
        #     split_sentences = doc.extract(self.sentence_extractor, t)
        #     u.store(split_sentences, "split_sentences")
        return list()
Esempio n. 7
0
    def process_document(self, doc: Document) -> List[Document]:
        nested_docs = list()
        doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document)

        doc.cdr_document[
            "title"] = "{Total} Displaced from {ReportedLocation} in {Country}".format(
                Total=doc.cdr_document["Total"],
                ReportedLocation=doc.cdr_document["ReportedLocation"],
                Country=doc.cdr_document["Country"])
        doc.cdr_document["dataset"] = "lake_chad_basin_displaced"

        place = {
            "uri": '{}_place'.format(doc.doc_id),
            "doc_id": '{}_place'.format(doc.doc_id),
            "country": doc.cdr_document.get("Country", ''),
            "dataset": "lcb_place"
        }
        place_doc = etk.create_document(place)
        nested_docs.append(place_doc)
        doc.kg.add_value("place", value='{}_place'.format(doc.doc_id))

        # Add event_date to the KG
        extracted_dates = self.date_extractor.extract(
            doc.cdr_document.get('Period', ''))
        doc.kg.add_value("event_date", value=extracted_dates)
        doc.kg.add_value("event_date_end", value=extracted_dates)

        doc.kg.add_value("location", json_path="ReportedLocation")
        doc.kg.add_value(
            "causeex_class",
            value=
            "http://ontology.causeex.com/ontology/odps/EventHierarchy#ForcedMove"
        )
        doc.kg.add_value("type", value=["event", "Displacement Event"])
        doc.kg.add_value("title", json_path="title")

        victim = {
            "dataset": "lake_chad_basin_displaced_victim",
            "total": doc.cdr_document["Total"],
            "type": ["Group", "Displaced People"],
            "uri": '{}_victim'.format(doc.doc_id)
        }
        victim_doc = etk.create_document(victim)
        victim_doc.doc_id = '{}_victim'.format(doc.doc_id)

        doc.kg.add_value("victim", value=victim_doc.doc_id)
        nested_docs.append(victim_doc)

        return nested_docs
Esempio n. 8
0
    def process_document(self, doc: Document):
        if self.document_selector(doc):
            event_date = doc.select_segments(jsonpath='$.event_date')
            for segment in event_date:
                extractions = doc.extract(extractor=self.date_extractor,
                                          extractable=segment)
                # doc.store(extractions=extractions, attribute=self.date_extractor.name)
                # doc.kg.add_doc_value("event_date", "$.{}[*]".format(self.date_extractor.name))

                for extraction in extractions:
                    doc.kg.add_value("event_date", value=extraction.value)

            # for segment in doc.select_segments(jsonpath='$.notes'):
            #     doc.kg.add_value("description", segment.value)
            doc.kg.add_value("description", json_path='$.notes')
Esempio n. 9
0
    def process_document(self, doc: Document):

        descriptions = doc.select_segments("projects[*].description")
        projects = doc.select_segments("projects[*]")

        for d, p in zip(descriptions, projects):
            names = doc.extract(self.name_extractor, d)
            p.store(names, "members")

            students = []
            for name_extraction in names:
                students += doc.extract(self.student_extractor,
                                        name_extraction)
            p.store(students, "students")

        doc.kg.add_value("developer", json_path="projects[*].members[*]")
        doc.kg.add_value("student_developer",
                         json_path="projects[*].students[*]")

        doc.kg.add_value("id", json_path='$.doc_id')
Esempio n. 10
0
 def test_etk_crf_glossary_extraction(self):
     etk = ETK(use_spacy_tokenizer=False)
     s = time.time()
     city_extractor = GlossaryExtractor(
         ['los angeles', 'new york', 'angeles'],
         'city_extractor',
         etk.default_tokenizer,
         case_sensitive=False,
         ngrams=3)
     doc_json = {
         'text':
         'i live in los angeles. my hometown is Beijing. I love New York City.'
     }
     doc = Document(etk,
                    cdr_document=doc_json,
                    mime_type='json',
                    url='',
                    doc_id='1')
     t_segments = doc.select_segments("$.text")
     for t_segment in t_segments:
         extracted_cities = doc.extract(city_extractor, t_segment)
         for extracted_city in extracted_cities:
             self.assertTrue(extracted_city.value in
                             ['los angeles', 'New York', 'angeles'])
Esempio n. 11
0
    def create_document(self,
                        doc: Dict,
                        mime_type: str = None,
                        url: str = "http://ex.com/123") -> Document:
        """
        Factory method to wrap input JSON docs in an ETK Document object.

        Args:
            doc (object): a JSON object containing a document in CDR format.
            mime_type (str): if doc is a tring, the mime_type tells what it is
            url (str): if the doc came from the web, specifies the URL for it

        Returns: wrapped Document

        """
        return Document(self, doc)
Esempio n. 12
0
    def process_document(self, doc: Document) -> List[Document]:
        nested_docs = list()
        # pyexcel produces dics with date objects, which are not JSON serializable, fix that.
        Utility.make_json_serializable(doc.cdr_document)

        # Add an ID based on the full contents of the raw document
        doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document)

        # Create a CDR document for an actor, we only put the SideA attribute in it,
        # and we give it a new dataset identifier so we can match it in an ETKModule
        actor1_dict = {
            "Side": doc.cdr_document["SideA"],
            "dataset": "ucdp-actor"
        }
        actor1_doc = etk.create_document(actor1_dict)

        # Create a doc_id for the actor document, from the doc_id of the event document
        actor1_doc.doc_id = doc.doc_id + "_actor1"

        # Now do the exact same thing for SideB
        actor2_dict = {
            "Side": doc.cdr_document["SideB"],
            "dataset": "ucdp-actor"
        }
        actor2_doc = etk.create_document(actor2_dict)
        actor2_doc.doc_id = doc.doc_id + "_actor2"

        kg_object_old_ontology = {
            "uri": doc.doc_id,
            "place":{
                "uri": doc.doc_id + "_place",
                "doc_id": doc.doc_id + "_place",
                "country": doc.select_segments("$.Location"),
                "type": ["Place"]
            },
            "type": [
                "Event",
                doc.extract(self.incomp_decoder, doc.select_segments("$.Incomp")[0]),
                doc.extract(self.int_decoder, doc.select_segments("$.Int")[0])
            ],
            "title": "{}/{} armed conflict in {}".format(
                doc.cdr_document["SideA"],
                doc.cdr_document["SideB"],
                doc.cdr_document["YEAR"]
            ),
            "causeex_class": [
                doc.extract(self.int_causeex_decoder, doc.select_segments("$.Int")[0]),
                self.event_prefix + "ArmedConflict"
            ],
            "event_date": doc.select_segments("$.StartDate"),
            "event_date_end": doc.select_segments("$.EpEndDate"),
            "fatalities": {
                "uri": doc.doc_id + "_fatalities",
                "title": doc.extract(self.int_fatalities_decoder, doc.select_segments("$.Int")[0]),
                "type": ["Group", "Dead People"],
                "min_size": doc.extract(self.int_fatalities_size_lower_decoder,
                                                doc.select_segments("$.Int")[0]),
                "max_size": doc.extract(self.int_fatalities_size_upper_decoder, doc.select_segments("$.Int")[0])
            },
            "actor": [actor1_doc.doc_id, actor2_doc.doc_id]
        }
        ds = doc.build_knowledge_graph(kg_object_old_ontology)

        nested_docs.extend(ds)

        # Return the list of new documents that we created to be processed by ETK.
        # Note that fatalities_dco is in the list as it is a newly created document. It does not have an
        # extraction module, so it will be passed to the output unchanged.
        nested_docs.append(actor1_doc)
        nested_docs.append(actor2_doc)
        return nested_docs
Esempio n. 13
0
                    if extractions:
                        path = '$."' + \
                               extractions[0].value + '"[?(@.country == "Italy")]'
                        jsonpath_expr = jex.parse(path)
                        city_match = jsonpath_expr.find(self.city_dataset)
                        if city_match:
                            # add corresponding values of city_dataset into knowledge graph of the doc
                            for field in city_match[0].value:
                                doc.kg.add_value(
                                    field, value=city_match[0].value[field])
                    new_docs.append(doc)
        return new_docs

    def document_selector(self, doc) -> bool:
        return doc.cdr_document.get("dataset") == "italy_team"


if __name__ == "__main__":
    # url = 'https://en.wikipedia.org/wiki/List_of_football_clubs_in_Italy'

    cdr = json.load(
        open('./resources/italy_teams.json', mode='r', encoding='utf-8'))
    kg_schema = KGSchema(json.load(open('./resources/master_config.json')))
    etk = ETK(modules=ItalyTeamsModule, kg_schema=kg_schema)
    etk.parser = jex.parse
    cdr_doc = Document(etk, cdr_document=cdr, mime_type='json', url=cdr['url'])
    results = etk.process_ems(cdr_doc)[1:]
    print('Total docs:', len(results))
    print("Sample result:\n")
    print(json.dumps(results[0].value, indent=2))
Esempio n. 14
0
    def process_document(self, doc: Document) -> List[Document]:
        # pyexcel produces dics with date objects, which are not JSON serializable, fix that.
        Utility.make_json_serializable(doc.cdr_document)

        # Add an ID based on the full contents of the raw document
        doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document)

        # Map location to country
        doc.kg.add_value("country", json_path="$.Location")

        # map incomp to type, after using a decoding dict
        doc.kg.add_value("type", value=doc.extract(self.incomp_decoder, doc.select_segments("$.Incomp")[0]))

        # map Int to type, also after using a decoding dict
        doc.kg.add_value("type", value=doc.extract(self.int_decoder, doc.select_segments("$.Int")[0]))

        # Add "Event" to type, as all these documents are events
        doc.kg.add_value("type", value="Event")

        # Add a title to our event
        doc.kg.add_value("title", value="{}/{} armed conflict in {}".format(
            doc.cdr_document["SideA"],
            doc.cdr_document["SideB"],
            doc.cdr_document["YEAR"]
        ))

        # Add the specific CauseEx ontology classes that we want to use for this event
        doc.kg.add_value("causeex_class", value=doc.extract(self.int_causeex_decoder, doc.select_segments("$.Int")[0]))
        doc.kg.add_value("causeex_class", value=self.event_prefix+"ArmedConflict")

        # Map dates to event_date
        doc.kg.add_value("event_date", json_path="$.StartDate")
        doc.kg.add_value("event_date_end", json_path="$.EpEndDate")

        # Create a CDR document for an actor, we only put the SideA attribute in it,
        # and we give it a new dataset identifier so we can match it in an ETKModule
        actor1_dict = {
            "Side": doc.cdr_document["SideA"],
            "dataset": "ucdp-actor"
        }
        actor1_doc = etk.create_document(actor1_dict)

        # Create a doc_id for the actor document, from the doc_id of the event document
        actor1_doc.doc_id = doc.doc_id + "_actor1"

        # Record the identifier of the actor object in the "actor" field of the event.
        doc.kg.add_value("actor", value=actor1_doc.doc_id)

        # Now do the exact same thing for SideB
        actor2_dict = {
            "Side": doc.cdr_document["SideB"],
            "dataset": "ucdp-actor"
        }
        actor2_doc = etk.create_document(actor2_dict)
        actor2_doc.doc_id = doc.doc_id + "_actor2"
        doc.kg.add_value("actor", value=actor2_doc.doc_id)

        # Create a fatalities object to record information about the fatalities in the conflict
        # Instead of creating an ETK module for it, it is possible to do it inline.
        fatalities_doc = etk.create_document({"Int": doc.cdr_document["Int"]})
        fatalities_doc.doc_id = doc.doc_id + "_fatalities"
        doc.kg.add_value("fatalities", value=fatalities_doc.doc_id)
        fatalities_doc.kg.add_value(
            "title",
            fatalities_doc.extract(self.int_fatalities_decoder, fatalities_doc.select_segments("$.Int")[0]))
        fatalities_doc.kg.add_value("type", value=["Group", "Dead People"])
        fatalities_doc.kg.add_value(
            "size_lower_bound", value=fatalities_doc.extract(self.int_fatalities_size_lower_decoder,
                                                             fatalities_doc.select_segments("$.Int")[0]))
        fatalities_doc.kg.add_value(
            "size_upper_bound", value=fatalities_doc.extract(self.int_fatalities_size_upper_decoder,
                                                             fatalities_doc.select_segments("$.Int")[0]))

        # Return the list of new documents that we created to be processed by ETK.
        # Note that fatalities_dco is in the list as it is a newly created document. It does not have an
        # extraction module, so it will be passed to the output unchanged.
        return [
            actor1_doc,
            actor2_doc,
            fatalities_doc
        ]
Esempio n. 15
0
    def process_document(self, doc: Document) -> List[Document]:
        nested_docs = list()

        json_doc = doc.cdr_document
        filename = json_doc.get('file_name')
        doc.doc_id = Utility.create_doc_id_from_json(json_doc)
        doc.cdr_document['uri'] = doc.doc_id
        doc.kg.add_value("type", value="Event")
        doc.kg.add_value("type", value="Act of Terrorism")
        doc.kg.add_value("provenance_filename", value=filename)
        for attack_type_code in attack_type_fields_code:
            ac = json_doc.get(attack_type_code, '')
            if ac != "":
                doc.kg.add_value("causeex_class",
                                 value=doc.extract(
                                     self.causeex_decoder,
                                     doc.select_segments(
                                         "$.{}".format(attack_type_code))[0]))

        # Add event_date to the KG
        extracted_dates = self.date_extractor.extract('{}-{}-{}'.format(
            json_doc.get('iyear'), json_doc.get('imonth'),
            json_doc.get('iday')))
        if len(extracted_dates) > 0:
            doc.kg.add_value("event_date", value=extracted_dates)
            doc.kg.add_value("event_date_end", value=extracted_dates)
        else:
            # no proper date mentioned in the event, try the approximate date
            approximate_date_txt = json_doc.get("approxdate")
            extracted_approx_dates = self.date_extractor.extract(
                approximate_date_txt)
            if len(extracted_approx_dates) > 0:
                doc.kg.add_value("event_date", value=extracted_approx_dates)
                doc.kg.add_value("event_date_end",
                                 value=extracted_approx_dates)

        # summary, aka description only available for incident after 1997
        doc.kg.add_value("description", json_path="$.summary")

        # add inclusion criteria: why is this incident regarded as a terrorist incident
        # TODO: ADD this to master_config
        crit1 = json_doc.get('crit1', 0)
        if crit1 == 1:
            doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_1)

        crit2 = json_doc.get('crit2', 0)
        if crit2 == 1:
            doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_2)

        crit3 = json_doc.get('crit3', 0)
        if crit3 == 1:
            doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_3)

        # add related events to KG
        # TODO: ADD this to master_config
        related_event_ids_txt = json_doc.get('related', '')
        if related_event_ids_txt.strip() != "":
            related_event_ids = related_event_ids_txt.split(',')
            if len(related_event_ids) > 0:
                doc.kg.add_value("related_events", value=related_event_ids)

        # add attack information, on second thoughts, this qualifies as event type
        for attack_type_field in attack_type_fields:
            doc.kg.add_value("type", value=json_doc.get(attack_type_field, ''))

        # TODO check the following 2
        if json_doc.get("suicide", 0) == 1:
            doc.kg.add_value("type", value='Suicide')

        if json_doc.get("success", 0) == 1:
            doc.kg.add_value("type", value='Success')

        # create nested objects for places
        place_object = dict()
        for place_field in place_fields:
            place_object[place_field] = json_doc.get(place_field)
        place_object["dataset"] = "gtd_place"

        place_doc_id = '{}_place'.format(doc.doc_id)
        place_object['uri'] = place_doc_id
        place_object['filename'] = filename
        place_doc = etk.create_document(place_object)
        place_doc.doc_id = place_doc_id

        doc.kg.add_value("place", value=place_doc.doc_id)
        nested_docs.append(place_doc)

        # create victim objects, there can be upto 3
        if json_doc.get('targtype1_txt', '').strip():
            victim1_object = dict()
            victim1_object['dataset'] = 'gtd_victim'
            victim1_object['filename'] = filename
            victim1_object['victim_type'] = list()
            victim1_object['victim_type'].append(json_doc.get('targtype1_txt'))
            if json_doc.get('targsubtype1_txt', ''):
                victim1_object['victim_type'].append(
                    json_doc.get('targsubtype1_txt'))
            victim1_object['victim_corp'] = json_doc.get('corp1', '')
            victim1_object['victim_target'] = json_doc.get('target1', '')
            victim1_object['victim_nationality'] = json_doc.get(
                'natlty1_txt', '')
            victim1_doc_id = '{}_victim1'.format(doc.doc_id)
            victim1_object['uri'] = victim1_doc_id
            victim1_doc = etk.create_document(victim1_object)
            victim1_doc.doc_id = victim1_doc_id
            doc.kg.add_value('victim', value=victim1_doc.doc_id)
            nested_docs.append(victim1_doc)

        if json_doc.get('targtype2_txt', '').strip():
            victim2_object = dict()
            victim2_object['dataset'] = 'gtd_victim'
            victim2_object['filename'] = filename
            victim2_object['victim_type'] = list()
            victim2_object['victim_type'].append(json_doc.get('targtype2_txt'))
            if json_doc.get('targsubtype2_txt', ''):
                victim2_object['victim_type'].append(
                    json_doc.get('targsubtype2_txt'))
            victim2_object['victim_corp'] = json_doc.get('corp2', '')
            victim2_object['victim_target'] = json_doc.get('target2', '')
            victim2_object['victim_nationality'] = json_doc.get(
                'natlty2_txt', '')
            victim2_doc_id = '{}_victim2'.format(doc.doc_id)
            victim2_object['uri'] = victim2_doc_id
            victim2_doc = etk.create_document(victim2_object)
            victim2_doc.doc_id = victim2_doc_id
            doc.kg.add_value('victim', value=victim2_doc.doc_id)
            nested_docs.append(victim2_doc)

        if json_doc.get('targtype3_txt', '').strip():
            victim3_object = dict()
            victim3_object['dataset'] = 'gtd_victim'
            victim3_object['filename'] = filename
            victim3_object['victim_type'] = list()
            victim3_object['victim_type'].append(json_doc.get('targtype3_txt'))
            if json_doc.get('targsubtype3_txt', ''):
                victim3_object['victim_type'].append(
                    json_doc.get('targsubtype3_txt'))
            victim3_object['victim_corp'] = json_doc.get('corp3', '')
            victim3_object['victim_target'] = json_doc.get('target3', '')
            victim3_object['victim_nationality'] = json_doc.get(
                'natlty3_txt', '')
            victim3_doc_id = '{}_victim3'.format(doc.doc_id)
            victim3_object['uri'] = victim3_doc_id
            victim3_doc = etk.create_document(victim3_object)
            victim3_doc.doc_id = victim3_doc_id
            doc.kg.add_value('victim', value=victim3_doc.doc_id)
            nested_docs.append(victim3_doc)

        # create actor/perpetrators objects
        if json_doc.get('gname', '').strip():
            actor1_object = dict()
            actor1_object['dataset'] = 'gtd_actor'
            actor1_object['filename'] = filename
            actor1_object['actor_group'] = list()
            actor1_object['actor_group'].append(json_doc.get('gname'))
            if json_doc.get('gsubname', ''):
                actor1_object['actor_group'].append(json_doc.get('gsubname'))

            actor1_doc_id = '{}_actor1'.format(doc.doc_id)
            actor1_object['uri'] = actor1_doc_id
            actor1_doc = etk.create_document(actor1_object)
            actor1_doc.doc_id = actor1_doc_id
            doc.kg.add_value('actor', value=actor1_doc.doc_id)
            nested_docs.append(actor1_doc)

        if json_doc.get('gname2', '').strip():
            actor2_object = dict()
            actor2_object['dataset'] = 'gtd_actor'
            actor2_object['filename'] = filename
            actor2_object['actor_group'] = list()
            actor2_object['actor_group'].append(json_doc.get('gname2'))
            if json_doc.get('gsubname2', ''):
                actor2_object['actor_group'].append(json_doc.get('gsubname2'))
            actor2_doc_id = '{}_actor2'.format(doc.doc_id)
            actor2_object['uri'] = actor2_doc_id
            actor2_doc = etk.create_document(actor2_object)
            actor2_doc.doc_id = actor2_doc_id
            doc.kg.add_value('actor', value=actor2_doc.doc_id)
            nested_docs.append(actor2_doc)

        if json_doc.get('gname3', '').strip():
            actor3_object = dict()
            actor3_object['dataset'] = 'gtd_actor'
            actor3_object['filename'] = filename
            actor3_object['actor_group'] = list()
            actor3_object['actor_group'].append(json_doc.get('gname3'))
            if json_doc.get('gsubname3', ''):
                actor3_object['actor_group'].append(json_doc.get('gsubname3'))
            actor3_doc_id = '{}_actor3'.format(doc.doc_id)
            actor3_object['uri'] = actor3_doc_id
            actor3_doc = etk.create_document(actor3_object)
            actor3_doc.doc_id = actor3_doc_id
            doc.kg.add_value('actor', value=actor3_doc.doc_id)
            nested_docs.append(actor3_doc)

        # create weapon objects, upto 4
        if json_doc.get('weaptype1_txt', '').strip():
            weapon1_object = dict()
            weapon1_object['dataset'] = 'gtd_weapon'
            weapon1_object['filename'] = filename
            weapon1_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon1_object['weapon_type'] = list()
            weapon1_object['weapon_type'].append(json_doc.get('weaptype1_txt'))
            if json_doc.get('weapsubtype1_txt', ''):
                weapon1_object['weapon_type'].append(
                    json_doc.get('weapsubtype1_txt'))
            if json_doc.get('weaptype1', '') != '':
                weapon1_object['weapon_code'] = json_doc.get('weaptype1')
            weapon1_doc_id = '{}_weapons1'.format(doc.doc_id)
            weapon1_object['uri'] = weapon1_doc_id
            weapon1_doc = etk.create_document(weapon1_object)
            weapon1_doc.doc_id = weapon1_doc_id
            doc.kg.add_value('weapons', weapon1_doc.doc_id)
            nested_docs.append(weapon1_doc)

        if json_doc.get('weaptype2_txt', '').strip():
            weapon2_object = dict()
            weapon2_object['dataset'] = 'gtd_weapon'
            weapon2_object['filename'] = filename
            weapon2_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon2_object['weapon_type'] = list()
            weapon2_object['weapon_type'].append(json_doc.get('weaptype2_txt'))
            if json_doc.get('weapsubtype2_txt', ''):
                weapon2_object['weapon_type'].append(
                    json_doc.get('weapsubtype2_txt'))
            if json_doc.get('weaptype2', '') != '':
                weapon2_object['weapon_code'] = json_doc.get('weaptype2')
            weapon2_doc_id = '{}_weapons2'.format(doc.doc_id)
            weapon2_object['uri'] = weapon2_doc_id
            weapon2_doc = etk.create_document(weapon2_object)
            weapon2_doc.doc_id = weapon2_doc_id
            doc.kg.add_value('weapons', weapon2_doc.doc_id)
            nested_docs.append(weapon2_doc)

        if json_doc.get('weaptype3_txt', '').strip():
            weapon3_object = dict()
            weapon3_object['dataset'] = 'gtd_weapon'
            weapon3_object['filename'] = filename
            weapon3_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon3_object['weapon_type'] = list()
            weapon3_object['weapon_type'].append(json_doc.get('weaptype3_txt'))
            if json_doc.get('weapsubtype3_txt', ''):
                weapon3_object['weapon_type'].append(
                    json_doc.get('weapsubtype3_txt'))
            if json_doc.get('weaptype3', '') != '':
                weapon3_object['weapon_code'] = json_doc.get('weaptype3')
            weapon3_doc_id = '{}_weapons3'.format(doc.doc_id)
            weapon3_object['uri'] = weapon3_doc_id
            weapon3_doc = etk.create_document(weapon3_object)
            weapon3_doc.doc_id = weapon3_doc_id
            doc.kg.add_value('weapons', weapon3_doc.doc_id)
            nested_docs.append(weapon3_doc)

        if json_doc.get('weaptype4_txt', '').strip():
            weapon4_object = dict()
            weapon4_object['dataset'] = 'gtd_weapon'
            weapon4_object['filename'] = filename
            weapon4_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon4_object['weapon_type'] = list()
            weapon4_object['weapon_type'].append(json_doc.get('weaptype4_txt'))
            if json_doc.get('weapsubtype4_txt', ''):
                weapon4_object['weapon_type'].append(
                    json_doc.get('weapsubtype4_txt'))
            if json_doc.get('weaptype4', '') != '':
                weapon4_object['weapon_code'] = json_doc.get('weaptype4')
            weapon4_doc_id = '{}_weapons4'.format(doc.doc_id)
            weapon4_object['uri'] = weapon4_doc_id
            weapon4_doc = etk.create_document(weapon4_object)
            weapon4_doc.doc_id = weapon4_doc_id
            doc.kg.add_value('weapons', weapon4_doc.doc_id)
            nested_docs.append(weapon4_doc)

        # create total fatalities docs
        nkill = json_doc.get("nkill", 0)
        if nkill != "":
            total_fatalities_object = dict()
            total_fatalities_object["dataset"] = "gtd_fatality"
            total_fatalities_object['filename'] = filename
            total_fatalities_doc_id = '{}_total_fatalitites'.format(doc.doc_id)
            total_fatalities_object['uri'] = total_fatalities_doc_id
            total_fatalities_object["size"] = nkill
            total_fatalities_doc = etk.create_document(total_fatalities_object)
            total_fatalities_doc.doc_id = total_fatalities_doc_id
            doc.kg.add_value("fatalities", value=total_fatalities_doc_id)
            nested_docs.append(total_fatalities_doc)

        # create US fatalities docs
        nkillus = json_doc.get("nkillus", 0)
        if nkillus != "":
            us_fatalities_object = dict()
            us_fatalities_object["dataset"] = "gtd_fatality"
            us_fatalities_object['filename'] = filename
            us_fatalities_doc_id = '{}_us_fatalitites'.format(doc.doc_id)
            us_fatalities_object['uri'] = us_fatalities_doc_id
            us_fatalities_object["size"] = nkillus
            us_fatalities_object["nationality"] = "United States"
            us_fatalities_doc = etk.create_document(us_fatalities_object)
            us_fatalities_doc.doc_id = us_fatalities_doc_id
            doc.kg.add_value("fatalities", value=us_fatalities_doc_id)
            nested_docs.append(us_fatalities_doc)

        # create total injuries docs
        nwound = json_doc.get("nwound", 0)
        if nwound != "":
            total_injuries_object = dict()
            total_injuries_object["dataset"] = "gtd_injury"
            total_injuries_object['filename'] = filename
            total_injuries_doc_id = '{}_total_injuries'.format(doc.doc_id)
            total_injuries_object['uri'] = total_injuries_doc_id
            total_injuries_object["size"] = nwound
            total_injuries_doc = etk.create_document(total_injuries_object)
            total_injuries_doc.doc_id = total_injuries_doc_id
            doc.kg.add_value("injuries", value=total_injuries_doc_id)
            nested_docs.append(total_injuries_doc)

        # create US injuries docs
        nwoundus = json_doc.get("nwoundus", 0)
        if nwoundus != "":
            us_injuries_object = dict()
            us_injuries_object["dataset"] = "gtd_injury"
            us_injuries_object['filename'] = filename
            us_injuries_doc_id = '{}_us_injuries'.format(doc.doc_id)
            us_injuries_object['uri'] = us_injuries_doc_id
            us_injuries_object["size"] = nwoundus
            us_injuries_doc = etk.create_document(us_injuries_object)
            us_injuries_doc.doc_id = us_injuries_doc_id
            doc.kg.add_value("injuries", value=us_injuries_doc_id)
            nested_docs.append(us_injuries_doc)

        # create damage docs
        # in this dataset we only have property damage
        if json_doc.get("property", 0) == 1:
            damage_object = dict()
            damage_object["dataset"] = "gtd_damage"
            damage_object['filename'] = filename
            damage_object["damage_title"] = json_doc.get("propextent_txt")
            damage_object["damage_value"] = json_doc.get("propvalue")
            damage_object["damage_description"] = json_doc.get("propcomment")
            damage_object_doc_id = '{}_damage'.format(doc.doc_id)
            damage_object['uri'] = damage_object_doc_id
            damage_doc = etk.create_document(damage_object)
            damage_doc.doc_id = damage_object_doc_id
            doc.kg.add_value("damage", value=damage_object_doc_id)
            nested_docs.append(damage_doc)

        return nested_docs
Esempio n. 16
0
    def process_document(self, doc: Document) -> List[Document]:
        doc.kg.add_value("title", json_path="$.ActorName")

        # Type
        doc.kg.add_value("type", value="Actor")
        doc.kg.add_value("type", json_path="$.ActorType1Code")
        doc.kg.add_value("type",
                         value=doc.extract(
                             self.actor_type_decoder,
                             doc.select_segments("$.ActorType1Code")[0]))
        doc.kg.add_value("type", json_path="$.ActorType2Code")
        doc.kg.add_value("type",
                         value=doc.extract(
                             self.actor_type_decoder,
                             doc.select_segments("$.ActorType2Code")[0]))
        doc.kg.add_value("type", json_path="$.ActorType3Code")
        doc.kg.add_value("type",
                         value=doc.extract(
                             self.actor_type_decoder,
                             doc.select_segments("$.ActorType2Code")[0]))

        # Ethnic group
        doc.kg.add_value("ethnic_group", json_path="$.ActorEthnicCode")
        doc.kg.add_value("ethnic_group",
                         value=doc.extract(
                             self.ethnic_group_decoder,
                             doc.select_segments("$.ActorEthnicCode")[0]))

        # Religion
        doc.kg.add_value("religion", json_path="$.ActorReligion1Code")
        doc.kg.add_value("religion",
                         value=doc.extract(
                             self.religion_decoder,
                             doc.select_segments("$.ActorReligion1Code")[0]))
        doc.kg.add_value("religion", json_path="$.ActorReligion2Code")
        doc.kg.add_value("religion",
                         value=doc.extract(
                             self.religion_decoder,
                             doc.select_segments("$.ActorReligion2Code")[0]))

        # Known group: putting as label
        doc.kg.add_value("label", json_path="$.ActorKnownGroupCode")
        doc.kg.add_value("label",
                         value=doc.extract(
                             self.known_group_decoder,
                             doc.select_segments("$.ActorKnownGroupCode")[0]))

        # Country, refers to the affiliation, being mapped to country of actor, losing the distinction.
        doc.kg.add_value("country", json_path="$.ActorCountryCode")
        doc.kg.add_value("country",
                         value=doc.extract(
                             self.country_decoder,
                             doc.select_segments("$.ActorCountryCode")[0]))

        # Note: not mapping the Actor Geo codes, because Pedro doesn't understand what they mean.
        return list()