コード例 #1
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.date_extractor = DateExtractor(self.etk, 'gtd_date_parser')
     self.causeex_decoder = DecodingValueExtractor(
         event_to_clauseex_class_mapping,
         'CauseEx Type',
         default_action="delete")
コード例 #2
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.date_extractor = DateExtractor(self.etk, 'ifp_date_parser')
     ifp_list = open('new_ifps.jl').readlines()
     self.new_ifps = dict()
     for ifp in ifp_list:
         j = json.loads(ifp)
         self.new_ifps[j['ifp']['id']] = j['ifp']['name']
     self.parsed_ifps = dict()
     # self.ifps_entity_map = dict()
     self.threshold = 0.86
     self.nlp = spacy.load('en_core_web_lg')
     self.preprocess_ifps()
     self.ranking_criteria = 'SENTENCE'
コード例 #3
0
    def test_date_extractor(self) -> None:
        date_extractor = DateExtractor('test_date_parser')
        text = '03/05/2018: I went to USC on Aug 20th, 2016 and will graduate on 2018, May 11. My birthday is 29-04-1994.'

        test_result = [
            x.value for x in date_extractor.extract(text, False, 30)
        ]

        expected = [
            '2018-03-05T00:00:00', '2016-08-20T00:00:00',
            '2018-05-11T00:00:00', '1994-04-29T00:00:00'
        ]

        self.assertEqual(json.dumps(test_result), json.dumps(expected))
コード例 #4
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.mapping = GdeltMapping(json.load(open("ODP-Mappings-V3.1.json")))
     # As our input files have no header, create a translation table to go from names to indices.
     for i in range(0, len(self.header_fields)):
         self.header_translation_table[
             self.header_fields[i]] = "COL" + str(i)
     # Extractors
     self.date_extractor = DateExtractor(self.etk, "Date Extractor")
コード例 #5
0
class LakeChadBasinDisplacedModule(ETKModule):
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        self.date_extractor = DateExtractor(self.etk, 'lcb_date_parser')

    def process_document(self, doc: Document) -> List[Document]:
        nested_docs = list()
        doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document)

        doc.cdr_document[
            "title"] = "{Total} Displaced from {ReportedLocation} in {Country}".format(
                Total=doc.cdr_document["Total"],
                ReportedLocation=doc.cdr_document["ReportedLocation"],
                Country=doc.cdr_document["Country"])
        doc.cdr_document["dataset"] = "lake_chad_basin_displaced"

        place = {
            "uri": '{}_place'.format(doc.doc_id),
            "doc_id": '{}_place'.format(doc.doc_id),
            "country": doc.cdr_document.get("Country", ''),
            "dataset": "lcb_place"
        }
        place_doc = etk.create_document(place)
        nested_docs.append(place_doc)
        doc.kg.add_value("place", value='{}_place'.format(doc.doc_id))

        # Add event_date to the KG
        extracted_dates = self.date_extractor.extract(
            doc.cdr_document.get('Period', ''))
        doc.kg.add_value("event_date", value=extracted_dates)
        doc.kg.add_value("event_date_end", value=extracted_dates)

        doc.kg.add_value("location", json_path="ReportedLocation")
        doc.kg.add_value(
            "causeex_class",
            value=
            "http://ontology.causeex.com/ontology/odps/EventHierarchy#ForcedMove"
        )
        doc.kg.add_value("type", value=["event", "Displacement Event"])
        doc.kg.add_value("title", json_path="title")

        victim = {
            "dataset": "lake_chad_basin_displaced_victim",
            "total": doc.cdr_document["Total"],
            "type": ["Group", "Displaced People"],
            "uri": '{}_victim'.format(doc.doc_id)
        }
        victim_doc = etk.create_document(victim)
        victim_doc.doc_id = '{}_victim'.format(doc.doc_id)

        doc.kg.add_value("victim", value=victim_doc.doc_id)
        nested_docs.append(victim_doc)

        return nested_docs

    def document_selector(self, doc) -> bool:
        return doc.cdr_document.get("dataset") == "lake_chad_basin_displaced"
コード例 #6
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.metadata_extractor = HTMLMetadataExtractor()
     self.content_extractor = HTMLContentExtractor()
     self.date_extractor = DateExtractor(self.etk, 'demo_date_parser')
     self.country_extractor = GlossaryExtractor(
         self.etk.load_glossary("${GLOSSARY_PATH}/countries.txt"),
         "country_extractor",
         self.etk.default_tokenizer,
         case_sensitive=False,
         ngrams=3)
     self.cities_extractor = GlossaryExtractor(
         self.etk.load_glossary("${GLOSSARY_PATH}/cities.txt"),
         "cities_extractor",
         self.etk.default_tokenizer,
         case_sensitive=False,
         ngrams=3)
コード例 #7
0
    def test_ground_truth(self) -> None:
        with open('etk/unit_tests/ground_truth/date_ground_truth.txt',
                  'r') as f:
            texts = f.readlines()
        for text in texts:
            text = text.strip()
            if text and text[0] != '#':
                temp = text.split('|')
                if len(temp) == 3:
                    input_text, expected, format = temp
                    ignore_before = datetime.datetime(1890, 1, 1)
                    ignore_after = datetime.datetime(2500, 10, 10)
                    relative_base = datetime.datetime(2018, 1, 1)

                    e = de.extract(
                        input_text,
                        extract_first_date_only=False,
                        additional_formats=[format],
                        use_default_formats=False,
                        ignore_dates_before=ignore_before,
                        ignore_dates_after=ignore_after,
                        detect_relative_dates=not format,
                        relative_base=relative_base,
                        preferred_date_order="DMY",
                        prefer_language_date_order=True,
                        return_as_timezone_aware=False,
                        prefer_day_of_month='first',
                        prefer_dates_from='current',
                        date_value_resolution=DateResolution.SECOND
                        if format and len(format) > 1
                        and format[1] in ['H', 'I'] else DateResolution.DAY)
                    expected = expected.replace(
                        '@today',
                        DateExtractor.convert_to_iso_format(
                            datetime.datetime.now()))
                    if expected and expected[0] != '@':
                        self.assertEqual(e[0].value if e else '', expected)
コード例 #8
0
class IFPRankingModule(ETKModule):
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        self.date_extractor = DateExtractor(self.etk, 'ifp_date_parser')
        ifp_list = open('new_ifps.jl').readlines()
        self.new_ifps = dict()
        for ifp in ifp_list:
            j = json.loads(ifp)
            self.new_ifps[j['ifp']['id']] = j['ifp']['name']
        self.parsed_ifps = dict()
        # self.ifps_entity_map = dict()
        self.threshold = 0.86
        self.nlp = spacy.load('en_core_web_lg')
        self.preprocess_ifps()
        self.ranking_criteria = 'SENTENCE'

    def preprocess_ifps(self):
        for id, ifp_name in self.new_ifps.items():

            # remove date information from query term
            extracted_date = self.date_extractor.extract(text=ifp_name)
            start, end = float('inf'), -1
            for i in extracted_date:
                start = min(start, i.provenance['start_char'])
                end = max(end, i.provenance['end_char'])
            # delete date from query term
            if len(extracted_date) != 0:
                parsed_ifp_name = ifp_name[:start] + ifp_name[end + 1:]
                self.parsed_ifps[id] = parsed_ifp_name
            else:
                self.parsed_ifps[id] = ifp_name

            # TODO use this code in future if news articles have to be matched after filtering out using entities in the IFP
            # self.ifps_entity_map[ifp] = list()
            # extract entities from query term
            # doc = self.nlp(ifp)
            # for ent in doc.ents:
            #     self.ifps_entity_map[ifp].append(re.escape(ent.text.strip()))
            # # remove empty entities
            # self.ifps_entity_map[ifp] = list(filter(bool, self.ifps_entity_map[ifp]))

    def process_document(self, doc: Document):
        result_docs = list()
        for id, parsed_ifp_name in self.parsed_ifps.items():
            dr_processor = DocRetrieveProcessor(
                etk=self.etk,
                ifp_id=id,
                ifp_title=parsed_ifp_name,
                orig_ifp_title=self.new_ifps[id],
                nlp=self.nlp)

            processed_doc = None
            if self.ranking_criteria == 'SENTENCE':
                processed_doc = dr_processor.process_by_sentence(
                    doc=doc, threshold=self.threshold)
            elif self.ranking_criteria == 'TITLE':
                processed_doc = dr_processor.process_by_title(
                    doc=doc, threshold=self.threshold)

            if processed_doc:
                for key in processed_doc.cdr_document.keys():
                    processed_doc.kg.add_value(key,
                                               processed_doc.cdr_document[key])
                result_docs.append(processed_doc)
        return result_docs

    def document_selector(self, doc) -> bool:
        """
        Boolean function for selecting document
        Args:
            doc: Document

        Returns:

        """
        # match all the IFPs to this news article, record this news article as relevant for all IFPs with simmilarity above threshold

        return DefaultDocumentSelector().select_document(doc)
コード例 #9
0
import unittest, datetime, pytz, json
from dateutil.relativedelta import relativedelta
from etk.extractors.date_extractor import DateExtractor, DateResolution
from etk.etk import ETK
from etk.knowledge_graph import KGSchema

kg_schema = KGSchema(json.load(open('etk/unit_tests/ground_truth/test_config.json')))
de = DateExtractor(ETK(kg_schema=kg_schema), 'unit_test_date')


class TestDateExtractor(unittest.TestCase):
    # auxiliary method
    @staticmethod
    def convert_to_iso_format(date: datetime.datetime, resolution: DateResolution = DateResolution.DAY) -> str or None:
        """

        Args:
            date: datetime.datetime - datetime object to convert
            resolution: resolution of the iso format date to return

        Returns: string of iso format date

        """
        # TODO: currently the resolution is specified by the user, should it be decided what we have extracted,
        # E.g.: like if only year exists, use DateResolution.YEAR as resolution
        try:
            if date:
                date_str = date.isoformat()
                length = len(date_str)
                if resolution == DateResolution.YEAR and length >= 4:
                    return date_str[:4]
コード例 #10
0
class GTDModule(ETKModule):
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        self.date_extractor = DateExtractor(self.etk, 'gtd_date_parser')
        self.causeex_decoder = DecodingValueExtractor(
            event_to_clauseex_class_mapping,
            'CauseEx Type',
            default_action="delete")

    def process_document(self, doc: Document) -> List[Document]:
        nested_docs = list()

        json_doc = doc.cdr_document
        filename = json_doc.get('file_name')
        doc.doc_id = Utility.create_doc_id_from_json(json_doc)
        doc.cdr_document['uri'] = doc.doc_id
        doc.kg.add_value("type", value="Event")
        doc.kg.add_value("type", value="Act of Terrorism")
        doc.kg.add_value("provenance_filename", value=filename)
        for attack_type_code in attack_type_fields_code:
            ac = json_doc.get(attack_type_code, '')
            if ac != "":
                doc.kg.add_value("causeex_class",
                                 value=doc.extract(
                                     self.causeex_decoder,
                                     doc.select_segments(
                                         "$.{}".format(attack_type_code))[0]))

        # Add event_date to the KG
        extracted_dates = self.date_extractor.extract('{}-{}-{}'.format(
            json_doc.get('iyear'), json_doc.get('imonth'),
            json_doc.get('iday')))
        if len(extracted_dates) > 0:
            doc.kg.add_value("event_date", value=extracted_dates)
            doc.kg.add_value("event_date_end", value=extracted_dates)
        else:
            # no proper date mentioned in the event, try the approximate date
            approximate_date_txt = json_doc.get("approxdate")
            extracted_approx_dates = self.date_extractor.extract(
                approximate_date_txt)
            if len(extracted_approx_dates) > 0:
                doc.kg.add_value("event_date", value=extracted_approx_dates)
                doc.kg.add_value("event_date_end",
                                 value=extracted_approx_dates)

        # summary, aka description only available for incident after 1997
        doc.kg.add_value("description", json_path="$.summary")

        # add inclusion criteria: why is this incident regarded as a terrorist incident
        # TODO: ADD this to master_config
        crit1 = json_doc.get('crit1', 0)
        if crit1 == 1:
            doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_1)

        crit2 = json_doc.get('crit2', 0)
        if crit2 == 1:
            doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_2)

        crit3 = json_doc.get('crit3', 0)
        if crit3 == 1:
            doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_3)

        # add related events to KG
        # TODO: ADD this to master_config
        related_event_ids_txt = json_doc.get('related', '')
        if related_event_ids_txt.strip() != "":
            related_event_ids = related_event_ids_txt.split(',')
            if len(related_event_ids) > 0:
                doc.kg.add_value("related_events", value=related_event_ids)

        # add attack information, on second thoughts, this qualifies as event type
        for attack_type_field in attack_type_fields:
            doc.kg.add_value("type", value=json_doc.get(attack_type_field, ''))

        # TODO check the following 2
        if json_doc.get("suicide", 0) == 1:
            doc.kg.add_value("type", value='Suicide')

        if json_doc.get("success", 0) == 1:
            doc.kg.add_value("type", value='Success')

        # create nested objects for places
        place_object = dict()
        for place_field in place_fields:
            place_object[place_field] = json_doc.get(place_field)
        place_object["dataset"] = "gtd_place"

        place_doc_id = '{}_place'.format(doc.doc_id)
        place_object['uri'] = place_doc_id
        place_object['filename'] = filename
        place_doc = etk.create_document(place_object)
        place_doc.doc_id = place_doc_id

        doc.kg.add_value("place", value=place_doc.doc_id)
        nested_docs.append(place_doc)

        # create victim objects, there can be upto 3
        if json_doc.get('targtype1_txt', '').strip():
            victim1_object = dict()
            victim1_object['dataset'] = 'gtd_victim'
            victim1_object['filename'] = filename
            victim1_object['victim_type'] = list()
            victim1_object['victim_type'].append(json_doc.get('targtype1_txt'))
            if json_doc.get('targsubtype1_txt', ''):
                victim1_object['victim_type'].append(
                    json_doc.get('targsubtype1_txt'))
            victim1_object['victim_corp'] = json_doc.get('corp1', '')
            victim1_object['victim_target'] = json_doc.get('target1', '')
            victim1_object['victim_nationality'] = json_doc.get(
                'natlty1_txt', '')
            victim1_doc_id = '{}_victim1'.format(doc.doc_id)
            victim1_object['uri'] = victim1_doc_id
            victim1_doc = etk.create_document(victim1_object)
            victim1_doc.doc_id = victim1_doc_id
            doc.kg.add_value('victim', value=victim1_doc.doc_id)
            nested_docs.append(victim1_doc)

        if json_doc.get('targtype2_txt', '').strip():
            victim2_object = dict()
            victim2_object['dataset'] = 'gtd_victim'
            victim2_object['filename'] = filename
            victim2_object['victim_type'] = list()
            victim2_object['victim_type'].append(json_doc.get('targtype2_txt'))
            if json_doc.get('targsubtype2_txt', ''):
                victim2_object['victim_type'].append(
                    json_doc.get('targsubtype2_txt'))
            victim2_object['victim_corp'] = json_doc.get('corp2', '')
            victim2_object['victim_target'] = json_doc.get('target2', '')
            victim2_object['victim_nationality'] = json_doc.get(
                'natlty2_txt', '')
            victim2_doc_id = '{}_victim2'.format(doc.doc_id)
            victim2_object['uri'] = victim2_doc_id
            victim2_doc = etk.create_document(victim2_object)
            victim2_doc.doc_id = victim2_doc_id
            doc.kg.add_value('victim', value=victim2_doc.doc_id)
            nested_docs.append(victim2_doc)

        if json_doc.get('targtype3_txt', '').strip():
            victim3_object = dict()
            victim3_object['dataset'] = 'gtd_victim'
            victim3_object['filename'] = filename
            victim3_object['victim_type'] = list()
            victim3_object['victim_type'].append(json_doc.get('targtype3_txt'))
            if json_doc.get('targsubtype3_txt', ''):
                victim3_object['victim_type'].append(
                    json_doc.get('targsubtype3_txt'))
            victim3_object['victim_corp'] = json_doc.get('corp3', '')
            victim3_object['victim_target'] = json_doc.get('target3', '')
            victim3_object['victim_nationality'] = json_doc.get(
                'natlty3_txt', '')
            victim3_doc_id = '{}_victim3'.format(doc.doc_id)
            victim3_object['uri'] = victim3_doc_id
            victim3_doc = etk.create_document(victim3_object)
            victim3_doc.doc_id = victim3_doc_id
            doc.kg.add_value('victim', value=victim3_doc.doc_id)
            nested_docs.append(victim3_doc)

        # create actor/perpetrators objects
        if json_doc.get('gname', '').strip():
            actor1_object = dict()
            actor1_object['dataset'] = 'gtd_actor'
            actor1_object['filename'] = filename
            actor1_object['actor_group'] = list()
            actor1_object['actor_group'].append(json_doc.get('gname'))
            if json_doc.get('gsubname', ''):
                actor1_object['actor_group'].append(json_doc.get('gsubname'))

            actor1_doc_id = '{}_actor1'.format(doc.doc_id)
            actor1_object['uri'] = actor1_doc_id
            actor1_doc = etk.create_document(actor1_object)
            actor1_doc.doc_id = actor1_doc_id
            doc.kg.add_value('actor', value=actor1_doc.doc_id)
            nested_docs.append(actor1_doc)

        if json_doc.get('gname2', '').strip():
            actor2_object = dict()
            actor2_object['dataset'] = 'gtd_actor'
            actor2_object['filename'] = filename
            actor2_object['actor_group'] = list()
            actor2_object['actor_group'].append(json_doc.get('gname2'))
            if json_doc.get('gsubname2', ''):
                actor2_object['actor_group'].append(json_doc.get('gsubname2'))
            actor2_doc_id = '{}_actor2'.format(doc.doc_id)
            actor2_object['uri'] = actor2_doc_id
            actor2_doc = etk.create_document(actor2_object)
            actor2_doc.doc_id = actor2_doc_id
            doc.kg.add_value('actor', value=actor2_doc.doc_id)
            nested_docs.append(actor2_doc)

        if json_doc.get('gname3', '').strip():
            actor3_object = dict()
            actor3_object['dataset'] = 'gtd_actor'
            actor3_object['filename'] = filename
            actor3_object['actor_group'] = list()
            actor3_object['actor_group'].append(json_doc.get('gname3'))
            if json_doc.get('gsubname3', ''):
                actor3_object['actor_group'].append(json_doc.get('gsubname3'))
            actor3_doc_id = '{}_actor3'.format(doc.doc_id)
            actor3_object['uri'] = actor3_doc_id
            actor3_doc = etk.create_document(actor3_object)
            actor3_doc.doc_id = actor3_doc_id
            doc.kg.add_value('actor', value=actor3_doc.doc_id)
            nested_docs.append(actor3_doc)

        # create weapon objects, upto 4
        if json_doc.get('weaptype1_txt', '').strip():
            weapon1_object = dict()
            weapon1_object['dataset'] = 'gtd_weapon'
            weapon1_object['filename'] = filename
            weapon1_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon1_object['weapon_type'] = list()
            weapon1_object['weapon_type'].append(json_doc.get('weaptype1_txt'))
            if json_doc.get('weapsubtype1_txt', ''):
                weapon1_object['weapon_type'].append(
                    json_doc.get('weapsubtype1_txt'))
            if json_doc.get('weaptype1', '') != '':
                weapon1_object['weapon_code'] = json_doc.get('weaptype1')
            weapon1_doc_id = '{}_weapons1'.format(doc.doc_id)
            weapon1_object['uri'] = weapon1_doc_id
            weapon1_doc = etk.create_document(weapon1_object)
            weapon1_doc.doc_id = weapon1_doc_id
            doc.kg.add_value('weapons', weapon1_doc.doc_id)
            nested_docs.append(weapon1_doc)

        if json_doc.get('weaptype2_txt', '').strip():
            weapon2_object = dict()
            weapon2_object['dataset'] = 'gtd_weapon'
            weapon2_object['filename'] = filename
            weapon2_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon2_object['weapon_type'] = list()
            weapon2_object['weapon_type'].append(json_doc.get('weaptype2_txt'))
            if json_doc.get('weapsubtype2_txt', ''):
                weapon2_object['weapon_type'].append(
                    json_doc.get('weapsubtype2_txt'))
            if json_doc.get('weaptype2', '') != '':
                weapon2_object['weapon_code'] = json_doc.get('weaptype2')
            weapon2_doc_id = '{}_weapons2'.format(doc.doc_id)
            weapon2_object['uri'] = weapon2_doc_id
            weapon2_doc = etk.create_document(weapon2_object)
            weapon2_doc.doc_id = weapon2_doc_id
            doc.kg.add_value('weapons', weapon2_doc.doc_id)
            nested_docs.append(weapon2_doc)

        if json_doc.get('weaptype3_txt', '').strip():
            weapon3_object = dict()
            weapon3_object['dataset'] = 'gtd_weapon'
            weapon3_object['filename'] = filename
            weapon3_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon3_object['weapon_type'] = list()
            weapon3_object['weapon_type'].append(json_doc.get('weaptype3_txt'))
            if json_doc.get('weapsubtype3_txt', ''):
                weapon3_object['weapon_type'].append(
                    json_doc.get('weapsubtype3_txt'))
            if json_doc.get('weaptype3', '') != '':
                weapon3_object['weapon_code'] = json_doc.get('weaptype3')
            weapon3_doc_id = '{}_weapons3'.format(doc.doc_id)
            weapon3_object['uri'] = weapon3_doc_id
            weapon3_doc = etk.create_document(weapon3_object)
            weapon3_doc.doc_id = weapon3_doc_id
            doc.kg.add_value('weapons', weapon3_doc.doc_id)
            nested_docs.append(weapon3_doc)

        if json_doc.get('weaptype4_txt', '').strip():
            weapon4_object = dict()
            weapon4_object['dataset'] = 'gtd_weapon'
            weapon4_object['filename'] = filename
            weapon4_object['weapon_title'] = json_doc.get('weapdetail', '')
            weapon4_object['weapon_type'] = list()
            weapon4_object['weapon_type'].append(json_doc.get('weaptype4_txt'))
            if json_doc.get('weapsubtype4_txt', ''):
                weapon4_object['weapon_type'].append(
                    json_doc.get('weapsubtype4_txt'))
            if json_doc.get('weaptype4', '') != '':
                weapon4_object['weapon_code'] = json_doc.get('weaptype4')
            weapon4_doc_id = '{}_weapons4'.format(doc.doc_id)
            weapon4_object['uri'] = weapon4_doc_id
            weapon4_doc = etk.create_document(weapon4_object)
            weapon4_doc.doc_id = weapon4_doc_id
            doc.kg.add_value('weapons', weapon4_doc.doc_id)
            nested_docs.append(weapon4_doc)

        # create total fatalities docs
        nkill = json_doc.get("nkill", 0)
        if nkill != "":
            total_fatalities_object = dict()
            total_fatalities_object["dataset"] = "gtd_fatality"
            total_fatalities_object['filename'] = filename
            total_fatalities_doc_id = '{}_total_fatalitites'.format(doc.doc_id)
            total_fatalities_object['uri'] = total_fatalities_doc_id
            total_fatalities_object["size"] = nkill
            total_fatalities_doc = etk.create_document(total_fatalities_object)
            total_fatalities_doc.doc_id = total_fatalities_doc_id
            doc.kg.add_value("fatalities", value=total_fatalities_doc_id)
            nested_docs.append(total_fatalities_doc)

        # create US fatalities docs
        nkillus = json_doc.get("nkillus", 0)
        if nkillus != "":
            us_fatalities_object = dict()
            us_fatalities_object["dataset"] = "gtd_fatality"
            us_fatalities_object['filename'] = filename
            us_fatalities_doc_id = '{}_us_fatalitites'.format(doc.doc_id)
            us_fatalities_object['uri'] = us_fatalities_doc_id
            us_fatalities_object["size"] = nkillus
            us_fatalities_object["nationality"] = "United States"
            us_fatalities_doc = etk.create_document(us_fatalities_object)
            us_fatalities_doc.doc_id = us_fatalities_doc_id
            doc.kg.add_value("fatalities", value=us_fatalities_doc_id)
            nested_docs.append(us_fatalities_doc)

        # create total injuries docs
        nwound = json_doc.get("nwound", 0)
        if nwound != "":
            total_injuries_object = dict()
            total_injuries_object["dataset"] = "gtd_injury"
            total_injuries_object['filename'] = filename
            total_injuries_doc_id = '{}_total_injuries'.format(doc.doc_id)
            total_injuries_object['uri'] = total_injuries_doc_id
            total_injuries_object["size"] = nwound
            total_injuries_doc = etk.create_document(total_injuries_object)
            total_injuries_doc.doc_id = total_injuries_doc_id
            doc.kg.add_value("injuries", value=total_injuries_doc_id)
            nested_docs.append(total_injuries_doc)

        # create US injuries docs
        nwoundus = json_doc.get("nwoundus", 0)
        if nwoundus != "":
            us_injuries_object = dict()
            us_injuries_object["dataset"] = "gtd_injury"
            us_injuries_object['filename'] = filename
            us_injuries_doc_id = '{}_us_injuries'.format(doc.doc_id)
            us_injuries_object['uri'] = us_injuries_doc_id
            us_injuries_object["size"] = nwoundus
            us_injuries_doc = etk.create_document(us_injuries_object)
            us_injuries_doc.doc_id = us_injuries_doc_id
            doc.kg.add_value("injuries", value=us_injuries_doc_id)
            nested_docs.append(us_injuries_doc)

        # create damage docs
        # in this dataset we only have property damage
        if json_doc.get("property", 0) == 1:
            damage_object = dict()
            damage_object["dataset"] = "gtd_damage"
            damage_object['filename'] = filename
            damage_object["damage_title"] = json_doc.get("propextent_txt")
            damage_object["damage_value"] = json_doc.get("propvalue")
            damage_object["damage_description"] = json_doc.get("propcomment")
            damage_object_doc_id = '{}_damage'.format(doc.doc_id)
            damage_object['uri'] = damage_object_doc_id
            damage_doc = etk.create_document(damage_object)
            damage_doc.doc_id = damage_object_doc_id
            doc.kg.add_value("damage", value=damage_object_doc_id)
            nested_docs.append(damage_doc)

        return nested_docs

    def document_selector(self, doc) -> bool:
        return doc.cdr_document.get("dataset") == "gtd"
コード例 #11
0
            "description": "03/05/2018: I went to USC on Aug 20th, 2016 and will graduate on 2018, May 11. My birthday is 29-04-1994."
        }
    ]
}

etk = ETK()
doc = etk.create_document(sample_input)

# example for glossary extractor:
name_extractor = GlossaryExtractor(etk.load_glossary("./names.txt"), "name_extractor", etk.default_tokenizer, case_sensitive=False, ngrams=1)

descriptions = doc.select_segments("projects[*].description")
projects = doc.select_segments("projects[*]")

for d, p in zip(descriptions, projects):
    print ("Iam d path: " + d.full_path)
    names = doc.invoke_extractor(name_extractor, d)
    p.store_extractions(names, "members")

# example for date extractor:
date_extractor = DateExtractor('test_date_parser')
member_descriptions = doc.select_segments("members[*].description")
members = doc.select_segments("members[*]")

for m_d, m in zip(member_descriptions, members):
    dates = doc.invoke_extractor(date_extractor, m_d, ignore_future_dates=False, ignore_past_years=40)
    m.store_extractions(dates, "related_dates")


print(json.dumps(sample_input, indent=2))
コード例 #12
0
from typing import List, Dict
from dateutil import parser
from datetime import timedelta
import datetime
from etk.extractors.date_extractor import DateExtractor, DateResolution
import sys

de = DateExtractor(extractor_name='date_extractor')
"""
Detect simple granularity types in a sequence of dates.
"""


class GranularityDetector:
    granularity = {
        "second": timedelta(seconds=1),
        "minute": timedelta(minutes=1),
        "hour": timedelta(hours=1),
        "day": timedelta(hours=24),
        "week": timedelta(days=7),
        "month": timedelta(days=30),
        "quarter": timedelta(days=120),
        "year": timedelta(days=365)
    }

    @staticmethod
    def get_parsed_date(my_date):
        if isinstance(my_date, datetime.datetime):
            return my_date
        elif isinstance(my_date, datetime.date):
            return datetime.datetime(my_date.year, my_date.month, my_date.day)
コード例 #13
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.date_extractor = DateExtractor(self.etk, 'acled_date_parser')
     self.country_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/countries.json.gz", read_json=True),
                                                "country_extractor",
                                                self.etk.default_tokenizer,
                                                case_sensitive=False,
                                                ngrams=3)
     self.states_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/states_usa_canada.json.gz", read_json=True),
                                               "states_extractor",
                                               self.etk.default_tokenizer,
                                               case_sensitive=False,
                                               ngrams=3)
     self.cities_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/cities.json.gz", read_json=True),
                                               "cities_extractor",
                                               self.etk.default_tokenizer,
                                               case_sensitive=False,
                                               ngrams=3)
     self.csv_processor = CsvProcessor(etk=etk, heading_row=1)
     self.interaction_decoding_dict = {
         "10": "Sole Military Action",
         "11": "Military Versus Military",
         "12": "Military Versus Rebels",
         "13": "Military Versus Political Militia",
         "14": "Military Versus Communal Militia",
         "15": "Military Versus Rioters",
         "16": "Military Versus Protesters",
         "17": "Military Versus Civilians",
         "18": "Military Versus Other",
         "20": "Sole Rebel Action",
         "22": "Rebels Versus Rebels",
         "23": "Rebels Versus Political Militia",
         "24": "Rebels Versus Communal Militia",
         "25": "Rebels Versus Rioters",
         "26": "Rebels Versus Protesters",
         "27": "Rebels Versus Civilians",
         "28": "Rebels Versus Other",
         "30": "Sole Political Militia Action",
         "33": "Political Militia Versus Political Militia",
         "34": "Political Militia Versus Communal Militia",
         "35": "Political Militia Versus Rioters",
         "36": "Political Militia Versus Protesters",
         "37": "Political Militia Versus Civilians",
         "38": "Political Militia Versus Other",
         "40": "Sole Communal Militia Action",
         "44": "Communal Militia Versus Communal Militia",
         "45": "Communal Militia Versus Rioters",
         "46": "Communal Militia Versus Protesters",
         "47": "Communal Militia Versus Civilians",
         "48": "Communal Militia Versus Other",
         "50": "Sole Rioter Action",
         "55": "Rioters Versus Rioters",
         "56": "Rioters Versus Protesters",
         "57": "Rioters Versus Civilians",
         "58": "Rioters Versus Other",
         "60": "Sole Protester Action",
         "66": "Protesters Versus Protesters",
         "68": "Protesters Versus Other",
         "78": "Other Actor Versus Civilians",
         "80": "Sole Other Action"
     }
     self.interaction_decoder = DecodingValueExtractor(
         self.interaction_decoding_dict,
         'default_decoding',
         case_sensitive=True)
コード例 #14
0
ファイル: date_extractor.py プロジェクト: xkgoodbest/etk
import warnings
import sys
import argparse

from etk.extractors.date_extractor import DateExtractor

date_extractor = DateExtractor()


def add_arguments(parser):
    """
    Parse arguments
    Args:
        parser (argparse.ArgumentParser)
    """
    parser.description = 'Examples:\n' \
                         'python -m etk date_extractor /tmp/date.txt\n' \
                         'cat /tmp/date.txt | python -m etk date_extractor'
    parser.add_argument('input_file',
                        nargs='?',
                        type=argparse.FileType('r'),
                        default=sys.stdin)


def run(args):
    """
    Args:
        args (argparse.Namespace)
    """
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
コード例 #15
0
ファイル: ranking_pipeline.py プロジェクト: xkgoodbest/etk
def main():
    filename = sys.argv[1]
    query_title = sys.argv[2]
    ranking_criteria = sys.argv[3]
    top_k = sys.argv[4]

    if ranking_criteria not in ('TITLE', 'SENTENCE'):
        print('Wrong mode! Please check the input argument!')
        return

    master_config = {
        "fields": {
            "developer": {
                "type": "string"
            },
            "student_developer": {
                "type": "string"
            },
            "spacy_name": {
                "type": "string"
            },
            "date": {
                "type": "date"
            }
        }
    }
    kg_schema = KGSchema(master_config)
    etk = ETK(kg_schema, ["./extraction_modules/"])
    nlp = spacy.load('en_core_web_lg')

    date_extractor = DateExtractor(etk=etk)

    queries = dict()
    queries_ent_map = dict()

    with open(query_title) as f:
        for line in f:
            orig_ifp_title = line
            # remove date information from query term
            res = date_extractor.extract(text=line)
            start, end = float('inf'), -1
            for i in res:
                start = min(start, i.provenance['start_char'])
                end = max(end, i.provenance['end_char'])
            # delete date from query term
            if len(res) != 0:
                line = line[:start] + line[end+1:]

            queries[orig_ifp_title] = line
            queries_ent_map[line] = list()
            # extract entities from query term
            doc = nlp(line)
            for ent in doc.ents:
                queries_ent_map[line].append(re.escape(ent.text.strip()))
            # remove empty entities
            queries_ent_map[line] = list(filter(bool, queries_ent_map[line]))

    # the list of selected docs for given query term
    query_docs_mapping = dict()

    docs = list()
    with open(filename) as f:
        for line in f:
            json_obj = json.loads(line)
            docs.append(etk.create_document(json_obj))

    ds = DefaultDocumentSelector()

    for orig_query, proc_query in queries.items():
        content_regex = queries_ent_map[proc_query]
        query_docs_mapping[proc_query] = list()
        for doc in docs:
            if len(content_regex) == 0 \
                    or ds.select_document(document=doc,
                              json_paths=['$.lexisnexis.doc_description'],
                              json_paths_regex=content_regex):
                query_docs_mapping[proc_query].append(doc)

    # TODO: pass ifp_id in
    for orig_query, proc_query in queries.items():
        # print(len(query_docs_mapping[proc_query]))
        dr_processor = DocRetrieveProcessor(etk=etk, ifp_id="1233", ifp_title=proc_query, orig_ifp_title=orig_query)
        heap = list()
        for doc in query_docs_mapping[proc_query]:
            processed_doc = dict()

            if ranking_criteria == 'SENTENCE':
                processed_doc = dr_processor.process_by_sentence(doc=doc, threshold=0).cdr_document
            elif ranking_criteria == 'TITLE':
                processed_doc = dr_processor.process_by_title(doc=doc, threshold=0).cdr_document

            if len(heap) < top_k:
                heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc))
            else:
                if processed_doc['similarity'] > heap[0][0]:
                    heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc))

        heap.sort(reverse=True)

        output_filename = './resources/output/'+orig_ifp_title+"_result.jl"

        with open(output_filename, 'a+b') as f:
            for item in heap:
                print(item[0])
                jl_str = json.dumps(item[2]) + '\n'
                f.write(jl_str.encode())
コード例 #16
0
ファイル: utils.py プロジェクト: JiFeRe/datamart
                      for row in vectors for cell in row if len(cell))
    for f in cat_vector:
        values = list(
            set(cell[f] for row in vectors for cell in row if len(cell)))
        for r, row in enumerate(vectors):
            for c, cell in enumerate(row):
                if len(cell) == 0: continue
                for v in values:
                    vectors[r][c][f'{f}-{v}'] = 1 if v == cell[f] else 0
                del vectors[r][c][f]
    return vectors


# --- parsing -----------------------------------------------------------------

_find_dates_extractor = DateExtractor()


def find_dates(text):
    try:
        return parse_date(text, fuzzy_with_tokens=True)[0]
    except:
        pass
    try:
        res = _find_dates_extractor.extract(text,
                                            prefer_language_date_order=False)
        if len(res): return res[0].value
    except:
        log('info', f'ETK DateExtractor raised an error on value {text}.')

コード例 #17
0
ファイル: em_acled.py プロジェクト: xkgoodbest/etk
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.date_extractor = DateExtractor(self.etk, 'acled_date_parser')