def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'gtd_date_parser') self.causeex_decoder = DecodingValueExtractor( event_to_clauseex_class_mapping, 'CauseEx Type', default_action="delete")
def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'ifp_date_parser') ifp_list = open('new_ifps.jl').readlines() self.new_ifps = dict() for ifp in ifp_list: j = json.loads(ifp) self.new_ifps[j['ifp']['id']] = j['ifp']['name'] self.parsed_ifps = dict() # self.ifps_entity_map = dict() self.threshold = 0.86 self.nlp = spacy.load('en_core_web_lg') self.preprocess_ifps() self.ranking_criteria = 'SENTENCE'
def test_date_extractor(self) -> None: date_extractor = DateExtractor('test_date_parser') text = '03/05/2018: I went to USC on Aug 20th, 2016 and will graduate on 2018, May 11. My birthday is 29-04-1994.' test_result = [ x.value for x in date_extractor.extract(text, False, 30) ] expected = [ '2018-03-05T00:00:00', '2016-08-20T00:00:00', '2018-05-11T00:00:00', '1994-04-29T00:00:00' ] self.assertEqual(json.dumps(test_result), json.dumps(expected))
def __init__(self, etk): ETKModule.__init__(self, etk) self.mapping = GdeltMapping(json.load(open("ODP-Mappings-V3.1.json"))) # As our input files have no header, create a translation table to go from names to indices. for i in range(0, len(self.header_fields)): self.header_translation_table[ self.header_fields[i]] = "COL" + str(i) # Extractors self.date_extractor = DateExtractor(self.etk, "Date Extractor")
class LakeChadBasinDisplacedModule(ETKModule): def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'lcb_date_parser') def process_document(self, doc: Document) -> List[Document]: nested_docs = list() doc.doc_id = Utility.create_doc_id_from_json(doc.cdr_document) doc.cdr_document[ "title"] = "{Total} Displaced from {ReportedLocation} in {Country}".format( Total=doc.cdr_document["Total"], ReportedLocation=doc.cdr_document["ReportedLocation"], Country=doc.cdr_document["Country"]) doc.cdr_document["dataset"] = "lake_chad_basin_displaced" place = { "uri": '{}_place'.format(doc.doc_id), "doc_id": '{}_place'.format(doc.doc_id), "country": doc.cdr_document.get("Country", ''), "dataset": "lcb_place" } place_doc = etk.create_document(place) nested_docs.append(place_doc) doc.kg.add_value("place", value='{}_place'.format(doc.doc_id)) # Add event_date to the KG extracted_dates = self.date_extractor.extract( doc.cdr_document.get('Period', '')) doc.kg.add_value("event_date", value=extracted_dates) doc.kg.add_value("event_date_end", value=extracted_dates) doc.kg.add_value("location", json_path="ReportedLocation") doc.kg.add_value( "causeex_class", value= "http://ontology.causeex.com/ontology/odps/EventHierarchy#ForcedMove" ) doc.kg.add_value("type", value=["event", "Displacement Event"]) doc.kg.add_value("title", json_path="title") victim = { "dataset": "lake_chad_basin_displaced_victim", "total": doc.cdr_document["Total"], "type": ["Group", "Displaced People"], "uri": '{}_victim'.format(doc.doc_id) } victim_doc = etk.create_document(victim) victim_doc.doc_id = '{}_victim'.format(doc.doc_id) doc.kg.add_value("victim", value=victim_doc.doc_id) nested_docs.append(victim_doc) return nested_docs def document_selector(self, doc) -> bool: return doc.cdr_document.get("dataset") == "lake_chad_basin_displaced"
def __init__(self, etk): ETKModule.__init__(self, etk) self.metadata_extractor = HTMLMetadataExtractor() self.content_extractor = HTMLContentExtractor() self.date_extractor = DateExtractor(self.etk, 'demo_date_parser') self.country_extractor = GlossaryExtractor( self.etk.load_glossary("${GLOSSARY_PATH}/countries.txt"), "country_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.cities_extractor = GlossaryExtractor( self.etk.load_glossary("${GLOSSARY_PATH}/cities.txt"), "cities_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3)
def test_ground_truth(self) -> None: with open('etk/unit_tests/ground_truth/date_ground_truth.txt', 'r') as f: texts = f.readlines() for text in texts: text = text.strip() if text and text[0] != '#': temp = text.split('|') if len(temp) == 3: input_text, expected, format = temp ignore_before = datetime.datetime(1890, 1, 1) ignore_after = datetime.datetime(2500, 10, 10) relative_base = datetime.datetime(2018, 1, 1) e = de.extract( input_text, extract_first_date_only=False, additional_formats=[format], use_default_formats=False, ignore_dates_before=ignore_before, ignore_dates_after=ignore_after, detect_relative_dates=not format, relative_base=relative_base, preferred_date_order="DMY", prefer_language_date_order=True, return_as_timezone_aware=False, prefer_day_of_month='first', prefer_dates_from='current', date_value_resolution=DateResolution.SECOND if format and len(format) > 1 and format[1] in ['H', 'I'] else DateResolution.DAY) expected = expected.replace( '@today', DateExtractor.convert_to_iso_format( datetime.datetime.now())) if expected and expected[0] != '@': self.assertEqual(e[0].value if e else '', expected)
class IFPRankingModule(ETKModule): def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'ifp_date_parser') ifp_list = open('new_ifps.jl').readlines() self.new_ifps = dict() for ifp in ifp_list: j = json.loads(ifp) self.new_ifps[j['ifp']['id']] = j['ifp']['name'] self.parsed_ifps = dict() # self.ifps_entity_map = dict() self.threshold = 0.86 self.nlp = spacy.load('en_core_web_lg') self.preprocess_ifps() self.ranking_criteria = 'SENTENCE' def preprocess_ifps(self): for id, ifp_name in self.new_ifps.items(): # remove date information from query term extracted_date = self.date_extractor.extract(text=ifp_name) start, end = float('inf'), -1 for i in extracted_date: start = min(start, i.provenance['start_char']) end = max(end, i.provenance['end_char']) # delete date from query term if len(extracted_date) != 0: parsed_ifp_name = ifp_name[:start] + ifp_name[end + 1:] self.parsed_ifps[id] = parsed_ifp_name else: self.parsed_ifps[id] = ifp_name # TODO use this code in future if news articles have to be matched after filtering out using entities in the IFP # self.ifps_entity_map[ifp] = list() # extract entities from query term # doc = self.nlp(ifp) # for ent in doc.ents: # self.ifps_entity_map[ifp].append(re.escape(ent.text.strip())) # # remove empty entities # self.ifps_entity_map[ifp] = list(filter(bool, self.ifps_entity_map[ifp])) def process_document(self, doc: Document): result_docs = list() for id, parsed_ifp_name in self.parsed_ifps.items(): dr_processor = DocRetrieveProcessor( etk=self.etk, ifp_id=id, ifp_title=parsed_ifp_name, orig_ifp_title=self.new_ifps[id], nlp=self.nlp) processed_doc = None if self.ranking_criteria == 'SENTENCE': processed_doc = dr_processor.process_by_sentence( doc=doc, threshold=self.threshold) elif self.ranking_criteria == 'TITLE': processed_doc = dr_processor.process_by_title( doc=doc, threshold=self.threshold) if processed_doc: for key in processed_doc.cdr_document.keys(): processed_doc.kg.add_value(key, processed_doc.cdr_document[key]) result_docs.append(processed_doc) return result_docs def document_selector(self, doc) -> bool: """ Boolean function for selecting document Args: doc: Document Returns: """ # match all the IFPs to this news article, record this news article as relevant for all IFPs with simmilarity above threshold return DefaultDocumentSelector().select_document(doc)
import unittest, datetime, pytz, json from dateutil.relativedelta import relativedelta from etk.extractors.date_extractor import DateExtractor, DateResolution from etk.etk import ETK from etk.knowledge_graph import KGSchema kg_schema = KGSchema(json.load(open('etk/unit_tests/ground_truth/test_config.json'))) de = DateExtractor(ETK(kg_schema=kg_schema), 'unit_test_date') class TestDateExtractor(unittest.TestCase): # auxiliary method @staticmethod def convert_to_iso_format(date: datetime.datetime, resolution: DateResolution = DateResolution.DAY) -> str or None: """ Args: date: datetime.datetime - datetime object to convert resolution: resolution of the iso format date to return Returns: string of iso format date """ # TODO: currently the resolution is specified by the user, should it be decided what we have extracted, # E.g.: like if only year exists, use DateResolution.YEAR as resolution try: if date: date_str = date.isoformat() length = len(date_str) if resolution == DateResolution.YEAR and length >= 4: return date_str[:4]
class GTDModule(ETKModule): def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'gtd_date_parser') self.causeex_decoder = DecodingValueExtractor( event_to_clauseex_class_mapping, 'CauseEx Type', default_action="delete") def process_document(self, doc: Document) -> List[Document]: nested_docs = list() json_doc = doc.cdr_document filename = json_doc.get('file_name') doc.doc_id = Utility.create_doc_id_from_json(json_doc) doc.cdr_document['uri'] = doc.doc_id doc.kg.add_value("type", value="Event") doc.kg.add_value("type", value="Act of Terrorism") doc.kg.add_value("provenance_filename", value=filename) for attack_type_code in attack_type_fields_code: ac = json_doc.get(attack_type_code, '') if ac != "": doc.kg.add_value("causeex_class", value=doc.extract( self.causeex_decoder, doc.select_segments( "$.{}".format(attack_type_code))[0])) # Add event_date to the KG extracted_dates = self.date_extractor.extract('{}-{}-{}'.format( json_doc.get('iyear'), json_doc.get('imonth'), json_doc.get('iday'))) if len(extracted_dates) > 0: doc.kg.add_value("event_date", value=extracted_dates) doc.kg.add_value("event_date_end", value=extracted_dates) else: # no proper date mentioned in the event, try the approximate date approximate_date_txt = json_doc.get("approxdate") extracted_approx_dates = self.date_extractor.extract( approximate_date_txt) if len(extracted_approx_dates) > 0: doc.kg.add_value("event_date", value=extracted_approx_dates) doc.kg.add_value("event_date_end", value=extracted_approx_dates) # summary, aka description only available for incident after 1997 doc.kg.add_value("description", json_path="$.summary") # add inclusion criteria: why is this incident regarded as a terrorist incident # TODO: ADD this to master_config crit1 = json_doc.get('crit1', 0) if crit1 == 1: doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_1) crit2 = json_doc.get('crit2', 0) if crit2 == 1: doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_2) crit3 = json_doc.get('crit3', 0) if crit3 == 1: doc.kg.add_value("inclusion_criteria", value=inclusion_criteria_3) # add related events to KG # TODO: ADD this to master_config related_event_ids_txt = json_doc.get('related', '') if related_event_ids_txt.strip() != "": related_event_ids = related_event_ids_txt.split(',') if len(related_event_ids) > 0: doc.kg.add_value("related_events", value=related_event_ids) # add attack information, on second thoughts, this qualifies as event type for attack_type_field in attack_type_fields: doc.kg.add_value("type", value=json_doc.get(attack_type_field, '')) # TODO check the following 2 if json_doc.get("suicide", 0) == 1: doc.kg.add_value("type", value='Suicide') if json_doc.get("success", 0) == 1: doc.kg.add_value("type", value='Success') # create nested objects for places place_object = dict() for place_field in place_fields: place_object[place_field] = json_doc.get(place_field) place_object["dataset"] = "gtd_place" place_doc_id = '{}_place'.format(doc.doc_id) place_object['uri'] = place_doc_id place_object['filename'] = filename place_doc = etk.create_document(place_object) place_doc.doc_id = place_doc_id doc.kg.add_value("place", value=place_doc.doc_id) nested_docs.append(place_doc) # create victim objects, there can be upto 3 if json_doc.get('targtype1_txt', '').strip(): victim1_object = dict() victim1_object['dataset'] = 'gtd_victim' victim1_object['filename'] = filename victim1_object['victim_type'] = list() victim1_object['victim_type'].append(json_doc.get('targtype1_txt')) if json_doc.get('targsubtype1_txt', ''): victim1_object['victim_type'].append( json_doc.get('targsubtype1_txt')) victim1_object['victim_corp'] = json_doc.get('corp1', '') victim1_object['victim_target'] = json_doc.get('target1', '') victim1_object['victim_nationality'] = json_doc.get( 'natlty1_txt', '') victim1_doc_id = '{}_victim1'.format(doc.doc_id) victim1_object['uri'] = victim1_doc_id victim1_doc = etk.create_document(victim1_object) victim1_doc.doc_id = victim1_doc_id doc.kg.add_value('victim', value=victim1_doc.doc_id) nested_docs.append(victim1_doc) if json_doc.get('targtype2_txt', '').strip(): victim2_object = dict() victim2_object['dataset'] = 'gtd_victim' victim2_object['filename'] = filename victim2_object['victim_type'] = list() victim2_object['victim_type'].append(json_doc.get('targtype2_txt')) if json_doc.get('targsubtype2_txt', ''): victim2_object['victim_type'].append( json_doc.get('targsubtype2_txt')) victim2_object['victim_corp'] = json_doc.get('corp2', '') victim2_object['victim_target'] = json_doc.get('target2', '') victim2_object['victim_nationality'] = json_doc.get( 'natlty2_txt', '') victim2_doc_id = '{}_victim2'.format(doc.doc_id) victim2_object['uri'] = victim2_doc_id victim2_doc = etk.create_document(victim2_object) victim2_doc.doc_id = victim2_doc_id doc.kg.add_value('victim', value=victim2_doc.doc_id) nested_docs.append(victim2_doc) if json_doc.get('targtype3_txt', '').strip(): victim3_object = dict() victim3_object['dataset'] = 'gtd_victim' victim3_object['filename'] = filename victim3_object['victim_type'] = list() victim3_object['victim_type'].append(json_doc.get('targtype3_txt')) if json_doc.get('targsubtype3_txt', ''): victim3_object['victim_type'].append( json_doc.get('targsubtype3_txt')) victim3_object['victim_corp'] = json_doc.get('corp3', '') victim3_object['victim_target'] = json_doc.get('target3', '') victim3_object['victim_nationality'] = json_doc.get( 'natlty3_txt', '') victim3_doc_id = '{}_victim3'.format(doc.doc_id) victim3_object['uri'] = victim3_doc_id victim3_doc = etk.create_document(victim3_object) victim3_doc.doc_id = victim3_doc_id doc.kg.add_value('victim', value=victim3_doc.doc_id) nested_docs.append(victim3_doc) # create actor/perpetrators objects if json_doc.get('gname', '').strip(): actor1_object = dict() actor1_object['dataset'] = 'gtd_actor' actor1_object['filename'] = filename actor1_object['actor_group'] = list() actor1_object['actor_group'].append(json_doc.get('gname')) if json_doc.get('gsubname', ''): actor1_object['actor_group'].append(json_doc.get('gsubname')) actor1_doc_id = '{}_actor1'.format(doc.doc_id) actor1_object['uri'] = actor1_doc_id actor1_doc = etk.create_document(actor1_object) actor1_doc.doc_id = actor1_doc_id doc.kg.add_value('actor', value=actor1_doc.doc_id) nested_docs.append(actor1_doc) if json_doc.get('gname2', '').strip(): actor2_object = dict() actor2_object['dataset'] = 'gtd_actor' actor2_object['filename'] = filename actor2_object['actor_group'] = list() actor2_object['actor_group'].append(json_doc.get('gname2')) if json_doc.get('gsubname2', ''): actor2_object['actor_group'].append(json_doc.get('gsubname2')) actor2_doc_id = '{}_actor2'.format(doc.doc_id) actor2_object['uri'] = actor2_doc_id actor2_doc = etk.create_document(actor2_object) actor2_doc.doc_id = actor2_doc_id doc.kg.add_value('actor', value=actor2_doc.doc_id) nested_docs.append(actor2_doc) if json_doc.get('gname3', '').strip(): actor3_object = dict() actor3_object['dataset'] = 'gtd_actor' actor3_object['filename'] = filename actor3_object['actor_group'] = list() actor3_object['actor_group'].append(json_doc.get('gname3')) if json_doc.get('gsubname3', ''): actor3_object['actor_group'].append(json_doc.get('gsubname3')) actor3_doc_id = '{}_actor3'.format(doc.doc_id) actor3_object['uri'] = actor3_doc_id actor3_doc = etk.create_document(actor3_object) actor3_doc.doc_id = actor3_doc_id doc.kg.add_value('actor', value=actor3_doc.doc_id) nested_docs.append(actor3_doc) # create weapon objects, upto 4 if json_doc.get('weaptype1_txt', '').strip(): weapon1_object = dict() weapon1_object['dataset'] = 'gtd_weapon' weapon1_object['filename'] = filename weapon1_object['weapon_title'] = json_doc.get('weapdetail', '') weapon1_object['weapon_type'] = list() weapon1_object['weapon_type'].append(json_doc.get('weaptype1_txt')) if json_doc.get('weapsubtype1_txt', ''): weapon1_object['weapon_type'].append( json_doc.get('weapsubtype1_txt')) if json_doc.get('weaptype1', '') != '': weapon1_object['weapon_code'] = json_doc.get('weaptype1') weapon1_doc_id = '{}_weapons1'.format(doc.doc_id) weapon1_object['uri'] = weapon1_doc_id weapon1_doc = etk.create_document(weapon1_object) weapon1_doc.doc_id = weapon1_doc_id doc.kg.add_value('weapons', weapon1_doc.doc_id) nested_docs.append(weapon1_doc) if json_doc.get('weaptype2_txt', '').strip(): weapon2_object = dict() weapon2_object['dataset'] = 'gtd_weapon' weapon2_object['filename'] = filename weapon2_object['weapon_title'] = json_doc.get('weapdetail', '') weapon2_object['weapon_type'] = list() weapon2_object['weapon_type'].append(json_doc.get('weaptype2_txt')) if json_doc.get('weapsubtype2_txt', ''): weapon2_object['weapon_type'].append( json_doc.get('weapsubtype2_txt')) if json_doc.get('weaptype2', '') != '': weapon2_object['weapon_code'] = json_doc.get('weaptype2') weapon2_doc_id = '{}_weapons2'.format(doc.doc_id) weapon2_object['uri'] = weapon2_doc_id weapon2_doc = etk.create_document(weapon2_object) weapon2_doc.doc_id = weapon2_doc_id doc.kg.add_value('weapons', weapon2_doc.doc_id) nested_docs.append(weapon2_doc) if json_doc.get('weaptype3_txt', '').strip(): weapon3_object = dict() weapon3_object['dataset'] = 'gtd_weapon' weapon3_object['filename'] = filename weapon3_object['weapon_title'] = json_doc.get('weapdetail', '') weapon3_object['weapon_type'] = list() weapon3_object['weapon_type'].append(json_doc.get('weaptype3_txt')) if json_doc.get('weapsubtype3_txt', ''): weapon3_object['weapon_type'].append( json_doc.get('weapsubtype3_txt')) if json_doc.get('weaptype3', '') != '': weapon3_object['weapon_code'] = json_doc.get('weaptype3') weapon3_doc_id = '{}_weapons3'.format(doc.doc_id) weapon3_object['uri'] = weapon3_doc_id weapon3_doc = etk.create_document(weapon3_object) weapon3_doc.doc_id = weapon3_doc_id doc.kg.add_value('weapons', weapon3_doc.doc_id) nested_docs.append(weapon3_doc) if json_doc.get('weaptype4_txt', '').strip(): weapon4_object = dict() weapon4_object['dataset'] = 'gtd_weapon' weapon4_object['filename'] = filename weapon4_object['weapon_title'] = json_doc.get('weapdetail', '') weapon4_object['weapon_type'] = list() weapon4_object['weapon_type'].append(json_doc.get('weaptype4_txt')) if json_doc.get('weapsubtype4_txt', ''): weapon4_object['weapon_type'].append( json_doc.get('weapsubtype4_txt')) if json_doc.get('weaptype4', '') != '': weapon4_object['weapon_code'] = json_doc.get('weaptype4') weapon4_doc_id = '{}_weapons4'.format(doc.doc_id) weapon4_object['uri'] = weapon4_doc_id weapon4_doc = etk.create_document(weapon4_object) weapon4_doc.doc_id = weapon4_doc_id doc.kg.add_value('weapons', weapon4_doc.doc_id) nested_docs.append(weapon4_doc) # create total fatalities docs nkill = json_doc.get("nkill", 0) if nkill != "": total_fatalities_object = dict() total_fatalities_object["dataset"] = "gtd_fatality" total_fatalities_object['filename'] = filename total_fatalities_doc_id = '{}_total_fatalitites'.format(doc.doc_id) total_fatalities_object['uri'] = total_fatalities_doc_id total_fatalities_object["size"] = nkill total_fatalities_doc = etk.create_document(total_fatalities_object) total_fatalities_doc.doc_id = total_fatalities_doc_id doc.kg.add_value("fatalities", value=total_fatalities_doc_id) nested_docs.append(total_fatalities_doc) # create US fatalities docs nkillus = json_doc.get("nkillus", 0) if nkillus != "": us_fatalities_object = dict() us_fatalities_object["dataset"] = "gtd_fatality" us_fatalities_object['filename'] = filename us_fatalities_doc_id = '{}_us_fatalitites'.format(doc.doc_id) us_fatalities_object['uri'] = us_fatalities_doc_id us_fatalities_object["size"] = nkillus us_fatalities_object["nationality"] = "United States" us_fatalities_doc = etk.create_document(us_fatalities_object) us_fatalities_doc.doc_id = us_fatalities_doc_id doc.kg.add_value("fatalities", value=us_fatalities_doc_id) nested_docs.append(us_fatalities_doc) # create total injuries docs nwound = json_doc.get("nwound", 0) if nwound != "": total_injuries_object = dict() total_injuries_object["dataset"] = "gtd_injury" total_injuries_object['filename'] = filename total_injuries_doc_id = '{}_total_injuries'.format(doc.doc_id) total_injuries_object['uri'] = total_injuries_doc_id total_injuries_object["size"] = nwound total_injuries_doc = etk.create_document(total_injuries_object) total_injuries_doc.doc_id = total_injuries_doc_id doc.kg.add_value("injuries", value=total_injuries_doc_id) nested_docs.append(total_injuries_doc) # create US injuries docs nwoundus = json_doc.get("nwoundus", 0) if nwoundus != "": us_injuries_object = dict() us_injuries_object["dataset"] = "gtd_injury" us_injuries_object['filename'] = filename us_injuries_doc_id = '{}_us_injuries'.format(doc.doc_id) us_injuries_object['uri'] = us_injuries_doc_id us_injuries_object["size"] = nwoundus us_injuries_doc = etk.create_document(us_injuries_object) us_injuries_doc.doc_id = us_injuries_doc_id doc.kg.add_value("injuries", value=us_injuries_doc_id) nested_docs.append(us_injuries_doc) # create damage docs # in this dataset we only have property damage if json_doc.get("property", 0) == 1: damage_object = dict() damage_object["dataset"] = "gtd_damage" damage_object['filename'] = filename damage_object["damage_title"] = json_doc.get("propextent_txt") damage_object["damage_value"] = json_doc.get("propvalue") damage_object["damage_description"] = json_doc.get("propcomment") damage_object_doc_id = '{}_damage'.format(doc.doc_id) damage_object['uri'] = damage_object_doc_id damage_doc = etk.create_document(damage_object) damage_doc.doc_id = damage_object_doc_id doc.kg.add_value("damage", value=damage_object_doc_id) nested_docs.append(damage_doc) return nested_docs def document_selector(self, doc) -> bool: return doc.cdr_document.get("dataset") == "gtd"
"description": "03/05/2018: I went to USC on Aug 20th, 2016 and will graduate on 2018, May 11. My birthday is 29-04-1994." } ] } etk = ETK() doc = etk.create_document(sample_input) # example for glossary extractor: name_extractor = GlossaryExtractor(etk.load_glossary("./names.txt"), "name_extractor", etk.default_tokenizer, case_sensitive=False, ngrams=1) descriptions = doc.select_segments("projects[*].description") projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): print ("Iam d path: " + d.full_path) names = doc.invoke_extractor(name_extractor, d) p.store_extractions(names, "members") # example for date extractor: date_extractor = DateExtractor('test_date_parser') member_descriptions = doc.select_segments("members[*].description") members = doc.select_segments("members[*]") for m_d, m in zip(member_descriptions, members): dates = doc.invoke_extractor(date_extractor, m_d, ignore_future_dates=False, ignore_past_years=40) m.store_extractions(dates, "related_dates") print(json.dumps(sample_input, indent=2))
from typing import List, Dict from dateutil import parser from datetime import timedelta import datetime from etk.extractors.date_extractor import DateExtractor, DateResolution import sys de = DateExtractor(extractor_name='date_extractor') """ Detect simple granularity types in a sequence of dates. """ class GranularityDetector: granularity = { "second": timedelta(seconds=1), "minute": timedelta(minutes=1), "hour": timedelta(hours=1), "day": timedelta(hours=24), "week": timedelta(days=7), "month": timedelta(days=30), "quarter": timedelta(days=120), "year": timedelta(days=365) } @staticmethod def get_parsed_date(my_date): if isinstance(my_date, datetime.datetime): return my_date elif isinstance(my_date, datetime.date): return datetime.datetime(my_date.year, my_date.month, my_date.day)
def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'acled_date_parser') self.country_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/countries.json.gz", read_json=True), "country_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.states_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/states_usa_canada.json.gz", read_json=True), "states_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.cities_extractor = GlossaryExtractor(self.etk.load_glossary( "${GLOSSARY_PATH}/cities.json.gz", read_json=True), "cities_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=3) self.csv_processor = CsvProcessor(etk=etk, heading_row=1) self.interaction_decoding_dict = { "10": "Sole Military Action", "11": "Military Versus Military", "12": "Military Versus Rebels", "13": "Military Versus Political Militia", "14": "Military Versus Communal Militia", "15": "Military Versus Rioters", "16": "Military Versus Protesters", "17": "Military Versus Civilians", "18": "Military Versus Other", "20": "Sole Rebel Action", "22": "Rebels Versus Rebels", "23": "Rebels Versus Political Militia", "24": "Rebels Versus Communal Militia", "25": "Rebels Versus Rioters", "26": "Rebels Versus Protesters", "27": "Rebels Versus Civilians", "28": "Rebels Versus Other", "30": "Sole Political Militia Action", "33": "Political Militia Versus Political Militia", "34": "Political Militia Versus Communal Militia", "35": "Political Militia Versus Rioters", "36": "Political Militia Versus Protesters", "37": "Political Militia Versus Civilians", "38": "Political Militia Versus Other", "40": "Sole Communal Militia Action", "44": "Communal Militia Versus Communal Militia", "45": "Communal Militia Versus Rioters", "46": "Communal Militia Versus Protesters", "47": "Communal Militia Versus Civilians", "48": "Communal Militia Versus Other", "50": "Sole Rioter Action", "55": "Rioters Versus Rioters", "56": "Rioters Versus Protesters", "57": "Rioters Versus Civilians", "58": "Rioters Versus Other", "60": "Sole Protester Action", "66": "Protesters Versus Protesters", "68": "Protesters Versus Other", "78": "Other Actor Versus Civilians", "80": "Sole Other Action" } self.interaction_decoder = DecodingValueExtractor( self.interaction_decoding_dict, 'default_decoding', case_sensitive=True)
import warnings import sys import argparse from etk.extractors.date_extractor import DateExtractor date_extractor = DateExtractor() def add_arguments(parser): """ Parse arguments Args: parser (argparse.ArgumentParser) """ parser.description = 'Examples:\n' \ 'python -m etk date_extractor /tmp/date.txt\n' \ 'cat /tmp/date.txt | python -m etk date_extractor' parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) def run(args): """ Args: args (argparse.Namespace) """ with warnings.catch_warnings(): warnings.simplefilter('ignore')
def main(): filename = sys.argv[1] query_title = sys.argv[2] ranking_criteria = sys.argv[3] top_k = sys.argv[4] if ranking_criteria not in ('TITLE', 'SENTENCE'): print('Wrong mode! Please check the input argument!') return master_config = { "fields": { "developer": { "type": "string" }, "student_developer": { "type": "string" }, "spacy_name": { "type": "string" }, "date": { "type": "date" } } } kg_schema = KGSchema(master_config) etk = ETK(kg_schema, ["./extraction_modules/"]) nlp = spacy.load('en_core_web_lg') date_extractor = DateExtractor(etk=etk) queries = dict() queries_ent_map = dict() with open(query_title) as f: for line in f: orig_ifp_title = line # remove date information from query term res = date_extractor.extract(text=line) start, end = float('inf'), -1 for i in res: start = min(start, i.provenance['start_char']) end = max(end, i.provenance['end_char']) # delete date from query term if len(res) != 0: line = line[:start] + line[end+1:] queries[orig_ifp_title] = line queries_ent_map[line] = list() # extract entities from query term doc = nlp(line) for ent in doc.ents: queries_ent_map[line].append(re.escape(ent.text.strip())) # remove empty entities queries_ent_map[line] = list(filter(bool, queries_ent_map[line])) # the list of selected docs for given query term query_docs_mapping = dict() docs = list() with open(filename) as f: for line in f: json_obj = json.loads(line) docs.append(etk.create_document(json_obj)) ds = DefaultDocumentSelector() for orig_query, proc_query in queries.items(): content_regex = queries_ent_map[proc_query] query_docs_mapping[proc_query] = list() for doc in docs: if len(content_regex) == 0 \ or ds.select_document(document=doc, json_paths=['$.lexisnexis.doc_description'], json_paths_regex=content_regex): query_docs_mapping[proc_query].append(doc) # TODO: pass ifp_id in for orig_query, proc_query in queries.items(): # print(len(query_docs_mapping[proc_query])) dr_processor = DocRetrieveProcessor(etk=etk, ifp_id="1233", ifp_title=proc_query, orig_ifp_title=orig_query) heap = list() for doc in query_docs_mapping[proc_query]: processed_doc = dict() if ranking_criteria == 'SENTENCE': processed_doc = dr_processor.process_by_sentence(doc=doc, threshold=0).cdr_document elif ranking_criteria == 'TITLE': processed_doc = dr_processor.process_by_title(doc=doc, threshold=0).cdr_document if len(heap) < top_k: heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc)) else: if processed_doc['similarity'] > heap[0][0]: heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc)) heap.sort(reverse=True) output_filename = './resources/output/'+orig_ifp_title+"_result.jl" with open(output_filename, 'a+b') as f: for item in heap: print(item[0]) jl_str = json.dumps(item[2]) + '\n' f.write(jl_str.encode())
for row in vectors for cell in row if len(cell)) for f in cat_vector: values = list( set(cell[f] for row in vectors for cell in row if len(cell))) for r, row in enumerate(vectors): for c, cell in enumerate(row): if len(cell) == 0: continue for v in values: vectors[r][c][f'{f}-{v}'] = 1 if v == cell[f] else 0 del vectors[r][c][f] return vectors # --- parsing ----------------------------------------------------------------- _find_dates_extractor = DateExtractor() def find_dates(text): try: return parse_date(text, fuzzy_with_tokens=True)[0] except: pass try: res = _find_dates_extractor.extract(text, prefer_language_date_order=False) if len(res): return res[0].value except: log('info', f'ETK DateExtractor raised an error on value {text}.')
def __init__(self, etk): ETKModule.__init__(self, etk) self.date_extractor = DateExtractor(self.etk, 'acled_date_parser')