def run(self): # read events archive_events = [] active_events = [] date = datetime.datetime(int(self.archivedate[:4]), int(self.archivedate[4:6]), int(self.archivedate[6:8])) print('Reading events') with open(self.in_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) for i, ed in enumerate(eventdicts): eventobj = event.Event() eventobj.import_eventdict(ed) if eventobj.datetime == date: archive_events.append(eventobj) else: active_events.append(eventobj) # write archive print('Writing archive') out_archive_events = [ event.return_dict(txt=False) for event in archive_events ] with open(self.out_archived().path, 'w', encoding='utf-8') as file_out: json.dump(out_archive_events, file_out) # write active events print('Writing active events') out_active_events = [ event.return_dict(txt=False) for event in active_events ] with open(self.out_active_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_active_events, file_out)
def run(self): # read in events print('Reading in events') with open(self.in_deduplicated_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed) event_objs.append(eventobj) # initialize event enhancer print('Enhancing events') enhancer = event_enhancer.EventEnhancer() enhancer.set_events(event_objs) enhancer.enhance() enhanced_events = enhancer.return_events() # write deduplicated out_enhanced_events = [ event.return_dict() for event in enhanced_events ] with open(self.out_enhanced_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_enhanced_events, file_out)
def run(self): # read in events with open(self.in_enhanced_events().path, 'r', encoding = 'utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed) event_objs.append(eventobj) # initialize event merger print('Merging; number of events at start:',len(event_objs)) overlap_threshold = float(self.overlap_threshold) merger = event_merger.EventMerger() merger.add_events(event_objs) merger.find_merges(overlap_threshold) events_merged = merger.return_events() print('Merging again; current number of events:',len(events_merged)) merger2 = event_merger.EventMerger() merger2.add_events(events_merged) merger2.find_merges(overlap_threshold) events_merged_final = merger2.return_events() print('Done. number of events after merge:',len(events_merged_final)) # write merged out_merged_events = [event.return_dict(txt=False) for event in events_merged_final] with open(self.out_merged_events().path,'w',encoding='utf-8') as file_out: json.dump(out_merged_events,file_out)
def run(self): # read in events print('Reading in events') with open(self.in_merged_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed, txt=False) event_objs.append(eventobj) print('Reading in citylist') # read in citylist with open(self.citylist, 'r', encoding='utf-8') as file_in: citylist = [ line.strip() for line in file_in.read().strip().split('\n') ] # initialize event filter print('Filtering; number of events at start:', len(event_objs)) filter = event_filter.EventFilter() filter.add_events(event_objs) filter.apply_filter(citylist) events_filtered = filter.return_events() print('Done. number of events after filter:', len(events_filtered)) # write filter out_filtered_events = [ event.return_dict(txt=False) for event in events_filtered ] with open(self.out_filtered_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_filtered_events, file_out)
def run(self): # if directory does not exist, create directory if not os.path.isdir(self.out_eventdir().path): self.setup_output_dir(self.out_eventdir().path) # collect tweet files end_date_year = self.end_date[:4] end_date_month = self.end_date[4:6] end_date_day = self.end_date[6:] last_date = datetime.date(int(end_date_year), int(end_date_month), int(end_date_day)) first_date = last_date - datetime.timedelta(days=self.window_size - 1) last_tweetfile = self.in_tweetdir( ).path + '/' + end_date_year + end_date_month + '/' + end_date_year + end_date_month + end_date_day + '-23.out.dateref.cityref.entity.json' days_tweetfiles = helpers.return_tweetfiles_window( last_tweetfile, self.window_size - 1) tweetfiles = [] for day in days_tweetfiles: tweetfiles.extend([ filename for filename in glob.glob(self.in_tweetdir().path + '/' + day + '*') ]) print('Reading in citylist') # read in citylist with open(self.citylist, 'r', encoding='utf-8') as file_in: citylist = [ line.strip() for line in file_in.read().strip().split('\n') ] # extract events er = event_ranker.EventRanker() # read in tweets print('Reading in tweets') for tweetfile in tweetfiles: date = helpers.return_date_entitytweetfile(tweetfile) with open(tweetfile, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) # format as tweet objects for td in tweetdicts: if not (td['refdates'] == {} and td['entities'] == {}): tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) er.add_tweet(tweetobj) er.tweet_count += 1 # extract events print('Performing event extraction') er.extract_events(self.minimum_event_mentions, self.cut_off) filter = event_filter.EventFilter() filter.add_events(er.events) filter.apply_filter(citylist) events_filtered = filter.return_events() print('Done. Extracted', len(events_filtered), 'events') # write to file outevents = [event.return_dict() for event in events_filtered] with open(self.out_events().path, 'w', encoding='utf-8') as file_out: json.dump(outevents, file_out)
def run(self): # read in events with open(self.in_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed) event_objs.append(eventobj) # initialize event deduplicator similarity_threshold = float(self.similarity_threshold) print('Deduplicating; number of events at start:', len(event_objs)) deduplicator = event_deduplicator.EventDeduplicator() deduplicator.set_events(event_objs) deduplicator.deduplicate_events(similarity_threshold) deduplicated_events = deduplicator.return_events() print('Done. number of events after deduplication:', len(deduplicated_events)) # write deduplicated out_deduplicated_events = [ event.return_dict() for event in deduplicated_events ] with open(self.out_deduplicated_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_deduplicated_events, file_out)
def run(self): # read in new events with open(self.in_events().path, 'r', encoding='utf-8') as file_in: new_eventdicts = json.loads(file_in.read()) new_event_objs = [] for ed in new_eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed, txt=False) new_event_objs.append(eventobj) earliest_date = min([event.datetime for event in new_event_objs]) # read in current events with open(self.current_events, 'r', encoding='utf-8') as file_in: current_eventdicts = json.loads(file_in.read()) current_event_objs = [] current_event_objs_candidates = [] for ed in current_eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed, txt=False) if eventobj.datetime >= earliest_date: current_event_objs_candidates.append(eventobj) else: current_event_objs.append(eventobj) # initialize event merger merger = event_merger.EventMerger() merger.add_events(current_event_objs_candidates) # merge before integration print( 'Merging new events before integration; number of events at start:', len(new_event_objs)) overlap_threshold = float(self.overlap_threshold) premerger = event_merger.EventMerger() premerger.add_events(new_event_objs) premerger.find_merges(overlap_threshold) new_events_merged = premerger.return_events() print('Done. New events after merge:', len(new_events_merged)) # integrate each event into the current ones print('Starting integrating new events; number of current events:', len(current_event_objs)) for new_event in new_events_merged: merger.find_merge(new_event, overlap_threshold) # write merged integrated_events = merger.return_events() + current_event_objs print('Done. Number of events after integration:', len(integrated_events)) out_integrated_events = [ event.return_dict(txt=False) for event in integrated_events ] with open(self.out_integrated_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_integrated_events, file_out)
def run(self): # initiate directory self.setup_output_dir(self.out_archivedir().path) # read events datebound = datetime.now() - datetime.timedelta(days=100) date_events = defaultdict(list) active_events = [] print('Reading events') with open(self.in_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) for i, ed in enumerate(eventdicts): eventobj = event.Event() eventobj.import_eventdict(ed) if eventobj.datetime < datebound: date_events[''.join( str(eventobj.datetime).split()[0].split('-'))].append( eventobj) else: active_events.append(eventobj) # write archives print('Writing archives') for date in sorted(list(date_events.keys())): print(date) events = date_events[date] out_events = [event.return_dict(txt=False) for event in events] outfile = self.out_archivedir().path + '/events_' + date + '.json' with open(outfile, 'w', encoding='utf-8') as file_out: json.dump(out_events, file_out) # write active events print('Writing active events') out_active_events = [ event.return_dict(txt=False) for event in active_events ] with open(self.out_active_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_active_events, file_out)
def run(self): # collect all event files with extension '.enhanced' enhanced_events = glob.glob(self.in_eventdir().path + '/*.enhanced') # initialize merger = event_merger.EventMerger() overlap_threshold = float(self.overlap_threshold) # for each event file for eventfile in enhanced_events: print('Reading', eventfile) with open(eventfile, 'r', encoding='utf-8') as file_in: current_eventdicts = json.loads(file_in.read()) new_event_objs = [] for ed in current_eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed) new_event_objs.append(eventobj) # merge before integration print( 'Merging new events before integration; number of events at start:', len(new_event_objs)) premerger = event_merger.EventMerger() premerger.add_events(new_event_objs) premerger.find_merges(overlap_threshold) new_events_merged = premerger.return_events() print('Done. New events after merge:', len(new_events_merged)) if len(merger.events) == 0: merger.add_events(new_events_merged) else: # integrate each event into the current ones print( 'Starting integrating new events; number of current events:', len(merger.events)) for new_event in new_events_merged: merger.find_merge(new_event, overlap_threshold) # write merged integrated_events = merger.return_events() print('Done. Number of events after integration:', len(integrated_events)) out_integrated_events = [ event.return_dict() for event in integrated_events ] with open(self.out_integrated_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_integrated_events, file_out)
def run(self): # read prediction data with open(self.in_predictiondir().path + '/events_meta.txt','r',encoding='utf=8') as file_in: meta = file_in.read().strip().split('\n') with open(self.in_predictiondir().path + '/events_text.predictions.txt','r',encoding='utf=8') as file_in: predictions = file_in.read().strip().split('\n') with open(self.in_predictiondir().path + '/events_text.full_predictions.txt','r',encoding='utf=8') as file_in: lines = file_in.read().strip().split('\n') label_order = lines[0].split('\t') full_predictions = [line.split('\t') for line in lines[1:]] print('Meta',len(meta)) print('Predictions',len(predictions)) print('Full predictions',len(full_predictions)) # read in events print('Reading in events') with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed,txt=self.text) event_objs.append(eventobj) # index events id_event = {} for eo in event_objs: id_event[eo.mongo_id] = eo # for each prediction for i,mid in enumerate(meta): prediction = predictions[i] prediction_score = dict(zip(label_order,full_predictions[i])) eo = id_event[mid] eo.eventtype = prediction eo.eventtype_scores = prediction_score # write output out_updated_events = [event.return_dict(txt=self.text) for event in event_objs] with open(self.out_updated_events().path,'w',encoding='utf-8') as file_out: json.dump(out_updated_events,file_out)
def run(self): # create directory self.setup_output_dir(self.out_eventdir().path) # set dates end_date_year = self.end_date[:4] end_date_month = self.end_date[4:6] end_date_day = self.end_date[6:] last_date = datetime.date(int(end_date_year), int(end_date_month), int(end_date_day)) start_date_year = self.start_date[:4] start_date_month = self.start_date[4:6] start_date_day = self.start_date[6:] first_date = datetime.date(int(start_date_year), int(start_date_month), int(start_date_day)) print('Reading in citylist') # read in citylist with open(self.citylist, 'r', encoding='utf-8') as file_in: citylist = [ line.strip() for line in file_in.read().strip().split('\n') ] # perform event extraction on first tweet window print('Reading in first window of tweets') cursor_date = first_date date_tweetobjects = defaultdict(list) date_tweetcounts = defaultdict(int) window_dates = [] while cursor_date < first_date + datetime.timedelta( days=self.window_size): print(cursor_date) tweetfiles = [ filename for filename in glob.glob(self.in_tweetdir().path + '/' + helpers.return_timeobj_date(cursor_date) + '*') ] for tweetfile in tweetfiles: # read in tweets with open(tweetfile, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) date_tweetcounts[cursor_date] += len(tweetdicts) for td in tweetdicts: if not (td['refdates'] == {} and td['entities'] == {}): tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) date_tweetobjects[cursor_date].append(tweetobj) window_dates.append(cursor_date) cursor_date += datetime.timedelta(days=1) # start event extraction print('Loading tweets into event extractor') er = event_ranker.EventRanker() for date in window_dates: for tweetobject in date_tweetobjects[date]: er.add_tweet(tweetobject) er.tweet_count += date_tweetcounts[date] print('Performing event extraction') er.extract_events(self.minimum_event_mentions, self.cut_off) filter = event_filter.EventFilter() filter.add_events(er.events) filter.apply_filter(citylist) events_filtered = filter.return_events() print('Done. Extracted', len(events_filtered), 'events') # write to file outevents = [event.return_dict() for event in events_filtered] with open(self.out_eventdir().path + '/' + str(cursor_date - datetime.timedelta(days=1)).replace( '-', '') + '.events', 'w', encoding='utf-8') as file_out: json.dump(outevents, file_out) # slide window forward and perform event extraction until last date print('Starting slider') window_tail = first_date window_head = cursor_date - datetime.timedelta(days=1) while window_head <= last_date: print('Discarding and collecting tweets') end_slider = window_head + datetime.timedelta(days=self.slider) while window_head < end_slider: # remove tweets of tail print('Deleting records for old tail', window_tail) del date_tweetobjects[window_tail] del date_tweetcounts[window_tail] window_tail = window_tail + datetime.timedelta(days=1) window_head = window_head + datetime.timedelta(days=1) print('Collecting tweets for new head', window_head) tweetfiles = [ filename for filename in glob.glob(self.in_tweetdir().path + '/' + helpers.return_timeobj_date(window_head) + '*') ] for tweetfile in tweetfiles: # read in tweets with open(tweetfile, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) date_tweetcounts[window_head] += len(tweetdicts) for td in tweetdicts: if not (td['refdates'] == {} and td['entities'] == {}): tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) date_tweetobjects[window_head].append(tweetobj) # add tweets to event ranker print('Loading tweets into event extractor for', window_tail, '-', window_head) er = event_ranker.EventRanker() window_dates = helpers.return_daterange(window_tail, self.window_size) for date in window_dates: for tweetobject in date_tweetobjects[date]: er.add_tweet(tweetobject) er.tweet_count += date_tweetcounts[date] print('Performing event extraction') er.extract_events(self.minimum_event_mentions, self.cut_off) filter = event_filter.EventFilter() filter.add_events(er.events) filter.apply_filter(citylist) events_filtered = filter.return_events() print('Done. Extracted', len(events_filtered), 'events') # write to file outevents = [event.return_dict() for event in events_filtered] with open(self.out_eventdir().path + '/' + str(window_head).replace('-', '') + '.events', 'w', encoding='utf-8') as file_out: json.dump(outevents, file_out)