def run(self): # read in gzipped tweet file good_format = re.compile(r'}$') tweets = [] for line in io.TextIOWrapper(io.BufferedReader(gzip.open(self.in_tweets().path)), encoding='utf-8', errors='ignore'): if self.format_json: try: tweets.append(json.loads(line.strip())) except: print('Error loading json, skipping to next line') else: try: tweets.append(line.strip()) except: print('Error loading json, skipping to next line') print(self.in_tweets().path,'contains',len(tweets),'before filtering') tf = tweetfilter.Tweetfilter(tweets) tf.discard_retweets() print('after retweet filter',len(tf.tweets)) tf.discard_nondutch() filtered_tweets = tf.return_tweets() print('after filtering:',len(filtered_tweets)) # write filtered tweets outtweets = [] for filtered_tweet in filtered_tweets: tweetobj = tweet.Tweet() tweetobj.import_twiqsdict(filtered_tweet) outtweets.append(tweetobj.return_dict()) # write to file with open(self.out_filtered().path,'w',encoding='utf-8') as outfile: json.dump(outtweets,outfile)
def run(self): # if directory does not exist, create directory if not os.path.isdir(self.out_eventdir().path): self.setup_output_dir(self.out_eventdir().path) # collect tweet files end_date_year = self.end_date[:4] end_date_month = self.end_date[4:6] end_date_day = self.end_date[6:] last_date = datetime.date(int(end_date_year), int(end_date_month), int(end_date_day)) first_date = last_date - datetime.timedelta(days=self.window_size - 1) last_tweetfile = self.in_tweetdir( ).path + '/' + end_date_year + end_date_month + '/' + end_date_year + end_date_month + end_date_day + '-23.out.dateref.cityref.entity.json' days_tweetfiles = helpers.return_tweetfiles_window( last_tweetfile, self.window_size - 1) tweetfiles = [] for day in days_tweetfiles: tweetfiles.extend([ filename for filename in glob.glob(self.in_tweetdir().path + '/' + day + '*') ]) print('Reading in citylist') # read in citylist with open(self.citylist, 'r', encoding='utf-8') as file_in: citylist = [ line.strip() for line in file_in.read().strip().split('\n') ] # extract events er = event_ranker.EventRanker() # read in tweets print('Reading in tweets') for tweetfile in tweetfiles: date = helpers.return_date_entitytweetfile(tweetfile) with open(tweetfile, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) # format as tweet objects for td in tweetdicts: if not (td['refdates'] == {} and td['entities'] == {}): tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) er.add_tweet(tweetobj) er.tweet_count += 1 # extract events print('Performing event extraction') er.extract_events(self.minimum_event_mentions, self.cut_off) filter = event_filter.EventFilter() filter.add_events(er.events) filter.apply_filter(citylist) events_filtered = filter.return_events() print('Done. Extracted', len(events_filtered), 'events') # write to file outevents = [event.return_dict() for event in events_filtered] with open(self.out_events().path, 'w', encoding='utf-8') as file_out: json.dump(outevents, file_out)
def run(self): # read in citylist with open(self.citylist, 'r', encoding='utf-8') as file_in: citylist = [ line.strip() for line in file_in.read().strip().split('\n') ] # read in tweets with open(self.in_dateref().path, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) # format as tweet objects tweets = [] for td in tweetdicts: tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) tweets.append(tweetobj) # extract location for tweetobj in tweets: # remove already extracted time from the tweet, forming it into chunks datestrings = [sr[0] for sr in tweetobj.string_refdates] tweet_chunks = helpers.remove_pattern_from_string( tweetobj.text, datestrings) # extract city from chunks ce = cityref_extractor.CityrefExtractor(citylist) for chunk in tweet_chunks: ce.find_cityrefs(chunk) tweetobj.set_cityrefs(ce.return_cityrefs()) # write to file outtweets = [tweet.return_dict() for tweet in tweets] with open(self.out_cityref().path, 'w', encoding='utf-8') as file_out: json.dump(outtweets, file_out)
def run(self): # read in tweets with open(self.in_tokenized().path, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) # format as tweet objects tweets = [] for td in tweetdicts: tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) tweets.append(tweetobj) # extract daterefs for tweetobj in tweets: dte = dutch_timex_extractor.Dutch_timex_extractor( tweetobj.text, tweetobj.datetime) dte.extract_refdates(self.skip_datematch, self.skip_monthmatch, self.skip_timeunitmatch, self.skip_daymatch) dte.filter_future_refdates() tweetobj.set_refdates(dte.refdates) # write to file outtweets = [tweet.return_dict() for tweet in tweets] with open(self.out_dateref().path, 'w', encoding='utf-8') as file_out: json.dump(outtweets, file_out)
def run(self): # set commonness object cs = commonness.Commonness() cs.set_classencoder(self.commonness_txt, self.commonness_cls, self.commonness_corpus) cs.set_dmodel(self.ngrams_score) # read in tweets with open(self.in_cityref().path, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) # format as tweet objects tweets = [] for td in tweetdicts: tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) tweets.append(tweetobj) # extract entities for tweetobj in tweets: # remove already extracted time and locations from the tweet, forming it into chunks datestrings = [sr[0] for sr in tweetobj.string_refdates] cities = tweetobj.cityrefs tweet_chunks = helpers.remove_pattern_from_string( tweetobj.text, datestrings + cities) # find entities in every chunk ee = entity_extractor.EntityExtractor() ee.set_commonness(cs) for chunk in tweet_chunks: tokens = chunk.split() ee.extract_entities(tokens) ee.filter_entities_threshold() tweetobj.set_entities(ee.entities) # write to file outtweets = [tweet.return_dict() for tweet in tweets] with open(self.out_entity().path, 'w', encoding='utf-8') as file_out: json.dump(outtweets, file_out)
def run(self): first_event_date_dt = datetime.datetime(int(self.first_event_date[:4]),int(self.first_event_date[4:6]),int(self.first_event_date[6:])) last_event_date_dt = datetime.datetime(int(self.last_event_date[:4]),int(self.last_event_date[4:6]),int(self.last_event_date[6:])) # read in burstiness print('Reading in bursty entities') with open(self.in_entity_burstiness().path,'r',encoding='utf-8') as file_in: bursty_entities = file_in.read().strip().split('\n') set_bursty_entities = set(bursty_entities) # read in events term_events = defaultdict(list) print('Reading in events') extended_events = [] with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in: eventdicts = json.loads(file_in.read()) for i,ed in enumerate(eventdicts): eventobj = event.Event() eventobj.import_eventdict(ed) if eventobj.datetime >= first_event_date_dt - datetime.timedelta(days=100) and eventobj.datetime <= last_event_date_dt + datetime.timedelta(days=100): extended_events.append(eventobj) if len(list(set_bursty_entities & set(list(ed['entities'])))) > 0: for term in list(set_bursty_entities & set(list(ed['entities']))): term_events[term].append(eventobj) # for each entity print('Saving event tweets dates by entity') date_entity_events = defaultdict(lambda : defaultdict(list)) for entity in bursty_entities: # for each event if len(term_events[entity]) == 0: continue else: for i,ev in enumerate(term_events[entity]): if ev.datetime >= first_event_date_dt and ev.datetime <= last_event_date_dt: if i == 0: minus = 100 else: minus = 100 if ((ev.datetime - term_events[entity][i-1].datetime).days > 199 or (ev.datetime - term_events[entity][i-1].datetime).days < 3) else ((ev.datetime - term_events[entity][i-1].datetime).days) / 2 if i == len(term_events[entity])-1: plus = 100 else: plus = 100 if ((term_events[entity][i-1].datetime - ev.datetime).days > 199 or (term_events[entity][i-1].datetime - ev.datetime).days < 3) else ((term_events[entity][i-1].datetime - ev.datetime).days) / 2 first = ev.datetime - datetime.timedelta(days=minus) last = ev.datetime + datetime.timedelta(days=plus) cursor = first while cursor <= last: date_str = ''.join(str(cursor).split()[0].split('-')) date_entity_events[date_str][entity].append(ev) cursor += datetime.timedelta(days=1) # read in tweets print('Collecting additional tweets') dates = list(date_entity_events.keys()) months = list(set([d[:6] for d in dates])) tweetsubdirs = sorted([ subdir for subdir in glob.glob(self.in_tweetdir().path + '/*') ]) entity_tweets = defaultdict(list) first = True for tweetsubdir in tweetsubdirs: subdirstr = tweetsubdir.split('/')[-1] if subdirstr in months: # go through all tweet files tweetfiles = [ tweetfile for tweetfile in glob.glob(tweetsubdir + '/*.entity.json') ] for tweetfile in tweetfiles: print(tweetfile) datestr = tweetfile.split('/')[-1].split('.')[0].split('-')[0] if datestr in dates: if first: candidate_entities = list(date_entity_events[datestr].keys()) set_candidate_entities = set(candidate_entities) cursordate = datestr first = False elif datestr != cursordate: # add tweets for entity in candidate_entities: for ev in date_entity_events[datestr][entity]: ev.add_tweets(entity_tweets[entity]) cursordate = datestr candidate_entities = list(date_entity_events[datestr].keys()) set_candidate_entities = set(candidate_entities) entity_tweets = defaultdict(list) # read in tweets with open(tweetfile, 'r', encoding = 'utf-8') as file_in: tweetdicts = json.loads(file_in.read()) for td in tweetdicts: if len(list(set_candidate_entities & set(list(td['entities'].keys())))) > 0: tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) for term in list(set_candidate_entities & set(list(td['entities'].keys()))): entity_tweets[term].append(tweetobj) # write to file print('Writing new events') out_extended_events = [ev.return_dict() for ev in extended_events] with open(self.out_more_tweets().path,'w',encoding='utf-8') as file_out: json.dump(out_extended_events,file_out)
def run(self): # create directory self.setup_output_dir(self.out_eventdir().path) # set dates end_date_year = self.end_date[:4] end_date_month = self.end_date[4:6] end_date_day = self.end_date[6:] last_date = datetime.date(int(end_date_year), int(end_date_month), int(end_date_day)) start_date_year = self.start_date[:4] start_date_month = self.start_date[4:6] start_date_day = self.start_date[6:] first_date = datetime.date(int(start_date_year), int(start_date_month), int(start_date_day)) print('Reading in citylist') # read in citylist with open(self.citylist, 'r', encoding='utf-8') as file_in: citylist = [ line.strip() for line in file_in.read().strip().split('\n') ] # perform event extraction on first tweet window print('Reading in first window of tweets') cursor_date = first_date date_tweetobjects = defaultdict(list) date_tweetcounts = defaultdict(int) window_dates = [] while cursor_date < first_date + datetime.timedelta( days=self.window_size): print(cursor_date) tweetfiles = [ filename for filename in glob.glob(self.in_tweetdir().path + '/' + helpers.return_timeobj_date(cursor_date) + '*') ] for tweetfile in tweetfiles: # read in tweets with open(tweetfile, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) date_tweetcounts[cursor_date] += len(tweetdicts) for td in tweetdicts: if not (td['refdates'] == {} and td['entities'] == {}): tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) date_tweetobjects[cursor_date].append(tweetobj) window_dates.append(cursor_date) cursor_date += datetime.timedelta(days=1) # start event extraction print('Loading tweets into event extractor') er = event_ranker.EventRanker() for date in window_dates: for tweetobject in date_tweetobjects[date]: er.add_tweet(tweetobject) er.tweet_count += date_tweetcounts[date] print('Performing event extraction') er.extract_events(self.minimum_event_mentions, self.cut_off) filter = event_filter.EventFilter() filter.add_events(er.events) filter.apply_filter(citylist) events_filtered = filter.return_events() print('Done. Extracted', len(events_filtered), 'events') # write to file outevents = [event.return_dict() for event in events_filtered] with open(self.out_eventdir().path + '/' + str(cursor_date - datetime.timedelta(days=1)).replace( '-', '') + '.events', 'w', encoding='utf-8') as file_out: json.dump(outevents, file_out) # slide window forward and perform event extraction until last date print('Starting slider') window_tail = first_date window_head = cursor_date - datetime.timedelta(days=1) while window_head <= last_date: print('Discarding and collecting tweets') end_slider = window_head + datetime.timedelta(days=self.slider) while window_head < end_slider: # remove tweets of tail print('Deleting records for old tail', window_tail) del date_tweetobjects[window_tail] del date_tweetcounts[window_tail] window_tail = window_tail + datetime.timedelta(days=1) window_head = window_head + datetime.timedelta(days=1) print('Collecting tweets for new head', window_head) tweetfiles = [ filename for filename in glob.glob(self.in_tweetdir().path + '/' + helpers.return_timeobj_date(window_head) + '*') ] for tweetfile in tweetfiles: # read in tweets with open(tweetfile, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) date_tweetcounts[window_head] += len(tweetdicts) for td in tweetdicts: if not (td['refdates'] == {} and td['entities'] == {}): tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) date_tweetobjects[window_head].append(tweetobj) # add tweets to event ranker print('Loading tweets into event extractor for', window_tail, '-', window_head) er = event_ranker.EventRanker() window_dates = helpers.return_daterange(window_tail, self.window_size) for date in window_dates: for tweetobject in date_tweetobjects[date]: er.add_tweet(tweetobject) er.tweet_count += date_tweetcounts[date] print('Performing event extraction') er.extract_events(self.minimum_event_mentions, self.cut_off) filter = event_filter.EventFilter() filter.add_events(er.events) filter.apply_filter(citylist) events_filtered = filter.return_events() print('Done. Extracted', len(events_filtered), 'events') # write to file outevents = [event.return_dict() for event in events_filtered] with open(self.out_eventdir().path + '/' + str(window_head).replace('-', '') + '.events', 'w', encoding='utf-8') as file_out: json.dump(outevents, file_out)