Esempio n. 1
0
 def run(self):
     # read in gzipped tweet file
     good_format = re.compile(r'}$')
     tweets = []
     for line in io.TextIOWrapper(io.BufferedReader(gzip.open(self.in_tweets().path)), encoding='utf-8', errors='ignore'):
         if self.format_json:
             try:
                 tweets.append(json.loads(line.strip()))
             except:
                 print('Error loading json, skipping to next line')
         else:
             try:
                 tweets.append(line.strip())
             except:
                 print('Error loading json, skipping to next line')
     print(self.in_tweets().path,'contains',len(tweets),'before filtering')
     tf = tweetfilter.Tweetfilter(tweets)
     tf.discard_retweets()
     print('after retweet filter',len(tf.tweets))
     tf.discard_nondutch()
     filtered_tweets = tf.return_tweets()
     print('after filtering:',len(filtered_tweets))
     # write filtered tweets
     outtweets = []
     for filtered_tweet in filtered_tweets:
         tweetobj = tweet.Tweet()
         tweetobj.import_twiqsdict(filtered_tweet)
         outtweets.append(tweetobj.return_dict())
     # write to file
     with open(self.out_filtered().path,'w',encoding='utf-8') as outfile:
         json.dump(outtweets,outfile)
Esempio n. 2
0
    def run(self):

        # if directory does not exist, create directory
        if not os.path.isdir(self.out_eventdir().path):
            self.setup_output_dir(self.out_eventdir().path)

        # collect tweet files
        end_date_year = self.end_date[:4]
        end_date_month = self.end_date[4:6]
        end_date_day = self.end_date[6:]
        last_date = datetime.date(int(end_date_year), int(end_date_month),
                                  int(end_date_day))
        first_date = last_date - datetime.timedelta(days=self.window_size - 1)
        last_tweetfile = self.in_tweetdir(
        ).path + '/' + end_date_year + end_date_month + '/' + end_date_year + end_date_month + end_date_day + '-23.out.dateref.cityref.entity.json'
        days_tweetfiles = helpers.return_tweetfiles_window(
            last_tweetfile, self.window_size - 1)
        tweetfiles = []
        for day in days_tweetfiles:
            tweetfiles.extend([
                filename for filename in glob.glob(self.in_tweetdir().path +
                                                   '/' + day + '*')
            ])

        print('Reading in citylist')
        # read in citylist
        with open(self.citylist, 'r', encoding='utf-8') as file_in:
            citylist = [
                line.strip() for line in file_in.read().strip().split('\n')
            ]

        # extract events
        er = event_ranker.EventRanker()
        # read in tweets
        print('Reading in tweets')
        for tweetfile in tweetfiles:
            date = helpers.return_date_entitytweetfile(tweetfile)
            with open(tweetfile, 'r', encoding='utf-8') as file_in:
                tweetdicts = json.loads(file_in.read())
            # format as tweet objects
            for td in tweetdicts:
                if not (td['refdates'] == {} and td['entities'] == {}):
                    tweetobj = tweet.Tweet()
                    tweetobj.import_tweetdict(td)
                    er.add_tweet(tweetobj)
                er.tweet_count += 1

        # extract events
        print('Performing event extraction')
        er.extract_events(self.minimum_event_mentions, self.cut_off)
        filter = event_filter.EventFilter()
        filter.add_events(er.events)
        filter.apply_filter(citylist)
        events_filtered = filter.return_events()
        print('Done. Extracted', len(events_filtered), 'events')

        # write to file
        outevents = [event.return_dict() for event in events_filtered]
        with open(self.out_events().path, 'w', encoding='utf-8') as file_out:
            json.dump(outevents, file_out)
Esempio n. 3
0
    def run(self):

        # read in citylist
        with open(self.citylist, 'r', encoding='utf-8') as file_in:
            citylist = [
                line.strip() for line in file_in.read().strip().split('\n')
            ]

        # read in tweets
        with open(self.in_dateref().path, 'r', encoding='utf-8') as file_in:
            tweetdicts = json.loads(file_in.read())

        # format as tweet objects
        tweets = []
        for td in tweetdicts:
            tweetobj = tweet.Tweet()
            tweetobj.import_tweetdict(td)
            tweets.append(tweetobj)

        # extract location
        for tweetobj in tweets:
            # remove already extracted time from the tweet, forming it into chunks
            datestrings = [sr[0] for sr in tweetobj.string_refdates]
            tweet_chunks = helpers.remove_pattern_from_string(
                tweetobj.text, datestrings)
            # extract city from chunks
            ce = cityref_extractor.CityrefExtractor(citylist)
            for chunk in tweet_chunks:
                ce.find_cityrefs(chunk)
            tweetobj.set_cityrefs(ce.return_cityrefs())

        # write to file
        outtweets = [tweet.return_dict() for tweet in tweets]
        with open(self.out_cityref().path, 'w', encoding='utf-8') as file_out:
            json.dump(outtweets, file_out)
Esempio n. 4
0
    def run(self):

        # read in tweets
        with open(self.in_tokenized().path, 'r', encoding='utf-8') as file_in:
            tweetdicts = json.loads(file_in.read())

        # format as tweet objects
        tweets = []
        for td in tweetdicts:
            tweetobj = tweet.Tweet()
            tweetobj.import_tweetdict(td)
            tweets.append(tweetobj)

        # extract daterefs
        for tweetobj in tweets:
            dte = dutch_timex_extractor.Dutch_timex_extractor(
                tweetobj.text, tweetobj.datetime)
            dte.extract_refdates(self.skip_datematch, self.skip_monthmatch,
                                 self.skip_timeunitmatch, self.skip_daymatch)
            dte.filter_future_refdates()
            tweetobj.set_refdates(dte.refdates)

        # write to file
        outtweets = [tweet.return_dict() for tweet in tweets]
        with open(self.out_dateref().path, 'w', encoding='utf-8') as file_out:
            json.dump(outtweets, file_out)
Esempio n. 5
0
    def run(self):

        # set commonness object
        cs = commonness.Commonness()
        cs.set_classencoder(self.commonness_txt, self.commonness_cls,
                            self.commonness_corpus)
        cs.set_dmodel(self.ngrams_score)

        # read in tweets
        with open(self.in_cityref().path, 'r', encoding='utf-8') as file_in:
            tweetdicts = json.loads(file_in.read())

        # format as tweet objects
        tweets = []
        for td in tweetdicts:
            tweetobj = tweet.Tweet()
            tweetobj.import_tweetdict(td)
            tweets.append(tweetobj)

        # extract entities
        for tweetobj in tweets:
            # remove already extracted time and locations from the tweet, forming it into chunks
            datestrings = [sr[0] for sr in tweetobj.string_refdates]
            cities = tweetobj.cityrefs
            tweet_chunks = helpers.remove_pattern_from_string(
                tweetobj.text, datestrings + cities)
            # find entities in every chunk
            ee = entity_extractor.EntityExtractor()
            ee.set_commonness(cs)
            for chunk in tweet_chunks:
                tokens = chunk.split()
                ee.extract_entities(tokens)
                ee.filter_entities_threshold()
            tweetobj.set_entities(ee.entities)

        # write to file
        outtweets = [tweet.return_dict() for tweet in tweets]
        with open(self.out_entity().path, 'w', encoding='utf-8') as file_out:
            json.dump(outtweets, file_out)
Esempio n. 6
0
    def run(self):

        first_event_date_dt = datetime.datetime(int(self.first_event_date[:4]),int(self.first_event_date[4:6]),int(self.first_event_date[6:]))
        last_event_date_dt = datetime.datetime(int(self.last_event_date[:4]),int(self.last_event_date[4:6]),int(self.last_event_date[6:]))    
        
        # read in burstiness
        print('Reading in bursty entities')
        with open(self.in_entity_burstiness().path,'r',encoding='utf-8') as file_in:
            bursty_entities = file_in.read().strip().split('\n')
        set_bursty_entities = set(bursty_entities)

        # read in events
        term_events = defaultdict(list)
        print('Reading in events')
        extended_events = []
        with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
            for i,ed in enumerate(eventdicts):
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                if eventobj.datetime >= first_event_date_dt - datetime.timedelta(days=100) and eventobj.datetime <= last_event_date_dt + datetime.timedelta(days=100):
                    extended_events.append(eventobj)
                    if len(list(set_bursty_entities & set(list(ed['entities'])))) > 0:
                        for term in list(set_bursty_entities & set(list(ed['entities']))):
                            term_events[term].append(eventobj)

        # for each entity
        print('Saving event tweets dates by entity')
        date_entity_events = defaultdict(lambda : defaultdict(list))
        for entity in bursty_entities:
            # for each event
            if len(term_events[entity]) == 0:
                continue
            else:
                for i,ev in enumerate(term_events[entity]):
                    if ev.datetime >= first_event_date_dt and ev.datetime <= last_event_date_dt:
                        if i == 0:
                           minus = 100
                        else:
                           minus = 100 if ((ev.datetime - term_events[entity][i-1].datetime).days > 199 or (ev.datetime - term_events[entity][i-1].datetime).days < 3) else ((ev.datetime - term_events[entity][i-1].datetime).days) / 2
                        if i == len(term_events[entity])-1:
                           plus = 100
                        else:
                           plus = 100 if ((term_events[entity][i-1].datetime - ev.datetime).days > 199 or (term_events[entity][i-1].datetime - ev.datetime).days < 3) else ((term_events[entity][i-1].datetime - ev.datetime).days) / 2
                        first = ev.datetime - datetime.timedelta(days=minus)
                        last = ev.datetime + datetime.timedelta(days=plus)
                        cursor = first
                        while cursor <= last:
                            date_str = ''.join(str(cursor).split()[0].split('-'))
                            date_entity_events[date_str][entity].append(ev)
                            cursor += datetime.timedelta(days=1)

        # read in tweets
        print('Collecting additional tweets')
        dates = list(date_entity_events.keys())
        months = list(set([d[:6] for d in dates]))
        tweetsubdirs = sorted([ subdir for subdir in glob.glob(self.in_tweetdir().path + '/*') ])
        entity_tweets = defaultdict(list)
        first = True
        for tweetsubdir in tweetsubdirs:
            subdirstr = tweetsubdir.split('/')[-1]
            if subdirstr in months:
                # go through all tweet files
                tweetfiles = [ tweetfile for tweetfile in glob.glob(tweetsubdir + '/*.entity.json') ]
                for tweetfile in tweetfiles:
                    print(tweetfile)
                    datestr = tweetfile.split('/')[-1].split('.')[0].split('-')[0]
                    if datestr in dates:
                        if first:
                            candidate_entities = list(date_entity_events[datestr].keys())
                            set_candidate_entities = set(candidate_entities)
                            cursordate = datestr
                            first = False
                        elif datestr != cursordate:
                            # add tweets
                            for entity in candidate_entities:
                                for ev in date_entity_events[datestr][entity]:
                                    ev.add_tweets(entity_tweets[entity])
                            cursordate = datestr
                            candidate_entities = list(date_entity_events[datestr].keys())
                            set_candidate_entities = set(candidate_entities)
                            entity_tweets = defaultdict(list)
                        # read in tweets
                        with open(tweetfile, 'r', encoding = 'utf-8') as file_in:
                            tweetdicts = json.loads(file_in.read())
                        for td in tweetdicts:
                            if len(list(set_candidate_entities & set(list(td['entities'].keys())))) > 0:
                                tweetobj = tweet.Tweet()
                                tweetobj.import_tweetdict(td)
                            for term in list(set_candidate_entities & set(list(td['entities'].keys()))):
                                entity_tweets[term].append(tweetobj)

        # write to file
        print('Writing new events')
        out_extended_events = [ev.return_dict() for ev in extended_events]
        with open(self.out_more_tweets().path,'w',encoding='utf-8') as file_out:
            json.dump(out_extended_events,file_out)
Esempio n. 7
0
    def run(self):

        # create directory
        self.setup_output_dir(self.out_eventdir().path)

        # set dates
        end_date_year = self.end_date[:4]
        end_date_month = self.end_date[4:6]
        end_date_day = self.end_date[6:]
        last_date = datetime.date(int(end_date_year), int(end_date_month),
                                  int(end_date_day))
        start_date_year = self.start_date[:4]
        start_date_month = self.start_date[4:6]
        start_date_day = self.start_date[6:]
        first_date = datetime.date(int(start_date_year), int(start_date_month),
                                   int(start_date_day))

        print('Reading in citylist')
        # read in citylist
        with open(self.citylist, 'r', encoding='utf-8') as file_in:
            citylist = [
                line.strip() for line in file_in.read().strip().split('\n')
            ]

        # perform event extraction on first tweet window
        print('Reading in first window of tweets')
        cursor_date = first_date
        date_tweetobjects = defaultdict(list)
        date_tweetcounts = defaultdict(int)
        window_dates = []
        while cursor_date < first_date + datetime.timedelta(
                days=self.window_size):
            print(cursor_date)
            tweetfiles = [
                filename for filename in
                glob.glob(self.in_tweetdir().path + '/' +
                          helpers.return_timeobj_date(cursor_date) + '*')
            ]
            for tweetfile in tweetfiles:
                # read in tweets
                with open(tweetfile, 'r', encoding='utf-8') as file_in:
                    tweetdicts = json.loads(file_in.read())
                date_tweetcounts[cursor_date] += len(tweetdicts)
                for td in tweetdicts:
                    if not (td['refdates'] == {} and td['entities'] == {}):
                        tweetobj = tweet.Tweet()
                        tweetobj.import_tweetdict(td)
                        date_tweetobjects[cursor_date].append(tweetobj)
            window_dates.append(cursor_date)
            cursor_date += datetime.timedelta(days=1)
        # start event extraction
        print('Loading tweets into event extractor')
        er = event_ranker.EventRanker()
        for date in window_dates:
            for tweetobject in date_tweetobjects[date]:
                er.add_tweet(tweetobject)
            er.tweet_count += date_tweetcounts[date]
        print('Performing event extraction')
        er.extract_events(self.minimum_event_mentions, self.cut_off)
        filter = event_filter.EventFilter()
        filter.add_events(er.events)
        filter.apply_filter(citylist)
        events_filtered = filter.return_events()
        print('Done. Extracted', len(events_filtered), 'events')
        # write to file
        outevents = [event.return_dict() for event in events_filtered]
        with open(self.out_eventdir().path + '/' +
                  str(cursor_date - datetime.timedelta(days=1)).replace(
                      '-', '') + '.events',
                  'w',
                  encoding='utf-8') as file_out:
            json.dump(outevents, file_out)

        # slide window forward and perform event extraction until last date
        print('Starting slider')
        window_tail = first_date
        window_head = cursor_date - datetime.timedelta(days=1)
        while window_head <= last_date:
            print('Discarding and collecting tweets')
            end_slider = window_head + datetime.timedelta(days=self.slider)
            while window_head < end_slider:
                # remove tweets of tail
                print('Deleting records for old tail', window_tail)
                del date_tweetobjects[window_tail]
                del date_tweetcounts[window_tail]
                window_tail = window_tail + datetime.timedelta(days=1)
                window_head = window_head + datetime.timedelta(days=1)
                print('Collecting tweets for new head', window_head)
                tweetfiles = [
                    filename for filename in
                    glob.glob(self.in_tweetdir().path + '/' +
                              helpers.return_timeobj_date(window_head) + '*')
                ]
                for tweetfile in tweetfiles:
                    # read in tweets
                    with open(tweetfile, 'r', encoding='utf-8') as file_in:
                        tweetdicts = json.loads(file_in.read())
                    date_tweetcounts[window_head] += len(tweetdicts)
                    for td in tweetdicts:
                        if not (td['refdates'] == {} and td['entities'] == {}):
                            tweetobj = tweet.Tweet()
                            tweetobj.import_tweetdict(td)
                            date_tweetobjects[window_head].append(tweetobj)
            # add tweets to event ranker
            print('Loading tweets into event extractor for', window_tail, '-',
                  window_head)
            er = event_ranker.EventRanker()
            window_dates = helpers.return_daterange(window_tail,
                                                    self.window_size)
            for date in window_dates:
                for tweetobject in date_tweetobjects[date]:
                    er.add_tweet(tweetobject)
                er.tweet_count += date_tweetcounts[date]
            print('Performing event extraction')
            er.extract_events(self.minimum_event_mentions, self.cut_off)
            filter = event_filter.EventFilter()
            filter.add_events(er.events)
            filter.apply_filter(citylist)
            events_filtered = filter.return_events()
            print('Done. Extracted', len(events_filtered), 'events')
            # write to file
            outevents = [event.return_dict() for event in events_filtered]
            with open(self.out_eventdir().path + '/' +
                      str(window_head).replace('-', '') + '.events',
                      'w',
                      encoding='utf-8') as file_out:
                json.dump(outevents, file_out)