Example #1
0
    def run(self):

        # read in new events
        with open(self.in_events().path, 'r', encoding='utf-8') as file_in:
            new_eventdicts = json.loads(file_in.read())
        new_event_objs = []
        for ed in new_eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed, txt=False)
            new_event_objs.append(eventobj)
        earliest_date = min([event.datetime for event in new_event_objs])

        # read in current events
        with open(self.current_events, 'r', encoding='utf-8') as file_in:
            current_eventdicts = json.loads(file_in.read())
        current_event_objs = []
        current_event_objs_candidates = []
        for ed in current_eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed, txt=False)
            if eventobj.datetime >= earliest_date:
                current_event_objs_candidates.append(eventobj)
            else:
                current_event_objs.append(eventobj)

        # initialize event merger
        merger = event_merger.EventMerger()
        merger.add_events(current_event_objs_candidates)

        # merge before integration
        print(
            'Merging new events before integration; number of events at start:',
            len(new_event_objs))
        overlap_threshold = float(self.overlap_threshold)
        premerger = event_merger.EventMerger()
        premerger.add_events(new_event_objs)
        premerger.find_merges(overlap_threshold)
        new_events_merged = premerger.return_events()
        print('Done. New events after merge:', len(new_events_merged))

        # integrate each event into the current ones
        print('Starting integrating new events; number of current events:',
              len(current_event_objs))
        for new_event in new_events_merged:
            merger.find_merge(new_event, overlap_threshold)

        # write merged
        integrated_events = merger.return_events() + current_event_objs
        print('Done. Number of events after integration:',
              len(integrated_events))
        out_integrated_events = [
            event.return_dict(txt=False) for event in integrated_events
        ]
        with open(self.out_integrated_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_integrated_events, file_out)
Example #2
0
    def run(self):

        # read in events
        print('Reading in events')
        with open(self.in_deduplicated_events().path, 'r',
                  encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed)
            event_objs.append(eventobj)

        # initialize event enhancer
        print('Enhancing events')
        enhancer = event_enhancer.EventEnhancer()
        enhancer.set_events(event_objs)
        enhancer.enhance()
        enhanced_events = enhancer.return_events()

        # write deduplicated
        out_enhanced_events = [
            event.return_dict() for event in enhanced_events
        ]
        with open(self.out_enhanced_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_enhanced_events, file_out)
Example #3
0
    def run(self):

        # read in events
        with open(self.in_enhanced_events().path, 'r', encoding = 'utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed)
            event_objs.append(eventobj)

        # initialize event merger
        print('Merging; number of events at start:',len(event_objs))
        overlap_threshold = float(self.overlap_threshold)
        merger = event_merger.EventMerger()
        merger.add_events(event_objs)
        merger.find_merges(overlap_threshold)
        events_merged = merger.return_events()
        print('Merging again; current number of events:',len(events_merged))
        merger2 = event_merger.EventMerger()
        merger2.add_events(events_merged)
        merger2.find_merges(overlap_threshold)
        events_merged_final = merger2.return_events()        
        print('Done. number of events after merge:',len(events_merged_final))

        # write merged 
        out_merged_events = [event.return_dict(txt=False) for event in events_merged_final]
        with open(self.out_merged_events().path,'w',encoding='utf-8') as file_out:
            json.dump(out_merged_events,file_out)
Example #4
0
    def run(self):

        # read in events
        print('Reading in events')
        with open(self.in_merged_events().path, 'r',
                  encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed, txt=False)
            event_objs.append(eventobj)

        print('Reading in citylist')
        # read in citylist
        with open(self.citylist, 'r', encoding='utf-8') as file_in:
            citylist = [
                line.strip() for line in file_in.read().strip().split('\n')
            ]

        # initialize event filter
        print('Filtering; number of events at start:', len(event_objs))
        filter = event_filter.EventFilter()
        filter.add_events(event_objs)
        filter.apply_filter(citylist)
        events_filtered = filter.return_events()
        print('Done. number of events after filter:', len(events_filtered))

        # write filter
        out_filtered_events = [
            event.return_dict(txt=False) for event in events_filtered
        ]
        with open(self.out_filtered_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_filtered_events, file_out)
Example #5
0
    def run(self):

        # read events
        archive_events = []
        active_events = []
        date = datetime.datetime(int(self.archivedate[:4]),
                                 int(self.archivedate[4:6]),
                                 int(self.archivedate[6:8]))
        print('Reading events')
        with open(self.in_events().path, 'r', encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
            for i, ed in enumerate(eventdicts):
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                if eventobj.datetime == date:
                    archive_events.append(eventobj)
                else:
                    active_events.append(eventobj)

        # write archive
        print('Writing archive')
        out_archive_events = [
            event.return_dict(txt=False) for event in archive_events
        ]
        with open(self.out_archived().path, 'w', encoding='utf-8') as file_out:
            json.dump(out_archive_events, file_out)

        # write active events
        print('Writing active events')
        out_active_events = [
            event.return_dict(txt=False) for event in active_events
        ]
        with open(self.out_active_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_active_events, file_out)
Example #6
0
    def run(self):

        # read in events
        with open(self.in_events().path, 'r', encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed)
            event_objs.append(eventobj)

        # initialize event deduplicator
        similarity_threshold = float(self.similarity_threshold)
        print('Deduplicating; number of events at start:', len(event_objs))
        deduplicator = event_deduplicator.EventDeduplicator()
        deduplicator.set_events(event_objs)
        deduplicator.deduplicate_events(similarity_threshold)
        deduplicated_events = deduplicator.return_events()
        print('Done. number of events after deduplication:',
              len(deduplicated_events))

        # write deduplicated
        out_deduplicated_events = [
            event.return_dict() for event in deduplicated_events
        ]
        with open(self.out_deduplicated_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_deduplicated_events, file_out)
Example #7
0
    def run(self):

        # collect all event files with extension '.enhanced'
        enhanced_events = glob.glob(self.in_eventdir().path + '/*.enhanced')

        # initialize
        merger = event_merger.EventMerger()
        overlap_threshold = float(self.overlap_threshold)

        # for each event file
        for eventfile in enhanced_events:
            print('Reading', eventfile)
            with open(eventfile, 'r', encoding='utf-8') as file_in:
                current_eventdicts = json.loads(file_in.read())
            new_event_objs = []
            for ed in current_eventdicts:
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                new_event_objs.append(eventobj)
            # merge before integration
            print(
                'Merging new events before integration; number of events at start:',
                len(new_event_objs))
            premerger = event_merger.EventMerger()
            premerger.add_events(new_event_objs)
            premerger.find_merges(overlap_threshold)
            new_events_merged = premerger.return_events()
            print('Done. New events after merge:', len(new_events_merged))
            if len(merger.events) == 0:
                merger.add_events(new_events_merged)
            else:
                # integrate each event into the current ones
                print(
                    'Starting integrating new events; number of current events:',
                    len(merger.events))
                for new_event in new_events_merged:
                    merger.find_merge(new_event, overlap_threshold)

        # write merged
        integrated_events = merger.return_events()
        print('Done. Number of events after integration:',
              len(integrated_events))
        out_integrated_events = [
            event.return_dict() for event in integrated_events
        ]
        with open(self.out_integrated_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_integrated_events, file_out)
Example #8
0
    def run(self):

        # read prediction data
        with open(self.in_predictiondir().path + '/events_meta.txt','r',encoding='utf=8') as file_in:
            meta = file_in.read().strip().split('\n')

        with open(self.in_predictiondir().path + '/events_text.predictions.txt','r',encoding='utf=8') as file_in:
            predictions = file_in.read().strip().split('\n')

        with open(self.in_predictiondir().path + '/events_text.full_predictions.txt','r',encoding='utf=8') as file_in:
            lines = file_in.read().strip().split('\n')
        label_order = lines[0].split('\t')
        full_predictions = [line.split('\t') for line in lines[1:]]

        print('Meta',len(meta))
        print('Predictions',len(predictions))
        print('Full predictions',len(full_predictions))
        
        # read in events
        print('Reading in events')
        with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed,txt=self.text)
            event_objs.append(eventobj)

        # index events
        id_event = {}
        for eo in event_objs:
            id_event[eo.mongo_id] = eo

        # for each prediction
        for i,mid in enumerate(meta):
            prediction = predictions[i]
            prediction_score = dict(zip(label_order,full_predictions[i]))
            eo = id_event[mid]
            eo.eventtype = prediction
            eo.eventtype_scores = prediction_score

        # write output
        out_updated_events = [event.return_dict(txt=self.text) for event in event_objs]
        with open(self.out_updated_events().path,'w',encoding='utf-8') as file_out:
            json.dump(out_updated_events,file_out)
Example #9
0
    def run(self):

        # initiate directory
        self.setup_output_dir(self.out_archivedir().path)

        # read events
        datebound = datetime.now() - datetime.timedelta(days=100)
        date_events = defaultdict(list)
        active_events = []
        print('Reading events')
        with open(self.in_events().path, 'r', encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
            for i, ed in enumerate(eventdicts):
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                if eventobj.datetime < datebound:
                    date_events[''.join(
                        str(eventobj.datetime).split()[0].split('-'))].append(
                            eventobj)
                else:
                    active_events.append(eventobj)

        # write archives
        print('Writing archives')
        for date in sorted(list(date_events.keys())):
            print(date)
            events = date_events[date]
            out_events = [event.return_dict(txt=False) for event in events]
            outfile = self.out_archivedir().path + '/events_' + date + '.json'
            with open(outfile, 'w', encoding='utf-8') as file_out:
                json.dump(out_events, file_out)

        # write active events
        print('Writing active events')
        out_active_events = [
            event.return_dict(txt=False) for event in active_events
        ]
        with open(self.out_active_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_active_events, file_out)
Example #10
0
    def run(self):

        first_event_date_dt = datetime.datetime(int(self.first_event_date[:4]),int(self.first_event_date[4:6]),int(self.first_event_date[6:]))
        last_event_date_dt = datetime.datetime(int(self.last_event_date[:4]),int(self.last_event_date[4:6]),int(self.last_event_date[6:]))    
        
        # read in burstiness
        print('Reading in bursty entities')
        with open(self.in_entity_burstiness().path,'r',encoding='utf-8') as file_in:
            bursty_entities = file_in.read().strip().split('\n')
        set_bursty_entities = set(bursty_entities)

        # read in events
        term_events = defaultdict(list)
        print('Reading in events')
        extended_events = []
        with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
            for i,ed in enumerate(eventdicts):
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                if eventobj.datetime >= first_event_date_dt - datetime.timedelta(days=100) and eventobj.datetime <= last_event_date_dt + datetime.timedelta(days=100):
                    extended_events.append(eventobj)
                    if len(list(set_bursty_entities & set(list(ed['entities'])))) > 0:
                        for term in list(set_bursty_entities & set(list(ed['entities']))):
                            term_events[term].append(eventobj)

        # for each entity
        print('Saving event tweets dates by entity')
        date_entity_events = defaultdict(lambda : defaultdict(list))
        for entity in bursty_entities:
            # for each event
            if len(term_events[entity]) == 0:
                continue
            else:
                for i,ev in enumerate(term_events[entity]):
                    if ev.datetime >= first_event_date_dt and ev.datetime <= last_event_date_dt:
                        if i == 0:
                           minus = 100
                        else:
                           minus = 100 if ((ev.datetime - term_events[entity][i-1].datetime).days > 199 or (ev.datetime - term_events[entity][i-1].datetime).days < 3) else ((ev.datetime - term_events[entity][i-1].datetime).days) / 2
                        if i == len(term_events[entity])-1:
                           plus = 100
                        else:
                           plus = 100 if ((term_events[entity][i-1].datetime - ev.datetime).days > 199 or (term_events[entity][i-1].datetime - ev.datetime).days < 3) else ((term_events[entity][i-1].datetime - ev.datetime).days) / 2
                        first = ev.datetime - datetime.timedelta(days=minus)
                        last = ev.datetime + datetime.timedelta(days=plus)
                        cursor = first
                        while cursor <= last:
                            date_str = ''.join(str(cursor).split()[0].split('-'))
                            date_entity_events[date_str][entity].append(ev)
                            cursor += datetime.timedelta(days=1)

        # read in tweets
        print('Collecting additional tweets')
        dates = list(date_entity_events.keys())
        months = list(set([d[:6] for d in dates]))
        tweetsubdirs = sorted([ subdir for subdir in glob.glob(self.in_tweetdir().path + '/*') ])
        entity_tweets = defaultdict(list)
        first = True
        for tweetsubdir in tweetsubdirs:
            subdirstr = tweetsubdir.split('/')[-1]
            if subdirstr in months:
                # go through all tweet files
                tweetfiles = [ tweetfile for tweetfile in glob.glob(tweetsubdir + '/*.entity.json') ]
                for tweetfile in tweetfiles:
                    print(tweetfile)
                    datestr = tweetfile.split('/')[-1].split('.')[0].split('-')[0]
                    if datestr in dates:
                        if first:
                            candidate_entities = list(date_entity_events[datestr].keys())
                            set_candidate_entities = set(candidate_entities)
                            cursordate = datestr
                            first = False
                        elif datestr != cursordate:
                            # add tweets
                            for entity in candidate_entities:
                                for ev in date_entity_events[datestr][entity]:
                                    ev.add_tweets(entity_tweets[entity])
                            cursordate = datestr
                            candidate_entities = list(date_entity_events[datestr].keys())
                            set_candidate_entities = set(candidate_entities)
                            entity_tweets = defaultdict(list)
                        # read in tweets
                        with open(tweetfile, 'r', encoding = 'utf-8') as file_in:
                            tweetdicts = json.loads(file_in.read())
                        for td in tweetdicts:
                            if len(list(set_candidate_entities & set(list(td['entities'].keys())))) > 0:
                                tweetobj = tweet.Tweet()
                                tweetobj.import_tweetdict(td)
                            for term in list(set_candidate_entities & set(list(td['entities'].keys()))):
                                entity_tweets[term].append(tweetobj)

        # write to file
        print('Writing new events')
        out_extended_events = [ev.return_dict() for ev in extended_events]
        with open(self.out_more_tweets().path,'w',encoding='utf-8') as file_out:
            json.dump(out_extended_events,file_out)