class ArchiveEventsDaily(WorkflowComponent): events = Parameter() archivedir = Parameter() archivedate = Parameter() def accepts(self): return [(InputFormat(self, format_id='events', extension='.events.integrated', inputparameter='events'), InputFormat(self, format_id='archivedir', extension='.archive', inputparameter='archivedir'))] def setup(self, workflow, input_feeds): daily_event_archiver = workflow.new_task('archive_events_daily', ArchiveEventsDailyTask, autopass=False, archivedate=self.archivedate) daily_event_archiver.in_events = input_feeds['events'] daily_event_archiver.in_archivedir = input_feeds['archivedir'] return daily_event_archiver
class TimblClassifier(WorkflowComponent): """A Timbl classifier that takes training data, test data, and outputs the test data with classification""" trainfile = Parameter() testfile = Parameter() def accepts(self): #Note: tuple in a list, the outer list corresponds to options, while the inner tuples are conjunctions return [(InputFormat(self, format_id='train', extension='train', inputparameter='trainfile'), InputFormat(self, format_id='test', extension='test', inputparameter='testfile'))] def setup(self, workflow, input_feeds): timbl_train = workflow.new_task('timbl_train', Timbl_train, autopass=True) timbl_train.in_train = input_feeds['train'] timbl_test = workflow.new_task('timbl_test', Timbl_test, autopass=True) timbl_test.in_test = input_feeds['test'] timbl_test.in_ibase = timbl_train.out_ibase timbl_test.in_wgt = timbl_train.out_wgt return timbl_test
class Ucto_folia2folia_dir(Task): extension = Parameter(default="folia.xml") language = Parameter() in_foliadir = InputSlot() #input slot def out_tokfoliadir(self): return self.outputfrominput(inputformat='foliadir', stripextension='.foliadir', addextension='.tok.foliadir') def run(self): #Set up the output directory, will create it and tear it down on failure automatically self.setup_output_dir(self.out_tokfoliadir().path) #gather input files inputfiles = [ filename for filename in glob.glob(self.in_foliadir().path + '/*.' + self.extension) ] #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically #in this case we run the FeaturizerTask_single component for each input file in the directory yield [ Ucto(inputfile=inputfile, inputslot='folia', outputdir=self.out_tokfoliadir().path, language=self.language) for inputfile in inputfiles ]
class TesseractOCR_document(Task): """OCR for a whole document (input is a directory of tiff image files (pages), output is a directory of hOCR files""" tiff_extension = Parameter(default='tif') language = Parameter() in_tiffdir = InputSlot() #input slot def out_hocrdir(self): return self.outputfrominput(inputformat='tiffdir', stripextension='.tiffdir', addextension='.hocrdir') def run(self): #Set up the output directory, will create it and tear it down on failure automatically self.setup_output_dir(self.out_hocrdir().path) #gather input files inputfiles = [ filename for filename in glob.glob(self.in_tiffdir().path + '/*.' + self.tiff_extension) ] #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically #in this case we run the OCR_singlepage component for each input file in the directory yield [ OCR_singlepage(inputfile=inputfile, outputdir=self.out_hocrdir().path, language=self.language, tiff_extension=self.tiff_extension) for inputfile in inputfiles ]
class ExtractCityref(StandardWorkflowComponent): citylist = Parameter() config = Parameter() strip_punctuation = BoolParameter() to_lowercase = BoolParameter() skip_date = BoolParameter() skip_month = BoolParameter() skip_timeunit = BoolParameter() skip_day = BoolParameter() format_json = BoolParameter() def accepts(self): return (InputFormat(self, format_id='dateref', extension='.dateref.json'), InputComponent(self, ExtractDateref, config=self.config, strip_punctuation=self.strip_punctuation, to_lowercase=self.to_lowercase, skip_datematch=self.skip_date, skip_monthmatch=self.skip_month, skip_timeunitmatch=self.skip_timeunit, skip_daymatch=self.skip_day)) def autosetup(self): return ExtractCityrefTask
class Timbl_base(Task): executable = 'timbl' algorithm = Parameter(default="IB1") metric = Parameter(default="O") weighting = Parameter(default="gr") distance = Parameter(default="Z") format = Parameter(default="Columns") k = IntParameter(default=1)
class IntegrateEvents(StandardWorkflowComponent): current_events = Parameter() overlap_threshold = Parameter(default = 0.2) def accepts(self): return InputFormat(self, format_id='events', extension='.enhanced'), InputFormat(self, format_id='events', extension='.integrated'), InputFormat(self, format_id='events', extension='.types'), InputFormat(self, format_id='events', extension='.filtered') def autosetup(self): return IntegrateEventsTask
class ExtractEntitiesTask(Task): in_cityref = InputSlot() commonness_txt = Parameter() commonness_cls = Parameter() commonness_corpus = Parameter() ngrams_score = Parameter() def out_entity(self): return self.outputfrominput(inputformat='cityref', stripextension='.json', addextension='.entity.json') def run(self): # set commonness object cs = commonness.Commonness() cs.set_classencoder(self.commonness_txt, self.commonness_cls, self.commonness_corpus) cs.set_dmodel(self.ngrams_score) # read in tweets with open(self.in_cityref().path, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) # format as tweet objects tweets = [] for td in tweetdicts: tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) tweets.append(tweetobj) # extract entities for tweetobj in tweets: # remove already extracted time and locations from the tweet, forming it into chunks datestrings = [sr[0] for sr in tweetobj.string_refdates] cities = tweetobj.cityrefs tweet_chunks = helpers.remove_pattern_from_string( tweetobj.text, datestrings + cities) # find entities in every chunk ee = entity_extractor.EntityExtractor() ee.set_commonness(cs) for chunk in tweet_chunks: tokens = chunk.split() ee.extract_entities(tokens) ee.filter_entities_threshold() tweetobj.set_entities(ee.entities) # write to file outtweets = [tweet.return_dict() for tweet in tweets] with open(self.out_entity().path, 'w', encoding='utf-8') as file_out: json.dump(outtweets, file_out)
class OCR_singlepage(StandardWorkflowComponent): language = Parameter() tiff_extension = Parameter(default='tif') def autosetup(self): return Tesseract def accepts(self): return InputFormat(self, format_id='tiff', extension=self.tiff_extension, directory=True),
class ExtractEvents(StandardWorkflowComponent): citylist = Parameter() end_date = Parameter() window_size = IntParameter(default=30) minimum_event_mentions = IntParameter(default=5) cut_off = IntParameter(default=2500) def accepts(self): return InputFormat(self, format_id='tweetdir', extension='.tweets') def autosetup(self): return ExtractEventsTask
class MergeEvents(StandardWorkflowComponent): overlap_threshold = Parameter(default = 0.2) similarity_threshold = Parameter(default = 0.7) def accepts(self): return ( InputFormat(self, format_id='enhanced_events', extension='.enhanced'), InputComponent(self, EnhanceEvents, similarity_threshold=self.similarity_threshold) ) def autosetup(self): return MergeEventsTask
class LowercaseVoweleaterDirTask2(Task): in_txtdir = InputSlot() extension = Parameter(default='txt') def out_txtdir(self): return self.outputfrominput(inputformat='txtdir', stripextension='.txtdir', addextension='.lcnv.txtdir') def run(self): #Set up the output directory, will create it and tear it down on failure automatically self.setup_output_dir(self.out_txtdir().path) #gather input files inputfiles = [ filename for filename in glob.glob(self.in_txtdir().path + '/*.' + self.extension) ] #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically #in this case we run the OCR_singlepage component for each input file in the directory yield [ Voweleater(inputfile=inputfile, outputdir=self.out_txtdir().path, startcomponent='Lowercaser') for inputfile in inputfiles ]
class FilterEvents(StandardWorkflowComponent): citylist = Parameter() overlap_threshold = Parameter(default=0.2) similarity_threshold = Parameter(default=0.7) def accepts(self): return (InputFormat(self, format_id='merged_events', extension='.merged'), InputComponent(self, MergeEvents, overlap_threshold=self.overlap_threshold, similarity_threshold=self.similarity_threshold)) def autosetup(self): return FilterEventsTask
class UpdateEventTypes(WorkflowComponent): events = Parameter() predictiondir = Parameter() text = BoolParameter() def accepts(self): return [ ( InputFormat(self,format_id='predictiondir',extension='.instances',inputparameter='predictiondir'), InputFormat(self,format_id='events',extension='.events.integrated',inputparameter='events') ) ] def setup(self, workflow, input_feeds): event_type_updater = workflow.new_task('update_event_types', UpdateEventTypesTask, autopass=True, text=self.text) event_type_updater.in_events = input_feeds['events'] event_type_updater.in_predictiondir = input_feeds['predictiondir'] return event_type_updater
class GetEntityTimeseriesMonth(WorkflowComponent): tweetdir = Parameter() events = Parameter() month = Parameter() def accepts(self): return [ ( InputFormat(self,format_id='tweetdir',extension='.tweets',inputparameter='tweetdir'), InputFormat(self,format_id='events',extension='.events',inputparameter='events') ) ] def setup(self, workflow, input_feeds): timeseries_generator = workflow.new_task('get_entity_timeseries', GetEntityTimeseriesTask, autopass=True, month=self.month) timeseries_generator.in_tweetdir = input_feeds['tweetdir'] timeseries_generator.in_events = input_feeds['events'] return timeseries_generator
class IntegrateEventDir(StandardWorkflowComponent): overlap_threshold = Parameter(default = 0.2) def accepts(self): return InputFormat(self, format_id='eventdir', extension='.events') def autosetup(self): return IntegrateEventDirTask
class DeduplicateEvents(StandardWorkflowComponent): similarity_threshold = Parameter(default=0.7) def accepts(self): return InputFormat(self, format_id='events', extension='.events') def autosetup(self): return DeduplicateEventsTask
class Symlink(Task): """Create a symlink""" filename = Parameter() stripextension = Parameter() addextension = Parameter() in_file = InputSlot() #input slot def out_file(self): if self.filename: return TargetInfo(self, self.filename) else: return self.outputfrominput(inputformat='file', stripextension=self.stripextension, addextension=self.addextension) def run(self): os.symlink(self.in_file.path(), self.out_file.path())
class FoliaValidator(StandardWorkflowComponent): folia_extension = Parameter(default='folia.xml') def accepts(self): return ( InputFormat(self, format_id='folia', extension=self.folia_extension), InputFormat(self, format_id='foliadir', extension='foliadir')) def autosetup(self): return FoliaValidatorTask, FoliaValidatorDirTask
class Ucto(StandardWorkflowComponent): """A workflow component for Ucto""" skip = Parameter( default="" ) #A parameter for the workflow, will be passed on to the tasks language = Parameter() tok_input_sentenceperline = BoolParameter(default=False) tok_output_sentenceperline = BoolParameter(default=False) def autosetup(self): return (Ucto_txt2folia, Ucto_folia2folia, Ucto_tok2folia) def accepts(self): """Returns a tuple of all the initial inputs and other workflows this component accepts as input (a disjunction, only one will be selected)""" return (InputFormat(self, format_id='folia', extension='folia.xml'), InputFormat(self, format_id='txt', extension='txt'), InputFormat(self, format_id='tok', extension='tok'), InputComponent(self, ConvertToFoLiA))
class CollectEventTweets(WorkflowComponent): tweetdir = Parameter() events = Parameter() entity_burstiness = Parameter() first_event_date = Parameter() last_event_date = Parameter() def accepts(self): return [ ( InputFormat(self,format_id='tweetdir',extension='.tweets',inputparameter='tweetdir'), InputFormat(self,format_id='events',extension='.events',inputparameter='events'), InputFormat(self,format_id='entity_burstiness',extension='.burstiness.txt',inputparameter='entity_burstiness') ) ] def setup(self, workflow, input_feeds): tweet_collector = workflow.new_task('collect_event_tweets', CollectEventTweetsTask, autopass=False, first_event_date=self.first_event_date, last_event_date=self.last_event_date) tweet_collector.in_tweetdir = input_feeds['tweetdir'] tweet_collector.in_events = input_feeds['events'] tweet_collector.in_entity_burstiness = input_feeds['entity_burstiness'] return tweet_collector
class IntegrateEventDirTask(Task): ### task to speed up event integration for sliding window event extraction ### make sure that all events in the directory are deduplicated and enhanced before running this task ### only files with extension '.enhanced' will be integrated in_eventdir = InputSlot() overlap_threshold = Parameter() def out_integrated_events(self): return self.outputfrominput(inputformat='eventdir', stripextension='.events', addextension='events.integrated') def run(self): # collect all event files with extension '.enhanced' enhanced_events = glob.glob(self.in_eventdir().path + '/*.enhanced') # initialize merger = event_merger.EventMerger() overlap_threshold = float(self.overlap_threshold) # for each event file for eventfile in enhanced_events: print('Reading',eventfile) with open(eventfile, 'r', encoding = 'utf-8') as file_in: current_eventdicts = json.loads(file_in.read()) new_event_objs = [] for ed in current_eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed) new_event_objs.append(eventobj) # merge before integration print('Merging new events before integration; number of events at start:',len(new_event_objs)) premerger = event_merger.EventMerger() premerger.add_events(new_event_objs) premerger.find_merges(overlap_threshold) new_events_merged = premerger.return_events() print('Done. New events after merge:',len(new_events_merged)) if len(merger.events) == 0: merger.add_events(new_events_merged) else: # integrate each event into the current ones print('Starting integrating new events; number of current events:',len(merger.events)) for new_event in new_events_merged: merger.find_merge(new_event,overlap_threshold) # write merged integrated_events = merger.return_events() print('Done. Number of events after integration:',len(integrated_events)) out_integrated_events = [event.return_dict() for event in integrated_events] with open(self.out_integrated_events().path,'w',encoding='utf-8') as file_out: json.dump(out_integrated_events,file_out)
class CollectEventTweetsDaily(WorkflowComponent): tweetdir = Parameter() events = Parameter() entity_burstiness = Parameter() entity_burstiness_new = Parameter() date = Parameter() def accepts(self): return [ ( InputFormat(self,format_id='tweetdir',extension='.tweets',inputparameter='tweetdir'), InputFormat(self,format_id='events',extension='.events.integrated',inputparameter='events'), InputFormat(self,format_id='entity_burstiness',extension='.burstiness.txt',inputparameter='entity_burstiness'), InputFormat(self,format_id='entity_burstiness_new',extension='.burstiness.txt',inputparameter='entity_burstiness_new') ) ] def setup(self, workflow, input_feeds): daily_tweet_collector = workflow.new_task('collect_event_tweets_daily', CollectEventTweetsDailyTask, autopass=False, date=self.date) daily_tweet_collector.in_tweetdir = input_feeds['tweetdir'] daily_tweet_collector.in_events = input_feeds['events'] daily_tweet_collector.in_entity_burstiness = input_feeds['entity_burstiness'] daily_tweet_collector.in_entity_burstiness_new = input_feeds['entity_burstiness_new'] return daily_tweet_collector
class AssessBurstiness(WorkflowComponent): entity_counts = Parameter() dates = Parameter() vocabulary = Parameter() events = Parameter() burstiness_threshold = IntParameter() def accepts(self): return [ ( InputFormat(self,format_id='entity_counts',extension='.counts.npz',inputparameter='entity_counts'), InputFormat(self,format_id='dates',extension='.counts_dates',inputparameter='dates'), InputFormat(self,format_id='vocabulary',extension='.counts_vocabulary',inputparameter='vocabulary'), InputFormat(self,format_id='events',extension='.events.integrated',inputparameter='events') ) ] def setup(self, workflow, input_feeds): burstiness_assessor = workflow.new_task('assess_burstiness', AssessBurstinessTask, autopass=False, burstiness_threshold=self.burstiness_threshold) burstiness_assessor.in_entity_counts = input_feeds['entity_counts'] burstiness_assessor.in_dates = input_feeds['dates'] burstiness_assessor.in_vocabulary = input_feeds['vocabulary'] burstiness_assessor.in_events = input_feeds['events'] return burstiness_assessor
class ArchiveEventsTask(Task): in_events = InputSlot() in_archivedir = InputSlot() archivedate = Parameter() def out_archived(self): return self.outputfrominput(inputformat='archivedir', stripextension='.archive', addextension='.archive/events_' + self.archivedate + '.json') def out_active_events(self): return self.outputfrominput(inputformat='events', stripextension='.events.integrated', addextension='.active.events.integrated') def run(self): # read events archive_events = [] active_events = [] date = datetime.datetime(int(self.archivedate[:4]), int(self.archivedate[4:6]), int(self.archivedate[6:8])) print('Reading events') with open(self.in_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) for i, ed in enumerate(eventdicts): eventobj = event.Event() eventobj.import_eventdict(ed) if eventobj.datetime == date: archive_events.append(eventobj) else: active_events.append(eventobj) # write archive print('Writing archive') out_archive_events = [ event.return_dict(txt=False) for event in archive_events ] with open(self.out_archived().path, 'w', encoding='utf-8') as file_out: json.dump(out_archive_events, file_out) # write active events print('Writing active events') out_active_events = [ event.return_dict(txt=False) for event in active_events ] with open(self.out_active_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_active_events, file_out)
class ExtractEntities(StandardWorkflowComponent): commonness_txt = Parameter() commonness_cls = Parameter() commonness_corpus = Parameter() ngrams_score = Parameter() config = Parameter() strip_punctuation = BoolParameter() to_lowercase = BoolParameter() skip_date = BoolParameter() skip_month = BoolParameter() skip_timeunit = BoolParameter() skip_day = BoolParameter() citylist = Parameter() format_json = BoolParameter() def accepts(self): return (InputFormat(self, format_id='cityref', extension='.json'), InputComponent(self, ExtractCityref, config=self.config, strip_punctuation=self.strip_punctuation, to_lowercase=self.to_lowercase, citylist=self.citylist, skip_date=self.skip_date, skip_month=self.skip_month, skip_timeunit=self.skip_timeunit, skip_day=self.skip_day)) def autosetup(self): return ExtractEntitiesTask
class CombineEntityTimeseries(WorkflowComponent): entity_counts_dir = Parameter() def accepts(self): return [ ( InputFormat(self,format_id='entity_counts_dir',extension='.timeseries',inputparameter='entity_counts_dir') ) ] def setup(self, workflow, input_feeds): timeseries_combiner = workflow.new_task('combine_entity_timeseries', CombineEntityTimeseriesTask, autopass=True) timeseries_combiner.in_entity_counts_dir = input_feeds['entity_counts_dir'] return timeseries_combiner
class EnhanceEvents(StandardWorkflowComponent): similarity_threshold = Parameter(default=0.7) def accepts(self): return (InputFormat(self, format_id='events', extension='.deduplicated'), InputComponent(self, DeduplicateEvents, similarity_threshold=self.similarity_threshold)) def autosetup(self): return EnhanceEventsTask
class OCR_folia(StandardWorkflowComponent): """OCR with FoLiA output""" language = Parameter() def setup(self, workflow, input_feeds): foliahocr = workflow.new_task('foliahocr', FoliaHOCR) foliahocr.in_hocrdir = input_feeds['hocrdir'] foliacat = workflow.new_task('foliacat', Foliacat) foliacat.in_foliadir = foliahocr.out_foliadir return foliacat def accepts(self): """Returns a tuple of all the initial inputs and other workflows this component accepts as input (a disjunction, only one will be selected)""" return InputComponent(self, OCR_document)
class OCR_document(StandardWorkflowComponent): language = Parameter() def autosetup(self): return TesseractOCR_document def accepts(self): """Returns a tuple of all the initial inputs and other workflows this component accepts as input (a disjunction, only one will be selected)""" return (InputFormat(self, format_id='tiffdir', extension='tiffdir', directory=True), InputComponent(self, ExtractPages))