class Tokenize(StandardWorkflowComponent): config = Parameter() strip_punctuation = BoolParameter() lowercase = BoolParameter() format_json = BoolParameter() def accepts(self): return (InputFormat(self, format_id='filtered', extension='.filtered.json'), InputComponent(self, FilterTweets, format_json=self.format_json)) def autosetup(self): return Tokenize_instances
class ExtractCityref(StandardWorkflowComponent): citylist = Parameter() config = Parameter() strip_punctuation = BoolParameter() to_lowercase = BoolParameter() skip_date = BoolParameter() skip_month = BoolParameter() skip_timeunit = BoolParameter() skip_day = BoolParameter() format_json = BoolParameter() def accepts(self): return (InputFormat(self, format_id='dateref', extension='.dateref.json'), InputComponent(self, ExtractDateref, config=self.config, strip_punctuation=self.strip_punctuation, to_lowercase=self.to_lowercase, skip_datematch=self.skip_date, skip_monthmatch=self.skip_month, skip_timeunitmatch=self.skip_timeunit, skip_daymatch=self.skip_day)) def autosetup(self): return ExtractCityrefTask
class ExtractEntities(StandardWorkflowComponent): commonness_txt = Parameter() commonness_cls = Parameter() commonness_corpus = Parameter() ngrams_score = Parameter() config = Parameter() strip_punctuation = BoolParameter() to_lowercase = BoolParameter() skip_date = BoolParameter() skip_month = BoolParameter() skip_timeunit = BoolParameter() skip_day = BoolParameter() citylist = Parameter() format_json = BoolParameter() def accepts(self): return (InputFormat(self, format_id='cityref', extension='.json'), InputComponent(self, ExtractCityref, config=self.config, strip_punctuation=self.strip_punctuation, to_lowercase=self.to_lowercase, citylist=self.citylist, skip_date=self.skip_date, skip_month=self.skip_month, skip_timeunit=self.skip_timeunit, skip_day=self.skip_day)) def autosetup(self): return ExtractEntitiesTask
class FilterTweets(StandardWorkflowComponent): format_json = BoolParameter() def autosetup(self): return FilterTweetsTask def accepts(self): return InputFormat(self, format_id='tweets', extension='.gz')
class Folia2txt(Task): executable = 'folia2txt' #external executable (None if n/a) sentenceperline = BoolParameter(default=False) paragraphperline = BoolParameter(default=False) retaintokenisation = BoolParameter(default=False) in_folia = InputSlot() #will be linked to an out_* slot of another module in the workflow specification def out_html(self): return self.outputfrominput(inputformat='folia',stripextension='.folia.xml', addextension='.txt') def run(self): self.ex(self.in_folia().path, o=self.out_html().path, s=self.sentenceperline, p=self.paragraphperline, t=self.retaintokenisation)
class UpdateEventTypesTask(Task): in_events = InputSlot() in_predictiondir = InputSlot() text = BoolParameter() def out_updated_events(self): return self.outputfrominput(inputformat='events', stripextension='.events.integrated', addextension='.types.events.integrated') def run(self): # read prediction data with open(self.in_predictiondir().path + '/events_meta.txt','r',encoding='utf=8') as file_in: meta = file_in.read().strip().split('\n') with open(self.in_predictiondir().path + '/events_text.predictions.txt','r',encoding='utf=8') as file_in: predictions = file_in.read().strip().split('\n') with open(self.in_predictiondir().path + '/events_text.full_predictions.txt','r',encoding='utf=8') as file_in: lines = file_in.read().strip().split('\n') label_order = lines[0].split('\t') full_predictions = [line.split('\t') for line in lines[1:]] print('Meta',len(meta)) print('Predictions',len(predictions)) print('Full predictions',len(full_predictions)) # read in events print('Reading in events') with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed,txt=self.text) event_objs.append(eventobj) # index events id_event = {} for eo in event_objs: id_event[eo.mongo_id] = eo # for each prediction for i,mid in enumerate(meta): prediction = predictions[i] prediction_score = dict(zip(label_order,full_predictions[i])) eo = id_event[mid] eo.eventtype = prediction eo.eventtype_scores = prediction_score # write output out_updated_events = [event.return_dict(txt=self.text) for event in event_objs] with open(self.out_updated_events().path,'w',encoding='utf-8') as file_out: json.dump(out_updated_events,file_out)
class Ucto(StandardWorkflowComponent): """A workflow component for Ucto""" skip = Parameter( default="" ) #A parameter for the workflow, will be passed on to the tasks language = Parameter() tok_input_sentenceperline = BoolParameter(default=False) tok_output_sentenceperline = BoolParameter(default=False) def autosetup(self): return (Ucto_txt2folia, Ucto_folia2folia, Ucto_tok2folia) def accepts(self): """Returns a tuple of all the initial inputs and other workflows this component accepts as input (a disjunction, only one will be selected)""" return (InputFormat(self, format_id='folia', extension='folia.xml'), InputFormat(self, format_id='txt', extension='txt'), InputFormat(self, format_id='tok', extension='tok'), InputComponent(self, ConvertToFoLiA))
class Tokenize_instances(Task): """"Tokenizes a file one document per line""" in_filtered = InputSlot() config = Parameter() strip_punctuation = BoolParameter() lowercase = BoolParameter() def out_tokenized(self): return self.outputfrominput(inputformat='filtered', stripextension='.filtered.json', addextension='.tok.json') def run(self): print('Running Tokenizer...') with open(self.in_filtered().path, 'r', encoding='utf-8') as file_in: tweets = json.load(file_in) toktweets = [] tokenizer = ucto.Tokenizer(self.config) for tweet in tweets: text = tweet['text'] tokenizer.process(text) tokens = [] for token in tokenizer: if not (self.strip_punctuation and token.tokentype == 'PUNCTUATION'): tokens.append(token.text) tokenized = ' '.join(tokens) if self.lowercase: tokenized = tokenized.lower() tweet['text'] = tokenized toktweets.append(tweet) # write to file with open(self.out_tokenized().path, 'w', encoding='utf-8') as file_out: json.dump(toktweets, file_out)
class ExtractDaterefTask(Task): in_tokenized = InputSlot() skip_datematch = BoolParameter() skip_monthmatch = BoolParameter() skip_timeunitmatch = BoolParameter() skip_daymatch = BoolParameter() def out_dateref(self): return self.outputfrominput(inputformat='tokenized', stripextension='.tok.json', addextension='.dateref.json') def run(self): # read in tweets with open(self.in_tokenized().path, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) # format as tweet objects tweets = [] for td in tweetdicts: tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) tweets.append(tweetobj) # extract daterefs for tweetobj in tweets: dte = dutch_timex_extractor.Dutch_timex_extractor( tweetobj.text, tweetobj.datetime) dte.extract_refdates(self.skip_datematch, self.skip_monthmatch, self.skip_timeunitmatch, self.skip_daymatch) dte.filter_future_refdates() tweetobj.set_refdates(dte.refdates) # write to file outtweets = [tweet.return_dict() for tweet in tweets] with open(self.out_dateref().path, 'w', encoding='utf-8') as file_out: json.dump(outtweets, file_out)
class Ucto_txt2tok(Task): executable = 'ucto' #external executable (None if n/a) #Parameters for this module (all mandatory!) language = Parameter() tok_input_sentenceperline = BoolParameter(default=False) tok_output_sentenceperline = BoolParameter(default=False) in_txt = InputSlot( ) #will be linked to an out_* slot of another module in the workflow specification def out_tok(self): return self.outputfrominput(inputformat='txt', stripextension='.txt', addextension='.tok') def run(self): self.ex(self.in_txt().path, self.out_tok().path, L=self.language, m=self.tok_input_sentenceperline, n=self.tok_output_sentenceperline)
class Ucto_dir(StandardWorkflowComponent): """A workflow component for Ucto that operates on entire directories""" skip = Parameter( default="" ) #A parameter for the workflow, will be passed on to the tasks language = Parameter() tok_input_sentenceperline = BoolParameter(default=False) tok_output_sentenceperline = BoolParameter(default=False) def autosetup(self): return (Ucto_txt2folia_dir, Ucto_folia2folia_dir) def accepts(self): """Returns a tuple of all the initial inputs and other workflows this component accepts as input (a disjunction, only one will be selected)""" return (InputFormat(self, format_id='txtdir', extension='txtdir', directory=True), InputFormat(self, format_id='foliadir', extension='foliadir', directory=True))
class CollatePDF(Task): """Collate multiple PDF files together""" executable = 'pdftk' naturalsort = BoolParameter(default=True) #do a natural sort of all pdfs in the input directory in_pdfdir = InputSlot() def out_pdf(self): return self.outputfrominput(inputformat='pdfdir',stripextension='.pdfdir',addextension='.pdf') def run(self): pdf_files = [ pdffile for pdffile in glob.glob(self.in_pdfdir().path + '/*.pdf') ] #collect all pdf files in collection if self.naturalsort: pdf_files = natsort.natsorted(pdf_files) args = pdf_files + ['output',self.out_pdf().path] self.ex(*args)
class UpdateEventTypes(WorkflowComponent): events = Parameter() predictiondir = Parameter() text = BoolParameter() def accepts(self): return [ ( InputFormat(self,format_id='predictiondir',extension='.instances',inputparameter='predictiondir'), InputFormat(self,format_id='events',extension='.events.integrated',inputparameter='events') ) ] def setup(self, workflow, input_feeds): event_type_updater = workflow.new_task('update_event_types', UpdateEventTypesTask, autopass=True, text=self.text) event_type_updater.in_events = input_feeds['events'] event_type_updater.in_predictiondir = input_feeds['predictiondir'] return event_type_updater
class Timbl_leaveoneout(Timbl_base): in_train = InputSlot() leaveoneout = BoolParameter(default=False) def out_log(self): return self.outputfrominput(inputformat='train', stripextension='.train', addextension='.timbl.leaveoneout.log') def run(self): self.ex(f=self.in_train().path, t="leave_one_out", a=self.algorithm, k=self.k, m=self.metric, w=self.weighting, d=self.distance, __stdout_to=self.out_log().path)
class FilterTweetsTask(Task): in_tweets = InputSlot() #input slot for a gzipped tweet file format_json = BoolParameter() def out_filtered(self): return self.outputfrominput(inputformat='tweets', stripextension='.gz', addextension='.filtered.json') def run(self): # read in gzipped tweet file good_format = re.compile(r'}$') tweets = [] for line in io.TextIOWrapper(io.BufferedReader(gzip.open(self.in_tweets().path)), encoding='utf-8', errors='ignore'): if self.format_json: try: tweets.append(json.loads(line.strip())) except: print('Error loading json, skipping to next line') else: try: tweets.append(line.strip()) except: print('Error loading json, skipping to next line') print(self.in_tweets().path,'contains',len(tweets),'before filtering') tf = tweetfilter.Tweetfilter(tweets) tf.discard_retweets() print('after retweet filter',len(tf.tweets)) tf.discard_nondutch() filtered_tweets = tf.return_tweets() print('after filtering:',len(filtered_tweets)) # write filtered tweets outtweets = [] for filtered_tweet in filtered_tweets: tweetobj = tweet.Tweet() tweetobj.import_twiqsdict(filtered_tweet) outtweets.append(tweetobj.return_dict()) # write to file with open(self.out_filtered().path,'w',encoding='utf-8') as outfile: json.dump(outtweets,outfile)
class ExtractDateref(StandardWorkflowComponent): skip_datematch = BoolParameter() skip_monthmatch = BoolParameter() skip_timeunitmatch = BoolParameter() skip_daymatch = BoolParameter() config = Parameter(default=False) strip_punctuation = BoolParameter() to_lowercase = BoolParameter() format_json = BoolParameter() def accepts(self): return (InputFormat(self, format_id='tokenized', extension='tok.json'), InputComponent(self, Tokenize, config=self.config, strip_punctuation=self.strip_punctuation, lowercase=self.to_lowercase)) def autosetup(self): return ExtractDaterefTask
class Frog_txt2folia(Task): """A task for Frog: Takes plaintext input and produces FoLiA output""" executable = 'frog' #external executable (None if n/a) #Parameters for this module (all mandatory!) tok_input_sentenceperline = BoolParameter(default=False) skip = Parameter(default="") in_txt = InputSlot() #input slot placeholder (will be linked to an out_* slot of another module in the workflow specification) def out_folia(self): """The output slot, for FoLiA""" return self.outputfrominput(inputformat='txt',stripextension='.txt', addextension='.frogged.folia.xml') #the format_id corresponds to the input slot (txt -> in_txt) def run(self): #execute a shell command, python keyword arguments will be passed as option flags (- for one letter, -- for more) # values will be made shell-safe. # None or False values will not be propagated at all. self.ex( t=self.in_txt().path, #the path of the input file (accessed through the input slot) X=self.out_folia().path, #the path of the output file (accessed through the output slot) id=os.path.basename(self.in_txt().path).split('.')[0], #first component of input filename (up to first period) will be FoLiA ID skip=self.skip if self.skip else None, n=self.tok_input_sentenceperline)