class Timbl_test(Timbl_base): in_ibase = InputSlot() #input slot in_wgt = InputSlot() in_test = InputSlot() def out_timbl(self): return self.outputfrominput(inputformat='test', stripextension='.test', addextension='.timbl.out') def out_log(self): return self.outputfrominput(inputformat='test', stripextension='.test', addextension='.timbl.test.log') def run(self): self.ex(i=self.in_ibase().path, t=self.in_test().path, w=self.in_wgt().path + ':' + self.weighting, o=self.out_timbl().path, a=self.algorithm, k=self.k, m=self.metric, d=self.distance, __stdout_to=self.out_log().path)
class UpdateEventTypesTask(Task): in_events = InputSlot() in_predictiondir = InputSlot() text = BoolParameter() def out_updated_events(self): return self.outputfrominput(inputformat='events', stripextension='.events.integrated', addextension='.types.events.integrated') def run(self): # read prediction data with open(self.in_predictiondir().path + '/events_meta.txt','r',encoding='utf=8') as file_in: meta = file_in.read().strip().split('\n') with open(self.in_predictiondir().path + '/events_text.predictions.txt','r',encoding='utf=8') as file_in: predictions = file_in.read().strip().split('\n') with open(self.in_predictiondir().path + '/events_text.full_predictions.txt','r',encoding='utf=8') as file_in: lines = file_in.read().strip().split('\n') label_order = lines[0].split('\t') full_predictions = [line.split('\t') for line in lines[1:]] print('Meta',len(meta)) print('Predictions',len(predictions)) print('Full predictions',len(full_predictions)) # read in events print('Reading in events') with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed,txt=self.text) event_objs.append(eventobj) # index events id_event = {} for eo in event_objs: id_event[eo.mongo_id] = eo # for each prediction for i,mid in enumerate(meta): prediction = predictions[i] prediction_score = dict(zip(label_order,full_predictions[i])) eo = id_event[mid] eo.eventtype = prediction eo.eventtype_scores = prediction_score # write output out_updated_events = [event.return_dict(txt=self.text) for event in event_objs] with open(self.out_updated_events().path,'w',encoding='utf-8') as file_out: json.dump(out_updated_events,file_out)
class ArchiveEventsTask(Task): in_events = InputSlot() in_archivedir = InputSlot() archivedate = Parameter() def out_archived(self): return self.outputfrominput(inputformat='archivedir', stripextension='.archive', addextension='.archive/events_' + self.archivedate + '.json') def out_active_events(self): return self.outputfrominput(inputformat='events', stripextension='.events.integrated', addextension='.active.events.integrated') def run(self): # read events archive_events = [] active_events = [] date = datetime.datetime(int(self.archivedate[:4]), int(self.archivedate[4:6]), int(self.archivedate[6:8])) print('Reading events') with open(self.in_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) for i, ed in enumerate(eventdicts): eventobj = event.Event() eventobj.import_eventdict(ed) if eventobj.datetime == date: archive_events.append(eventobj) else: active_events.append(eventobj) # write archive print('Writing archive') out_archive_events = [ event.return_dict(txt=False) for event in archive_events ] with open(self.out_archived().path, 'w', encoding='utf-8') as file_out: json.dump(out_archive_events, file_out) # write active events print('Writing active events') out_active_events = [ event.return_dict(txt=False) for event in active_events ] with open(self.out_active_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_active_events, file_out)
class LowercaseVoweleaterDirTask2(Task): in_txtdir = InputSlot() extension = Parameter(default='txt') def out_txtdir(self): return self.outputfrominput(inputformat='txtdir', stripextension='.txtdir', addextension='.lcnv.txtdir') def run(self): #Set up the output directory, will create it and tear it down on failure automatically self.setup_output_dir(self.out_txtdir().path) #gather input files inputfiles = [ filename for filename in glob.glob(self.in_txtdir().path + '/*.' + self.extension) ] #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically #in this case we run the OCR_singlepage component for each input file in the directory yield [ Voweleater(inputfile=inputfile, outputdir=self.out_txtdir().path, startcomponent='Lowercaser') for inputfile in inputfiles ]
class Timbl_train(Timbl_base): in_train = InputSlot() #input slot def out_ibase(self): return self.outputfrominput(inputformat='train', stripextension='.train', addextension='.ibase') def out_wgt(self): return self.outputfrominput(inputformat='train', stripextension='.train', addextension='.wgt') def out_log(self): return self.outputfrominput(inputformat='train', stripextension='.train', addextension='.timbl.train.log') def run(self): self.ex(f=self.in_train().path, I=self.out_ibase().path, W=self.out_wgt().path, a=self.algorithm, k=self.k, m=self.metric, w=self.weighting, d=self.distance, __stdout_to=self.out_log().path)
class Ucto_folia2folia_dir(Task): extension = Parameter(default="folia.xml") language = Parameter() in_foliadir = InputSlot() #input slot def out_tokfoliadir(self): return self.outputfrominput(inputformat='foliadir', stripextension='.foliadir', addextension='.tok.foliadir') def run(self): #Set up the output directory, will create it and tear it down on failure automatically self.setup_output_dir(self.out_tokfoliadir().path) #gather input files inputfiles = [ filename for filename in glob.glob(self.in_foliadir().path + '/*.' + self.extension) ] #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically #in this case we run the FeaturizerTask_single component for each input file in the directory yield [ Ucto(inputfile=inputfile, inputslot='folia', outputdir=self.out_tokfoliadir().path, language=self.language) for inputfile in inputfiles ]
class TesseractOCR_document(Task): """OCR for a whole document (input is a directory of tiff image files (pages), output is a directory of hOCR files""" tiff_extension = Parameter(default='tif') language = Parameter() in_tiffdir = InputSlot() #input slot def out_hocrdir(self): return self.outputfrominput(inputformat='tiffdir', stripextension='.tiffdir', addextension='.hocrdir') def run(self): #Set up the output directory, will create it and tear it down on failure automatically self.setup_output_dir(self.out_hocrdir().path) #gather input files inputfiles = [ filename for filename in glob.glob(self.in_tiffdir().path + '/*.' + self.tiff_extension) ] #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically #in this case we run the OCR_singlepage component for each input file in the directory yield [ OCR_singlepage(inputfile=inputfile, outputdir=self.out_hocrdir().path, language=self.language, tiff_extension=self.tiff_extension) for inputfile in inputfiles ]
class ScaleTestTask(Task): in_txtdir = InputSlot() n = IntParameter() def out_txtdir(self): return self.outputfrominput(inputformat='txtdir', stripextension='.txtdir', addextension='.out.txtdir') def run(self): self.setup_output_dir(self.out_txtdir().path) #gather input files log.info("Collecting input files...") inputfiles = [ os.path.join(self.in_txtdir().path, str(i) + '.txt') for i in range(1, self.n + 1) ] log.info("Collected " + str(len(inputfiles)) + " input files") #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically for inputfiles_chunk in chunk(inputfiles, 1000): yield ParallelBatch(component='Voweleater', inputfiles=','.join(inputfiles_chunk), passparameters=PassParameters( outputdir=self.out_txtdir().path))
class IndexAllTweetsTask(Task): in_tweetdir = InputSlot() def out_indexed_tweets(self): return self.outputfrominput(inputformat='tweetdir', stripextension='.tweets', addextension='.tweets_indexed.json') def run(self): # read in tweets indexed_tweets = {} tweetsubdirs = sorted( [subdir for subdir in glob.glob(self.in_tweetdir().path + '/*')]) for tweetsubdir in tweetsubdirs: print(tweetsubdir) # go through all tweet files tweetfiles = [ tweetfile for tweetfile in glob.glob(tweetsubdir + '/*.entity.json') ] for tweetfile in tweetfiles: tweetfilestr = '/'.join(tweetfile.split('/')[-2:]) # read in tweets with open(tweetfile, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) for i, td in enumerate(tweetdicts): indexed_tweets[td['id']] = [tweetfilestr, i] # write to file with open(self.out_indexed_tweets().path, 'w', encoding='utf-8') as file_out: json.dump(indexed_tweets, file_out)
class Folia2html(Task): executable = 'folia2html' #external executable (None if n/a) in_folia = InputSlot() #will be linked to an out_* slot of another module in the workflow specification def out_html(self): return self.outputfrominput(inputformat='folia',stripextension='.folia.xml', addextension='.html') def run(self): self.ex(self.in_folia().path, o=self.out_html().path)
class Rst2folia(Task): executable = 'rst2folia' #external executable (None if n/a) in_rst = InputSlot() #will be linked to an out_* slot of another module in the workflow specification def out_folia(self): return self.outputfrominput(inputformat='rst',stripextension='.rst', addextension='.folia.xml') def run(self): self.ex(self.in_rst().path, self.out_folia().path, docid=os.path.basename(self.in_rst().path).split('.')[0]) #first component of input filename (up to first period) will be FoLiA ID
class ArchiveEventsTask(Task): in_events = InputSlot() def out_archivedir(self): return self.outputfrominput(inputformat='events', stripextension='.events', addextension='.archive') def out_active_events(self): return self.outputfrominput(inputformat='events', stripextension='.events', addextension='.active.events') def run(self): # initiate directory self.setup_output_dir(self.out_archivedir().path) # read events datebound = datetime.now() - datetime.timedelta(days=100) date_events = defaultdict(list) active_events = [] print('Reading events') with open(self.in_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) for i, ed in enumerate(eventdicts): eventobj = event.Event() eventobj.import_eventdict(ed) if eventobj.datetime < datebound: date_events[''.join( str(eventobj.datetime).split()[0].split('-'))].append( eventobj) else: active_events.append(eventobj) # write archives print('Writing archives') for date in sorted(list(date_events.keys())): print(date) events = date_events[date] out_events = [event.return_dict(txt=False) for event in events] outfile = self.out_archivedir().path + '/events_' + date + '.json' with open(outfile, 'w', encoding='utf-8') as file_out: json.dump(out_events, file_out) # write active events print('Writing active events') out_active_events = [ event.return_dict(txt=False) for event in active_events ] with open(self.out_active_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_active_events, file_out)
class ExtractEntitiesTask(Task): in_cityref = InputSlot() commonness_txt = Parameter() commonness_cls = Parameter() commonness_corpus = Parameter() ngrams_score = Parameter() def out_entity(self): return self.outputfrominput(inputformat='cityref', stripextension='.json', addextension='.entity.json') def run(self): # set commonness object cs = commonness.Commonness() cs.set_classencoder(self.commonness_txt, self.commonness_cls, self.commonness_corpus) cs.set_dmodel(self.ngrams_score) # read in tweets with open(self.in_cityref().path, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) # format as tweet objects tweets = [] for td in tweetdicts: tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) tweets.append(tweetobj) # extract entities for tweetobj in tweets: # remove already extracted time and locations from the tweet, forming it into chunks datestrings = [sr[0] for sr in tweetobj.string_refdates] cities = tweetobj.cityrefs tweet_chunks = helpers.remove_pattern_from_string( tweetobj.text, datestrings + cities) # find entities in every chunk ee = entity_extractor.EntityExtractor() ee.set_commonness(cs) for chunk in tweet_chunks: tokens = chunk.split() ee.extract_entities(tokens) ee.filter_entities_threshold() tweetobj.set_entities(ee.entities) # write to file outtweets = [tweet.return_dict() for tweet in tweets] with open(self.out_entity().path, 'w', encoding='utf-8') as file_out: json.dump(outtweets, file_out)
class Alpino2folia(Task): executable = 'alpino2folia' in_alpinodocdir = InputSlot() def out_folia(self): return self.outputfrominput(inputformat='alpinodocdir',stripextension='.alpinodocdir', addextension='.folia.xml') def run(self): alpinofiles = [ alpinofile for alpinofile in sorted(glob.glob(self.in_alpinodocdir().path + '/*.xml'),key=lambda x: int(os.path.basename(x).split('.')[0])) ] #collect all alpino files in collection args = alpinofiles + [self.out_folia().path] #last argument is folia output self.ex(*args)
class IntegrateEventDirTask(Task): ### task to speed up event integration for sliding window event extraction ### make sure that all events in the directory are deduplicated and enhanced before running this task ### only files with extension '.enhanced' will be integrated in_eventdir = InputSlot() overlap_threshold = Parameter() def out_integrated_events(self): return self.outputfrominput(inputformat='eventdir', stripextension='.events', addextension='events.integrated') def run(self): # collect all event files with extension '.enhanced' enhanced_events = glob.glob(self.in_eventdir().path + '/*.enhanced') # initialize merger = event_merger.EventMerger() overlap_threshold = float(self.overlap_threshold) # for each event file for eventfile in enhanced_events: print('Reading',eventfile) with open(eventfile, 'r', encoding = 'utf-8') as file_in: current_eventdicts = json.loads(file_in.read()) new_event_objs = [] for ed in current_eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed) new_event_objs.append(eventobj) # merge before integration print('Merging new events before integration; number of events at start:',len(new_event_objs)) premerger = event_merger.EventMerger() premerger.add_events(new_event_objs) premerger.find_merges(overlap_threshold) new_events_merged = premerger.return_events() print('Done. New events after merge:',len(new_events_merged)) if len(merger.events) == 0: merger.add_events(new_events_merged) else: # integrate each event into the current ones print('Starting integrating new events; number of current events:',len(merger.events)) for new_event in new_events_merged: merger.find_merge(new_event,overlap_threshold) # write merged integrated_events = merger.return_events() print('Done. Number of events after integration:',len(integrated_events)) out_integrated_events = [event.return_dict() for event in integrated_events] with open(self.out_integrated_events().path,'w',encoding='utf-8') as file_out: json.dump(out_integrated_events,file_out)
class Foliacat(Task): executable = 'foliacat' extension = Parameter(default='folia.xml') in_foliadir = InputSlot() def out_folia(self): return self.outputfrominput(inputformat='foliadir',stripextension='.foliadir', addextension='.folia.xml') def run(self): foliafiles = [ filename for filename in natsort.natsorted(glob.glob(self.in_foliadir().path + '/*.' + self.extension)) ] self.ex(*foliafiles, o=self.out_folia().path, i=self.out_folia().path.split('.')[0]) #first component of filename acts as document ID
class VoweleaterTask(Task): """Example of a task that invokes an external tool and uses stdin and stdout. This one simply removes vowels from a text.""" executable = 'sed' in_txt = InputSlot() encoding = Parameter(default='utf-8') def out_txt(self): return self.outputfrominput(inputformat='txt', stripextension='.txt', addextension='.novowels.txt') def run(self): self.ex(e='s/[aeiouAEIOU]//g', __stdin_from=self.in_txt().path, __stdout_to=self.out_txt().path)
class PrepareInstancesIdsTask(Task): in_events = InputSlot() def out_instances(self): return self.outputfrominput(inputformat='events', stripextension='.events.integrated', addextension='.events.instances') def run(self): # initiate directory with instances self.setup_output_dir(self.out_instances().path) # read in events print('Reading in events') with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in: eventdicts = json.loads(file_in.read()) # extract information print('Extracting text') ids = [] txt = [] counter = list(range(0,len(eventdicts),1000)) for i,ed in enumerate(eventdicts): if i in counter: print('Event',i,'of',len(eventdicts)) tweetstxt = [] for tweettext in [' '.join([tweet['user'],tweet['text']]) for tweet in ed['tweets']] + [' '.join([tweet['user'],tweet['text']]) for tweet in ed['tweets_added']]: if re.search('http',tweettext): tokens = tweettext.split() for j,token in enumerate(tokens): if token[:4] == 'http': tokens[j] = 'THISISATWITTERLINK' tweetstxt.append(' '.join(tokens).replace('\n',' ').replace('\r',' ')) else: tweetstxt.append(tweettext.replace('\n',' ').replace('\r',' ')) if ' '.join(tweetstxt).strip() == '': continue else: ids.append(ed['mongo_id']) txt.append(' '.join(tweetstxt)) # write data print('Done. Writing to files') with open(self.out_instances().path + '/events_meta.txt','w',encoding='utf-8') as out: out.write('\n'.join(ids)) with open(self.out_instances().path + '/events_text.txt','w',encoding='utf-8') as out: out.write('\n'.join(txt))
class LowercaseTask(Task): """A simple task, implemented in python""" in_txt = InputSlot() encoding = Parameter(default='utf-8') def out_txt(self): return self.outputfrominput(inputformat='txt', stripextension='.txt', addextension='.lowercase.txt') def run(self): with open(self.in_txt().path, 'r', encoding=self.encoding) as f_in: with open(self.out_txt().path, 'w', encoding=self.encoding) as f_out: f_out.write(f_in.read().lower())
class Frog_folia2folia(Task): executable = 'frog' #external executable (None if n/a) #Parameters for this module (all mandatory!) skip = Parameter(default="") in_folia = InputSlot() #will be linked to an out_* slot of another module in the workflow specification def out_folia(self): return self.outputfrominput(inputformat='folia',stripextension='.folia.xml', addextension='.frogged.folia.xml') def run(self): self.ex( x=self.in_folia().path, X=self.out_folia().path, skip=self.skip if self.skip else None)
class FoliaHOCR(Task): """Converts a directory of hocr files to a directory of FoLiA files""" executable = "FoLiA-hocr" threads = Parameter(default=1) in_hocrdir = InputSlot() def out_foliadir(self): """Directory of FoLiA document, one per hOCR file""" return self.outputfrominput(inputformat='hocrdir',stripextension='.hocrdir', addextension='.foliadir') def run(self): self.setup_output_dir(self.out_foliadir().path) self.ex(self.in_hocrdir().path, t=self.threads, O=self.out_foliadir().path)
class CollatePDF(Task): """Collate multiple PDF files together""" executable = 'pdftk' naturalsort = BoolParameter(default=True) #do a natural sort of all pdfs in the input directory in_pdfdir = InputSlot() def out_pdf(self): return self.outputfrominput(inputformat='pdfdir',stripextension='.pdfdir',addextension='.pdf') def run(self): pdf_files = [ pdffile for pdffile in glob.glob(self.in_pdfdir().path + '/*.pdf') ] #collect all pdf files in collection if self.naturalsort: pdf_files = natsort.natsorted(pdf_files) args = pdf_files + ['output',self.out_pdf().path] self.ex(*args)
class Pdf2images(Task): """Extract images from a PDF document to a set of TIFF images""" executable = 'pdfimages' #external executable (None if n/a) in_pdf = InputSlot() #will be linked to an out_* slot of another module in the workflow specification def out_tiffdir(self): return self.outputfrominput(inputformat='pdf',stripextension='.pdf',addextension='.tiffdir') def run(self): #we use a DirectoryHandler that takes care of creating a temporary directory to hold all output and renames it to the final directory when all succeeds, and cleaning up otherwise with DirectoryHandler(self.out_tiffdir().path) as dirhandler: self.ex(self.in_pdf().path, dirhandler.directory+'/' + os.path.basename(self.in_pdf().path).split('.')[0] , #output to temporary directory and a file prefix tiff=True, p=True, __singlehyphen=True, #use single-hypens even for multi-letter options )
class FilterEventsTask(Task): in_merged_events = InputSlot() citylist = Parameter() def out_filtered_events(self): return self.outputfrominput(inputformat='merged_events', stripextension='.merged', addextension='.filtered') def run(self): # read in events print('Reading in events') with open(self.in_merged_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed, txt=False) event_objs.append(eventobj) print('Reading in citylist') # read in citylist with open(self.citylist, 'r', encoding='utf-8') as file_in: citylist = [ line.strip() for line in file_in.read().strip().split('\n') ] # initialize event filter print('Filtering; number of events at start:', len(event_objs)) filter = event_filter.EventFilter() filter.add_events(event_objs) filter.apply_filter(citylist) events_filtered = filter.return_events() print('Done. number of events after filter:', len(events_filtered)) # write filter out_filtered_events = [ event.return_dict(txt=False) for event in events_filtered ] with open(self.out_filtered_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_filtered_events, file_out)
class ExtractCityrefTask(Task): in_dateref = InputSlot() citylist = Parameter() def out_cityref(self): return self.outputfrominput(inputformat='dateref', stripextension='.json', addextension='.cityref.json') def run(self): # read in citylist with open(self.citylist, 'r', encoding='utf-8') as file_in: citylist = [ line.strip() for line in file_in.read().strip().split('\n') ] # read in tweets with open(self.in_dateref().path, 'r', encoding='utf-8') as file_in: tweetdicts = json.loads(file_in.read()) # format as tweet objects tweets = [] for td in tweetdicts: tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) tweets.append(tweetobj) # extract location for tweetobj in tweets: # remove already extracted time from the tweet, forming it into chunks datestrings = [sr[0] for sr in tweetobj.string_refdates] tweet_chunks = helpers.remove_pattern_from_string( tweetobj.text, datestrings) # extract city from chunks ce = cityref_extractor.CityrefExtractor(citylist) for chunk in tweet_chunks: ce.find_cityrefs(chunk) tweetobj.set_cityrefs(ce.return_cityrefs()) # write to file outtweets = [tweet.return_dict() for tweet in tweets] with open(self.out_cityref().path, 'w', encoding='utf-8') as file_out: json.dump(outtweets, file_out)
class FoliaValidatorDirTask(Task): executable = "foliavalidator" in_foliadir = InputSlot() folia_extension = Parameter(default='folia.xml') def out_validationsummary(self): return self.outputfrominput(inputformat='foliadir',stripextension='.foliadir', addextension='.folia-validation-summary.txt') def run(self): #gather input files if self.outputdir and not os.path.exists(self.outputdir): os.makedirs(self.outputdir) log.info("Collecting input files...") inputfiles = recursive_glob(self.in_foliadir().path, '*.' + self.folia_extension) log.info("Collected " + str(len(inputfiles)) + " input files") log.info("Scheduling validators") if self.outputdir: passparameters = PassParameters(folia_extension=self.folia_extension,replaceinputdir=self.in_foliadir().path, outputdir=self.outputdir) else: passparameters = PassParameters(folia_extension=self.folia_extension) yield [ FoliaValidator(inputfile=inputfile,passparameters=passparameters) for inputfile in inputfiles ] log.info("Collecting output files...") #Gather all output files if self.outputdir: outputfiles = recursive_glob(self.outputdir, '*.folia-validation-report.txt') else: outputfiles = recursive_glob(self.in_foliadir().path, '*.folia-validation-report.txt') log.info("Writing summary") with open(self.out_validationsummary().path,'w',encoding='utf-8') as f_summary: for outputfilename in outputfiles: with open(outputfilename, 'r',encoding='utf-8') as f: success = False for line in f: if line.startswith('Validated successfully'): success = True break if success: f_summary.write(outputfilename + ": OK\n") else: f_summary.write(outputfilename + ": ERROR\n")
class CombineEntityTimeseriesTask(Task): in_entity_counts_dir = InputSlot() def out_combined_counts(self): return self.outputfrominput(inputformat='entity_counts_dir', stripextension='.timeseries', addextension='.timeseries/combined.counts.npz') def out_combined_vocabulary(self): return self.outputfrominput(inputformat='entity_counts_dir', stripextension='.timeseries', addextension='.timeseries/combined.counts_vocabulary') def out_combined_dateseries(self): return self.outputfrominput(inputformat='entity_counts_dir', stripextension='.timeseries', addextension='.timeseries/combined.counts_dates') def run(self): # read entity counts print('Reading countfiles') countfiles = sorted([countfile for countfile in glob.glob(self.in_entity_counts_dir().path + '/20*' + 'counts.npz')]) vocabularies = sorted([vocabulary for vocabulary in glob.glob(self.in_entity_counts_dir().path + '/20*' + 'counts_vocabulary')]) datefiles = sorted([datesequence for datesequence in glob.glob(self.in_entity_counts_dir().path + '/20*' + 'counts_dates')]) print(len(countfiles),'Countfiles and',len(vocabularies),'Vocabulary files and',len(datefiles),'datefiles') dates = [] counts = [] for j,countfile in enumerate(countfiles): print(countfile) with open(datefiles[j],'r',encoding='utf-8') as file_in: dates.extend(file_in.read().strip().split('\n')) loader = numpy.load(countfile) counts.append(sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape'])) with open(vocabularies[j],'r',encoding='utf-8') as file_in: vocabulary = file_in.read().strip().split('\n') print('Done. Vocabulary size:',len(vocabulary),'Num dates:',len(dates),'Shape first counts:',counts[0].shape) # combine counts print('Combining counts') counts_combined = sparse.hstack(counts).tocsr() print('COMBINED SHAPE',counts_combined.shape) # write to files print('Writing to files') with open(self.out_combined_vocabulary().path,'w',encoding='utf-8') as out: out.write('\n'.join(vocabulary)) with open(self.out_combined_dateseries().path,'w',encoding='utf-8') as out: out.write('\n'.join(dates)) numpy.savez(self.out_combined_counts().path, data=counts_combined.data, indices=counts_combined.indices, indptr=counts_combined.indptr, shape=counts_combined.shape)
class FoliaValidatorTask(Task): executable = "foliavalidator" folia_extension = Parameter(default='folia.xml') in_folia = InputSlot() def out_validator(self): return self.outputfrominput(inputformat='folia',stripextension=self.folia_extension, addextension='.folia-validation-report.txt') def run(self): #If an explicit outputdir is given, ensure the directory for the output file exists (including any intermediate directories) if self.outputdir: self.setup_output_dir(os.path.dirname(self.out_validator().path)) #Run the validator self.ex(self.in_folia().path, __stderr_to=self.out_validator().path, __ignorefailure=True) #if the validator fails (it does when the document is invalid), we ignore it as that is a valid result for us
class Folia2txt(Task): executable = 'folia2txt' #external executable (None if n/a) sentenceperline = BoolParameter(default=False) paragraphperline = BoolParameter(default=False) retaintokenisation = BoolParameter(default=False) in_folia = InputSlot() #will be linked to an out_* slot of another module in the workflow specification def out_html(self): return self.outputfrominput(inputformat='folia',stripextension='.folia.xml', addextension='.txt') def run(self): self.ex(self.in_folia().path, o=self.out_html().path, s=self.sentenceperline, p=self.paragraphperline, t=self.retaintokenisation)
class Symlink(Task): """Create a symlink""" filename = Parameter() stripextension = Parameter() addextension = Parameter() in_file = InputSlot() #input slot def out_file(self): if self.filename: return TargetInfo(self, self.filename) else: return self.outputfrominput(inputformat='file', stripextension=self.stripextension, addextension=self.addextension) def run(self): os.symlink(self.in_file.path(), self.out_file.path())