Ejemplo n.º 1
0
class ArchiveEventsDaily(WorkflowComponent):

    events = Parameter()
    archivedir = Parameter()

    archivedate = Parameter()

    def accepts(self):
        return [(InputFormat(self,
                             format_id='events',
                             extension='.events.integrated',
                             inputparameter='events'),
                 InputFormat(self,
                             format_id='archivedir',
                             extension='.archive',
                             inputparameter='archivedir'))]

    def setup(self, workflow, input_feeds):

        daily_event_archiver = workflow.new_task('archive_events_daily',
                                                 ArchiveEventsDailyTask,
                                                 autopass=False,
                                                 archivedate=self.archivedate)
        daily_event_archiver.in_events = input_feeds['events']
        daily_event_archiver.in_archivedir = input_feeds['archivedir']

        return daily_event_archiver
Ejemplo n.º 2
0
class TimblClassifier(WorkflowComponent):
    """A Timbl classifier that takes training data, test data, and outputs the test data with classification"""

    trainfile = Parameter()
    testfile = Parameter()

    def accepts(self):
        #Note: tuple in a list, the outer list corresponds to options, while the inner tuples are conjunctions
        return [(InputFormat(self,
                             format_id='train',
                             extension='train',
                             inputparameter='trainfile'),
                 InputFormat(self,
                             format_id='test',
                             extension='test',
                             inputparameter='testfile'))]

    def setup(self, workflow, input_feeds):
        timbl_train = workflow.new_task('timbl_train',
                                        Timbl_train,
                                        autopass=True)
        timbl_train.in_train = input_feeds['train']

        timbl_test = workflow.new_task('timbl_test', Timbl_test, autopass=True)
        timbl_test.in_test = input_feeds['test']
        timbl_test.in_ibase = timbl_train.out_ibase
        timbl_test.in_wgt = timbl_train.out_wgt

        return timbl_test
Ejemplo n.º 3
0
class Ucto_folia2folia_dir(Task):
    extension = Parameter(default="folia.xml")
    language = Parameter()

    in_foliadir = InputSlot()  #input slot

    def out_tokfoliadir(self):
        return self.outputfrominput(inputformat='foliadir',
                                    stripextension='.foliadir',
                                    addextension='.tok.foliadir')

    def run(self):
        #Set up the output directory, will create it and tear it down on failure automatically
        self.setup_output_dir(self.out_tokfoliadir().path)

        #gather input files
        inputfiles = [
            filename for filename in glob.glob(self.in_foliadir().path +
                                               '/*.' + self.extension)
        ]

        #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically
        #in this case we run the FeaturizerTask_single component for each input file in the directory
        yield [
            Ucto(inputfile=inputfile,
                 inputslot='folia',
                 outputdir=self.out_tokfoliadir().path,
                 language=self.language) for inputfile in inputfiles
        ]
Ejemplo n.º 4
0
class TesseractOCR_document(Task):
    """OCR for a whole document (input is a directory of tiff image files (pages), output is a directory of hOCR files"""
    tiff_extension = Parameter(default='tif')
    language = Parameter()

    in_tiffdir = InputSlot()  #input slot

    def out_hocrdir(self):
        return self.outputfrominput(inputformat='tiffdir',
                                    stripextension='.tiffdir',
                                    addextension='.hocrdir')

    def run(self):
        #Set up the output directory, will create it and tear it down on failure automatically
        self.setup_output_dir(self.out_hocrdir().path)

        #gather input files
        inputfiles = [
            filename for filename in glob.glob(self.in_tiffdir().path + '/*.' +
                                               self.tiff_extension)
        ]

        #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically
        #in this case we run the OCR_singlepage component for each input file in the directory
        yield [
            OCR_singlepage(inputfile=inputfile,
                           outputdir=self.out_hocrdir().path,
                           language=self.language,
                           tiff_extension=self.tiff_extension)
            for inputfile in inputfiles
        ]
Ejemplo n.º 5
0
class ExtractCityref(StandardWorkflowComponent):

    citylist = Parameter()

    config = Parameter()
    strip_punctuation = BoolParameter()
    to_lowercase = BoolParameter()
    skip_date = BoolParameter()
    skip_month = BoolParameter()
    skip_timeunit = BoolParameter()
    skip_day = BoolParameter()
    format_json = BoolParameter()

    def accepts(self):
        return (InputFormat(self,
                            format_id='dateref',
                            extension='.dateref.json'),
                InputComponent(self,
                               ExtractDateref,
                               config=self.config,
                               strip_punctuation=self.strip_punctuation,
                               to_lowercase=self.to_lowercase,
                               skip_datematch=self.skip_date,
                               skip_monthmatch=self.skip_month,
                               skip_timeunitmatch=self.skip_timeunit,
                               skip_daymatch=self.skip_day))

    def autosetup(self):
        return ExtractCityrefTask
Ejemplo n.º 6
0
class Timbl_base(Task):
    executable = 'timbl'

    algorithm = Parameter(default="IB1")
    metric = Parameter(default="O")
    weighting = Parameter(default="gr")
    distance = Parameter(default="Z")
    format = Parameter(default="Columns")
    k = IntParameter(default=1)
Ejemplo n.º 7
0
class IntegrateEvents(StandardWorkflowComponent):

    current_events = Parameter()
    overlap_threshold = Parameter(default = 0.2)

    def accepts(self):
        return InputFormat(self, format_id='events', extension='.enhanced'), InputFormat(self, format_id='events', extension='.integrated'), InputFormat(self, format_id='events', extension='.types'), InputFormat(self, format_id='events', extension='.filtered')

    def autosetup(self):
        return IntegrateEventsTask
Ejemplo n.º 8
0
class ExtractEntitiesTask(Task):

    in_cityref = InputSlot()

    commonness_txt = Parameter()
    commonness_cls = Parameter()
    commonness_corpus = Parameter()
    ngrams_score = Parameter()

    def out_entity(self):
        return self.outputfrominput(inputformat='cityref',
                                    stripextension='.json',
                                    addextension='.entity.json')

    def run(self):

        # set commonness object
        cs = commonness.Commonness()
        cs.set_classencoder(self.commonness_txt, self.commonness_cls,
                            self.commonness_corpus)
        cs.set_dmodel(self.ngrams_score)

        # read in tweets
        with open(self.in_cityref().path, 'r', encoding='utf-8') as file_in:
            tweetdicts = json.loads(file_in.read())

        # format as tweet objects
        tweets = []
        for td in tweetdicts:
            tweetobj = tweet.Tweet()
            tweetobj.import_tweetdict(td)
            tweets.append(tweetobj)

        # extract entities
        for tweetobj in tweets:
            # remove already extracted time and locations from the tweet, forming it into chunks
            datestrings = [sr[0] for sr in tweetobj.string_refdates]
            cities = tweetobj.cityrefs
            tweet_chunks = helpers.remove_pattern_from_string(
                tweetobj.text, datestrings + cities)
            # find entities in every chunk
            ee = entity_extractor.EntityExtractor()
            ee.set_commonness(cs)
            for chunk in tweet_chunks:
                tokens = chunk.split()
                ee.extract_entities(tokens)
                ee.filter_entities_threshold()
            tweetobj.set_entities(ee.entities)

        # write to file
        outtweets = [tweet.return_dict() for tweet in tweets]
        with open(self.out_entity().path, 'w', encoding='utf-8') as file_out:
            json.dump(outtweets, file_out)
Ejemplo n.º 9
0
class OCR_singlepage(StandardWorkflowComponent):
    language = Parameter()
    tiff_extension = Parameter(default='tif')

    def autosetup(self):
        return Tesseract

    def accepts(self):
        return InputFormat(self,
                           format_id='tiff',
                           extension=self.tiff_extension,
                           directory=True),
Ejemplo n.º 10
0
class ExtractEvents(StandardWorkflowComponent):

    citylist = Parameter()
    end_date = Parameter()
    window_size = IntParameter(default=30)
    minimum_event_mentions = IntParameter(default=5)
    cut_off = IntParameter(default=2500)

    def accepts(self):
        return InputFormat(self, format_id='tweetdir', extension='.tweets')

    def autosetup(self):
        return ExtractEventsTask
Ejemplo n.º 11
0
class MergeEvents(StandardWorkflowComponent):

    overlap_threshold = Parameter(default = 0.2)
    similarity_threshold = Parameter(default = 0.7)

    def accepts(self):
        return (
            InputFormat(self, format_id='enhanced_events', extension='.enhanced'),
            InputComponent(self, EnhanceEvents, similarity_threshold=self.similarity_threshold)
        )

    def autosetup(self):
        return MergeEventsTask
Ejemplo n.º 12
0
class LowercaseVoweleaterDirTask2(Task):
    in_txtdir = InputSlot()
    extension = Parameter(default='txt')

    def out_txtdir(self):
        return self.outputfrominput(inputformat='txtdir',
                                    stripextension='.txtdir',
                                    addextension='.lcnv.txtdir')

    def run(self):
        #Set up the output directory, will create it and tear it down on failure automatically
        self.setup_output_dir(self.out_txtdir().path)

        #gather input files
        inputfiles = [
            filename for filename in glob.glob(self.in_txtdir().path + '/*.' +
                                               self.extension)
        ]

        #inception aka dynamic dependencies: we yield a list of tasks to perform which could not have been predicted statically
        #in this case we run the OCR_singlepage component for each input file in the directory
        yield [
            Voweleater(inputfile=inputfile,
                       outputdir=self.out_txtdir().path,
                       startcomponent='Lowercaser') for inputfile in inputfiles
        ]
Ejemplo n.º 13
0
class FilterEvents(StandardWorkflowComponent):

    citylist = Parameter()
    overlap_threshold = Parameter(default=0.2)
    similarity_threshold = Parameter(default=0.7)

    def accepts(self):
        return (InputFormat(self,
                            format_id='merged_events',
                            extension='.merged'),
                InputComponent(self,
                               MergeEvents,
                               overlap_threshold=self.overlap_threshold,
                               similarity_threshold=self.similarity_threshold))

    def autosetup(self):
        return FilterEventsTask
Ejemplo n.º 14
0
class UpdateEventTypes(WorkflowComponent):

    events = Parameter()
    predictiondir = Parameter()

    text = BoolParameter()

    def accepts(self):
        return [ ( InputFormat(self,format_id='predictiondir',extension='.instances',inputparameter='predictiondir'), InputFormat(self,format_id='events',extension='.events.integrated',inputparameter='events') ) ]

    def setup(self, workflow, input_feeds):

        event_type_updater = workflow.new_task('update_event_types', UpdateEventTypesTask, autopass=True, text=self.text)
        event_type_updater.in_events = input_feeds['events']
        event_type_updater.in_predictiondir = input_feeds['predictiondir']

        return event_type_updater
Ejemplo n.º 15
0
class GetEntityTimeseriesMonth(WorkflowComponent):

    tweetdir = Parameter()
    events = Parameter()

    month = Parameter()
    
    def accepts(self):
        return [ ( InputFormat(self,format_id='tweetdir',extension='.tweets',inputparameter='tweetdir'), InputFormat(self,format_id='events',extension='.events',inputparameter='events') ) ]

    def setup(self, workflow, input_feeds):

        timeseries_generator = workflow.new_task('get_entity_timeseries', GetEntityTimeseriesTask, autopass=True, month=self.month)
        timeseries_generator.in_tweetdir = input_feeds['tweetdir']
        timeseries_generator.in_events = input_feeds['events']

        return timeseries_generator
Ejemplo n.º 16
0
class IntegrateEventDir(StandardWorkflowComponent):

    overlap_threshold = Parameter(default = 0.2)

    def accepts(self):
        return InputFormat(self, format_id='eventdir', extension='.events')

    def autosetup(self):
        return IntegrateEventDirTask
Ejemplo n.º 17
0
class DeduplicateEvents(StandardWorkflowComponent):

    similarity_threshold = Parameter(default=0.7)

    def accepts(self):
        return InputFormat(self, format_id='events', extension='.events')

    def autosetup(self):
        return DeduplicateEventsTask
Ejemplo n.º 18
0
class Symlink(Task):
    """Create a symlink"""

    filename = Parameter()
    stripextension = Parameter()
    addextension = Parameter()

    in_file = InputSlot()  #input slot

    def out_file(self):
        if self.filename:
            return TargetInfo(self, self.filename)
        else:
            return self.outputfrominput(inputformat='file',
                                        stripextension=self.stripextension,
                                        addextension=self.addextension)

    def run(self):
        os.symlink(self.in_file.path(), self.out_file.path())
Ejemplo n.º 19
0
class FoliaValidator(StandardWorkflowComponent):
    folia_extension = Parameter(default='folia.xml')

    def accepts(self):
        return (
            InputFormat(self, format_id='folia', extension=self.folia_extension),
            InputFormat(self, format_id='foliadir', extension='foliadir'))

    def autosetup(self):
        return FoliaValidatorTask, FoliaValidatorDirTask
Ejemplo n.º 20
0
class Ucto(StandardWorkflowComponent):
    """A workflow component for Ucto"""

    skip = Parameter(
        default=""
    )  #A parameter for the workflow, will be passed on to the tasks

    language = Parameter()
    tok_input_sentenceperline = BoolParameter(default=False)
    tok_output_sentenceperline = BoolParameter(default=False)

    def autosetup(self):
        return (Ucto_txt2folia, Ucto_folia2folia, Ucto_tok2folia)

    def accepts(self):
        """Returns a tuple of all the initial inputs and other workflows this component accepts as input (a disjunction, only one will be selected)"""
        return (InputFormat(self, format_id='folia', extension='folia.xml'),
                InputFormat(self, format_id='txt', extension='txt'),
                InputFormat(self, format_id='tok', extension='tok'),
                InputComponent(self, ConvertToFoLiA))
Ejemplo n.º 21
0
class CollectEventTweets(WorkflowComponent):

    tweetdir = Parameter()
    events = Parameter()
    entity_burstiness = Parameter()

    first_event_date = Parameter()
    last_event_date = Parameter()
    
    def accepts(self):
        return [ ( InputFormat(self,format_id='tweetdir',extension='.tweets',inputparameter='tweetdir'), InputFormat(self,format_id='events',extension='.events',inputparameter='events'), InputFormat(self,format_id='entity_burstiness',extension='.burstiness.txt',inputparameter='entity_burstiness') ) ]

    def setup(self, workflow, input_feeds):

        tweet_collector = workflow.new_task('collect_event_tweets', CollectEventTweetsTask, autopass=False, first_event_date=self.first_event_date, last_event_date=self.last_event_date)
        tweet_collector.in_tweetdir = input_feeds['tweetdir']
        tweet_collector.in_events = input_feeds['events']
        tweet_collector.in_entity_burstiness = input_feeds['entity_burstiness']

        return tweet_collector
Ejemplo n.º 22
0
class IntegrateEventDirTask(Task):

    ### task to speed up event integration for sliding window event extraction
    ### make sure that all events in the directory are deduplicated and enhanced before running this task
    ### only files with extension '.enhanced' will be integrated

    in_eventdir = InputSlot()

    overlap_threshold = Parameter()

    def out_integrated_events(self):
        return self.outputfrominput(inputformat='eventdir', stripextension='.events', addextension='events.integrated')

    def run(self):

        # collect all event files with extension '.enhanced'
        enhanced_events = glob.glob(self.in_eventdir().path + '/*.enhanced')

        # initialize
        merger = event_merger.EventMerger()
        overlap_threshold = float(self.overlap_threshold)

        # for each event file
        for eventfile in enhanced_events:
            print('Reading',eventfile)
            with open(eventfile, 'r', encoding = 'utf-8') as file_in:
                current_eventdicts = json.loads(file_in.read())
            new_event_objs = []
            for ed in current_eventdicts:
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                new_event_objs.append(eventobj)
            # merge before integration
            print('Merging new events before integration; number of events at start:',len(new_event_objs))
            premerger = event_merger.EventMerger()
            premerger.add_events(new_event_objs)
            premerger.find_merges(overlap_threshold)
            new_events_merged = premerger.return_events()
            print('Done. New events after merge:',len(new_events_merged))
            if len(merger.events) == 0:
                merger.add_events(new_events_merged)
            else:
                # integrate each event into the current ones
                print('Starting integrating new events; number of current events:',len(merger.events))
                for new_event in new_events_merged:
                    merger.find_merge(new_event,overlap_threshold)            

        # write merged 
        integrated_events = merger.return_events()
        print('Done. Number of events after integration:',len(integrated_events))
        out_integrated_events = [event.return_dict() for event in integrated_events]
        with open(self.out_integrated_events().path,'w',encoding='utf-8') as file_out:
            json.dump(out_integrated_events,file_out)
Ejemplo n.º 23
0
class CollectEventTweetsDaily(WorkflowComponent):

    tweetdir = Parameter()
    events = Parameter()
    entity_burstiness = Parameter()
    entity_burstiness_new = Parameter()

    date = Parameter() 

    def accepts(self):
        return [ ( InputFormat(self,format_id='tweetdir',extension='.tweets',inputparameter='tweetdir'), InputFormat(self,format_id='events',extension='.events.integrated',inputparameter='events'), InputFormat(self,format_id='entity_burstiness',extension='.burstiness.txt',inputparameter='entity_burstiness'), InputFormat(self,format_id='entity_burstiness_new',extension='.burstiness.txt',inputparameter='entity_burstiness_new') ) ]

    def setup(self, workflow, input_feeds):

        daily_tweet_collector = workflow.new_task('collect_event_tweets_daily', CollectEventTweetsDailyTask, autopass=False, date=self.date)
        daily_tweet_collector.in_tweetdir = input_feeds['tweetdir']
        daily_tweet_collector.in_events = input_feeds['events']
        daily_tweet_collector.in_entity_burstiness = input_feeds['entity_burstiness']
        daily_tweet_collector.in_entity_burstiness_new = input_feeds['entity_burstiness_new']

        return daily_tweet_collector
Ejemplo n.º 24
0
class AssessBurstiness(WorkflowComponent):

    entity_counts = Parameter()
    dates = Parameter()
    vocabulary = Parameter()
    events = Parameter()

    burstiness_threshold = IntParameter()

    def accepts(self):
        return [ ( InputFormat(self,format_id='entity_counts',extension='.counts.npz',inputparameter='entity_counts'), InputFormat(self,format_id='dates',extension='.counts_dates',inputparameter='dates'), InputFormat(self,format_id='vocabulary',extension='.counts_vocabulary',inputparameter='vocabulary'), InputFormat(self,format_id='events',extension='.events.integrated',inputparameter='events') ) ]

    def setup(self, workflow, input_feeds):

        burstiness_assessor = workflow.new_task('assess_burstiness', AssessBurstinessTask, autopass=False, burstiness_threshold=self.burstiness_threshold)
        burstiness_assessor.in_entity_counts = input_feeds['entity_counts']
        burstiness_assessor.in_dates = input_feeds['dates']
        burstiness_assessor.in_vocabulary = input_feeds['vocabulary']
        burstiness_assessor.in_events = input_feeds['events']

        return burstiness_assessor
Ejemplo n.º 25
0
class ArchiveEventsTask(Task):

    in_events = InputSlot()
    in_archivedir = InputSlot()

    archivedate = Parameter()

    def out_archived(self):
        return self.outputfrominput(inputformat='archivedir',
                                    stripextension='.archive',
                                    addextension='.archive/events_' +
                                    self.archivedate + '.json')

    def out_active_events(self):
        return self.outputfrominput(inputformat='events',
                                    stripextension='.events.integrated',
                                    addextension='.active.events.integrated')

    def run(self):

        # read events
        archive_events = []
        active_events = []
        date = datetime.datetime(int(self.archivedate[:4]),
                                 int(self.archivedate[4:6]),
                                 int(self.archivedate[6:8]))
        print('Reading events')
        with open(self.in_events().path, 'r', encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
            for i, ed in enumerate(eventdicts):
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                if eventobj.datetime == date:
                    archive_events.append(eventobj)
                else:
                    active_events.append(eventobj)

        # write archive
        print('Writing archive')
        out_archive_events = [
            event.return_dict(txt=False) for event in archive_events
        ]
        with open(self.out_archived().path, 'w', encoding='utf-8') as file_out:
            json.dump(out_archive_events, file_out)

        # write active events
        print('Writing active events')
        out_active_events = [
            event.return_dict(txt=False) for event in active_events
        ]
        with open(self.out_active_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_active_events, file_out)
Ejemplo n.º 26
0
class ExtractEntities(StandardWorkflowComponent):

    commonness_txt = Parameter()
    commonness_cls = Parameter()
    commonness_corpus = Parameter()
    ngrams_score = Parameter()

    config = Parameter()
    strip_punctuation = BoolParameter()
    to_lowercase = BoolParameter()
    skip_date = BoolParameter()
    skip_month = BoolParameter()
    skip_timeunit = BoolParameter()
    skip_day = BoolParameter()
    citylist = Parameter()
    format_json = BoolParameter()

    def accepts(self):
        return (InputFormat(self, format_id='cityref', extension='.json'),
                InputComponent(self,
                               ExtractCityref,
                               config=self.config,
                               strip_punctuation=self.strip_punctuation,
                               to_lowercase=self.to_lowercase,
                               citylist=self.citylist,
                               skip_date=self.skip_date,
                               skip_month=self.skip_month,
                               skip_timeunit=self.skip_timeunit,
                               skip_day=self.skip_day))

    def autosetup(self):
        return ExtractEntitiesTask
Ejemplo n.º 27
0
class CombineEntityTimeseries(WorkflowComponent):

    entity_counts_dir = Parameter()
    
    def accepts(self):
        return [ ( InputFormat(self,format_id='entity_counts_dir',extension='.timeseries',inputparameter='entity_counts_dir') ) ]

    def setup(self, workflow, input_feeds):

        timeseries_combiner = workflow.new_task('combine_entity_timeseries', CombineEntityTimeseriesTask, autopass=True)
        timeseries_combiner.in_entity_counts_dir = input_feeds['entity_counts_dir']

        return timeseries_combiner
Ejemplo n.º 28
0
class EnhanceEvents(StandardWorkflowComponent):

    similarity_threshold = Parameter(default=0.7)

    def accepts(self):
        return (InputFormat(self,
                            format_id='events',
                            extension='.deduplicated'),
                InputComponent(self,
                               DeduplicateEvents,
                               similarity_threshold=self.similarity_threshold))

    def autosetup(self):
        return EnhanceEventsTask
Ejemplo n.º 29
0
class OCR_folia(StandardWorkflowComponent):
    """OCR with FoLiA output"""
    language = Parameter()

    def setup(self, workflow, input_feeds):
        foliahocr = workflow.new_task('foliahocr', FoliaHOCR)
        foliahocr.in_hocrdir = input_feeds['hocrdir']
        foliacat = workflow.new_task('foliacat', Foliacat)
        foliacat.in_foliadir = foliahocr.out_foliadir
        return foliacat

    def accepts(self):
        """Returns a tuple of all the initial inputs and other workflows this component accepts as input (a disjunction, only one will be selected)"""
        return InputComponent(self, OCR_document)
Ejemplo n.º 30
0
class OCR_document(StandardWorkflowComponent):

    language = Parameter()

    def autosetup(self):
        return TesseractOCR_document

    def accepts(self):
        """Returns a tuple of all the initial inputs and other workflows this component accepts as input (a disjunction, only one will be selected)"""
        return (InputFormat(self,
                            format_id='tiffdir',
                            extension='tiffdir',
                            directory=True),
                InputComponent(self, ExtractPages))