Exemple #1
0
    def add_document(self, doc):
        """Add a document to the batch.

        Adds a document tuple to the batch and checks if it exists.

        Args:
            doc (tuple): A standard document tuple.

        Raises:
            NidabaInputException: The document tuple does not refer to a file.
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')

        if not self.storage.is_file(*doc):
            raise NidabaInputException('Input document is not a file.')

        with self.redis.pipeline() as pipe:
            while (1):
                try:
                    pipe.watch(self.id)
                    self._restore_and_create_scratchpad(pipe)
                    self.docs.append(doc)
                    self.scratchpad['scratchpad']['docs'] = self.docs
                    pipe.set(self.id, json.dumps(self.scratchpad))
                    pipe.execute()
                    break
                except self.redis.WatchError:
                    continue
Exemple #2
0
    def rm_document(self, doc):
        """Removes a document from the (unexecuted) batch.

        Removes a document tuple from the batch.

        Args:
            doc (tuple): A standard document tuple.

        Raises:
            NidabaInputException: The document tuple does not refer to a file.
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')

        with self.redis.pipeline() as pipe:
            while (1):
                try:
                    pipe.watch(self.id)
                    self._restore_and_create_scratchpad(pipe)
                    self.docs.remove(list(doc))
                    self.scratchpad['scratchpad']['docs'] = self.docs
                    pipe.set(self.id, json.dumps(self.scratchpad))
                    pipe.execute()
                    break
                except WatchError:
                    continue
                except ValueError:
                    raise NidabaInputException(
                        'Document not part of the batch')
Exemple #3
0
def task_arg_validator(arg_values, **kwargs):
    """
    Validates keyword arguments against the list of valid argument values
    contained in the task definition.

    Raises:
        NidabaInputException if validation failed.
    """
    kwc = kwargs.copy()

    def _val_single_arg(arg, type):
        if type == 'float':
            if not isinstance(val, float) and not isinstance(val, int):
                raise NidabaInputException('{} is not a float'.format(val))
        elif type == 'int':
            if not isinstance(val, int):
                raise NidabaInputException('{} is not an int'.format(val))
        elif type == 'str':
            if not isinstance(val, basestring):
                raise NidabaInputException('{} is not a string'.format(val))
        # XXX: Add file/files checker for local case
        elif type == 'file':
            pass
        elif type == 'files':
            pass
        else:
            raise NidabaInputException('Argument type {} unknown'.format(type))

    for k, v in arg_values.iteritems():
        try:
            val = kwc.pop(k)
        except:
            raise NidabaInputException('Missing argument: {}'.format(k))
        if isinstance(v, tuple):
            if not isinstance(val, type(v[0])):
                raise NidabaInputException(
                    '{} of different type than range fields'.format(val))
            if val < v[0] or val > v[1]:
                raise NidabaInputException(
                    '{} outside of allowed range {}-{}'.format(val, *v))
        elif isinstance(v, list):
            if isinstance(val, Iterable) and not isinstance(val, basestring):
                va = set(val)
            else:
                va = set([val])
            if not set(v).issuperset(va):
                raise NidabaInputException(
                    '{} not in list of valid values'.format(val))
        else:
            _val_single_arg(val, v)

    if kwc:
        raise NidabaInputException('Superfluous arguments present')
Exemple #4
0
    def get_state(self):
        """
        Retrieves the current state of a batch.

        Returns:
            (unicode): A string containing one of the following states:

                NONE: The batch ID is not registered in the backend.
                FAILURE: Batch execution has failed.
                PENDING: The batch is currently running.
                SUCCESS: The batch has completed successfully.
        """
        if not self.id:
            raise NidabaInputException('Object not attached to batch.')
        r = requests.get('{}/batch/{}'.format(self.host, self.id))
        r.raise_for_status()
        batch = r.json()
        if 'scratchpad' in batch:
            return u'NONE'
        elif 'chains' in batch:
            self.lock = True
            batch = batch['chains']
            st = u'SUCCESS'
            for subtask in batch.itervalues():
                if subtask['state'] == 'PENDING' or subtask[
                        'state'] == 'RUNNING':
                    st = u'PENDING'
                if subtask['state'] == 'FAILURE':
                    return u'FAILURE'
            return st
        else:
            return u'NONE'
Exemple #5
0
    def run(self):
        """Executes the current batch definition.

        Expands the current batch definition to a series of celery chains and
        executes them asynchronously. Additionally a batch record is written to
        the celery result backend.

        Returns:
            (unicode): Batch identifier.
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')
        # reorder task definitions
        keys = [
            'img', 'binarize', 'segmentation', 'ocr', 'stats',
            'postprocessing', 'output'
        ]
        tasks = OrderedDict((key, self.tasks[key]) for key in keys)
        self.add_step()
        for group, btasks in tasks.iteritems():
            self.add_tick()
            for task in btasks:
                super(SimpleBatch,
                      self).add_task('{}.{}'.format(group, task[0]), **task[1])
                if self.order[group] == 'sequence':
                    self.add_tick()
        self.lock = True
        return super(SimpleBatch, self).run()
Exemple #6
0
 def _val_single_arg(arg, type):
     if type == 'float':
         if not isinstance(val, float) and not isinstance(val, int):
             raise NidabaInputException('{} is not a float'.format(val))
     elif type == 'int':
         if not isinstance(val, int):
             raise NidabaInputException('{} is not an int'.format(val))
     elif type == 'str':
         if not isinstance(val, basestring):
             raise NidabaInputException('{} is not a string'.format(val))
     # XXX: Add file/files checker for local case
     elif type == 'file':
         pass
     elif type == 'files':
         pass
     else:
         raise NidabaInputException('Argument type {} unknown'.format(type))
Exemple #7
0
    def add_task(self, group, method, **kwargs):
        """Add a task.

        Adds a ``task``, a single executable task gathering one or more input
        documents and returning a single output document, to the current tick.
        Multiple jobs are run in parallel.

        Args:
            group (unicode): A task group identifier
            method (unicode): A task identifier
            **kwargs: Arguments to the task

        Raises:
            NidabaInputException: Trying to modify executed task.
            NidabaNoSuchAlgorithmException: Invalid method given.
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')
        # validate that the task exists
        if group not in self.tasks:
            raise NidabaNoSuchAlgorithmException(
                'Unknown task group {}'.format(group))
        if u'nidaba.{}.{}'.format(group, method) not in self.celery.app.tasks:
            raise NidabaNoSuchAlgorithmException('Unknown task {} {}'.format(
                group, method))
        task = self.celery.app.tasks[u'nidaba.{}.{}'.format(group, method)]
        # validate arguments first against getcallargs
        try:
            getcallargs(task.run, ('', ''), **kwargs)
        except TypeError as e:
            raise NidabaInputException(str(e))
        # validate against arg_values field of the task
        task_arg_validator(task.get_valid_args(), **kwargs)
        with self.redis.pipeline() as pipe:
            while (1):
                try:
                    pipe.watch(self.id)
                    self._restore_and_create_scratchpad(pipe)
                    self.tasks[group].append((method, kwargs))
                    self.scratchpad['scratchpad']['simple_tasks'] = self.tasks
                    pipe.set(self.id, json.dumps(self.scratchpad))
                    pipe.execute()
                    break
                except WatchError:
                    continue
Exemple #8
0
 def get_documents(self):
     """
     Returns the list of input document for this task.
     """
     if not self.id:
         raise NidabaInputException('Object not attached to batch.')
     r = requests.get('{}/batch/{}/pages'.format(self.host, self.id))
     r.raise_for_status()
     return r.json()
Exemple #9
0
    def run(self):
        """
        Executes the current batch definition.

        Expands the current batch definition to a series of celery chains and
        executes them asynchronously. Additionally a batch record is written to
        the celery result backend.

        Returns:
            (unicode): Batch identifier.
        """
        if not self.id:
            raise NidabaInputException('Object not attached to batch.')
        if self.lock:
            raise NidabaInputException('Executed batch may not be reexecuted')
        r = requests.post('{}/batch/{}'.format(self.host, self.id))
        r.raise_for_status()
        return self.id
Exemple #10
0
 def next(self):
     row = self.reader.next()
     if len(row) != 5:
         raise NidabaInputException('Incorrect number of columns')
     coords = [int(s) for s in row[:-1]]
     coordinates = [
         coords[0], coords[1], coords[0] + coords[2], coords[1] + coords[3]
     ]
     return coordinates + [row[-1]]
Exemple #11
0
 def get_tasks(self):
     """
     Returns the task tree either from the scratchpad or from the pipeline
     when already in execution.
     """
     if not self.id:
         raise NidabaInputException('Object not attached to batch.')
     r = requests.get('{}/batch/{}/tasks'.format(self.host, self.id))
     r.raise_for_status()
     return r.json()
Exemple #12
0
    def add_document(self, path, callback, auxiliary=False):
        """
        Add a document to the batch.

        Uploads a document to the API server and adds it to the batch.

        ..note::
            Note that this function accepts a standard file system path and NOT
            a storage tuple as a client using the web API is not expected to
            keep a separate, local storage medium.

        Args:
            path (unicode): Path to the document
            callback (function): A function that is called with a
                                 ``requests_toolbelt.multipart.encoder.MultipartEncoderMonitor``
                                instance.
            auxiliary (bool): Switch to disable setting the file as an input
                              document. May be used to upload ground truths,
                              metadata, and other ancillary files..

        Raises:
            NidabaInputException: The document does not refer to a file or the
                                  batch is locked because the run() method has
                                  been called.
        """
        if not self.id:
            raise NidabaInputException('Object not attached to batch.')
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')
        if auxiliary:
            params = {'auxiliary': True}
        else:
            params = {}
        m = encoder.MultipartEncoderMonitor.from_fields(
            fields={'scans': (os.path.basename(path), open(path, 'rb'))},
            callback=callback)
        r = requests.post('{}/batch/{}/pages'.format(self.host, self.id),
                          data=m,
                          headers={'Content-Type': m.content_type},
                          params=params)
        r.raise_for_status()
        return r.json()[0]['url']
Exemple #13
0
    def add_task(self, group, method, *args, **kwargs):
        """
        Add a particular task configuration to a task group.

        Args:
            group (unicode): Group the task belongs to
            method (unicode): Name of the task
            kwargs: Arguments to the task
        """
        if not self.id:
            raise NidabaInputException('Object not attached to batch.')
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')
        # validate that the task exists
        if group not in self.allowed_tasks or method not in self.allowed_tasks[
                group]:
            raise NidabaInputException('Unknown task {}'.format(method))
        r = requests.post('{}/batch/{}/tasks/{}/{}'.format(
            self.host, self.id, group, method),
                          json=kwargs)
        r.raise_for_status()
Exemple #14
0
    def __init__(self, id=None):
        # stuff depending on a valid configuration
        from nidaba import storage
        from nidaba import config
        self.storage = storage

        # slowly importing stuff
        from nidaba import tasks
        from nidaba import plugins
        from nidaba import celery
        self.task_reg = tasks
        self.celery = celery

        self.id = id
        if self.id is None:
            self.id = uuid.uuid4().get_hex()
            self.storage.prepare_filestore(self.id)
        if not self.storage.is_valid_job(self.id):
            raise NidabaInputException('Storage not prepared for task')

        self.docs = []
        self.scratchpad = {}
        self.redis = config.Redis

        self.tasks = OrderedDict([('img', []), ('binarize', []),
                                  ('segmentation', []), ('ocr', []),
                                  ('stats', []), ('postprocessing', []),
                                  ('output', []), ('archive', [])])

        # defines if tasks in a group are run in parallel or in sequence and their merge mode
        self.order = {
            'img': ('sequence', False),
            'binarize': ('parallel', False),
            'segmentation': ('parallel', False),
            'ocr': ('parallel', False),
            'stats': ('parallel', False),
            'postprocessing': ('sequence', 'doc'),
            'output': ('sequence', False),
            'archive': ('parallel', True)
        }

        self.lock = False
        with self.redis.pipeline() as pipe:
            while (1):
                try:
                    pipe.watch(self.id)
                    self._restore_and_create_scratchpad(pipe)
                    if 'scratchpad' not in self.scratchpad:
                        self.lock = True
                    pipe.execute()
                    break
                except WatchError:
                    continue
Exemple #15
0
    def rm_task(self, group, method, **kwargs):
        """Removes a task from the (unexecuted) batch.

        Removes a task from the batch.

        Args:
            group (unicode): A task group identifier
            method (unicode): A task identifier
            **kwargs: Arguments to the task

        Raises:
            NidabaInputException: Trying to modify executed task.
            NidabaNoSuchAlgorithmException: Invalid method given.
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')
        # validate that the task exists
        if group not in self.tasks:
            raise NidabaNoSuchAlgorithmException(
                'Unknown task group {}'.format(group))
        if u'nidaba.{}.{}'.format(group, method) not in self.celery.app.tasks:
            raise NidabaNoSuchAlgorithmException('Unknown task {} {}'.format(
                group, method))
        task = self.celery.app.tasks[u'nidaba.{}.{}'.format(group, method)]
        with self.redis.pipeline() as pipe:
            while (1):
                try:
                    pipe.watch(self.id)
                    self._restore_and_create_scratchpad(pipe)
                    self.tasks[group].remove([method, kwargs])
                    self.scratchpad['scratchpad']['simple_tasks'] = self.tasks
                    pipe.set(self.id, json.dumps(self.scratchpad))
                    pipe.execute()
                    break
                except WatchError:
                    continue
                except ValueError:
                    raise NidabaInputException('Task not part of the batch')
Exemple #16
0
    def get_extended_state(self):
        """
        Returns the extended batch state.

        Raises:
            NidabaInputException if the batch hasn't been executed yet.
        """
        if not self.id:
            raise NidabaInputException('Object not attached to batch.')
        r = requests.get('{}/batch/{}'.format(self.host, self.id))
        r.raise_for_status()
        if 'chains' in r.json():
            self.lock = True
            return r.json()['chains']
Exemple #17
0
 def create_batch(self):
     """
     Creates a batch on the server. Also synchronizes the list of available
     tasks and their parameters.
     """
     if self.id is not None:
         raise NidabaInputException(
             'SimpleBatch object already initialized')
     r = requests.post('{}/batch'.format(self.host))
     r.raise_for_status()
     self.id = r.json()['id']
     self.lock = False
     self.get_available_tasks()
     return self.id
Exemple #18
0
    def add_task(self, group, method, **kwargs):
        """
        Add a particular task configuration to a task group.

        Args:
            group (unicode): Group the task belongs to
            method (unicode): Name of the task
            kwargs: Arguments to the task
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')
        # validate that the task exists
        if group not in self.tasks:
            raise NidabaNoSuchAlgorithmException('Unknown task group')
        if u'nidaba.{}.{}'.format(group, method) not in self.celery.app.tasks:
            raise NidabaNoSuchAlgorithmException('Unknown task')
        task = self.celery.app.tasks[u'nidaba.{}.{}'.format(group, method)]
        # validate arguments first against getcallargs
        try:
            getcallargs(task.run, ('', ''), **kwargs)
        except TypeError as e:
            raise NidabaInputException(str(e))
        # validate against arg_values field of the task
        task_arg_validator(task.get_valid_args(), **kwargs)
        with self.redis.pipeline() as pipe:
            while (1):
                try:
                    pipe.watch(self.id)
                    self._restore_and_create_scratchpad(pipe)
                    self.tasks[group].append((method, kwargs))
                    self.scratchpad['scratchpad']['tasks'] = self.tasks
                    pipe.set(self.id, json.dumps(self.scratchpad))
                    pipe.execute()
                    break
                except WatchError:
                    continue
Exemple #19
0
    def rm_document(self, path):
        """
        Removes a document from the batch.

        ..note::
            Note that this function accepts a standard file system path and NOT
            a storage tuple as a client using the web API is not expected to
            keep a separate, local storage medium.

        Args:
            path (unicode): Path to the document

        Raises:
            NidabaInputException: The document does not refer to a file or the
                                  batch is locked because the run() method has
                                  been called.
        """
        if not self.id:
            raise NidabaInputException('Object not attached to batch.')
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')
        r = requests.delete('{}/batch/{}/pages'.format(self.host, self.id),
                            json={'scans': [path]})
        r.raise_for_status()
Exemple #20
0
 def is_running(self):
     """
     Returns True if the batch's run() method has been successfully called, otherwise False.
     """
     if not self.id:
         raise NidabaInputException('Object not attached to batch.')
     r = requests.get('{}/batch/{}'.format(self.host, self.id))
     r.raise_for_status()
     self.lock = True
     if r.json():
         self.lock = True
         return True
     else:
         self.lock = False
         return False
Exemple #21
0
    def add_document(self, doc):
        """Add a document to the batch.

        Adds a document tuple to the batch and checks if it exists.

        Args:
            doc (tuple): A standard document tuple.

        Raises:
            NidabaInputException: The document tuple does not refer to a file
                                  or the batch is locked because the run()
                                  method has been called.
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')
        super(SimpleBatch, self).add_document(doc)
Exemple #22
0
    def __init__(self, id=None):
        # stuff depending on a valid configuration
        from nidaba import storage
        self.storage = storage

        # slowly importing stuff (tasks and plugins also needed so the task
        # registry is complete)
        from nidaba import tasks
        from nidaba import plugins
        from nidaba import celery
        self.celery = celery

        self.tasks = OrderedDict([('img', []), ('binarize', []),
                                  ('segmentation', []), ('ocr', []),
                                  ('stats', []), ('postprocessing', []),
                                  ('output', [])])

        # defines if tasks in a group are run in parallel or in sequence
        self.order = {
            'img': 'sequence',
            'binarize': 'parallel',
            'segmentation': 'parallel',
            'ocr': 'parallel',
            'stats': 'parallel',
            'postprocessing': 'sequence',
            'output': 'sequence'
        }
        if id is None:
            id = unicode(uuid.uuid4())
            self.storage.prepare_filestore(id)
        if not self.storage.is_valid_job(id):
            raise NidabaInputException('Storage not prepared for task')
        super(SimpleBatch, self).__init__(id)
        self.lock = False
        with self.redis.pipeline() as pipe:
            while (1):
                try:
                    pipe.watch(self.id)
                    self._restore_and_create_scratchpad(pipe)
                    if 'scratchpad' not in self.scratchpad:
                        self.lock = True
                    pipe.execute()
                    break
                except WatchError:
                    continue
Exemple #23
0
    def get_results(self):
        """
        Retrieves the storage tuples of a successful batch.

        Returns:
        """
        if not self.id:
            raise NidabaInputException('Object not attached to batch.')
        r = requests.get('{}/batch/{}'.format(self.host, self.id))
        r.raise_for_status()
        if 'chains' in r.json():
            self.lock = True
            batch = r.json()['chains']
            outfiles = []
            for subtask in batch.itervalues():
                if len(subtask['children']) == 0 and not subtask[
                        'housekeeping'] and subtask['result'] is not None:
                    outfiles.append(
                        (subtask['result'], subtask['root_document']))
            return outfiles
        else:
            return None
Exemple #24
0
    def run(self):
        """Executes the current batch definition.

        Expands the current batch definition to a series of celery chains and
        executes them asynchronously. Additionally a batch record is written to
        the celery result backend.

        Returns:
            (unicode): Batch identifier.
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')

        # reorder task definitions
        keys = [
            'img', 'binarize', 'segmentation', 'ocr', 'stats',
            'postprocessing', 'output', 'archive'
        ]
        tasks = OrderedDict((key, self.tasks[key]) for key in keys)
        first = []
        prev = None
        result_data = {}
        self.lock = True

        # build chain
        root_docs = self.docs
        prev = []
        for group, step in tasks.iteritems():
            # skip groups without tasks
            if not step:
                continue
            sequential = True if self.order[group][0] == 'sequence' else False
            mmode = self.order[group][1]

            def _repeat(lst, n):
                return list(
                    itertools.chain.from_iterable(
                        itertools.repeat(x, n) for x in lst))

            if sequential:
                step = [step]
            # multiply number of tasks in this step by number of tasks in
            # previous step if not merging
            if not mmode:
                step = _repeat(step, len(root_docs))
                root_docs = root_docs * (len(step) / len(root_docs))
            # by number of root docs if doc merging
            elif mmode == 'doc':
                step = _repeat(step, len(self.docs))
                root_docs = self.docs
            else:
                root_docs = [root_docs] * len(step)
            if not sequential:
                step = [[x] for x in step]
            nprev = []
            r = []
            for rd_idx, (rdoc, c) in enumerate(zip(root_docs, step)):
                if sequential:
                    r.append([])
                for idx, (fun, kwargs) in enumerate(c):
                    # if idx > 0 (sequential == true) parent is previous task in sequence
                    if idx > 0:
                        parents = [task_id]
                    # if merge mode is 'doc' base parents are tasks n * (len(prev)/len(docs)) to n+1 ...
                    elif mmode == 'doc':
                        parents = prev[rd_idx::len(root_docs)]
                    # if merging everything all tasks in previous step are parents
                    elif mmode:
                        parents = prev
                    # if not merging a single task in previous step is the parent
                    elif mmode is False:
                        parents = [prev[rd_idx % len(prev)]] if prev else prev
                    task_id = uuid.uuid4().get_hex()
                    # last task in a sequence is entered into new prev array
                    if idx + 1 == len(c):
                        nprev.append(task_id)
                    result_data[task_id] = {
                        'children': [],
                        'parents': parents,
                        'root_documents': [rdoc],
                        'state': 'PENDING',
                        'result': None,
                        'task': (group, fun, kwargs)
                    }
                    for parent in parents:
                        result_data[parent]['children'].append(task_id)
                    task = self.celery.app.tasks[u'nidaba.{}.{}'.format(
                        group, fun)]
                    if sequential:
                        r[-1].append(
                            task.s(batch_id=self.id, task_id=task_id,
                                   **kwargs))
                    else:
                        r.append(
                            task.s(batch_id=self.id, task_id=task_id,
                                   **kwargs))
            prev = nprev
            t = self.celery.app.tasks[u'nidaba.util.barrier'].s(
                merging=mmode,
                sequential=sequential,
                replace=r,
                root_docs=self.docs)
            first.append(t)
        with self.redis.pipeline() as pipe:
            while (1):
                try:
                    pipe.watch(self.id)
                    self._restore_and_create_scratchpad(pipe)
                    # also deletes the scratchpad
                    pipe.set(self.id, json.dumps(result_data))
                    pipe.execute()
                    break
                except self.redis.WatchError:
                    continue
        chain(first).apply_async(args=[self.docs])
        return self.id