Exemple #1
0
    def run(self):
        """Executes the current batch definition.

        Expands the current batch definition to a series of celery chains and
        executes them asynchronously. Additionally a batch record is written to
        the celery result backend.

        Returns:
            (unicode): Batch identifier.
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')
        # reorder task definitions
        keys = ['img', 'binarize', 'segmentation', 'ocr', 'stats', 'postprocessing', 'output']
        tasks = OrderedDict((key, self.tasks[key]) for key in keys)
        self.add_step()
        for group, btasks in tasks.iteritems():
            self.add_tick()
            for task in btasks:
                super(SimpleBatch, self).add_task('{}.{}'.format(group,
                                                                 task[0]),
                                                  **task[1])
                if self.order[group] == 'sequence':
                    self.add_tick()
        self.lock = True
        return super(SimpleBatch, self).run()
Exemple #2
0
    def run(self):
        """Executes the current batch definition.

        Expands the current batch definition to a series of celery chains and
        executes them asynchronously. Additionally a batch record is written to
        the celery result backend.

        Returns:
            (unicode): Batch identifier.
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')
        # reorder task definitions
        keys = [
            'img', 'binarize', 'segmentation', 'ocr', 'stats',
            'postprocessing', 'output'
        ]
        tasks = OrderedDict((key, self.tasks[key]) for key in keys)
        self.add_step()
        for group, btasks in tasks.iteritems():
            self.add_tick()
            for task in btasks:
                super(SimpleBatch,
                      self).add_task('{}.{}'.format(group, task[0]), **task[1])
                if self.order[group] == 'sequence':
                    self.add_tick()
        self.lock = True
        return super(SimpleBatch, self).run()
Exemple #3
0
    def run(self):
        """Executes the current batch definition.

        Expands the current batch definition to a series of celery chains and
        executes them asynchronously. Additionally a batch record is written to
        the celery result backend.

        Returns:
            (unicode): Batch identifier.
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')

        # resync batch before execution
        with self.redis.pipeline() as pipe:
            while(1):
                try:
                    pipe.watch(self.id)
                    self._restore_and_create_scratchpad(pipe)

                    # reorder task definitions
                    keys = ['img', 'binarize', 'segmentation', 'ocr', 'stats', 'postprocessing', 'output', 'archive']
                    tasks = OrderedDict((key, self.tasks[key]) for key in keys)
                    first = []
                    prev = None
                    result_data = {}
                    self.lock = True
            
                    # build chain
                    root_docs = sorted(self.docs, key=lambda x: x[1])
                    prev = []
                    for group, step in tasks.iteritems():
                        # skip groups without tasks
                        if not step:
                            continue
                        sequential = True if self.order[group][0] == 'sequence' else False
                        mmode = self.order[group][1]
            
                        def _repeat(lst, n):
                            return list(itertools.chain.from_iterable(itertools.repeat(x, n) for x in lst))
            
                        if sequential:
                            step = [step]
                        # multiply number of tasks in this step by number of tasks in
                        # previous step if not merging
                        if not mmode:
                            step = _repeat(step, len(root_docs))
                            root_docs = root_docs * (len(step) / len(root_docs))
                        # by number of root docs if doc merging
                        elif mmode == 'doc':
                            step = _repeat(step, len(self.docs))
                            root_docs = self.docs
                        else:
                            root_docs = [root_docs] * len(step)
                        if not sequential:
                            step = [[x] for x in step]
                        nprev = []
                        r = []
                        for rd_idx, (rdoc, c) in enumerate(zip(root_docs, step)):
                            if sequential:
                                r.append([])
                            for idx, (fun, kwargs) in enumerate(c):
                                # if idx > 0 (sequential == true) parent is previous task in sequence
                                if idx > 0:
                                    parents = [task_id]
                                # if merge mode is 'doc' base parents are tasks n * (len(prev)/len(docs)) to n+1 ...
                                elif mmode == 'doc':
                                    parents = prev[rd_idx::len(root_docs)]
                                # if merging everything all tasks in previous step are parents
                                elif mmode:
                                    parents = prev
                                # if not merging a single task in previous step is the parent
                                elif mmode is False:
                                    parents = [prev[rd_idx % len(prev)]] if prev else prev
                                task_id = uuid.uuid4().get_hex()
                                # last task in a sequence is entered into new prev array
                                if idx + 1 == len(c):
                                    nprev.append(task_id)
                                result_data[task_id] = {'children': [],
                                                        'parents': parents,
                                                        'root_documents': rdoc if mmode else [rdoc],
                                                        'state': 'PENDING',
                                                        'result': None,
                                                        'task': (group, fun, kwargs)}
                                for parent in parents:
                                    result_data[parent]['children'].append(task_id)
                                task = self.celery.app.tasks[u'nidaba.{}.{}'.format(group, fun)]
                                if sequential:
                                    r[-1].append(task.s(batch_id=self.id, task_id=task_id, **kwargs))
                                else:
                                    r.append(task.s(batch_id=self.id, task_id=task_id, **kwargs))
                        prev = nprev
                        t = self.celery.app.tasks[u'nidaba.util.barrier'].s(merging=mmode, sequential=sequential, replace=r, root_docs=self.docs)
                        first.append(t)

                    pipe.set(self.id, json.dumps(result_data))
                    chain(first).apply_async(args=[self.docs])
                    break
                except WatchError:
                    continue
        return self.id
Exemple #4
0
    def run(self):
        """Executes the current batch definition.

        Expands the current batch definition to a series of celery chains and
        executes them asynchronously. Additionally a batch record is written to
        the celery result backend.

        Returns:
            (unicode): Batch identifier.
        """
        if self.lock:
            raise NidabaInputException('Executed batch may not be modified')

        # reorder task definitions
        keys = [
            'img', 'binarize', 'segmentation', 'ocr', 'stats',
            'postprocessing', 'output', 'archive'
        ]
        tasks = OrderedDict((key, self.tasks[key]) for key in keys)
        first = []
        prev = None
        result_data = {}
        self.lock = True

        # build chain
        root_docs = self.docs
        prev = []
        for group, step in tasks.iteritems():
            # skip groups without tasks
            if not step:
                continue
            sequential = True if self.order[group][0] == 'sequence' else False
            mmode = self.order[group][1]

            def _repeat(lst, n):
                return list(
                    itertools.chain.from_iterable(
                        itertools.repeat(x, n) for x in lst))

            if sequential:
                step = [step]
            # multiply number of tasks in this step by number of tasks in
            # previous step if not merging
            if not mmode:
                step = _repeat(step, len(root_docs))
                root_docs = root_docs * (len(step) / len(root_docs))
            # by number of root docs if doc merging
            elif mmode == 'doc':
                step = _repeat(step, len(self.docs))
                root_docs = self.docs
            else:
                root_docs = [root_docs] * len(step)
            if not sequential:
                step = [[x] for x in step]
            nprev = []
            r = []
            for rd_idx, (rdoc, c) in enumerate(zip(root_docs, step)):
                if sequential:
                    r.append([])
                for idx, (fun, kwargs) in enumerate(c):
                    # if idx > 0 (sequential == true) parent is previous task in sequence
                    if idx > 0:
                        parents = [task_id]
                    # if merge mode is 'doc' base parents are tasks n * (len(prev)/len(docs)) to n+1 ...
                    elif mmode == 'doc':
                        parents = prev[rd_idx::len(root_docs)]
                    # if merging everything all tasks in previous step are parents
                    elif mmode:
                        parents = prev
                    # if not merging a single task in previous step is the parent
                    elif mmode is False:
                        parents = [prev[rd_idx % len(prev)]] if prev else prev
                    task_id = uuid.uuid4().get_hex()
                    # last task in a sequence is entered into new prev array
                    if idx + 1 == len(c):
                        nprev.append(task_id)
                    result_data[task_id] = {
                        'children': [],
                        'parents': parents,
                        'root_documents': [rdoc],
                        'state': 'PENDING',
                        'result': None,
                        'task': (group, fun, kwargs)
                    }
                    for parent in parents:
                        result_data[parent]['children'].append(task_id)
                    task = self.celery.app.tasks[u'nidaba.{}.{}'.format(
                        group, fun)]
                    if sequential:
                        r[-1].append(
                            task.s(batch_id=self.id, task_id=task_id,
                                   **kwargs))
                    else:
                        r.append(
                            task.s(batch_id=self.id, task_id=task_id,
                                   **kwargs))
            prev = nprev
            t = self.celery.app.tasks[u'nidaba.util.barrier'].s(
                merging=mmode,
                sequential=sequential,
                replace=r,
                root_docs=self.docs)
            first.append(t)
        with self.redis.pipeline() as pipe:
            while (1):
                try:
                    pipe.watch(self.id)
                    self._restore_and_create_scratchpad(pipe)
                    # also deletes the scratchpad
                    pipe.set(self.id, json.dumps(result_data))
                    pipe.execute()
                    break
                except self.redis.WatchError:
                    continue
        chain(first).apply_async(args=[self.docs])
        return self.id