Example #1
0
 def check_db_exists(self, epoch):
     logger.info('Check existence of %s' % self._db_name(epoch))
     with Task() as task:
         existence = ops.Const(False)
         ops.DBExists(
             [],
             [existence],
             db_name=self._db_name(epoch),
             db_type=self._db_type,
             absolute_path=True)
         task.add_output(existence)
     return task
Example #2
0
 def save(self, epoch):
     """
     Build a Task that is run once after `init_group` and after each
     epoch is run. This will execute a Save ops to serialize and persist
     blobs present in the global workspaace.
     """
     logger.info('Save to %s' % self._db_name(epoch))
     with Task() as task:
         ops.Save(
             self.blob_list(), [], db=self._db_name(epoch),
             db_type=self._db_type, absolute_path=True)
     return task
Example #3
0
 def load(self, epoch):
     """
     Build a Task that will be run by JobRunner when the job is to be
     resumed from a given epoch. This task will run a Load op that will
     load and deserialize all relevant blobs from a persistent storage.
     """
     with Task() as task:
         ops.Load([],
                  self.blob_list(),
                  db=self._dbname(epoch),
                  db_type=self._db_type,
                  absolute_path=True)
     return task
Example #4
0
def build_job():
    with Node('reader'):
        with Job() as job:
            with job.init_group:
                init_net = core.Net('init_net')
                data_arr = Struct(('val', np.array(range(10))))
                data = ConstRecord(init_net, data_arr)
                ds = Dataset(data)
                full_reader = ds.reader(init_net)
                total = init_net.Const([100])
                Task(step=init_net)

            def inc_total(rec):
                net = core.Net('inc_total')
                net.Add([total, rec.val()], [total])
                return [net]

            epoch_reader = ReaderWithLimit(full_reader, num_iter=3)
            pipe(epoch_reader, processor=inc_total)
            job.add_stop_signal(epoch_reader.data_finished())

        total_fetcher = Task(step=core.Net('empty'), outputs=[total])
    return job, total_fetcher
Example #5
0
 def init(self, nodes=None, retrieve_from_epoch=None):
     """
     Build a Task that will be run once after the job's `init_group` is run.
     This task will determine which blobs need to be checkpointed.
     If retrieve_from_epoch is not None, then the checkpoint metadata is
     retrieved from a previously saved checkpoint.
     """
     assert nodes is None or len(nodes) == 1, (
         'CheckpointManager only supports single node.')
     net = core.Net('get_blob_list')
     if retrieve_from_epoch is None:
         net.GetAllBlobNames(
             [],
             self._blob_names,
             include_shared=False)
     else:
         net.Load(
             [], self._blob_names,
             db=self._dbname(retrieve_from_epoch),
             db_type=self._db_type,
             absolute_path=True)
     task = Task(step=net, outputs=[self._blob_names])
     self._names_output = task.outputs()[0]
     return task
Example #6
0
 def init(self, nodes=None, retrieve_from_epoch=None):
     """
     Build a Task that will be run once after the job's `init_group` is run.
     This task will determine which blobs need to be snapshoted.
     If retrieve_from_epoch is not None, then the snapshot metadata is
     retrieved from a previously saved snapshot.
     """
     assert nodes is None or len(nodes) == 1, (
         'SnapshotManager only supports single node.')
     net = core.Net('get_blob_list')
     if retrieve_from_epoch is None:
         net.GetAllBlobNames(
             [],
             self._blob_names,
             include_shared=False)
     else:
         net.Load(
             [], self._blob_names,
             db=self._dbname(retrieve_from_epoch),
             db_type=self._db_type,
             absolute_path=True)
     task = Task(step=net, outputs=[self._blob_names])
     self._names_output = task.outputs()[0]
     return task
Example #7
0
 def load(self, epoch, path_prefix=None):
     """
     Build a Task that will be run by JobRunner when the job is to be
     resumed from a given epoch. This task will run a Load op that will
     load and deserialize all relevant blobs from a persistent storage.
     """
     full_db_name = self._db_name(epoch, path_prefix)
     logger.info("Loading checkpoints from = %s" % full_db_name)
     with Task() as task:
         ops.Load([],
                  self.blob_list(),
                  db=full_db_name,
                  db_type=self._db_type,
                  absolute_path=True)
     return task
Example #8
0
def pipe_and_output(
        input, output=None, num_threads=1, processor=None, name=None,
        capacity=None, group=None, final_outputs=None):
    """
    Similar to `pipe`, with the additional ability for the pipe Task to
    return output values to the `Session` once done.

    Returns:
        Tuple (out_queue, *task_outputs)
            out_queue:    same as return value of `pipe`.
            task_outputs: TaskOutput object, fetchable from the client after
                          session.run() returns.
    """
    result, step = _pipe_step(
        input, output, num_threads, processor, name, capacity, group,
        final_outputs)
    assert step is not None
    task = Task(step=step, group=group, outputs=final_outputs)
    output = None
    if final_outputs is not None:
        output = task.outputs()
        if type(final_outputs) not in (list, tuple):
            output = output[0]
    return result, output
Example #9
0
def pipe(input,
         output=None,
         num_threads=1,
         processor=None,
         name=None,
         capacity=None,
         group=None):
    """
    Given a Reader, Queue or DataStream in `input`, and optionally, a Writer,
    Queue or DataStream in `output`, creates a Task that, when run, will
    pipe the input into the output, using multiple parallel threads.
    Additionally, if a processor is given, it will be called between reading
    and writing steps, allowing it to transform the record.

    Args:
        input:       either a Reader, Queue or DataStream that will be read
                     until a stop is signaled either by the reader or the
                     writer.
        output:      either a Writer, a Queue or a DataStream that will be
                     writen to as long as neither reader or writer signal
                     a stop condition. If output is not provided or is None,
                     a Queue is created with given `capacity` and writen to.
        num_threads: number of concurrent threads used for processing and
                     piping. If set to 0, no Task is created, and a
                     reader is returned instead -- the reader returned will
                     read from the reader passed in and process it.
        processor:   (optional) function that takes an input record and
                     optionally returns a record; this will be called
                     between read and write steps. If the processor does
                     not return a record, a writer will not be instantiated.
                     Processor can also be a core.Net with input and output
                     records properly set. In that case, a NetProcessor is
                     instantiated, cloning the net for each of the threads.
        name:        (optional) name of the task to be created.
        capacity:    when output is not passed, a queue of given `capacity`
                     is created and written to.
        group:       (optional) explicitly add the created Task to this
                     TaskGroup, instead of using the currently active one.

    Returns:
        Output Queue, DataStream, Reader, or None, depending on the parameters
        passed.
    """
    result, step = _pipe_step(input, output, num_threads, processor, name,
                              capacity, group)
    if step is not None:
        Task(step=step, group=group)
    return result
Example #10
0
def build_pipeline(node_id):
    with Node('trainer_%d' % node_id):
        with Job.current().init_group, Task():
            data_arr = Struct(('val', np.array(list(range(10)))))
            data = ConstRecord(ops, data_arr)
            ds = Dataset(data, name='dataset:%d' % node_id)
            full_reader = ds.reader(ops)
            total = ops.Const([100])

        def inc_total(rec):
            ops.Add([total, rec.val()], [total])

        epoch_reader = ReaderWithLimit(full_reader, num_iter=3)
        pipe(epoch_reader, processor=inc_total)
        Job.current().add_stop_signal(epoch_reader.data_finished())
    return [total]
Example #11
0
def example_loop():
    with Task():
        total = ops.Const(0)
        total_large = ops.Const(0)
        total_small = ops.Const(0)
        total_tiny = ops.Const(0)
        with ops.loop(10) as loop:
            outer = ops.Mul([loop.iter(), ops.Const(10)])
            with ops.loop(loop.iter()) as inner:
                val = ops.Add([outer, inner.iter()])
                with ops.If(ops.GE([val, ops.Const(80)])) as c:
                    ops.Add([total_large, val], [total_large])
                with c.Elif(ops.GE([val, ops.Const(50)])) as c:
                    ops.Add([total_small, val], [total_small])
                with c.Else():
                    ops.Add([total_tiny, val], [total_tiny])
                ops.Add([total, val], total)
Example #12
0
 def test_multi_instance_python_op(self):
     """
     When task instances are created at runtime, C++ concurrently creates
     multiple instances of operators in C++, and concurrently destroys them
     once the task is finished. This means that the destructor of PythonOp
     will be called concurrently, so the GIL must be acquired. This
     test exercises this condition.
     """
     with Task(num_instances=64) as task:
         with ops.loop(4):
             ops.Python((python_op_builder, [], {}))([], [])
     with LocalSession() as session:
         PythonOpStats.num_instances = 0
         PythonOpStats.num_calls = 0
         session.run(task)
         self.assertEquals(PythonOpStats.num_instances, 64)
         self.assertEquals(PythonOpStats.num_calls, 256)
Example #13
0
    def init(
        self,
        nodes=None,
        retrieve_from_epoch=None,
        path_prefix=None,
        path_type=None
    ):
        """
        Build a Task that will be run once after the job's `init_group` is run.
        This task will determine which blobs need to be checkpointed.
        If retrieve_from_epoch is not None, then the checkpoint metadata is
        retrieved from a previously saved checkpoint.
        """
        assert nodes is None or len(nodes) == 1, (
            'CheckpointManager only supports single node.')

        self._path_prefix = path_prefix
        self._path_type = path_type
        if self._metadata_handler:
            self._metadata_handler.init(
                db_prefix=self._db_prefix,
                db_type=self._db_type,
                node_names=[str(self._node_name)],
                path_prefix=self._path_prefix,
                path_type=self._path_type)

        with Task(outputs=[self._blob_names]) as task:
            if retrieve_from_epoch is None:
                ops.GetAllBlobNames(
                    [],
                    self._blob_names,
                    include_shared=False)
            else:
                full_db_name = db_name(retrieve_from_epoch,
                                        self._node_name, self._db_prefix, path_prefix)
                db_type = path_type or self._db_type
                logger.info("Initializing checkpoints from = %s"
                            % full_db_name)
                ops.Load(
                    [], self._blob_names,
                    db=full_db_name,
                    db_type=db_type,
                    absolute_path=True)
        self._names_output = task.outputs()[0]
        return task
Example #14
0
def example_task():
    with Task():
        with ops.task_init():
            one = ops.Const(1)
        two = ops.Add([one, one])
        with ops.task_init():
            three = ops.Const(3)
        accum = ops.Add([two, three])
        # here, accum should be 5
        with ops.task_exit():
            # here, accum should be 6, since this executes after lines below
            seven_1 = ops.Add([accum, one])
        six = ops.Add([accum, one])
        ops.Add([accum, one], [accum])
        seven_2 = ops.Add([accum, one])
        o6 = final_output(six)
        o7_1 = final_output(seven_1)
        o7_2 = final_output(seven_2)
        return o6, o7_1, o7_2
Example #15
0
    def _timed_task(self, cp_op_name, add_op):
        """
        Build a Task that will measure the time span of checkpoint operations,
        once operation is done, time can be read from _current_checkpoint_duration.

        Args:
            cp_op_name: A string name of the checkpoint operation.
            add_op: A functor to add the checkpoint operation.

        Returns:
            A task with timer.
        """
        with Task(name=cp_op_name) as task:
            with ops.task_init():
                timer = ops.TimerBegin([], counter_name=self._node_name)
            add_op()
            with ops.task_exit():
                time_span_blob = ops.TimerGetAndEnd(timer)
            self._current_checkpoint_duration = final_output(time_span_blob)
        return task
Example #16
0
    def test_multi_instance(self):
        NUM_INSTANCES = 10
        NUM_ITERS = 15
        with TaskGroup() as tg:
            with Task(num_instances=NUM_INSTANCES):
                with ops.task_init():
                    counter1 = ops.CreateCounter([], ['global_counter'])
                    counter2 = ops.CreateCounter([], ['global_counter2'])
                    counter3 = ops.CreateCounter([], ['global_counter3'])
                # both task_counter and local_counter should be thread local
                with ops.task_instance_init():
                    task_counter = ops.CreateCounter([], ['task_counter'])
                local_counter = ops.CreateCounter([], ['local_counter'])
                with ops.loop(NUM_ITERS):
                    ops.CountUp(counter1)
                    ops.CountUp(task_counter)
                    ops.CountUp(local_counter)
                # gather sum of squares of local counters to make sure that
                # each local counter counted exactly up to NUM_ITERS, and
                # that there was no false sharing of counter instances.
                with ops.task_instance_exit():
                    count2 = ops.RetrieveCount(task_counter)
                    with ops.loop(ops.Mul([count2, count2])):
                        ops.CountUp(counter2)
                # This should have the same effect as the above
                count3 = ops.RetrieveCount(local_counter)
                with ops.loop(ops.Mul([count3, count3])):
                    ops.CountUp(counter3)
                # The code below will only run once
                with ops.task_exit():
                    total1 = final_output(ops.RetrieveCount(counter1))
                    total2 = final_output(ops.RetrieveCount(counter2))
                    total3 = final_output(ops.RetrieveCount(counter3))

        with LocalSession() as session:
            session.run(tg)
            self.assertEquals(total1.fetch(), NUM_INSTANCES * NUM_ITERS)
            self.assertEquals(total2.fetch(), NUM_INSTANCES * (NUM_ITERS**2))
            self.assertEquals(total3.fetch(), NUM_INSTANCES * (NUM_ITERS**2))
Example #17
0
 def test_setup(self):
     with Task() as task:
         with ops.task_init():
             one = ops.Const(1)
         two = ops.Add([one, one])
         with ops.task_init():
             three = ops.Const(3)
         accum = ops.Add([two, three])
         # here, accum should be 5
         with ops.task_exit():
             # here, accum should be 6, since this executes after lines below
             seven_1 = ops.Add([accum, one])
         six = ops.Add([accum, one])
         ops.Add([accum, one], [accum])
         seven_2 = ops.Add([accum, one])
         o6 = final_output(six)
         o7_1 = final_output(seven_1)
         o7_2 = final_output(seven_2)
     with LocalSession() as session:
         session.run(task)
         self.assertEquals(o6.fetch(), 6)
         self.assertEquals(o7_1.fetch(), 7)
         self.assertEquals(o7_2.fetch(), 7)
Example #18
0
    def load_blobs_from_checkpoint(self, blob_names, epoch):
        """
        Builds a Task that loads only the necessary blobs from a checkpoint of
        the given epoch. The necessary blobs are given in the blob_names
        argument.

        Args:
            blob_names: A list of strings. Each string is the name of a
                blob.
            epoch: The checkpoint epoch to load from.

        Returns:
            A Task which loads the specified blobs from the checkpoint of the
            given epoch.
        """
        with Task() as task:
            ops.Load([],
                     blob_names,
                     db=self._dbname(epoch),
                     db_type=self._db_type,
                     absolute_path=True,
                     allow_incomplete=True)
        return task
Example #19
0
 def add_stop_signal(self, output):
     if isinstance(output, core.BlobReference):
         t = Task(outputs=[output], group=self.epoch_group)
         output = t.outputs()[0]
     assert isinstance(output, TaskOutput)
     self.stop_signals.append(output)
Example #20
0
def _pipe_step(input,
               output=None,
               num_threads=1,
               processor=None,
               name=None,
               capacity=None,
               group=None,
               final_outputs=None):
    """
    """
    if isinstance(input, Reader):
        reader = input
    elif hasattr(input, 'reader'):
        reader = input.reader()
    else:
        raise ValueError('in must be a reader, queue or streaam.')

    if processor is not None:
        reader = ProcessingReader(reader, processor)

    if num_threads == 0:
        assert output is None
        return reader, None

    if name is None and processor is not None:
        name = processor_name(processor)
    if name is None and output is not None:
        name = 'pipe_into:%s' % processor_name(output)
    if name is None:
        name = 'pipe_from:%s' % processor_name(input)

    with Task(name=name, group=group, outputs=final_outputs) as task:
        global_exit_net = core.Net('exit')
        global_init_net = core.Net('init')
        reader.setup_ex(global_init_net, global_exit_net)

        out_queue = None
        writer = None

        steps = []
        for thread_id in range(num_threads):
            with NetBuilder(name='t:%d' % thread_id) as nb:
                init_net = core.Net('init')
                exit_net = core.Net('exit')
                read_nets, status, rec = reader.read_record_ex(
                    init_net, exit_net)

                if rec is not None:
                    if writer is None:
                        # hack so that the out queue gets the right name prefix
                        # (otherwise they would be prefixed with the thread id)
                        with NetBuilder(_fullname=task.name):
                            out_queue, writer = _init_output(
                                output, capacity, global_init_net,
                                global_exit_net)
                    write_nets, _ = writer.write_record_ex(
                        rec, init_net, exit_net, status)
                else:
                    write_nets = []
                ops.net(init_net)
                ops.net(
                    core.execution_step('body',
                                        list(read_nets) + list(write_nets),
                                        should_stop_blob=status))
                ops.net(exit_net)
            steps.append(core.to_execution_step(nb))
        ops.net(global_init_net)
        ops.net(core.execution_step('body', steps, concurrent_substeps=True))
        ops.net(global_exit_net)
    return out_queue, task
Example #21
0
 def add_stop_signal(self, output):
     if isinstance(output, core.BlobReference):
         t = Task(outputs=[output], group=self.epoch_group)
         output = t.outputs()[0]
     assert isinstance(output, TaskOutput)
     self.stop_signals.append(output)
Example #22
0
def _static_threads_task(name, group, final_outputs, reader, num_threads,
                         output, capacity):
    node_name = str(Node.current())
    profiler_name = "{0}/{1}/{2}/{3}/{4}".format(
        node_name,
        "pipe",
        name,
        processor_name(input) if input else "NoInput",
        processor_name(output) if output else "NoOutput")

    with Task(name=name, group=group, outputs=final_outputs) as task:
        global_exit_net = core.Net('exit')
        global_init_net = core.Net('init')
        reader.setup_ex(global_init_net, global_exit_net)

        out_queue = None
        writer = None

        steps = []
        for thread_id in range(num_threads):
            with NetBuilder(name='t:%d' % thread_id) as nb:
                init_net = core.Net('init')
                exit_net = core.Net('exit')
                read_nets, status, rec = reader.read_record_ex(
                    init_net, exit_net)
                init_net.ConstantFill(
                    [], [status],
                    shape=[],
                    value=False,
                    dtype=core.DataType.BOOL
                )

                if rec is not None:
                    if writer is None:
                        # hack so that the out queue gets the right name prefix
                        # (otherwise they would be prefixed with the thread id)
                        with NetBuilder(_fullname=task.name):
                            out_queue, writer = _init_output(
                                output, capacity, global_init_net,
                                global_exit_net)
                    write_nets, _ = writer.write_record_ex(
                        rec, init_net, exit_net, status)
                else:
                    write_nets = []

                timer_start_net = core.Net('timer_start')
                timer = timer_start_net.TimerBegin([], counter_name=profiler_name)
                timer_end_net = core.Net('timer_end')
                timer_end_net.TimerEnd(timer, [])

                ops.net(init_net)
                ops.net(core.execution_step(
                    'body',
                    [timer_start_net] + list(read_nets) + list(write_nets) +
                    [timer_end_net],
                    should_stop_blob=status))
                ops.net(timer_end_net)
                ops.net(exit_net)
            steps.append(core.to_execution_step(nb))
        ops.net(global_init_net)
        ops.net(core.execution_step('body', steps, concurrent_substeps=True))
        ops.net(global_exit_net)
    return out_queue, task
Example #23
0
def _pipe_step(input,
               output=None,
               num_threads=1,
               processor=None,
               name=None,
               capacity=None,
               group=None,
               final_outputs=None):
    """
    """
    if isinstance(input, Reader):
        reader = input
    elif hasattr(input, 'reader'):
        reader = input.reader()
    else:
        raise ValueError('in must be a reader, queue or streaam.')

    if processor is not None:
        reader = ProcessingReader(reader, processor)

    if num_threads == 0:
        assert output is None
        return reader, None

    if name is None and processor is not None:
        name = processor_name(processor)
    if name is None and output is not None:
        name = 'pipe_into:%s' % processor_name(output)
    if name is None:
        name = 'pipe_from:%s' % processor_name(input)

    node_name = str(Node.current())
    profiler_name = "{0}/{1}/{2}/{3}/{4}".format(
        node_name, "pipe", name,
        processor_name(input) if input else "NoInput",
        processor_name(output) if output else "NoOutput")

    with Task(name=name, group=group, outputs=final_outputs) as task:
        global_exit_net = core.Net('exit')
        global_init_net = core.Net('init')
        reader.setup_ex(global_init_net, global_exit_net)

        out_queue = None
        writer = None

        steps = []
        for thread_id in range(num_threads):
            with NetBuilder(name='t:%d' % thread_id) as nb:
                init_net = core.Net('init')
                exit_net = core.Net('exit')
                read_nets, status, rec = reader.read_record_ex(
                    init_net, exit_net)
                init_net.ConstantFill([], [status],
                                      shape=[],
                                      value=False,
                                      dtype=core.DataType.BOOL)

                if rec is not None:
                    if writer is None:
                        # hack so that the out queue gets the right name prefix
                        # (otherwise they would be prefixed with the thread id)
                        with NetBuilder(_fullname=task.name):
                            out_queue, writer = _init_output(
                                output, capacity, global_init_net,
                                global_exit_net)
                    write_nets, _ = writer.write_record_ex(
                        rec, init_net, exit_net, status)
                else:
                    write_nets = []

                timer_start_net = core.Net('timer_start')
                timer = timer_start_net.TimerBegin([],
                                                   counter_name=profiler_name)
                timer_end_net = core.Net('timer_end')
                timer_end_net.TimerEnd(timer, [])

                ops.net(init_net)
                ops.net(
                    core.execution_step('body',
                                        [timer_start_net] + list(read_nets) +
                                        list(write_nets) + [timer_end_net],
                                        should_stop_blob=status))
                ops.net(timer_end_net)
                ops.net(exit_net)
            steps.append(core.to_execution_step(nb))
        ops.net(global_init_net)
        ops.net(core.execution_step('body', steps, concurrent_substeps=True))
        ops.net(global_exit_net)
    return out_queue, task