コード例 #1
0
    def load_blobs_locally(self, nodes, blob_names, epoch, session):
        """Loads the necessary blobs from the checkpoints to the current node.

        Args:
            blob_names: A list of strings. Each string is the name of a
                blob.
            epoch: An integer. The checkpoint epoch to load from.
            session: A Session object to execute the Load ops.
        """
        if self._node_managers is not None:
            assert [node for node, _ in self._node_managers] == nodes
        else:
            self._node_managers = []
            for node in nodes:
                with Node(node):
                    manager = CheckpointManager(db_prefix=self._db_prefix,
                                                node_name=str(node),
                                                db_type=self._db_type)
                    self._node_managers.append((node, manager))
        assert self._node_managers is not None, 'must initialize node managers'
        for _, manager in self._node_managers:
            existence_task = manager.check_db_exists(epoch)
            session.run(existence_task)
            existence = existence_task.outputs()[0].fetch()
            if not existence:
                logger.info(
                    'DB %s does not exist!' %
                    db_name(epoch, manager._node_name, manager._db_prefix))
                return False
            load_task = manager.load_blobs_from_checkpoint(blob_names, epoch)
            session.run(load_task)
        logger.info('Successfully loaded from checkpoints.')
        return True
コード例 #2
0
def build_job(node_id):
    all_outputs = []
    with Job() as job:
        with Node('reader' + str(node_id)):
            with job.init_group:
                init_net = core.Net('init_net' + str(node_id))
                data_arr = Struct(('val', np.array(range(10))))
                data = ConstRecord(init_net, data_arr)
                ds = Dataset(data, name='dataset' + str(node_id))
                full_reader = ds.reader(init_net)
                total = init_net.Const([100])
                Task(step=init_net)

            def inc_total(rec):
                net = core.Net('inc_total' + str(node_id))
                net.Add([total, rec.val()], [total])
                return [net]

            epoch_reader = ReaderWithLimit(full_reader, num_iter=3)
            pipe(epoch_reader, processor=inc_total)
            job.add_stop_signal(epoch_reader.data_finished())
            all_outputs.append(total)

    total_fetcher = Task(step=core.Net('empty'), outputs=all_outputs)
    return job, total_fetcher
コード例 #3
0
 def _task_group(self, func, *args, **kw):
     assert self._node_managers is not None, 'init must be called first.'
     with TaskGroup(WorkspaceType.GLOBAL) as task_group:
         for node, manager in self._node_managers:
             with Node(node):
                 func(manager, *args, **kw)
         return task_group
コード例 #4
0
ファイル: core_test.py プロジェクト: thomascong121/NCRF
    def test_create_plan_from_proto_correctly(self):
        from caffe2.python.net_builder import ops
        with Node('trainer'), Task(name='my_task', num_instances=2) as task:
            with ops.task_init():
                globl = ops.Const(0)
            with ops.task_instance_init():
                local = ops.Const(0)
            with ops.loop(100):
                ops.Copy(globl, local)
            with ops.task_instance_exit():
                ops.Add([globl, local], [globl])
            with ops.task_exit():
                ops.Mul([globl, globl], [globl])

        plan = core.Plan(task.get_step())
        test_plan = core.Plan.create_from_proto(plan.Proto())

        self.assertEqual(len(plan.Steps()), 1)
        self.assertEqual(len(test_plan.Steps()), 1)
        self.assertEqual(len(plan.Proto().network), 9)
        self.assertEqual(len(test_plan.Proto().network), 9)
        self.assertEqual(len(plan.Proto().execution_step), 1)
        self.assertEqual(len(test_plan.Proto().execution_step), 1)
        self.assertEqual(plan.Steps()[0].Name(), test_plan.Steps()[0].Name())
        self.assertEqual(len(plan.Nets()), len(test_plan.Nets()))
        for idx in range(0, len(plan.Nets())):
            # When we create Net for test_plan, we will end up with new Net
            # name with postfix.
            net_1 = plan.Nets()[idx]
            net_2 = test_plan.Nets()[idx]
            trim_size = len(net_1.Name())
            self.assertEqual(net_1.Name(), net_2.Name()[:trim_size])
コード例 #5
0
 def init(self,
          nodes,
          retrieve_from_epoch=None,
          path_prefix=None,
          path_type=None):
     if self._node_managers is not None:
         assert [node for node, _ in self._node_managers] == nodes
         return
     self._node_managers = []
     self._path_prefix = path_prefix
     self._path_type = path_type
     self._node_names = [str(node) for node in nodes]
     if self._metadata_handler:
         self._metadata_handler.init(db_prefix=self._db_prefix,
                                     db_type=self._db_type,
                                     node_names=self._node_names,
                                     path_prefix=self._path_prefix,
                                     path_type=self._path_type)
     for node in nodes:
         with Node(node):
             manager = CheckpointManager(db_prefix=self._db_prefix,
                                         node_name=str(node),
                                         db_type=self._db_type)
             self._node_managers.append((node, manager))
     return self._task_group(CheckpointManager.init,
                             nodes=[node],
                             retrieve_from_epoch=retrieve_from_epoch,
                             path_prefix=path_prefix,
                             path_type=path_type)
コード例 #6
0
def _static_threads_task(name, group, final_outputs, reader, num_threads,
                         output, capacity):
    node_name = str(Node.current())
    profiler_name = "{0}/{1}/{2}/{3}/{4}".format(
        node_name, "pipe", name,
        processor_name(input) if input else "NoInput",
        processor_name(output) if output else "NoOutput")

    with Task(name=name, group=group, outputs=final_outputs) as task:
        global_exit_net = core.Net('exit')
        global_init_net = core.Net('init')
        reader.setup_ex(global_init_net, global_exit_net)

        out_queue = None
        writer = None

        steps = []
        for thread_id in range(num_threads):
            with NetBuilder(name='t:%d' % thread_id) as nb:
                init_net = core.Net('init')
                exit_net = core.Net('exit')
                read_nets, status, rec = reader.read_record_ex(
                    init_net, exit_net)
                init_net.ConstantFill([], [status],
                                      shape=[],
                                      value=False,
                                      dtype=core.DataType.BOOL)

                if rec is not None:
                    if writer is None:
                        # hack so that the out queue gets the right name prefix
                        # (otherwise they would be prefixed with the thread id)
                        with NetBuilder(_fullname=task.name):
                            out_queue, writer = _init_output(
                                output, capacity, global_init_net,
                                global_exit_net)
                    write_nets, _ = writer.write_record_ex(
                        rec, init_net, exit_net, status)
                else:
                    write_nets = []

                timer_start_net = core.Net('timer_start')
                timer = timer_start_net.TimerBegin([],
                                                   counter_name=profiler_name)
                timer_end_net = core.Net('timer_end')
                timer_end_net.TimerEnd(timer, [])

                ops.net(init_net)
                ops.net(
                    core.execution_step('body',
                                        [timer_start_net] + list(read_nets) +
                                        list(write_nets) + [timer_end_net],
                                        should_stop_blob=status))
                ops.net(timer_end_net)
                ops.net(exit_net)
            steps.append(core.to_execution_step(nb))
        ops.net(global_init_net)
        ops.net(core.execution_step('body', steps, concurrent_substeps=True))
        ops.net(global_exit_net)
    return out_queue, task
コード例 #7
0
ファイル: pipeline.py プロジェクト: wolfviking0/caffe2_SSD
def _runtime_threads_task(name, group, final_outputs, reader, num_threads,
                          output, capacity):
    node_name = str(Node.current())
    profiler_name = "{0}/{1}/{2}/{3}/{4}".format(
        node_name,
        "pipe",
        name,
        processor_name(input) if input else "NoInput",
        processor_name(output) if output else "NoOutput")

    with Task(name=name, group=group, outputs=final_outputs,
              num_instances=num_threads) as task:
        global_exit_net = core.Net('pipe:exit')
        global_init_net = core.Net('pipe:init')
        reader.setup_ex(global_init_net, global_exit_net)

        init_net = core.Net('pipe:instance:init')
        exit_net = core.Net('pipe:instance:exit')
        read_nets, status, rec = reader.read_record_ex(init_net, exit_net)
        init_net.ConstantFill(
            [], [status],
            shape=[],
            value=False,
            dtype=core.DataType.BOOL
        )

        if rec is not None:
            out_queue, writer = _init_output(
                output, capacity, global_init_net, global_exit_net)
            write_nets, _ = writer.write_record_ex(
                rec, init_net, exit_net, status)
        else:
            out_queue = None
            write_nets = []

        with ops.task_init():
            ops.net(global_init_net)
        with ops.task_instance_init():
            ops.net(init_net)

        timer_start_net = core.Net('timer_start')
        timer = timer_start_net.TimerBegin([], counter_name=profiler_name)
        timer_end_net = core.Net('timer_end')
        timer_end_net.TimerEnd(timer, [])

        ops.net(core.execution_step(
            'body',
            [timer_start_net] + list(read_nets) + list(write_nets) +
            [timer_end_net],
            should_stop_blob=status))
        ops.net(timer_end_net)

        with ops.task_instance_exit():
            ops.net(exit_net)
        with ops.task_exit():
            ops.net(global_exit_net)

    return out_queue, task
コード例 #8
0
 def build(self, epoch, checkpoint_manager):
     with TaskGroup(WorkspaceType.GLOBAL) as upload_task_group:
         for node, manager in checkpoint_manager._node_managers:
             with Node(str(node)), Task():
                 src_path = db_name(epoch, manager._node_name, manager._db_prefix)
                 dest_path = os.path.join(self.dest_dir, str(node))
                 ops.Python((local_copy_op,
                             [src_path, dest_path], {}))([], [])
     return upload_task_group
コード例 #9
0
ファイル: checkpoint.py プロジェクト: Chomolungma/caffe2
 def _task_group(self, func, *args, **kw):
     assert self._node_managers is not None, 'init must be called first.'
     with TaskGroup(WorkspaceType.GLOBAL) as task_group:
         for node, manager in self._node_managers:
             # TODO(aartibasant, T21070353): Enable the checkpoints for
             # readers.
             # The checkpointing for readers is broken because of D5582328.
             # Disabling the reader checkpoints until it is fixed.
             if "reader" in str(node):
                 continue
             with Node(node):
                 func(manager, *args, **kw)
         return task_group
コード例 #10
0
ファイル: checkpoint.py プロジェクト: Chomolungma/caffe2
 def init(self, nodes, retrieve_from_epoch=None):
     if self._node_managers is not None:
         assert [node for node, _ in self._node_managers] == nodes
         return
     self._node_managers = []
     for node in nodes:
         with Node(node):
             manager = self._node_manager_class(db=os.path.join(
                 self._db_prefix, node),
                                                db_type=self._db_type)
             self._node_managers.append((node, manager))
     return self._task_group(self._node_manager_class.init,
                             nodes=[node],
                             retrieve_from_epoch=retrieve_from_epoch)
コード例 #11
0
 def init(self, nodes, retrieve_from_epoch=None):
     if self._node_managers is not None:
         assert [node for node, _ in self._node_managers] == nodes
         return
     self._node_managers = []
     for node in nodes:
         with Node(node):
             manager = CheckpointManager(db_prefix=self._db_prefix,
                                         node_name=str(node),
                                         db_type=self._db_type)
             self._node_managers.append((node, manager))
     return self._task_group(CheckpointManager.init,
                             nodes=[node],
                             retrieve_from_epoch=retrieve_from_epoch)
コード例 #12
0
def build_pipeline(node_id):
    with Node('trainer_%d' % node_id):
        with Job.current().init_group, Task():
            data_arr = Struct(('val', np.array(list(range(10)))))
            data = ConstRecord(ops, data_arr)
            ds = Dataset(data, name='dataset:%d' % node_id)
            full_reader = ds.reader(ops)
            total = ops.Const([100])

        def inc_total(rec):
            ops.Add([total, rec.val()], [total])

        epoch_reader = ReaderWithLimit(full_reader, num_iter=3)
        pipe(epoch_reader, processor=inc_total)
        Job.current().add_stop_signal(epoch_reader.data_finished())
    return [total]
コード例 #13
0
ファイル: checkpoint_test.py プロジェクト: xwuShirley/pytorch
    def test_download_group_simple(self):
        """
        A simple test that ensures we have download task group
        executed between epoch_group and exit_group.
        """
        model = model_helper.ModelHelper(name="test_model")
        download_net = core.Net("download_net")

        for name in ["input1", "input2", "output", "download_result"]:
            model.param_init_net.ConstantFill([], [name],
                                              shape=[
                                                  8,
                                              ],
                                              value=1.0,
                                              run_once=0)
        model.net.Add(["input1", "input2"], ["output"])
        download_net.Copy(["output"], ["download_result"])

        # All blob values are initialized as 1.0, after download_net executed
        # we expect to see download result is the same as training result.
        with Job() as job:
            with Node("trainer:0"):
                with job.init_group:
                    Task(step=model.param_init_net)
                with job.epoch_group:
                    with Task():
                        with ops.loop(1):
                            ops.net(model.net)
                with job.download_group:
                    Task(step=download_net)

                epoch_limiter(job, 1)

        ws = workspace.C.Workspace()
        session = LocalSession(ws)
        job_runner = JobRunner(job)
        job_runner.train(session)

        expected_result = np.full(8, 2.0).astype(np.float32)
        self.assertTrue(
            np.array_equal(expected_result, ws.fetch_blob("output")))
        self.assertTrue(
            np.array_equal(expected_result, ws.fetch_blob("download_result")))
コード例 #14
0
ファイル: checkpoint.py プロジェクト: rgomathi/caffe2
    def load_blobs_locally(self, nodes, blob_names, epoch, session):
        """Loads the necessary blobs from the checkpoints to the current node.

        Args:
            blob_names: A list of strings. Each string is the name of a
                blob.
            epoch: An integer. The checkpoint epoch to load from.
            session: A Session object to execute the Load ops.
        """
        if self._node_managers is not None:
            assert [node for node, _ in self._node_managers] == nodes
        else:
            self._node_managers = []
            for node in nodes:
                with Node(node):
                    manager = self._node_manager_class(db=os.path.join(
                        self._db_prefix, node),
                                                       db_type=self._db_type)
                    self._node_managers.append((node, manager))
        assert self._node_managers is not None, 'must initialize node managers'
        for _, manager in self._node_managers:
            with TaskGroup(WorkspaceType.GLOBAL) as task_group:
                manager.load_blobs_from_checkpoint(blob_names, epoch)
            session.run(task_group)
コード例 #15
0
ファイル: pipeline.py プロジェクト: Yangqing/caffe2
def _static_threads_task(name, group, final_outputs, reader, num_threads,
                         output, capacity):
    node_name = str(Node.current())
    profiler_name = "{0}/{1}/{2}/{3}/{4}".format(
        node_name,
        "pipe",
        name,
        processor_name(input) if input else "NoInput",
        processor_name(output) if output else "NoOutput")

    with Task(name=name, group=group, outputs=final_outputs) as task:
        global_exit_net = core.Net('exit')
        global_init_net = core.Net('init')
        reader.setup_ex(global_init_net, global_exit_net)

        out_queue = None
        writer = None

        steps = []
        for thread_id in range(num_threads):
            with NetBuilder(name='t:%d' % thread_id) as nb:
                init_net = core.Net('init')
                exit_net = core.Net('exit')
                read_nets, status, rec = reader.read_record_ex(
                    init_net, exit_net)
                init_net.ConstantFill(
                    [], [status],
                    shape=[],
                    value=False,
                    dtype=core.DataType.BOOL
                )

                if rec is not None:
                    if writer is None:
                        # hack so that the out queue gets the right name prefix
                        # (otherwise they would be prefixed with the thread id)
                        with NetBuilder(_fullname=task.name):
                            out_queue, writer = _init_output(
                                output, capacity, global_init_net,
                                global_exit_net)
                    write_nets, _ = writer.write_record_ex(
                        rec, init_net, exit_net, status)
                else:
                    write_nets = []

                timer_start_net = core.Net('timer_start')
                timer = timer_start_net.TimerBegin([], counter_name=profiler_name)
                timer_end_net = core.Net('timer_end')
                timer_end_net.TimerEnd(timer, [])

                ops.net(init_net)
                ops.net(core.execution_step(
                    'body',
                    [timer_start_net] + list(read_nets) + list(write_nets) +
                    [timer_end_net],
                    should_stop_blob=status))
                ops.net(timer_end_net)
                ops.net(exit_net)
            steps.append(core.to_execution_step(nb))
        ops.net(global_init_net)
        ops.net(core.execution_step('body', steps, concurrent_substeps=True))
        ops.net(global_exit_net)
    return out_queue, task
コード例 #16
0
def _pipe_step(input,
               output=None,
               num_threads=1,
               processor=None,
               name=None,
               capacity=None,
               group=None,
               final_outputs=None):
    """
    """
    if isinstance(input, Reader):
        reader = input
    elif hasattr(input, 'reader'):
        reader = input.reader()
    else:
        raise ValueError('in must be a reader, queue or streaam.')

    if processor is not None:
        reader = ProcessingReader(reader, processor)

    if num_threads == 0:
        assert output is None
        return reader, None

    if name is None and processor is not None:
        name = processor_name(processor)
    if name is None and output is not None:
        name = 'pipe_into:%s' % processor_name(output)
    if name is None:
        name = 'pipe_from:%s' % processor_name(input)

    node_name = str(Node.current())
    profiler_name = "{0}/{1}/{2}/{3}/{4}".format(
        node_name, "pipe", name,
        processor_name(input) if input else "NoInput",
        processor_name(output) if output else "NoOutput")

    with Task(name=name, group=group, outputs=final_outputs) as task:
        global_exit_net = core.Net('exit')
        global_init_net = core.Net('init')
        reader.setup_ex(global_init_net, global_exit_net)

        out_queue = None
        writer = None

        steps = []
        for thread_id in range(num_threads):
            with NetBuilder(name='t:%d' % thread_id) as nb:
                init_net = core.Net('init')
                exit_net = core.Net('exit')
                read_nets, status, rec = reader.read_record_ex(
                    init_net, exit_net)
                init_net.ConstantFill([], [status],
                                      shape=[],
                                      value=False,
                                      dtype=core.DataType.BOOL)

                if rec is not None:
                    if writer is None:
                        # hack so that the out queue gets the right name prefix
                        # (otherwise they would be prefixed with the thread id)
                        with NetBuilder(_fullname=task.name):
                            out_queue, writer = _init_output(
                                output, capacity, global_init_net,
                                global_exit_net)
                    write_nets, _ = writer.write_record_ex(
                        rec, init_net, exit_net, status)
                else:
                    write_nets = []

                timer_start_net = core.Net('timer_start')
                timer = timer_start_net.TimerBegin([],
                                                   counter_name=profiler_name)
                timer_end_net = core.Net('timer_end')
                timer_end_net.TimerEnd(timer, [])

                ops.net(init_net)
                ops.net(
                    core.execution_step('body',
                                        [timer_start_net] + list(read_nets) +
                                        list(write_nets) + [timer_end_net],
                                        should_stop_blob=status))
                ops.net(timer_end_net)
                ops.net(exit_net)
            steps.append(core.to_execution_step(nb))
        ops.net(global_init_net)
        ops.net(core.execution_step('body', steps, concurrent_substeps=True))
        ops.net(global_exit_net)
    return out_queue, task