def load_blobs_locally(self, nodes, blob_names, epoch, session): """Loads the necessary blobs from the checkpoints to the current node. Args: blob_names: A list of strings. Each string is the name of a blob. epoch: An integer. The checkpoint epoch to load from. session: A Session object to execute the Load ops. """ if self._node_managers is not None: assert [node for node, _ in self._node_managers] == nodes else: self._node_managers = [] for node in nodes: with Node(node): manager = CheckpointManager(db_prefix=self._db_prefix, node_name=str(node), db_type=self._db_type) self._node_managers.append((node, manager)) assert self._node_managers is not None, 'must initialize node managers' for _, manager in self._node_managers: existence_task = manager.check_db_exists(epoch) session.run(existence_task) existence = existence_task.outputs()[0].fetch() if not existence: logger.info( 'DB %s does not exist!' % db_name(epoch, manager._node_name, manager._db_prefix)) return False load_task = manager.load_blobs_from_checkpoint(blob_names, epoch) session.run(load_task) logger.info('Successfully loaded from checkpoints.') return True
def build_job(node_id): all_outputs = [] with Job() as job: with Node('reader' + str(node_id)): with job.init_group: init_net = core.Net('init_net' + str(node_id)) data_arr = Struct(('val', np.array(range(10)))) data = ConstRecord(init_net, data_arr) ds = Dataset(data, name='dataset' + str(node_id)) full_reader = ds.reader(init_net) total = init_net.Const([100]) Task(step=init_net) def inc_total(rec): net = core.Net('inc_total' + str(node_id)) net.Add([total, rec.val()], [total]) return [net] epoch_reader = ReaderWithLimit(full_reader, num_iter=3) pipe(epoch_reader, processor=inc_total) job.add_stop_signal(epoch_reader.data_finished()) all_outputs.append(total) total_fetcher = Task(step=core.Net('empty'), outputs=all_outputs) return job, total_fetcher
def _task_group(self, func, *args, **kw): assert self._node_managers is not None, 'init must be called first.' with TaskGroup(WorkspaceType.GLOBAL) as task_group: for node, manager in self._node_managers: with Node(node): func(manager, *args, **kw) return task_group
def test_create_plan_from_proto_correctly(self): from caffe2.python.net_builder import ops with Node('trainer'), Task(name='my_task', num_instances=2) as task: with ops.task_init(): globl = ops.Const(0) with ops.task_instance_init(): local = ops.Const(0) with ops.loop(100): ops.Copy(globl, local) with ops.task_instance_exit(): ops.Add([globl, local], [globl]) with ops.task_exit(): ops.Mul([globl, globl], [globl]) plan = core.Plan(task.get_step()) test_plan = core.Plan.create_from_proto(plan.Proto()) self.assertEqual(len(plan.Steps()), 1) self.assertEqual(len(test_plan.Steps()), 1) self.assertEqual(len(plan.Proto().network), 9) self.assertEqual(len(test_plan.Proto().network), 9) self.assertEqual(len(plan.Proto().execution_step), 1) self.assertEqual(len(test_plan.Proto().execution_step), 1) self.assertEqual(plan.Steps()[0].Name(), test_plan.Steps()[0].Name()) self.assertEqual(len(plan.Nets()), len(test_plan.Nets())) for idx in range(0, len(plan.Nets())): # When we create Net for test_plan, we will end up with new Net # name with postfix. net_1 = plan.Nets()[idx] net_2 = test_plan.Nets()[idx] trim_size = len(net_1.Name()) self.assertEqual(net_1.Name(), net_2.Name()[:trim_size])
def init(self, nodes, retrieve_from_epoch=None, path_prefix=None, path_type=None): if self._node_managers is not None: assert [node for node, _ in self._node_managers] == nodes return self._node_managers = [] self._path_prefix = path_prefix self._path_type = path_type self._node_names = [str(node) for node in nodes] if self._metadata_handler: self._metadata_handler.init(db_prefix=self._db_prefix, db_type=self._db_type, node_names=self._node_names, path_prefix=self._path_prefix, path_type=self._path_type) for node in nodes: with Node(node): manager = CheckpointManager(db_prefix=self._db_prefix, node_name=str(node), db_type=self._db_type) self._node_managers.append((node, manager)) return self._task_group(CheckpointManager.init, nodes=[node], retrieve_from_epoch=retrieve_from_epoch, path_prefix=path_prefix, path_type=path_type)
def _static_threads_task(name, group, final_outputs, reader, num_threads, output, capacity): node_name = str(Node.current()) profiler_name = "{0}/{1}/{2}/{3}/{4}".format( node_name, "pipe", name, processor_name(input) if input else "NoInput", processor_name(output) if output else "NoOutput") with Task(name=name, group=group, outputs=final_outputs) as task: global_exit_net = core.Net('exit') global_init_net = core.Net('init') reader.setup_ex(global_init_net, global_exit_net) out_queue = None writer = None steps = [] for thread_id in range(num_threads): with NetBuilder(name='t:%d' % thread_id) as nb: init_net = core.Net('init') exit_net = core.Net('exit') read_nets, status, rec = reader.read_record_ex( init_net, exit_net) init_net.ConstantFill([], [status], shape=[], value=False, dtype=core.DataType.BOOL) if rec is not None: if writer is None: # hack so that the out queue gets the right name prefix # (otherwise they would be prefixed with the thread id) with NetBuilder(_fullname=task.name): out_queue, writer = _init_output( output, capacity, global_init_net, global_exit_net) write_nets, _ = writer.write_record_ex( rec, init_net, exit_net, status) else: write_nets = [] timer_start_net = core.Net('timer_start') timer = timer_start_net.TimerBegin([], counter_name=profiler_name) timer_end_net = core.Net('timer_end') timer_end_net.TimerEnd(timer, []) ops.net(init_net) ops.net( core.execution_step('body', [timer_start_net] + list(read_nets) + list(write_nets) + [timer_end_net], should_stop_blob=status)) ops.net(timer_end_net) ops.net(exit_net) steps.append(core.to_execution_step(nb)) ops.net(global_init_net) ops.net(core.execution_step('body', steps, concurrent_substeps=True)) ops.net(global_exit_net) return out_queue, task
def _runtime_threads_task(name, group, final_outputs, reader, num_threads, output, capacity): node_name = str(Node.current()) profiler_name = "{0}/{1}/{2}/{3}/{4}".format( node_name, "pipe", name, processor_name(input) if input else "NoInput", processor_name(output) if output else "NoOutput") with Task(name=name, group=group, outputs=final_outputs, num_instances=num_threads) as task: global_exit_net = core.Net('pipe:exit') global_init_net = core.Net('pipe:init') reader.setup_ex(global_init_net, global_exit_net) init_net = core.Net('pipe:instance:init') exit_net = core.Net('pipe:instance:exit') read_nets, status, rec = reader.read_record_ex(init_net, exit_net) init_net.ConstantFill( [], [status], shape=[], value=False, dtype=core.DataType.BOOL ) if rec is not None: out_queue, writer = _init_output( output, capacity, global_init_net, global_exit_net) write_nets, _ = writer.write_record_ex( rec, init_net, exit_net, status) else: out_queue = None write_nets = [] with ops.task_init(): ops.net(global_init_net) with ops.task_instance_init(): ops.net(init_net) timer_start_net = core.Net('timer_start') timer = timer_start_net.TimerBegin([], counter_name=profiler_name) timer_end_net = core.Net('timer_end') timer_end_net.TimerEnd(timer, []) ops.net(core.execution_step( 'body', [timer_start_net] + list(read_nets) + list(write_nets) + [timer_end_net], should_stop_blob=status)) ops.net(timer_end_net) with ops.task_instance_exit(): ops.net(exit_net) with ops.task_exit(): ops.net(global_exit_net) return out_queue, task
def build(self, epoch, checkpoint_manager): with TaskGroup(WorkspaceType.GLOBAL) as upload_task_group: for node, manager in checkpoint_manager._node_managers: with Node(str(node)), Task(): src_path = db_name(epoch, manager._node_name, manager._db_prefix) dest_path = os.path.join(self.dest_dir, str(node)) ops.Python((local_copy_op, [src_path, dest_path], {}))([], []) return upload_task_group
def _task_group(self, func, *args, **kw): assert self._node_managers is not None, 'init must be called first.' with TaskGroup(WorkspaceType.GLOBAL) as task_group: for node, manager in self._node_managers: # TODO(aartibasant, T21070353): Enable the checkpoints for # readers. # The checkpointing for readers is broken because of D5582328. # Disabling the reader checkpoints until it is fixed. if "reader" in str(node): continue with Node(node): func(manager, *args, **kw) return task_group
def init(self, nodes, retrieve_from_epoch=None): if self._node_managers is not None: assert [node for node, _ in self._node_managers] == nodes return self._node_managers = [] for node in nodes: with Node(node): manager = self._node_manager_class(db=os.path.join( self._db_prefix, node), db_type=self._db_type) self._node_managers.append((node, manager)) return self._task_group(self._node_manager_class.init, nodes=[node], retrieve_from_epoch=retrieve_from_epoch)
def init(self, nodes, retrieve_from_epoch=None): if self._node_managers is not None: assert [node for node, _ in self._node_managers] == nodes return self._node_managers = [] for node in nodes: with Node(node): manager = CheckpointManager(db_prefix=self._db_prefix, node_name=str(node), db_type=self._db_type) self._node_managers.append((node, manager)) return self._task_group(CheckpointManager.init, nodes=[node], retrieve_from_epoch=retrieve_from_epoch)
def build_pipeline(node_id): with Node('trainer_%d' % node_id): with Job.current().init_group, Task(): data_arr = Struct(('val', np.array(list(range(10))))) data = ConstRecord(ops, data_arr) ds = Dataset(data, name='dataset:%d' % node_id) full_reader = ds.reader(ops) total = ops.Const([100]) def inc_total(rec): ops.Add([total, rec.val()], [total]) epoch_reader = ReaderWithLimit(full_reader, num_iter=3) pipe(epoch_reader, processor=inc_total) Job.current().add_stop_signal(epoch_reader.data_finished()) return [total]
def test_download_group_simple(self): """ A simple test that ensures we have download task group executed between epoch_group and exit_group. """ model = model_helper.ModelHelper(name="test_model") download_net = core.Net("download_net") for name in ["input1", "input2", "output", "download_result"]: model.param_init_net.ConstantFill([], [name], shape=[ 8, ], value=1.0, run_once=0) model.net.Add(["input1", "input2"], ["output"]) download_net.Copy(["output"], ["download_result"]) # All blob values are initialized as 1.0, after download_net executed # we expect to see download result is the same as training result. with Job() as job: with Node("trainer:0"): with job.init_group: Task(step=model.param_init_net) with job.epoch_group: with Task(): with ops.loop(1): ops.net(model.net) with job.download_group: Task(step=download_net) epoch_limiter(job, 1) ws = workspace.C.Workspace() session = LocalSession(ws) job_runner = JobRunner(job) job_runner.train(session) expected_result = np.full(8, 2.0).astype(np.float32) self.assertTrue( np.array_equal(expected_result, ws.fetch_blob("output"))) self.assertTrue( np.array_equal(expected_result, ws.fetch_blob("download_result")))
def load_blobs_locally(self, nodes, blob_names, epoch, session): """Loads the necessary blobs from the checkpoints to the current node. Args: blob_names: A list of strings. Each string is the name of a blob. epoch: An integer. The checkpoint epoch to load from. session: A Session object to execute the Load ops. """ if self._node_managers is not None: assert [node for node, _ in self._node_managers] == nodes else: self._node_managers = [] for node in nodes: with Node(node): manager = self._node_manager_class(db=os.path.join( self._db_prefix, node), db_type=self._db_type) self._node_managers.append((node, manager)) assert self._node_managers is not None, 'must initialize node managers' for _, manager in self._node_managers: with TaskGroup(WorkspaceType.GLOBAL) as task_group: manager.load_blobs_from_checkpoint(blob_names, epoch) session.run(task_group)
def _static_threads_task(name, group, final_outputs, reader, num_threads, output, capacity): node_name = str(Node.current()) profiler_name = "{0}/{1}/{2}/{3}/{4}".format( node_name, "pipe", name, processor_name(input) if input else "NoInput", processor_name(output) if output else "NoOutput") with Task(name=name, group=group, outputs=final_outputs) as task: global_exit_net = core.Net('exit') global_init_net = core.Net('init') reader.setup_ex(global_init_net, global_exit_net) out_queue = None writer = None steps = [] for thread_id in range(num_threads): with NetBuilder(name='t:%d' % thread_id) as nb: init_net = core.Net('init') exit_net = core.Net('exit') read_nets, status, rec = reader.read_record_ex( init_net, exit_net) init_net.ConstantFill( [], [status], shape=[], value=False, dtype=core.DataType.BOOL ) if rec is not None: if writer is None: # hack so that the out queue gets the right name prefix # (otherwise they would be prefixed with the thread id) with NetBuilder(_fullname=task.name): out_queue, writer = _init_output( output, capacity, global_init_net, global_exit_net) write_nets, _ = writer.write_record_ex( rec, init_net, exit_net, status) else: write_nets = [] timer_start_net = core.Net('timer_start') timer = timer_start_net.TimerBegin([], counter_name=profiler_name) timer_end_net = core.Net('timer_end') timer_end_net.TimerEnd(timer, []) ops.net(init_net) ops.net(core.execution_step( 'body', [timer_start_net] + list(read_nets) + list(write_nets) + [timer_end_net], should_stop_blob=status)) ops.net(timer_end_net) ops.net(exit_net) steps.append(core.to_execution_step(nb)) ops.net(global_init_net) ops.net(core.execution_step('body', steps, concurrent_substeps=True)) ops.net(global_exit_net) return out_queue, task
def _pipe_step(input, output=None, num_threads=1, processor=None, name=None, capacity=None, group=None, final_outputs=None): """ """ if isinstance(input, Reader): reader = input elif hasattr(input, 'reader'): reader = input.reader() else: raise ValueError('in must be a reader, queue or streaam.') if processor is not None: reader = ProcessingReader(reader, processor) if num_threads == 0: assert output is None return reader, None if name is None and processor is not None: name = processor_name(processor) if name is None and output is not None: name = 'pipe_into:%s' % processor_name(output) if name is None: name = 'pipe_from:%s' % processor_name(input) node_name = str(Node.current()) profiler_name = "{0}/{1}/{2}/{3}/{4}".format( node_name, "pipe", name, processor_name(input) if input else "NoInput", processor_name(output) if output else "NoOutput") with Task(name=name, group=group, outputs=final_outputs) as task: global_exit_net = core.Net('exit') global_init_net = core.Net('init') reader.setup_ex(global_init_net, global_exit_net) out_queue = None writer = None steps = [] for thread_id in range(num_threads): with NetBuilder(name='t:%d' % thread_id) as nb: init_net = core.Net('init') exit_net = core.Net('exit') read_nets, status, rec = reader.read_record_ex( init_net, exit_net) init_net.ConstantFill([], [status], shape=[], value=False, dtype=core.DataType.BOOL) if rec is not None: if writer is None: # hack so that the out queue gets the right name prefix # (otherwise they would be prefixed with the thread id) with NetBuilder(_fullname=task.name): out_queue, writer = _init_output( output, capacity, global_init_net, global_exit_net) write_nets, _ = writer.write_record_ex( rec, init_net, exit_net, status) else: write_nets = [] timer_start_net = core.Net('timer_start') timer = timer_start_net.TimerBegin([], counter_name=profiler_name) timer_end_net = core.Net('timer_end') timer_end_net.TimerEnd(timer, []) ops.net(init_net) ops.net( core.execution_step('body', [timer_start_net] + list(read_nets) + list(write_nets) + [timer_end_net], should_stop_blob=status)) ops.net(timer_end_net) ops.net(exit_net) steps.append(core.to_execution_step(nb)) ops.net(global_init_net) ops.net(core.execution_step('body', steps, concurrent_substeps=True)) ops.net(global_exit_net) return out_queue, task