def forward(self, x): out_futures = [] # List to store futures received from the workers tensors = x[0] offset = x[1] length = x[2] index = length + offset # Executes each stages depending on the stage each tensor is supposed to be fed into if offset < 1: rref_1 = RRef(tensors[0]) p1_out_rref = self.p1_rref.rpc_async().forward(rref_1) out_futures.append(p1_out_rref) # Maintains the order of input list stages if index >= 2 and offset < 2: rref_2 = RRef(tensors[1 - offset]) p2_out_rref = self.p2_rref.rpc_async().forward(rref_2) out_futures.append(p2_out_rref) if index >= 3 and offset < 3: rref_3 = RRef(tensors[2 - offset]) p3_out_rref = self.p3_rref.rpc_async().forward(rref_3) out_futures.append(p3_out_rref) if index >= 4 and offset < 4: rref_4 = RRef(tensors[3 - offset]) p4_out_rref = self.p4_rref.rpc_async().forward(rref_4) out_futures.append(p4_out_rref) out_tensors = torch.futures.wait_all(out_futures) # Collects all the output tensors for i in range(len(out_tensors)): out_tensors[i] = torch.cat([out_tensors[i]]) # Prevents the problem from each tensor being a leaf tensor return out_tensors # Changed from returning concatenated tensors to returning list of output tensors
def forward( self, input: TensorOrTensors) -> RRef[TensorOrTensors]: # type: ignore """:class:`Pipe` is a fairly transparent module wrapper. It doesn't modify the input and output signature of the underlying module. But there's type restriction. Input and output have to be a :class:`~torch.Tensor` or a sequence of tensors. This restriction is applied at partition boundaries too. Args: input (torch.Tensor or Sequence[torch.Tensor]): input mini-batch Returns: :class:`~torch.distributed.rpc.RRef` to the output of the mini-batch Raises: TypeError: input is not a tensor or tensors. """ microbatch.check(input) if not self.devices: # Empty sequential module is not illegal. return RRef(input) # Divide a mini-batch into micro-batches. batches = microbatch.scatter(input, self.chunks) # Run pipeline parallelism. self.pipeline.run(batches) # Merge the micro-batches into one mini-batch. output = microbatch.gather(batches) return RRef(output)
def rloss(loss_func: Callable, input_rref: rpc.RRef, target_rref: rpc.RRef) -> rpc.RRef: if BOUNCE_TENSORS: return loss_func(input_rref.remote().cpu().to_here(), target_rref.remote().cpu().to_here()) else: return loss_func(input_rref.to_here(), target_rref.to_here())
def test_rref_str(self): rref1 = RRef(self.rank) id_class = "GloballyUniqueId" self.assertEqual( "OwnerRRef({}({}, 0))".format(id_class, self.rank), rref1.__str__() ) dst_rank = (self.rank + 1) % self.world_size rref2 = rpc.remote("worker{}".format(dst_rank), torch.add, args=(torch.ones(2, 2), 1)) self.assertEqual( rref2.__str__(), "UserRRef(RRefId = {0}({1}, 1), ForkId = {0}({1}, 2))".format(id_class, self.rank) )
def __init__(self, world_size, batch=True): self.ob_rrefs = [] self.agent_rref = RRef(self) self.rewards = {} self.policy = Policy(batch).cuda() self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2) self.running_reward = 0 for ob_rank in range(1, world_size): ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank)) self.ob_rrefs.append(remote(ob_info, Observer, args=(batch, ))) self.rewards[ob_info.id] = [] self.states = torch.zeros(len(self.ob_rrefs), 1, 4) self.batch = batch # With batching, saved_log_probs contains a list of tensors, where each # tensor contains probs from all observers in one step. # Without batching, saved_log_probs is a dictionary where the key is the # observer id and the value is a list of probs for that observer. self.saved_log_probs = [] if self.batch else { k: [] for k in range(len(self.ob_rrefs)) } self.future_actions = torch.futures.Future() self.lock = threading.Lock() self.pending_states = len(self.ob_rrefs)
def run(rank, world_size): print(f'rank = {rank} world_size = {world_size}') env_dict = {key: os.environ[key] for key in key_list} print(env_dict) if rank == 0: rpc.init_rpc(f"rank{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE) coordinator = Coordinator(world_size=world_size, lr=1e-3) coord_rref = RRef(coordinator) coordinator.run_training_loop(1000, coord_rref) torch.save(coordinator.policy, open(f'plots/policy_nworkers{world_size-1}.pt', 'wb')) json.dump(coordinator.stats, open(f'plots/stats_nworkers{world_size-1}.json', 'w')) else: rpc.init_rpc(f"rank{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE) rpc.shutdown()
def __init__(self, rank: int, num_callees: int = 1, num_callers: int = 1, threads_process: int = 1, caller_class: object = None, caller_args=None, future_keys: list = None): # ASSERTIONS assert num_callees > 0 assert num_callers > 0 # caller_class must be given assert caller_class is not None # callee_rref is correct subclass # use import here to omit circular import # pylint: disable=import-outside-toplevel from ..agents.rpc_caller import RpcCaller assert issubclass(caller_class, RpcCaller) assert isinstance(future_keys, list) # ATTRIBUTES # RPC self.rank = rank # pylint: disable=invalid-name self.id = rpc.get_worker_info().id self.name = rpc.get_worker_info().name self.rref = RRef(self) self.shutdown = False self._shutdown_done = False # COUNTERS self._t_start = time.time() self._loop_iteration = 0 # STORAGE self._caller_rrefs = [] self._pending_rpcs = deque() self._future_answers = {k: Future() for k in future_keys} self._current_futures = deque(maxlen=len(future_keys)) # THREADS self.lock_batching = mp.Lock() self._processing_threads = [ Thread(target=self._process_batch, daemon=True, name='processing_thread_%d' % i) for i in range(threads_process) ] for thread in self._processing_threads: thread.start() # spawn actors self._spawn_callers(caller_class, num_callees, num_callers, *caller_args)
def forward(self, *inputs) -> RRef: """ Processes a single input mini-batch through the pipe and returns an :class:`~torch.distributed.rpc.RRef` pointing to the output. :class:`Pipe` is a fairly transparent module wrapper. It doesn't modify the input and output signature of the underlying module. But there's type restriction. Input and output have to be a :class:`~torch.Tensor` or a sequence of tensors. This restriction is applied at partition boundaries too. The sequence of inputs are fed into the first stage of the pipeline as ``*inputs``. As a result the positional args for this function should match the positional args for the first stage of the pipeline. The same condition applies for output of one stage of the pipeline which is the input for the next stage. The input tensor is split into multiple micro-batches based on the ``chunks`` parameter used to initialize :class:`Pipe`. The batch size is assumed to be the first dimension of the tensor and if the batch size is less than ``chunks``, the number of micro-batches is equal to the batch size. Args: inputs (torch.Tensor or sequence of :class:`~torch.Tensor`): input mini-batch Returns: :class:`~torch.distributed.rpc.RRef` to the output of the mini-batch Raises: TypeError: input is not a tensor or sequence of tensors. """ microbatch.check(*inputs) if not self.devices: # Empty sequential module is not illegal. return RRef(*inputs) # Divide a mini-batch into micro-batches. batches = microbatch.scatter(*inputs, chunks=self.chunks) # Run pipeline parallelism. self.pipeline.run(batches) # Merge the micro-batches into one mini-batch. output = microbatch.gather(batches) return RRef(output)
def forward(self, xs): # out_futures = [] # for x in iter(xs.split(self.num_split, dim=0)): x_rref = RRef(xs) y_rref = self.p1_rref.remote().forward(x_rref) z_fut = self.p2_rref.rpc_async().forward(y_rref) # out_futures.append(z_fut) return z_fut.wait()
def _parameter_rrefs(module): r""" Create one RRef for each parameter in the given local module, and return a list of RRefs. """ param_rrefs = [] for param in module.parameters(): param_rrefs.append(RRef(param)) return param_rrefs
def forward(self, xs): # Split the input batch xs into micro-batches, and collect async RPC # futures into a list out_futures = [] for x in iter(xs.split(self.split_size, dim=0)): x_rref = RRef(x) y_rref = self.p1_rref.remote().forward(x_rref) z_fut = self.p2_rref.rpc_async().forward(y_rref) out_futures.append(z_fut) # collect and cat all output tensors into one tensor. return torch.cat(torch.futures.wait_all(out_futures))
def forward(self, x): # Split the input batch xs into micro-batches, and collect async RPC # futures into a list out_futures = [] input_rref = RRef(x) p1_out_rref = self.p1_rref.remote().forward(input_rref) p2_out_rref = self.p2_rref.remote().forward(p1_out_rref) p3_out_rref = self.p3_rref.remote().forward(p2_out_rref) out_fut = self.p4_rref.rpc_async().forward(p3_out_rref) out_futures.append(out_fut) # collect and cat all output tensors into one tensor. return torch.cat(torch.futures.wait_all(out_futures))
def test_rref_context_debug_info(self): if not dist.is_initialized(): dist.init_process_group( backend="gloo", init_method=self.init_method, rank=self.rank, world_size=self.world_size, ) from torch.distributed.rpc import _get_debug_info rref1 = RRef(self.rank) info = _get_debug_info() self.assertIn("num_owner_rrefs", info) # RRef on local value is not added to context until shared across RPC self.assertEqual("0", info["num_owner_rrefs"]) dst_rank = (self.rank + 1) % self.world_size rpc.rpc_sync( "worker{}".format(dst_rank), set_global_rref, args=(rref1,) ) info = _get_debug_info() self.assertIn("num_owner_rrefs", info) self.assertEqual("1", info["num_owner_rrefs"]) rpc.rpc_sync("worker{}".format(dst_rank), clear_global_rref) rref2 = rpc.remote( "worker{}".format(dst_rank), torch.add, args=(torch.ones(2, 2), 1) ) rref3 = rpc.remote( "worker{}".format(dst_rank), torch.add, args=(torch.ones(2, 2), 1) ) rref2.to_here() rref3.to_here() # Use a barrier to make sure that OwnerRRefs are created on this worker # before checking debug info dist.barrier() info = _get_debug_info() self.assertIn("num_owner_rrefs", info) self.assertEqual("2", info["num_owner_rrefs"]) # Use another barrier to make sure that UserRRefs are only deleted after # checking debug info dist.barrier()
def test_pass_local_rrefs(self): n = self.rank + 1 dst_rank = n % self.world_size dst_worker = "worker{}".format(dst_rank) rref = RRef(40) self.assertEqual( rpc.rpc_sync(dst_worker, add_rref_to_value, args=(rref, 50)), 90) self.assertEqual( rpc.rpc_async(dst_worker, add_rref_to_value, args=(rref, 50)).wait(), 90) self.assertEqual( rpc.remote(dst_worker, add_rref_to_value, args=(rref, 50)).to_here(), 90)
def __init__(self, world_size): self.ob_rrefs = [] self.agent_rref = RRef(self) self.rewards = {} self.saved_log_probs = {} self.policy = Policy() self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2) self.eps = np.finfo(np.float32).eps.item() self.running_reward = 0 self.reward_threshold = DummyEnv().reward_threshold for ob_rank in range(1, world_size): ob_info = rpc.get_worker_info(worker_name(ob_rank)) self.ob_rrefs.append(remote(ob_info, Observer)) self.rewards[ob_info.id] = [] self.saved_log_probs[ob_info.id] = []
def __init__(self, world_size): self.ob_rrefs = [] self.agent_rref = RRef(self) self.rewards = {} self.saved_log_probs = {} self.policy = Policy() self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2) self.eps = numpy.finfo(numpy.float32).eps.item() self.running_reward = 0 self.reward_threshold = gym.make(ENV).spec.reward_threshold for ob_rank in range(1, world_size): ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank)) self.ob_rrefs.append(remote(ob_info, Observer)) self.rewards[ob_info.id] = [] self.saved_log_probs[ob_info.id] = []
def forward(self, xs): # Split the input batch xs into micro-batches, and collect async RPC # futures into a list out_futures = [] for x in iter(xs.split(self.split_size, dim=0)): x_rref = RRef(x) y_rref = _remote_on_rref(ResNetPart1.forward, self.p1_rref, x_rref) z_fut = _async_on_rref(ResNetPart2.forward, self.p2_rref, y_rref) out_futures.append(z_fut) # wait for all RPC to finish outs = [fut.wait() for fut in out_futures] # cat all tensors into one tensor. out = torch.cat(outs) return out
def __init__(self, rank: int, infer_rref: RRef, env_spawner: EnvSpawner): # ASSERTIONS # infer_rref must be a Learner assert infer_rref._get_type() is agents.Learner super().__init__(rank, infer_rref) self._num_envs = env_spawner.num_envs self._envs = env_spawner.spawn() self._current_states = [env.initial() for env in self._envs] # pylint: disable=not-callable self._metrics = [{ 'latency': tensor(0.).view(1, 1) } for _ in range(self._num_envs)] self._futures = []
def __init__(self, config, world_size): self.e = 0 self.config = config.config_NeuralPlayer self.preprocessor = None self._init_dataset(self.config.config_Datasets) self._init_agent(self.config.config_Agent) self.agent_rref = RRef(self.agent) self.world_size = world_size #nb of remote agents self.worker_rrefs = [] self.data_gatherer = ScoreDataGatherer() for worker_rank in range(1, self.world_size): worker_info = rpc.get_worker_info(f"worker{worker_rank}") self.worker_rrefs.append( remote(worker_info, CentralAgentWorker, args=(config, worker_rank), timeout=600))
def __init__( self, module_rref: rpc.RRef, device: str, num_inputs: int, num_outputs: Optional[int], rank: int, chunks: int, checkpoint_stop: int, ) -> None: self.module = module_rref.local_value() self.chunks = chunks self.device = torch.device(device) self.checkpoint_stop = checkpoint_stop self.rank = rank self.num_inputs = num_inputs self.num_outputs = num_outputs (self.in_queue, ), (self.out_queue, ) = create_workers([self.device])
def run_pipeline(self, pipeline_record_rref: rpc.RRef) -> Optional[Tensor]: """Processes a min-batch on this partition. If this is the last partition (pipeline_record has no consumer), concatenates results of processing all chunks and returns the result as the output of the model on the whole mini-batch. """ pipeline_record = pipeline_record_rref.local_value() self.run(pipeline_record) if not pipeline_record.consumers: result = microbatch.gather(pipeline_record.batches) assert len(result) == 1 result = result[0] s0 = current_stream(result.device) if is_cuda(s0): # TODO. Investigate why this is needed and remove it if possible. as_cuda(s0).synchronize() return result return None
def evaluate(self): self.model.eval() test_loss = Average() test_acc = Accuracy() with torch.no_grad(): for data, target in self.test_loader: with dist_autograd.context() as context_id: data_ref = RRef(data) output_ref = self.model(data_ref) output = output_ref.to_here() loss = F.cross_entropy(output, target) test_loss.update(loss.item(), data.size(0)) test_acc.update(output, target) return test_loss, test_acc
def train(self): train_loss = Average() train_acc = Accuracy() for data, target in self.train_loader: with dist_autograd.context() as context_id: data_ref = RRef(data) output_ref = self.model(data_ref) output = output_ref.to_here() loss = F.cross_entropy(output, target) dist_autograd.backward(context_id, [loss]) self.optimizer.step(context_id) train_loss.update(loss.item(), data.size(0)) train_acc.update(output, target) return train_loss, train_acc
def __init__(self, world_size, log_interval, save_dir): env = create_env("SuperMarioBros-1-1-v0") self.logger = MetricLogger(save_dir) self.agent = MarioAgent(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir) self.learner_rref = RRef(self) self.actor_rrefs = [] for actor_rank in range(1, world_size): actor_info = rpc.get_worker_info(ACTOR_NAME.format(actor_rank)) self.actor_rrefs.append( remote(actor_info, Actor, args=(actor_rank, ))) self.update_lock = threading.Lock() self.episode_lock = threading.Lock() self.episode = 0 self.log_interval = log_interval
def __init__(self, rank: int, callee_rref: rpc.RRef): # ASSERTIONS # check for RpcCallee being inherited by callee_rref # use import here to omit circular import from ..agents.rpc_callee import RpcCallee assert issubclass(callee_rref._get_type(), RpcCallee) # ATTRIBUTES # RPC self.callee_rref = callee_rref self.rank = rank # pylint: disable=invalid-name self.id = rpc.get_worker_info().id self.name = rpc.get_worker_info().name # COUNTER self._loop_iteration = 0 self.shutdown = False
def __init__(self, world_size, args): if args.env_name == 'L2M2019Env': env = L2M2019Env(visualize=False, difficulty=args.difficulty) obs_dim = 99 else: env = gym.make(args.env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] self.device = torch.device(args.device) self.args = args self.world_size = world_size self.actor_critic = MLPActorCritic(obs_dim, act_dim, hidden_sizes=args.hidden_sizes).to( self.device) self.replay_buffer = [ ReplayBuffer(obs_dim, act_dim, args.buffer_size) for _ in range(1, world_size) ] self.gac = GAC(self.actor_critic, self.replay_buffer, device=self.device, gamma=args.gamma, alpha_start=args.alpha_start, alpha_min=args.alpha_min, alpha_max=args.alpha_max) self.test_len = 0.0 self.test_ret = 0.0 self.ob_rrefs = [] for ob_rank in range(1, world_size): ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank)) self.ob_rrefs.append(remote(ob_info, Observer, args=(args, ))) self.agent_rref = RRef(self)
def run(rank, world_size): print(f'rank = {rank} world_size = {world_size}') env_dict = {key: os.environ[key] for key in key_list} print(env_dict) if rank == 0: rpc.init_rpc(f"rank{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE) batch_size_multiple = 1 n_iter = 10000 #number of updates coordinator = Coordinator(world_size, batch_size_multiple, lr=1e-3) coord_rref = RRef(coordinator) coordinator.run_training_loop(n_iter, coord_rref) torch.save( coordinator.policy, open( f'plots/{coordinator.env_name}_policy_nworkers{world_size-1}_batchsizemultiple{batch_size_multiple}.pt', 'wb')) json.dump( coordinator.stats, open( f'plots/{coordinator.env_name}_stats_nworkers{world_size-1}_batchsizemultiple{batch_size_multiple}.json', 'w')) else: rpc.init_rpc(f"rank{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE) rpc.shutdown()
def _parameter_rrefs(module): param_rrefs = [] for param in module.parameters(): param_rrefs.append(RRef(param)) return param_rrefs
def _run_trainer(emb_rref, rank): r""" Each trainer runs a forward pass which involves an embedding lookup on the parameter server and running nn.Linear locally. During the backward pass, DDP is responsible for aggregating the gradients for the dense part (nn.Linear) and distributed autograd ensures gradients updates are propagated to the parameter server. """ # Setup the model. model = HybridModel(emb_rref, rank) # Retrieve all model parameters as rrefs for DistributedOptimizer. # Retrieve parameters for embedding table. model_parameter_rrefs = rpc.rpc_sync("ps", _retrieve_embedding_parameters, args=(emb_rref, )) # model.parameters() only includes local parameters. for param in model.parameters(): model_parameter_rrefs.append(RRef(param)) # Setup distributed optimizer opt = DistributedOptimizer( optim.SGD, model_parameter_rrefs, lr=0.05, ) criterion = torch.nn.CrossEntropyLoss() def get_next_batch(rank): for _ in range(10): num_indices = random.randint(20, 50) indices = torch.LongTensor(num_indices).random_(0, NUM_EMBEDDINGS) # Generate offsets. offsets = [] start = 0 batch_size = 0 while start < num_indices: offsets.append(start) start += random.randint(1, 10) batch_size += 1 offsets_tensor = torch.LongTensor(offsets) target = torch.LongTensor(batch_size).random_(8).cuda(rank) yield indices, offsets_tensor, target # Train for 100 epochs for epoch in range(100): # create distributed autograd context for indices, offsets, target in get_next_batch(rank): with dist_autograd.context() as context_id: output = model(indices, offsets) loss = criterion(output, target) # Run distributed backward pass dist_autograd.backward(context_id, [loss]) # Tun distributed optimizer opt.step(context_id) # Not necessary to zero grads as each iteration creates a different # distributed autograd context which hosts different grads print("Training done for epoch {}".format(epoch))
def _retrieve_embedding_parameters(emb_rref): return [RRef(p) for p in emb_rref.local_value().parameters()]