Example #1
0
    def forward(self, x):

        out_futures = []    # List to store futures received from the workers
        tensors = x[0]
        offset = x[1]
        length = x[2]
        index = length + offset

        # Executes each stages depending on the stage each tensor is supposed to be fed into
        if offset < 1:
            rref_1 = RRef(tensors[0])
            p1_out_rref = self.p1_rref.rpc_async().forward(rref_1)
            out_futures.append(p1_out_rref)     # Maintains the order of input list stages
        if index >= 2 and offset < 2:
            rref_2 = RRef(tensors[1 - offset])
            p2_out_rref = self.p2_rref.rpc_async().forward(rref_2) 
            out_futures.append(p2_out_rref)
        if index >= 3 and offset < 3:
            rref_3 = RRef(tensors[2 - offset])
            p3_out_rref = self.p3_rref.rpc_async().forward(rref_3) 
            out_futures.append(p3_out_rref)
        if index >= 4 and offset < 4:
            rref_4 = RRef(tensors[3 - offset])
            p4_out_rref = self.p4_rref.rpc_async().forward(rref_4) 
            out_futures.append(p4_out_rref)

        out_tensors = torch.futures.wait_all(out_futures)   # Collects all the output tensors

        for i in range(len(out_tensors)):
            out_tensors[i] = torch.cat([out_tensors[i]])    # Prevents the problem from each tensor being a leaf tensor

        return out_tensors  # Changed from returning concatenated tensors to returning list of output tensors
Example #2
0
    def forward(
            self,
            input: TensorOrTensors) -> RRef[TensorOrTensors]:  # type: ignore
        """:class:`Pipe` is a fairly transparent module wrapper. It doesn't
        modify the input and output signature of the underlying module. But
        there's type restriction. Input and output have to be a
        :class:`~torch.Tensor` or a sequence of tensors. This restriction is
        applied at partition boundaries too.

        Args:
            input (torch.Tensor or Sequence[torch.Tensor]): input mini-batch

        Returns:
            :class:`~torch.distributed.rpc.RRef` to the output of the mini-batch

        Raises:
            TypeError: input is not a tensor or tensors.

        """
        microbatch.check(input)

        if not self.devices:
            # Empty sequential module is not illegal.
            return RRef(input)

        # Divide a mini-batch into micro-batches.
        batches = microbatch.scatter(input, self.chunks)

        # Run pipeline parallelism.
        self.pipeline.run(batches)

        # Merge the micro-batches into one mini-batch.
        output = microbatch.gather(batches)
        return RRef(output)
def rloss(loss_func: Callable, input_rref: rpc.RRef,
          target_rref: rpc.RRef) -> rpc.RRef:
    if BOUNCE_TENSORS:
        return loss_func(input_rref.remote().cpu().to_here(),
                         target_rref.remote().cpu().to_here())
    else:
        return loss_func(input_rref.to_here(), target_rref.to_here())
Example #4
0
    def test_rref_str(self):
        rref1 = RRef(self.rank)
        id_class = "GloballyUniqueId"
        self.assertEqual(
            "OwnerRRef({}({}, 0))".format(id_class, self.rank),
            rref1.__str__()
        )

        dst_rank = (self.rank + 1) % self.world_size
        rref2 = rpc.remote("worker{}".format(dst_rank), torch.add, args=(torch.ones(2, 2), 1))
        self.assertEqual(
            rref2.__str__(),
            "UserRRef(RRefId = {0}({1}, 1), ForkId = {0}({1}, 2))".format(id_class, self.rank)
        )
Example #5
0
    def __init__(self, world_size, batch=True):
        self.ob_rrefs = []
        self.agent_rref = RRef(self)
        self.rewards = {}
        self.policy = Policy(batch).cuda()
        self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)
        self.running_reward = 0

        for ob_rank in range(1, world_size):
            ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank))
            self.ob_rrefs.append(remote(ob_info, Observer, args=(batch, )))
            self.rewards[ob_info.id] = []

        self.states = torch.zeros(len(self.ob_rrefs), 1, 4)
        self.batch = batch
        # With batching, saved_log_probs contains a list of tensors, where each
        # tensor contains probs from all observers in one step.
        # Without batching, saved_log_probs is a dictionary where the key is the
        # observer id and the value is a list of probs for that observer.
        self.saved_log_probs = [] if self.batch else {
            k: []
            for k in range(len(self.ob_rrefs))
        }
        self.future_actions = torch.futures.Future()
        self.lock = threading.Lock()
        self.pending_states = len(self.ob_rrefs)
def run(rank, world_size):
    print(f'rank = {rank} world_size = {world_size}')
    env_dict = {key: os.environ[key] for key in key_list}

    print(env_dict)

    if rank == 0:
        rpc.init_rpc(f"rank{rank}",
                     rank=rank,
                     world_size=world_size,
                     backend=rpc.BackendType.TENSORPIPE)

        coordinator = Coordinator(world_size=world_size, lr=1e-3)
        coord_rref = RRef(coordinator)
        coordinator.run_training_loop(1000, coord_rref)

        torch.save(coordinator.policy,
                   open(f'plots/policy_nworkers{world_size-1}.pt', 'wb'))
        json.dump(coordinator.stats,
                  open(f'plots/stats_nworkers{world_size-1}.json', 'w'))

    else:
        rpc.init_rpc(f"rank{rank}",
                     rank=rank,
                     world_size=world_size,
                     backend=rpc.BackendType.TENSORPIPE)

    rpc.shutdown()
Example #7
0
    def __init__(self,
                 rank: int,
                 num_callees: int = 1,
                 num_callers: int = 1,
                 threads_process: int = 1,
                 caller_class: object = None,
                 caller_args=None,
                 future_keys: list = None):

        # ASSERTIONS
        assert num_callees > 0
        assert num_callers > 0

        # caller_class must be given
        assert caller_class is not None

        # callee_rref is correct subclass
        # use import here to omit circular import
        # pylint: disable=import-outside-toplevel
        from ..agents.rpc_caller import RpcCaller
        assert issubclass(caller_class, RpcCaller)
        assert isinstance(future_keys, list)

        # ATTRIBUTES

        # RPC
        self.rank = rank
        # pylint: disable=invalid-name
        self.id = rpc.get_worker_info().id
        self.name = rpc.get_worker_info().name
        self.rref = RRef(self)

        self.shutdown = False
        self._shutdown_done = False

        # COUNTERS
        self._t_start = time.time()
        self._loop_iteration = 0

        # STORAGE
        self._caller_rrefs = []
        self._pending_rpcs = deque()
        self._future_answers = {k: Future() for k in future_keys}
        self._current_futures = deque(maxlen=len(future_keys))

        # THREADS
        self.lock_batching = mp.Lock()
        self._processing_threads = [
            Thread(target=self._process_batch,
                   daemon=True,
                   name='processing_thread_%d' % i)
            for i in range(threads_process)
        ]

        for thread in self._processing_threads:
            thread.start()

        # spawn actors
        self._spawn_callers(caller_class, num_callees, num_callers,
                            *caller_args)
Example #8
0
    def forward(self, *inputs) -> RRef:
        """
        Processes a single input mini-batch through the pipe and returns an
        :class:`~torch.distributed.rpc.RRef` pointing to the output.
        :class:`Pipe` is a fairly transparent module wrapper. It doesn't
        modify the input and output signature of the underlying module. But
        there's type restriction. Input and output have to be a
        :class:`~torch.Tensor` or a sequence of tensors. This restriction is
        applied at partition boundaries too.

        The sequence of inputs are fed into the first stage of the pipeline as
        ``*inputs``. As a result the positional args for this function should
        match the positional args for the first stage of the pipeline. The same
        condition applies for output of one stage of the pipeline which is the
        input for the next stage.

        The input tensor is split into multiple micro-batches based on the
        ``chunks`` parameter used to initialize :class:`Pipe`. The batch size
        is assumed to be the first dimension of the tensor and if the batch
        size is less than ``chunks``, the number of micro-batches is equal to
        the batch size.

        Args:
            inputs (torch.Tensor or sequence of :class:`~torch.Tensor`): input mini-batch

        Returns:
            :class:`~torch.distributed.rpc.RRef` to the output of the mini-batch

        Raises:
            TypeError: input is not a tensor or sequence of tensors.

        """
        microbatch.check(*inputs)

        if not self.devices:
            # Empty sequential module is not illegal.
            return RRef(*inputs)

        # Divide a mini-batch into micro-batches.
        batches = microbatch.scatter(*inputs, chunks=self.chunks)

        # Run pipeline parallelism.
        self.pipeline.run(batches)

        # Merge the micro-batches into one mini-batch.
        output = microbatch.gather(batches)
        return RRef(output)
    def forward(self, xs):
        # out_futures = []
        # for x in iter(xs.split(self.num_split, dim=0)):
        x_rref = RRef(xs)
        y_rref = self.p1_rref.remote().forward(x_rref)
        z_fut = self.p2_rref.rpc_async().forward(y_rref)
        # out_futures.append(z_fut)

        return z_fut.wait()
Example #10
0
def _parameter_rrefs(module):
    r"""
    Create one RRef for each parameter in the given local module, and return a
    list of RRefs.
    """
    param_rrefs = []
    for param in module.parameters():
        param_rrefs.append(RRef(param))
    return param_rrefs
Example #11
0
    def forward(self, xs):
        # Split the input batch xs into micro-batches, and collect async RPC
        # futures into a list
        out_futures = []
        for x in iter(xs.split(self.split_size, dim=0)):
            x_rref = RRef(x)
            y_rref = self.p1_rref.remote().forward(x_rref)
            z_fut = self.p2_rref.rpc_async().forward(y_rref)
            out_futures.append(z_fut)

        # collect and cat all output tensors into one tensor.
        return torch.cat(torch.futures.wait_all(out_futures))
Example #12
0
    def forward(self, x):
        # Split the input batch xs into micro-batches, and collect async RPC
        # futures into a list
        out_futures = []
        input_rref = RRef(x)
        p1_out_rref = self.p1_rref.remote().forward(input_rref)
        p2_out_rref = self.p2_rref.remote().forward(p1_out_rref)
        p3_out_rref = self.p3_rref.remote().forward(p2_out_rref)
        out_fut = self.p4_rref.rpc_async().forward(p3_out_rref)
        out_futures.append(out_fut)

        # collect and cat all output tensors into one tensor.
        return torch.cat(torch.futures.wait_all(out_futures))
Example #13
0
    def test_rref_context_debug_info(self):
        if not dist.is_initialized():
            dist.init_process_group(
                backend="gloo",
                init_method=self.init_method,
                rank=self.rank,
                world_size=self.world_size,
            )

        from torch.distributed.rpc import _get_debug_info
        rref1 = RRef(self.rank)
        info = _get_debug_info()
        self.assertIn("num_owner_rrefs", info)
        # RRef on local value is not added to context until shared across RPC
        self.assertEqual("0", info["num_owner_rrefs"])

        dst_rank = (self.rank + 1) % self.world_size
        rpc.rpc_sync(
            "worker{}".format(dst_rank),
            set_global_rref,
            args=(rref1,)
        )
        info = _get_debug_info()
        self.assertIn("num_owner_rrefs", info)
        self.assertEqual("1", info["num_owner_rrefs"])
        rpc.rpc_sync("worker{}".format(dst_rank), clear_global_rref)


        rref2 = rpc.remote(
            "worker{}".format(dst_rank),
            torch.add,
            args=(torch.ones(2, 2), 1)
        )
        rref3 = rpc.remote(
            "worker{}".format(dst_rank),
            torch.add,
            args=(torch.ones(2, 2), 1)
        )
        rref2.to_here()
        rref3.to_here()

        # Use a barrier to make sure that OwnerRRefs are created on this worker
        # before checking debug info
        dist.barrier()
        info = _get_debug_info()
        self.assertIn("num_owner_rrefs", info)
        self.assertEqual("2", info["num_owner_rrefs"])

        # Use another barrier to make sure that UserRRefs are only deleted after
        # checking debug info
        dist.barrier()
Example #14
0
    def test_pass_local_rrefs(self):
        n = self.rank + 1
        dst_rank = n % self.world_size
        dst_worker = "worker{}".format(dst_rank)

        rref = RRef(40)
        self.assertEqual(
            rpc.rpc_sync(dst_worker, add_rref_to_value, args=(rref, 50)), 90)
        self.assertEqual(
            rpc.rpc_async(dst_worker, add_rref_to_value,
                          args=(rref, 50)).wait(), 90)
        self.assertEqual(
            rpc.remote(dst_worker, add_rref_to_value,
                       args=(rref, 50)).to_here(), 90)
 def __init__(self, world_size):
     self.ob_rrefs = []
     self.agent_rref = RRef(self)
     self.rewards = {}
     self.saved_log_probs = {}
     self.policy = Policy()
     self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)
     self.eps = np.finfo(np.float32).eps.item()
     self.running_reward = 0
     self.reward_threshold = DummyEnv().reward_threshold
     for ob_rank in range(1, world_size):
         ob_info = rpc.get_worker_info(worker_name(ob_rank))
         self.ob_rrefs.append(remote(ob_info, Observer))
         self.rewards[ob_info.id] = []
         self.saved_log_probs[ob_info.id] = []
Example #16
0
 def __init__(self, world_size):
     self.ob_rrefs = []
     self.agent_rref = RRef(self)
     self.rewards = {}
     self.saved_log_probs = {}
     self.policy = Policy()
     self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)
     self.eps = numpy.finfo(numpy.float32).eps.item()
     self.running_reward = 0
     self.reward_threshold = gym.make(ENV).spec.reward_threshold
     for ob_rank in range(1, world_size):
         ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank))
         self.ob_rrefs.append(remote(ob_info, Observer))
         self.rewards[ob_info.id] = []
         self.saved_log_probs[ob_info.id] = []
Example #17
0
    def forward(self, xs):
        # Split the input batch xs into micro-batches, and collect async RPC
        # futures into a list
        out_futures = []
        for x in iter(xs.split(self.split_size, dim=0)):
            x_rref = RRef(x)
            y_rref = _remote_on_rref(ResNetPart1.forward, self.p1_rref, x_rref)
            z_fut = _async_on_rref(ResNetPart2.forward, self.p2_rref, y_rref)
            out_futures.append(z_fut)

        # wait for all RPC to finish
        outs = [fut.wait() for fut in out_futures]
        # cat all tensors into one tensor.
        out = torch.cat(outs)
        return out
Example #18
0
    def __init__(self, rank: int, infer_rref: RRef, env_spawner: EnvSpawner):
        # ASSERTIONS
        # infer_rref must be a Learner
        assert infer_rref._get_type() is agents.Learner

        super().__init__(rank, infer_rref)

        self._num_envs = env_spawner.num_envs
        self._envs = env_spawner.spawn()
        self._current_states = [env.initial() for env in self._envs]

        # pylint: disable=not-callable
        self._metrics = [{
            'latency': tensor(0.).view(1, 1)
        } for _ in range(self._num_envs)]

        self._futures = []
Example #19
0
 def __init__(self, config, world_size):
     self.e = 0
     self.config = config.config_NeuralPlayer
     self.preprocessor = None
     self._init_dataset(self.config.config_Datasets)
     self._init_agent(self.config.config_Agent)
     self.agent_rref = RRef(self.agent)
     self.world_size = world_size  #nb of remote agents
     self.worker_rrefs = []
     self.data_gatherer = ScoreDataGatherer()
     for worker_rank in range(1, self.world_size):
         worker_info = rpc.get_worker_info(f"worker{worker_rank}")
         self.worker_rrefs.append(
             remote(worker_info,
                    CentralAgentWorker,
                    args=(config, worker_rank),
                    timeout=600))
Example #20
0
 def __init__(
     self,
     module_rref: rpc.RRef,
     device: str,
     num_inputs: int,
     num_outputs: Optional[int],
     rank: int,
     chunks: int,
     checkpoint_stop: int,
 ) -> None:
     self.module = module_rref.local_value()
     self.chunks = chunks
     self.device = torch.device(device)
     self.checkpoint_stop = checkpoint_stop
     self.rank = rank
     self.num_inputs = num_inputs
     self.num_outputs = num_outputs
     (self.in_queue, ), (self.out_queue, ) = create_workers([self.device])
Example #21
0
    def run_pipeline(self, pipeline_record_rref: rpc.RRef) -> Optional[Tensor]:
        """Processes a min-batch on this partition.
           If this is the last partition (pipeline_record has no consumer), concatenates results of processing
           all chunks and returns the result as the output of the model on the whole mini-batch.
        """
        pipeline_record = pipeline_record_rref.local_value()
        self.run(pipeline_record)

        if not pipeline_record.consumers:
            result = microbatch.gather(pipeline_record.batches)
            assert len(result) == 1
            result = result[0]
            s0 = current_stream(result.device)
            if is_cuda(s0):
                # TODO. Investigate why this is needed and remove it if possible.
                as_cuda(s0).synchronize()
            return result

        return None
Example #22
0
    def evaluate(self):
        self.model.eval()

        test_loss = Average()
        test_acc = Accuracy()

        with torch.no_grad():
            for data, target in self.test_loader:
                with dist_autograd.context() as context_id:
                    data_ref = RRef(data)

                    output_ref = self.model(data_ref)
                    output = output_ref.to_here()
                    loss = F.cross_entropy(output, target)

                    test_loss.update(loss.item(), data.size(0))
                    test_acc.update(output, target)

        return test_loss, test_acc
Example #23
0
    def train(self):

        train_loss = Average()
        train_acc = Accuracy()

        for data, target in self.train_loader:
            with dist_autograd.context() as context_id:
                data_ref = RRef(data)

                output_ref = self.model(data_ref)
                output = output_ref.to_here()
                loss = F.cross_entropy(output, target)

                dist_autograd.backward(context_id, [loss])
                self.optimizer.step(context_id)

                train_loss.update(loss.item(), data.size(0))
                train_acc.update(output, target)

        return train_loss, train_acc
Example #24
0
    def __init__(self, world_size, log_interval, save_dir):
        env = create_env("SuperMarioBros-1-1-v0")

        self.logger = MetricLogger(save_dir)
        self.agent = MarioAgent(state_dim=(4, 84, 84),
                                action_dim=env.action_space.n,
                                save_dir=save_dir)
        self.learner_rref = RRef(self)
        self.actor_rrefs = []

        for actor_rank in range(1, world_size):
            actor_info = rpc.get_worker_info(ACTOR_NAME.format(actor_rank))
            self.actor_rrefs.append(
                remote(actor_info, Actor, args=(actor_rank, )))

        self.update_lock = threading.Lock()
        self.episode_lock = threading.Lock()
        self.episode = 0

        self.log_interval = log_interval
Example #25
0
    def __init__(self, rank: int, callee_rref: rpc.RRef):
        # ASSERTIONS
        # check for RpcCallee being inherited by callee_rref
        # use import here to omit circular import
        from ..agents.rpc_callee import RpcCallee
        assert issubclass(callee_rref._get_type(), RpcCallee)

        # ATTRIBUTES

        # RPC
        self.callee_rref = callee_rref
        self.rank = rank
        # pylint: disable=invalid-name
        self.id = rpc.get_worker_info().id
        self.name = rpc.get_worker_info().name

        # COUNTER
        self._loop_iteration = 0

        self.shutdown = False
Example #26
0
    def __init__(self, world_size, args):
        if args.env_name == 'L2M2019Env':
            env = L2M2019Env(visualize=False, difficulty=args.difficulty)
            obs_dim = 99
        else:
            env = gym.make(args.env_name)
            obs_dim = env.observation_space.shape[0]

        act_dim = env.action_space.shape[0]

        self.device = torch.device(args.device)

        self.args = args
        self.world_size = world_size

        self.actor_critic = MLPActorCritic(obs_dim,
                                           act_dim,
                                           hidden_sizes=args.hidden_sizes).to(
                                               self.device)
        self.replay_buffer = [
            ReplayBuffer(obs_dim, act_dim, args.buffer_size)
            for _ in range(1, world_size)
        ]

        self.gac = GAC(self.actor_critic,
                       self.replay_buffer,
                       device=self.device,
                       gamma=args.gamma,
                       alpha_start=args.alpha_start,
                       alpha_min=args.alpha_min,
                       alpha_max=args.alpha_max)

        self.test_len = 0.0
        self.test_ret = 0.0

        self.ob_rrefs = []
        for ob_rank in range(1, world_size):
            ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank))
            self.ob_rrefs.append(remote(ob_info, Observer, args=(args, )))

        self.agent_rref = RRef(self)
Example #27
0
def run(rank, world_size):
    print(f'rank = {rank} world_size = {world_size}')
    env_dict = {key: os.environ[key] for key in key_list}

    print(env_dict)

    if rank == 0:
        rpc.init_rpc(f"rank{rank}",
                     rank=rank,
                     world_size=world_size,
                     backend=rpc.BackendType.TENSORPIPE)

        batch_size_multiple = 1
        n_iter = 10000  #number of updates

        coordinator = Coordinator(world_size, batch_size_multiple, lr=1e-3)
        coord_rref = RRef(coordinator)
        coordinator.run_training_loop(n_iter, coord_rref)

        torch.save(
            coordinator.policy,
            open(
                f'plots/{coordinator.env_name}_policy_nworkers{world_size-1}_batchsizemultiple{batch_size_multiple}.pt',
                'wb'))
        json.dump(
            coordinator.stats,
            open(
                f'plots/{coordinator.env_name}_stats_nworkers{world_size-1}_batchsizemultiple{batch_size_multiple}.json',
                'w'))

    else:
        rpc.init_rpc(f"rank{rank}",
                     rank=rank,
                     world_size=world_size,
                     backend=rpc.BackendType.TENSORPIPE)

    rpc.shutdown()
Example #28
0
def _parameter_rrefs(module):
    param_rrefs = []
    for param in module.parameters():
        param_rrefs.append(RRef(param))
    return param_rrefs
Example #29
0
def _run_trainer(emb_rref, rank):
    r"""
    Each trainer runs a forward pass which involves an embedding lookup on the
    parameter server and running nn.Linear locally. During the backward pass,
    DDP is responsible for aggregating the gradients for the dense part
    (nn.Linear) and distributed autograd ensures gradients updates are
    propagated to the parameter server.
    """

    # Setup the model.
    model = HybridModel(emb_rref, rank)

    # Retrieve all model parameters as rrefs for DistributedOptimizer.

    # Retrieve parameters for embedding table.
    model_parameter_rrefs = rpc.rpc_sync("ps",
                                         _retrieve_embedding_parameters,
                                         args=(emb_rref, ))

    # model.parameters() only includes local parameters.
    for param in model.parameters():
        model_parameter_rrefs.append(RRef(param))

    # Setup distributed optimizer
    opt = DistributedOptimizer(
        optim.SGD,
        model_parameter_rrefs,
        lr=0.05,
    )

    criterion = torch.nn.CrossEntropyLoss()

    def get_next_batch(rank):
        for _ in range(10):
            num_indices = random.randint(20, 50)
            indices = torch.LongTensor(num_indices).random_(0, NUM_EMBEDDINGS)

            # Generate offsets.
            offsets = []
            start = 0
            batch_size = 0
            while start < num_indices:
                offsets.append(start)
                start += random.randint(1, 10)
                batch_size += 1

            offsets_tensor = torch.LongTensor(offsets)
            target = torch.LongTensor(batch_size).random_(8).cuda(rank)
            yield indices, offsets_tensor, target

    # Train for 100 epochs
    for epoch in range(100):
        # create distributed autograd context
        for indices, offsets, target in get_next_batch(rank):
            with dist_autograd.context() as context_id:
                output = model(indices, offsets)
                loss = criterion(output, target)

                # Run distributed backward pass
                dist_autograd.backward(context_id, [loss])

                # Tun distributed optimizer
                opt.step(context_id)

                # Not necessary to zero grads as each iteration creates a different
                # distributed autograd context which hosts different grads
        print("Training done for epoch {}".format(epoch))
Example #30
0
def _retrieve_embedding_parameters(emb_rref):
    return [RRef(p) for p in emb_rref.local_value().parameters()]