def impala(device, dtype, use_lr_sch=False): c = TestIMPALA.c actor = smw( Actor(c.observe_dim, c.action_num).type(dtype).to(device), device, device) critic = smw( Critic(c.observe_dim).type(dtype).to(device), device, device) servers = model_server_helper(model_num=1) world = get_world() # process 0 and 1 will be workers, and 2 will be trainer impala_group = world.create_rpc_group("impala", ["0", "1", "2"]) if use_lr_sch: lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)], logger=default_logger) impala = IMPALA(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), impala_group, servers, lr_scheduler=LambdaLR, lr_scheduler_args=((lr_func, ), (lr_func, ))) else: impala = IMPALA(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), impala_group, servers) return impala
def ddpg_apex(device, dtype, discrete=False): c = TestDDPGApex.c if not discrete: actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range) .type(dtype).to(device), device, device) actor_t = smw(Actor(c.observe_dim, c.action_dim, c.action_range) .type(dtype).to(device), device, device) else: actor = smw(ActorDiscrete(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) actor_t = smw(ActorDiscrete(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) critic = smw(Critic(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) critic_t = smw(Critic(c.observe_dim, c.action_dim) .type(dtype).to(device), device, device) servers = model_server_helper(model_num=2) world = get_world() # process 0 and 1 will be workers, and 2 will be trainer apex_group = world.create_rpc_group("worker", ["0", "1", "2"]) ddpg_apex = DDPGApex(actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction='sum'), apex_group, servers, replay_device="cpu", replay_size=c.replay_size) return ddpg_apex
def init_from_config(cls, config: Union[Dict[str, Any], Config]): world = get_world() f_config = deepcopy(config["frame_config"]) apex_group = world.create_rpc_group( group_name=f_config["apex_group_name"], members=( world.get_members() if f_config["apex_members"] == "all" else f_config["apex_members"] ), ) models = assert_and_get_valid_models(f_config["models"]) model_args = f_config["model_args"] model_kwargs = f_config["model_kwargs"] models = [ m(*arg, **kwarg) for m, arg, kwarg in zip(models, model_args, model_kwargs) ] # wrap models in DistributedDataParallel when running in learner mode max_learner_id = f_config["learner_process_number"] learner_group = world.create_collective_group(ranks=list(range(max_learner_id))) if world.rank < max_learner_id: models = [ DistributedDataParallel(module=m, process_group=learner_group.group) for m in models ] optimizer = assert_and_get_valid_optimizer(f_config["optimizer"]) criterion = assert_and_get_valid_criterion(f_config["criterion"])( *f_config["criterion_args"], **f_config["criterion_kwargs"] ) criterion.reduction = "none" lr_scheduler = f_config["lr_scheduler"] and assert_and_get_valid_lr_scheduler( f_config["lr_scheduler"] ) servers = model_server_helper( model_num=1, group_name=f_config["model_server_group_name"], members=f_config["model_server_members"], ) del f_config["optimizer"] del f_config["criterion"] del f_config["lr_scheduler"] frame = cls( *models, optimizer, criterion, apex_group, servers, lr_scheduler=lr_scheduler, **f_config ) if world.rank >= max_learner_id: frame.update = lambda *_, **__: (None, None) return frame
def ars_lr(device, dtype): c = TestARS.c actor = smw( ActorDiscrete(c.observe_dim, c.action_num).type(dtype).to(device), device, device) lr_func = gen_learning_rate_func([(0, 1e-3), (200000, 3e-4)], logger=default_logger) servers = model_server_helper(model_num=1) world = get_world() ars_group = world.create_rpc_group("ars", ["0", "1", "2"]) ars = ARS(actor, t.optim.SGD, ars_group, servers, noise_size=1000000, lr_scheduler=LambdaLR, lr_scheduler_args=((lr_func, ), )) return ars
def dqn_apex(device, dtype): c = TestDQNApex.c q_net = smw(QNet(c.observe_dim, c.action_num) .type(dtype).to(device), device, device) q_net_t = smw(QNet(c.observe_dim, c.action_num) .type(dtype).to(device), device, device) servers = model_server_helper(model_num=1) world = get_world() # process 0 and 1 will be workers, and 2 will be trainer apex_group = world.create_rpc_group("apex", ["0", "1", "2"]) dqn_apex = DQNApex(q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction='sum'), apex_group, servers, replay_device="cpu", replay_size=c.replay_size) return dqn_apex
def ars(device, dtype): c = TestARS.c actor = smw( ActorDiscrete(c.observe_dim, c.action_num).type(dtype).to(device), device, device) servers = model_server_helper(model_num=1) world = get_world() ars_group = world.create_rpc_group("ars", ["0", "1", "2"]) ars = ARS(actor, t.optim.SGD, ars_group, servers, noise_std_dev=0.1, learning_rate=0.1, noise_size=1000000, rollout_num=6, used_rollout_num=6, normalize_state=True) return ars
def init_from_config(cls, config: Union[Dict[str, Any], Config]): world = get_world() f_config = copy.deepcopy(config["frame_config"]) ars_group = world.create_rpc_group( group_name=f_config["ars_group_name"], members=( world.get_members() if f_config["ars_members"] == "all" else f_config["ars_members"] ), ) models = assert_and_get_valid_models(f_config["models"]) model_args = f_config["model_args"] model_kwargs = f_config["model_kwargs"] models = [ m(*arg, **kwarg) for m, arg, kwarg in zip(models, model_args, model_kwargs) ] optimizer = assert_and_get_valid_optimizer(f_config["optimizer"]) lr_scheduler = f_config["lr_scheduler"] and assert_and_get_valid_lr_scheduler( f_config["lr_scheduler"] ) servers = model_server_helper( model_num=1, group_name=f_config["model_server_group_name"], members=f_config["model_server_members"], ) del f_config["optimizer"] del f_config["lr_scheduler"] frame = cls( *models, optimizer, ars_group, servers, lr_scheduler=lr_scheduler, **f_config, ) return frame
def main(rank): env = gym.make("Pendulum-v0") observe_dim = 3 action_dim = 1 action_range = 2 max_episodes = 2000 max_steps = 200 noise_param = (0, 0.2) noise_mode = "normal" solved_reward = -150 solved_repeat = 5 # initlize distributed world first world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20) servers = model_server_helper(model_num=2) apex_group = world.create_rpc_group("apex", ["0", "1", "2", "3"]) actor = Actor(observe_dim, action_dim, action_range) actor_t = Actor(observe_dim, action_dim, action_range) critic = Critic(observe_dim, action_dim) critic_t = Critic(observe_dim, action_dim) ddpg_apex = DDPGApex(actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction='sum'), apex_group, servers) # synchronize all processes in the group, make sure # distributed buffer has been created on all processes in apex_group apex_group.barrier() # manually control syncing to improve performance ddpg_apex.set_sync(False) if rank in (0, 1): # Process 0 and 1 are workers(samplers) # begin training episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: # sleep to wait for learners keep up sleep(0.1) episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) # manually pull the newest parameters ddpg_apex.manual_sync() while not terminal and step <= max_steps: step += 1 with t.no_grad(): old_state = state # agent model inference action = ddpg_apex.act_with_noise({"state": old_state}, noise_param=noise_param, mode=noise_mode) state, reward, terminal, _ = env.step(action.numpy()) state = t.tensor(state, dtype=t.float32)\ .view(1, observe_dim) total_reward += reward[0] ddpg_apex.store_transition({ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": reward[0], "terminal": terminal or step == max_steps }) smoothed_total_reward = (smoothed_total_reward * 0.9 + total_reward * 0.1) logger.info("Process {} Episode {} total reward={:.2f}".format( rank, episode, smoothed_total_reward)) if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") # will cause torch RPC to complain # since other processes may have not finished yet. # just for demonstration. exit(0) else: reward_fulfilled = 0 elif rank in (2, 3): # wait for enough samples while ddpg_apex.replay_buffer.all_size() < 500: sleep(0.1) while True: ddpg_apex.update()
def main(rank): env = gym.make("CartPole-v0") observe_dim = 4 action_num = 2 max_episodes = 2000 max_steps = 200 solved_reward = 190 solved_repeat = 5 # initlize distributed world first world = World(world_size=3, rank=rank, name=str(rank), rpc_timeout=20) actor = dmw(ActorDiscrete(observe_dim, action_num)) servers = model_server_helper(model_num=1) ars_group = world.create_rpc_group("ars", ["0", "1", "2"]) ars = ARS( actor, t.optim.SGD, ars_group, servers, noise_std_dev=0.1, learning_rate=0.1, noise_size=1000000, rollout_num=6, used_rollout_num=6, normalize_state=True, ) # begin training episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 all_reward = 0 for at in ars.get_actor_types(): total_reward = 0 terminal = False step = 0 # batch size = 1 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) while not terminal and step <= max_steps: step += 1 with t.no_grad(): # agent model inference action = ars.act({"state": state}, at) state, reward, terminal, __ = env.step(action) state = t.tensor(state, dtype=t.float32).view(1, observe_dim) total_reward += reward ars.store_reward(total_reward, at) all_reward += total_reward # update ars.update() # show reward smoothed_total_reward = ( smoothed_total_reward * 0.9 + all_reward / len(ars.get_actor_types()) * 0.1 ) logger.info( f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}" ) if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") # will cause torch RPC to complain # since other processes may have not finished yet. # just for demonstration. exit(0) else: reward_fulfilled = 0
def main(rank): env = gym.make("CartPole-v0") observe_dim = 4 action_num = 2 max_episodes = 2000 max_steps = 200 solved_reward = 190 solved_repeat = 5 # initlize distributed world first world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20) servers = model_server_helper(model_num=1) apex_group = world.create_rpc_group("apex", ["0", "1", "2", "3"]) if rank in (2, 3): # learner_group.group is the wrapped torch.distributed.ProcessGroup learner_group = world.create_collective_group(ranks=[2, 3]) # wrap the model with DistributedDataParallel # if current process is learner process 2 or 3 q_net = DistributedDataParallel(module=QNet(observe_dim, action_num), process_group=learner_group.group) q_net_t = DistributedDataParallel(module=QNet(observe_dim, action_num), process_group=learner_group.group) else: q_net = QNet(observe_dim, action_num) q_net_t = QNet(observe_dim, action_num) # we may use a smaller batch size to train if we are using # DistributedDataParallel dqn_apex = DQNApex( q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction="sum"), apex_group, servers, batch_size=50, ) # synchronize all processes in the group, make sure # distributed buffer has been created on all processes in apex_group apex_group.barrier() # manually control syncing to improve performance dqn_apex.set_sync(False) if rank in (0, 1): # Process 0 and 1 are workers(samplers) # begin training episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: # sleep to wait for learners keep up sleep(0.1) episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) # manually pull the newest parameters dqn_apex.manual_sync() while not terminal and step <= max_steps: step += 1 with t.no_grad(): old_state = state # agent model inference action = dqn_apex.act_discrete_with_noise( {"state": old_state}) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).view(1, observe_dim) total_reward += reward dqn_apex.store_transition({ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": reward, "terminal": terminal or step == max_steps, }) smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1 logger.info( f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}" ) if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") # will cause torch RPC to complain # since other processes may have not finished yet. # just for demonstration. exit(0) else: reward_fulfilled = 0 elif rank in (2, 3): # wait for enough samples while dqn_apex.replay_buffer.all_size() < 500: sleep(0.1) while True: dqn_apex.update()
def main(rank): env = gym.make("CartPole-v0") observe_dim = 4 action_num = 2 max_episodes = 2000 max_steps = 200 solved_reward = 190 solved_repeat = 5 # initlize distributed world first world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20) servers = model_server_helper(model_num=1) impala_group = world.create_rpc_group("impala", ["0", "1", "2", "3"]) if rank in (2, 3): # learner_group.group is the wrapped torch.distributed.ProcessGroup learner_group = world.create_collective_group(ranks=[2, 3]) # wrap the model with DistributedDataParallel # if current process is learner process 2 or 3 actor = DistributedDataParallel(module=Actor(observe_dim, action_num), process_group=learner_group.group) critic = DistributedDataParallel(module=Critic(observe_dim), process_group=learner_group.group) else: actor = Actor(observe_dim, action_num) critic = Critic(observe_dim) # we may use a smaller batch size to train if we are using # DistributedDataParallel # note: since the impala framework is storing a whole # episode as a single sample, we should wait for a smaller number impala = IMPALA( actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"), impala_group, servers, batch_size=2, ) # synchronize all processes in the group, make sure # distributed buffer has been created on all processes in apex_group impala_group.barrier() # manually control syncing to improve performance impala.set_sync(False) if rank in (0, 1): # Process 0 and 1 are workers(samplers) # begin training episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: # sleep to wait for learners keep up sleep(0.1) episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) # manually pull the newest parameters impala.manual_sync() tmp_observations = [] while not terminal and step <= max_steps: step += 1 with t.no_grad(): old_state = state # agent model inference action, action_log_prob, *_ = impala.act( {"state": old_state}) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).view(1, observe_dim) total_reward += reward tmp_observations.append({ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": reward, "action_log_prob": action_log_prob.item(), "terminal": terminal or step == max_steps, }) impala.store_episode(tmp_observations) smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1 logger.info( f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}" ) if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") # will cause torch RPC to complain # since other processes may have not finished yet. # just for demonstration. exit(0) else: reward_fulfilled = 0 elif rank in (2, 3): # wait for enough samples # note: since the impala framework is storing a whole # episode as a single sample, we should wait for a smaller number while impala.replay_buffer.all_size() < 5: sleep(0.1) while True: impala.update()
def init_from_config( cls, config: Union[Dict[str, Any], Config], model_device: Union[str, t.device] = "cpu", ): world = get_world() f_config = deepcopy(config["frame_config"]) impala_group = world.create_rpc_group( group_name=f_config["impala_group_name"], members=( world.get_members() if f_config["impala_members"] == "all" else f_config["impala_members"] ), ) models = assert_and_get_valid_models(f_config["models"]) model_args = f_config["model_args"] model_kwargs = f_config["model_kwargs"] models = [ m(*arg, **kwarg).to(model_device) for m, arg, kwarg in zip(models, model_args, model_kwargs) ] # wrap models in DistributedDataParallel when running in learner mode max_learner_id = f_config["learner_process_number"] learner_group = world.create_collective_group(ranks=list(range(max_learner_id))) if world.rank < max_learner_id: models = [ DistributedDataParallel(module=m, process_group=learner_group.group) for m in models ] optimizer = assert_and_get_valid_optimizer(f_config["optimizer"]) criterion = assert_and_get_valid_criterion(f_config["criterion"])( *f_config["criterion_args"], **f_config["criterion_kwargs"] ) lr_scheduler = f_config["lr_scheduler"] and assert_and_get_valid_lr_scheduler( f_config["lr_scheduler"] ) servers = model_server_helper( model_num=1, group_name=f_config["model_server_group_name"], members=f_config["model_server_members"], ) del f_config["optimizer"] del f_config["criterion"] del f_config["lr_scheduler"] frame = cls( *models, optimizer, criterion, impala_group, servers, lr_scheduler=lr_scheduler, **f_config, ) if world.rank >= max_learner_id: frame.role = "sampler" frame.update = _disable_update else: frame.role = "learner" return frame