def test_seed(self, envs, idx): for name, creators in zip(*envs): default_logger.info(f"Testing on env {name}") subproc_wrapper = openai_gym.ParallelWrapperSubProc(creators) seeds = subproc_wrapper.seed() subproc_wrapper.close() assert len(seeds) == ENV_NUM
def test_full_train(self, train_config, rainbow_train): c = train_config # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env while episode < c.max_episodes: episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) tmp_observations = [] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state # agent model inference action = rainbow_train.act_discrete_with_noise( {"state": old_state.unsqueeze(0)} ) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) tmp_observations.append({ "state": {"state": old_state.unsqueeze(0)}, "action": {"action": action}, "next_state": {"state": state.unsqueeze(0)}, "reward": float(reward), "terminal": terminal or step == c.max_steps }) rainbow_train.store_episode(tmp_observations) # update if episode.get() > 100: for _ in range(step.get()): rainbow_train.update() smoother.update(total_reward) step.reset() terminal = False logger.info("Episode {} total reward={:.2f}" .format(episode, smoother.value)) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: logger.info("Environment solved!") return else: reward_fulfilled.reset() pytest.fail("RAINBOW Training failed.")
def remove_trials_older_than( self, diff_day: int = 0, diff_hour: int = 1, diff_minute: int = 0, diff_second: int = 0, ): """ By default this function removes all trials started one hour earlier than current time. Args: diff_day: Difference in days. diff_hour: Difference in hours. diff_minute: Difference in minutes. diff_second: Difference in seconds. """ trial_list = [f for f in os.listdir(self.env_root)] current_time = datetime.now() diff_threshold = timedelta( days=diff_day, hours=diff_hour, minutes=diff_minute, seconds=diff_second ) for file in trial_list: try: time = datetime.strptime(file, self.time_format) except ValueError: # not a trial pass else: diff_time = current_time - time if diff_time > diff_threshold: rm_path = join(self.env_root, file) default_logger.info(f"Removing trial directory: {rm_path}") shutil.rmtree(rm_path)
def test_full_train(self, train_config, a2c_train, gae_lambda): c = train_config a2c_train.gae_lambda = gae_lambda # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env env.seed(0) while episode < c.max_episodes: episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) tmp_observations = [] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state # agent model inference action = a2c_train.act({"state": old_state.unsqueeze(0)})[0] state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) tmp_observations.append( { "state": {"state": old_state.unsqueeze(0)}, "action": {"action": action}, "next_state": {"state": state.unsqueeze(0)}, "reward": float(reward), "terminal": terminal or step == c.max_steps, } ) # update a2c_train.store_episode(tmp_observations) a2c_train.update() smoother.update(total_reward) step.reset() terminal = False logger.info(f"Episode {episode} total reward={smoother.value:.2f}") if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: logger.info("Environment solved!") return else: reward_fulfilled.reset() pytest.fail("A2C Training failed.")
def test_active(self, envs): for name, creators in zip(*envs): default_logger.info(f"Testing on env {name}") subproc_wrapper = openai_gym.ParallelWrapperSubProc(creators) subproc_wrapper.reset() active = subproc_wrapper.active() subproc_wrapper.close() assert len(active) == ENV_NUM
def test_render(self, envs, idx, render_num): for name, creators in zip(*envs): default_logger.info(f"Testing on env {name}") subproc_wrapper = openai_gym.ParallelWrapperSubProc(creators) subproc_wrapper.reset(idx) rendered = subproc_wrapper.render(idx) subproc_wrapper.close() assert len(rendered) == render_num assert isinstance(rendered[0], np.ndarray) assert rendered[0].ndim == 3 and rendered[0].shape[-1] == 3
def on_train_batch_end(self, trainer, pl_module, outputs, batch, _batch_idx, _dataloader_idx) -> None: for log in batch[0].logs: if "total_reward" in log: self.max_total_reward = max(log["total_reward"], self.max_total_reward) default_logger.info( f"Current max total reward={self.max_total_reward:.2f}.") trainer.should_stop = self.max_total_reward >= 150 return default_logger.error("Missing total reward in logs.")
def test_reset(self, envs, idx, reset_num): for name, creators in zip(*envs): default_logger.info(f"Testing on env {name}") dummy_wrapper = openai_gym.ParallelWrapperDummy(creators) obsrvs = dummy_wrapper.reset(idx) dummy_wrapper.close() assert len(obsrvs) == reset_num for obsrv in obsrvs: assert dummy_wrapper.observation_space.contains( obsrv ), "Required observation form: {}, Actual observation: {}".format( str(dummy_wrapper.observation_space), obsrv)
def perturb_adjust_hook(_model, _input, output): if perturb_switch.get(): tmp_action["with_noise"] = output.clone() else: tmp_action["without_noise"] = output.clone() if "with_noise" in tmp_action and "without_noise" in tmp_action: # Compute distance between two actions generated by # noisy parameters and original parameters. with t.no_grad(): dist = distance_func(tmp_action["with_noise"], tmp_action["without_noise"]) tmp_action.clear() param_noise_spec.adapt(dist) logger.info("Current output distance: {}".format(dist)) logger.info("Current param noise stddev: {}".format( param_noise_spec.get_dev()))
def on_train_batch_end(self, trainer, pl_module, outputs, batch, _batch_idx, _dataloader_idx) -> None: for log in batch[0].logs: if "total_reward" in log: self.max_total_reward = max(log["total_reward"], self.max_total_reward) default_logger.info( f"Process [{get_cur_rank()}] " f"Current max total reward={self.max_total_reward:.2f}.") self.queue.put((get_cur_rank(), self.max_total_reward)) t_plugin = trainer.training_type_plugin trainer.should_stop = self.reduce_early_stopping_decision( trainer, t_plugin) if trainer.should_stop: default_logger.info( f"Process [{get_cur_rank()}] decides to exit.") return default_logger.error("Missing total reward in logs.")
def test_step(self, envs, idx, act_num): for name, creators in zip(*envs): default_logger.info(f"Testing on env {name}") dummy_wrapper = openai_gym.ParallelWrapperDummy(creators) action = [ mock_action(dummy_wrapper.action_space) for _ in range(act_num) ] dummy_wrapper.reset(idx) obsrvs, reward, terminal, info = dummy_wrapper.step(action, idx) dummy_wrapper.close() assert len(obsrvs) == act_num assert len(reward) == act_num assert len(terminal) == act_num assert len(info) == act_num and isinstance(info[0], dict) for obsrv in obsrvs: assert dummy_wrapper.observation_space.contains( obsrv ), "Required observation form: {}, Actual observation: {}".format( str(dummy_wrapper.observation_space), obsrv)
def generate(): actor = Actor(observe_dim, action_num) critic = Critic(observe_dim) ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum")) episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 # update episode_observations, episode_total_reward = run_episode(ppo, env) ppo.store_episode(episode_observations) ppo.update() # show reward smoothed_total_reward = smoothed_total_reward * 0.9 + episode_total_reward * 0.1 logger.info( f"Episode {episode} total reward={smoothed_total_reward:.2f}") if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") break else: reward_fulfilled = 0 trajectories = [] for i in range(expert_episodes): logger.info(f"Generating trajectory {i}") trajectories.append([{ "state": s["state"], "action": s["action"] } for s in run_episode(ppo, env)[0]]) archive = Archive( path=os.path.join(ROOT, "generated", f"{generated_name}_" + get_time_string())) archive.add_item("expert_trajectories", trajectories) archive.save() logger.info(f"Trajectories saved as {archive.path}")
def test_dqn_apex_cpu_spawn_full_train(self, tmpdir): # by default, pytorch lightning will use ddp-spawn mode to replace ddp # if there are only cpus os.environ["WORLD_SIZE"] = "3" config = generate_env_config("CartPole-v0", {}) config = generate_training_config(root_dir=tmpdir.make_numbered_dir(), config=config) config = generate_algorithm_config("DQNApex", config) # use ddp_cpu config["gpus"] = None config["num_processes"] = 3 # this testing process corresponds to this node config["num_nodes"] = 1 config["early_stopping_patience"] = 100 # Use class instead of string name since algorithms is distributed. config["frame_config"]["models"] = [QNet, QNet] config["frame_config"]["model_kwargs"] = [ { "state_dim": 4, "action_num": 2 }, { "state_dim": 4, "action_num": 2 }, ] # for spawn we use a special callback, because the we cannot access # max_total_reward from sub-processes queue = SimpleQueue(ctx=mp.get_context("spawn")) # cb = [SpawnInspectCallback(queue), LoggerDebugCallback()] cb = [SpawnInspectCallback(queue)] t = Thread(target=launch, args=(config, ), kwargs={"pl_callbacks": cb}) t.start() default_logger.info("Start tracking") subproc_max_total_reward = [0, 0, 0] while True: try: result = queue.quick_get(timeout=60) default_logger.info( f"Result from process [{result[0]}]: {result[1]}") subproc_max_total_reward[result[0]] = result[1] except TimeoutError: # no more results default_logger.info("No more results.") break t.join() assert ( sum(subproc_max_total_reward) / 3 >= 150 ), f"Max total reward {sum(subproc_max_total_reward) / 3} below threshold 150."
def test_cpu_shared_tensor(self): x = [t.ones([10]) * i for i in range(5)] for xx in x: xx.share_memory_() logger.info("CPU tensors created.") pool = self.pool_impl(processes=2, is_copy_tensor=False, share_method="cpu") logger.info("Pool created.") assert all( out == expect_out for out, expect_out in zip(pool.map(func, x), [0, 20, 40, 60, 80]) ) pool.close() pool.join() logger.info("Pool joined.")
def generate_expert_episodes(): actor = Actor(observe_dim, action_num) critic = Critic(observe_dim) ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum")) logger.info("Training expert PPO") episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 # update episode_observations, episode_total_reward = run_episode(ppo, env) ppo.store_episode(episode_observations) ppo.update() # show reward smoothed_total_reward = smoothed_total_reward * 0.9 + episode_total_reward * 0.1 logger.info( f"Episode {episode} total reward={smoothed_total_reward:.2f}") if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") break else: reward_fulfilled = 0 trajectories = [] for i in range(expert_episodes): logger.info(f"Generating trajectory {i}") trajectories.append([{ "state": s["state"], "action": s["action"] } for s in run_episode(ppo, env)[0]]) return trajectories
action = ppo.act({"mem": history.get()})[0] state, reward, terminal, _ = env.step(action.item()) state = convert(state) total_reward += reward old_history = history.get() new_history = history.append(state).get() tmp_observations.append({ "state": { "mem": old_history }, "action": { "action": action }, "next_state": { "mem": new_history }, "reward": reward, "terminal": terminal, }) # update ppo.store_episode(tmp_observations) ppo.update() # show reward smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1 logger.info( f"Episode {episode} total reward={smoothed_total_reward:.2f}")
"state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": reward, "terminal": terminal or step == max_steps }) # update, update more if episode is longer, else less if episode > 100: for _ in range(step): dqn.update() # show reward smoothed_total_reward = (smoothed_total_reward * 0.9 + total_reward * 0.1) logger.info("Episode {} total reward={:.2f}".format( episode, smoothed_total_reward)) if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") exit(0) else: reward_fulfilled = 0
"state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": reward, "terminal": terminal or step == max_steps, }) # update, update more if episode is longer, else less if episode > 100: for _ in range(step): dqn.update() # show reward smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1 logger.info( f"Episode {episode} total reward={smoothed_total_reward:.2f}") if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") exit(0) else: reward_fulfilled = 0
def test_full_train(self, train_config, sac_train): c = train_config sac_train.target_entropy = -c.action_dim # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env while episode < c.max_episodes: episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state # agent model inference action = sac_train.act({"state": old_state.unsqueeze(0)})[0] state, reward, terminal, _ = env.step(action.cpu().numpy()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) sac_train.store_transition({ "state": { "state": old_state.unsqueeze(0) }, "action": { "action": action }, "next_state": { "state": state.unsqueeze(0) }, "reward": float(reward), "terminal": terminal or step == c.max_steps, }) # update if episode > 100: for i in range(step.get()): sac_train.update() logger.info( f"new entropy alpha: {sac_train.entropy_alpha.item()}") smoother.update(total_reward) step.reset() terminal = False logger.info(f"Episode {episode} total reward={smoother.value:.2f}") if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: logger.info("Environment solved!") return else: reward_fulfilled.reset() pytest.fail("SAC Training failed.")
def main(rank): env = gym.make("Pendulum-v0") observe_dim = 3 action_dim = 1 action_range = 2 max_episodes = 2000 max_steps = 200 noise_param = (0, 0.2) noise_mode = "normal" solved_reward = -150 solved_repeat = 5 # initlize distributed world first world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20) servers = model_server_helper(model_num=2) apex_group = world.create_rpc_group("apex", ["0", "1", "2", "3"]) actor = Actor(observe_dim, action_dim, action_range) actor_t = Actor(observe_dim, action_dim, action_range) critic = Critic(observe_dim, action_dim) critic_t = Critic(observe_dim, action_dim) ddpg_apex = DDPGApex(actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction='sum'), apex_group, servers) # synchronize all processes in the group, make sure # distributed buffer has been created on all processes in apex_group apex_group.barrier() # manually control syncing to improve performance ddpg_apex.set_sync(False) if rank in (0, 1): # Process 0 and 1 are workers(samplers) # begin training episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: # sleep to wait for learners keep up sleep(0.1) episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) # manually pull the newest parameters ddpg_apex.manual_sync() while not terminal and step <= max_steps: step += 1 with t.no_grad(): old_state = state # agent model inference action = ddpg_apex.act_with_noise({"state": old_state}, noise_param=noise_param, mode=noise_mode) state, reward, terminal, _ = env.step(action.numpy()) state = t.tensor(state, dtype=t.float32)\ .view(1, observe_dim) total_reward += reward[0] ddpg_apex.store_transition({ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": reward[0], "terminal": terminal or step == max_steps }) smoothed_total_reward = (smoothed_total_reward * 0.9 + total_reward * 0.1) logger.info("Process {} Episode {} total reward={:.2f}".format( rank, episode, smoothed_total_reward)) if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") # will cause torch RPC to complain # since other processes may have not finished yet. # just for demonstration. exit(0) else: reward_fulfilled = 0 elif rank in (2, 3): # wait for enough samples while ddpg_apex.replay_buffer.all_size() < 500: sleep(0.1) while True: ddpg_apex.update()
def main(rank): env = gym.make("CartPole-v0") observe_dim = 4 action_num = 2 max_episodes = 2000 max_steps = 200 solved_reward = 190 solved_repeat = 5 # initlize distributed world first world = World(world_size=3, rank=rank, name=str(rank), rpc_timeout=20) actor = dmw(ActorDiscrete(observe_dim, action_num)) servers = model_server_helper(model_num=1) ars_group = world.create_rpc_group("ars", ["0", "1", "2"]) ars = ARS( actor, t.optim.SGD, ars_group, servers, noise_std_dev=0.1, learning_rate=0.1, noise_size=1000000, rollout_num=6, used_rollout_num=6, normalize_state=True, ) # begin training episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 all_reward = 0 for at in ars.get_actor_types(): total_reward = 0 terminal = False step = 0 # batch size = 1 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) while not terminal and step <= max_steps: step += 1 with t.no_grad(): # agent model inference action = ars.act({"state": state}, at) state, reward, terminal, __ = env.step(action) state = t.tensor(state, dtype=t.float32).view(1, observe_dim) total_reward += reward ars.store_reward(total_reward, at) all_reward += total_reward # update ars.update() # show reward smoothed_total_reward = ( smoothed_total_reward * 0.9 + all_reward / len(ars.get_actor_types()) * 0.1 ) logger.info( f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}" ) if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") # will cause torch RPC to complain # since other processes may have not finished yet. # just for demonstration. exit(0) else: reward_fulfilled = 0
def main(rank): env = gym.make("CartPole-v0") observe_dim = 4 action_num = 2 max_episodes = 2000 max_steps = 200 solved_reward = 190 solved_repeat = 5 # initlize distributed world first world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20) servers = model_server_helper(model_num=1) apex_group = world.create_rpc_group("apex", ["0", "1", "2", "3"]) if rank in (2, 3): # learner_group.group is the wrapped torch.distributed.ProcessGroup learner_group = world.create_collective_group(ranks=[2, 3]) # wrap the model with DistributedDataParallel # if current process is learner process 2 or 3 q_net = DistributedDataParallel(module=QNet(observe_dim, action_num), process_group=learner_group.group) q_net_t = DistributedDataParallel(module=QNet(observe_dim, action_num), process_group=learner_group.group) else: q_net = QNet(observe_dim, action_num) q_net_t = QNet(observe_dim, action_num) # we may use a smaller batch size to train if we are using # DistributedDataParallel dqn_apex = DQNApex( q_net, q_net_t, t.optim.Adam, nn.MSELoss(reduction="sum"), apex_group, servers, batch_size=50, ) # synchronize all processes in the group, make sure # distributed buffer has been created on all processes in apex_group apex_group.barrier() # manually control syncing to improve performance dqn_apex.set_sync(False) if rank in (0, 1): # Process 0 and 1 are workers(samplers) # begin training episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: # sleep to wait for learners keep up sleep(0.1) episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) # manually pull the newest parameters dqn_apex.manual_sync() while not terminal and step <= max_steps: step += 1 with t.no_grad(): old_state = state # agent model inference action = dqn_apex.act_discrete_with_noise( {"state": old_state}) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).view(1, observe_dim) total_reward += reward dqn_apex.store_transition({ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": reward, "terminal": terminal or step == max_steps, }) smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1 logger.info( f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}" ) if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") # will cause torch RPC to complain # since other processes may have not finished yet. # just for demonstration. exit(0) else: reward_fulfilled = 0 elif rank in (2, 3): # wait for enough samples while dqn_apex.replay_buffer.all_size() < 500: sleep(0.1) while True: dqn_apex.update()
def test_full_train(self, train_config, maddpg_train): c = train_config # begin training episode, step = Counter(), Counter() # first for prey, second for pred smoother = Smooth() reward_fulfilled = Counter() terminal = False env = c.env while episode < c.max_episodes: episode.count() # batch size = 1 total_reward = 0 states = [t.tensor(st, dtype=t.float32) for st in env.reset()] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_states = states # agent model inference results = maddpg_train.act_discrete_with_noise( [{"state": st.unsqueeze(0)} for st in states] ) actions = [int(r[0]) for r in results] action_probs = [r[1] for r in results] states, rewards, terminals, _ = env.step(actions) states = [t.tensor(st, dtype=t.float32) for st in states] total_reward += float(sum(rewards)) / c.agent_num maddpg_train.store_transitions( [ { "state": {"state": ost.unsqueeze(0)}, "action": {"action": act}, "next_state": {"state": st.unsqueeze(0)}, "reward": float(rew), "terminal": term or step == c.max_steps, } for ost, act, st, rew, term in zip( old_states, action_probs, states, rewards, terminals ) ] ) # update if episode > 5: for i in range(step.get()): maddpg_train.update() # total reward is divided by steps here, since: # "Agents are rewarded based on minimum agent distance # to each landmark, penalized for collisions" smoother.update(total_reward / step.get()) logger.info(f"Episode {episode} total steps={step}") step.reset() terminal = False logger.info(f"Episode {episode} total reward={smoother.value:.2f}") if smoother.value > c.solved_reward and episode > 20: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: logger.info("Environment solved!") return else: reward_fulfilled.reset() pytest.fail("MADDPG Training failed.")
def __init__( self, models: List[nn.Module], model_connection: Dict[Tuple[int, int], int], devices: List[Union[t.device, str]] = None, model_size_multiplier=2, max_mem_ratio=0.5, cpu_weight=0, connection_weight=2, size_match_weight=1e-2, complexity_match_weight=1, entropy_weight=1, iterations=500, update_rate=0.01, gpu_gpu_distance=1, cpu_gpu_distance=10, move_models=True, ): """ Assign models to different devices. In the scope of a single process. Assigner assumes all GPUs have the **same processing power**. Assignment is based on four aspects: 1. Distance and model connections. Connection is usually indicated by the amount of data transmitted between two models. 2. Compute complexity. 3. Model size. 4. Entropy. Four aspects are controlled by four weights: 1. ``connection_weight``, assigner will try to reduce the total ``distance * connection`` if this weight is larger. 2. ``size_match_weight``, this weight controls the total memory space used on a single device, only works if total assigned memory of models exceeds allowed device memory size (internally it uses a relu activation), the larger, the tighter and more restricted the fit. 3. ``complexity_match_weight``, this weights balance the model computation cost across devices, assigner will try to even the ``computation cost / compute power`` ratio for each device if this weight is larger. 4. ``entropy_weight``, this weight minimize the uncertainty of model placement probability, so ``model i`` will have a close to 1 probability of locating on some ``device j`` if this weight is larger. Assignment uses gradient descent to compute the probability matrix of each ``model i`` locating on each available ``device j``. See Also: :class:`.ModelSizeEstimator` Note: When the sum of your model size is very close to the capacity of your device memory, `ModelAssigner` does not respond very well to the ``size_match_weight``, therefore, please consider about increasing ``model_size_multiplier`` or decreasing ``max_mem_ratio``. Args: models: Models to assign. model_connection: Connection weight between modules. **Must be positive** devices: Available devices. model_size_multiplier: Size multiplier of models, used to reserve enough space for models, max_mem_ratio: Maximum percent of memory allowed. cpu_weight: Weight of cpu. Relative to the computing power of one GPU. By default it is 0 so no computation will be performed on CPU. **Must be positive** connection_weight: Weight of connection between models. size_match_weight: Weight of size match. complexity_match_weight: Weight of complexity match. entropy_weight: Weight of entropy. iterations: Number of optimization iterations. update_rate: Learning rate of the adam optimizer. gpu_gpu_distance: Estimated distance cost between gpu-gpu. **Must be positive** cpu_gpu_distance: Estimated distance cost between cpu-gpu. **Must be positive** move_models: Whether to automatically move the models after assignment. """ if devices is None: devices = [ t.device(type="cuda", index=i) for i in GPUtil.getAvailable(order="load") ] else: devices = [t.device(d) for d in devices] available_devices = [ t.device(type="cuda", index=i) for i in GPUtil.getAvailable(order="load") ] used_devices = [] for dev in devices: if dev.type == "cuda" and dev not in available_devices: default_logger.info( f"Warning: device {dev} not available, removed.") else: used_devices.append(dev) devices = used_devices if not devices: devices = [t.device("cpu")] default_logger.info(f"Using these devices: {devices}") sizes = [ ModelSizeEstimator(model, model_size_multiplier).estimate_size() for model in models ] device_size_capacity = [] device_complexity_capacity = [] gpus = GPUtil.getGPUs() for dev in devices: if dev.type == "cpu": device_size_capacity.append( int(psutil.virtual_memory().available / 1024**2) * max_mem_ratio) device_complexity_capacity.append(cpu_weight) elif dev.type == "cuda": device_size_capacity.append(gpus[dev.index].memoryFree * max_mem_ratio) device_complexity_capacity.append(1 - gpus[dev.index].load) if np.sum(np.array(sizes)) > np.sum(device_size_capacity): raise RuntimeError( f"Estimated model will use {np.sum(np.array(sizes)):.2f} MB, " f"but only have {np.sum(device_size_capacity):.2f} MB allowed memory " "in total.") # assign model to devices # using heuristic and gradient decent device_num = len(devices) model_num = len(models) # Important, the placement probability matrix! this matrix # describes the probability of placement of: # model i on device j placement = t.randn([model_num, device_num], requires_grad=True) optimizer = t.optim.Adam([placement], lr=update_rate) model_size = t.tensor(sizes, dtype=t.float).view([1, model_num]) size_capacity = t.tensor(device_size_capacity, dtype=t.float).view([1, device_num]) model_complexity = model_size # complexity_capacity is basically the estimated computing power # of devices. complexity_capacity = t.tensor(device_complexity_capacity, dtype=t.float).view([1, device_num]) # model connection indicates the amount of data transmitted between # each pair of models, a weighted adjacency matrix. model_conn = t.zeros([model_num, model_num]) for direction, conn in model_connection.items(): model_conn[direction[0], direction[1]] = conn # device distance matrix device_distance = t.zeros([device_num, device_num]) for i in range(device_num): for j in range(i): if (devices[i].type == "cpu" and devices[j].type == "cuda" or devices[i].type == "cuda" and devices[j].type == "cpu"): device_distance[i, j] = device_distance[j, i] = cpu_gpu_distance elif (devices[i].type == "cuda" and devices[j].type == "cuda" and devices[i].index != devices[j].index): device_distance[i, j] = device_distance[j, i] = gpu_gpu_distance # optimize for _ in range(iterations): self.optimize_placement( optimizer, placement, model_size, size_capacity, model_complexity, complexity_capacity, model_conn, device_distance, connection_weight, size_match_weight, complexity_match_weight, entropy_weight, ) self._assignment = [ devices[d] for d in t.argmax(placement, dim=1).tolist() ] if move_models: for model, ass_device in zip(models, self._assignment): model.to(ass_device)
def test_close(self, envs): for name, creators in zip(*envs): default_logger.info(f"Testing on env {name}") subproc_wrapper = openai_gym.ParallelWrapperSubProc(creators) subproc_wrapper.close()
state, reward, terminal, _ = env.step(action.item()) state = convert(state) total_reward += reward tmp_observations.append({ "state": { "mem": old_state, "hidden": old_hidden }, "action": { "action": action }, "next_state": { "mem": state, "hidden": hidden }, "reward": reward, "terminal": terminal }) # update rppo.store_episode(tmp_observations) rppo.update() # show reward smoothed_total_reward = (smoothed_total_reward * 0.9 + total_reward * 0.1) logger.info("Episode {} total reward={:.2f}".format( episode, smoothed_total_reward))
def fnTrain(): episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 iNumOfTrainSamples = env.fnNumIterations() afRewardArray = [] fMaxRewardSum = -np.inf while episode < iNumOfTrainSamples: episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1, env.observation_spec().shape[0]) while not terminal and step <= max_steps: step += 1 with t.no_grad(): old_state = state # agent model inference action = dqn.act_discrete_with_noise({"some_state": old_state}) state, reward, terminal, oInfo = env.step(action.item()) state = t.tensor(state, dtype=t.float32).view( 1, env.observation_spec().shape[0]) total_reward += reward dqn.store_transition({ "state": { "some_state": old_state }, "action": { "action": action }, "next_state": { "some_state": state }, "reward": np.float32(reward), "terminal": terminal or step == max_steps }) # update, update more if episode is longer, else less if episode > 100: for _ in range(step): dqn.update() # show reward smoothed_total_reward = (smoothed_total_reward * 0.9 + total_reward * 0.1) logger.info("Episode {} of {} ({:.2f}%), total reward={:.2f}".format( episode, iNumOfTrainSamples, 100.00 * episode / iNumOfTrainSamples, smoothed_total_reward)) if (solved_repeat <= len(afRewardArray)): afRewardArray.pop(0) afRewardArray.append(smoothed_total_reward) fRewardSum = np.sum(afRewardArray) if (fMaxRewardSum < fRewardSum): fMaxRewardSum = fRewardSum dqn.save(g_sModel1) print("Reward sum={}".format(fMaxRewardSum))
def main(rank): env = gym.make("CartPole-v0") observe_dim = 4 action_num = 2 max_episodes = 2000 max_steps = 200 solved_reward = 190 solved_repeat = 5 # initlize distributed world first _world = World(world_size=3, rank=rank, name=str(rank), rpc_timeout=20) actor = Actor(observe_dim, action_num) critic = Critic(observe_dim) # in all test scenarios, all processes will be used as reducers servers = grad_server_helper( [lambda: Actor(observe_dim, action_num), lambda: Critic(observe_dim)], learning_rate=5e-3 ) a3c = A3C(actor, critic, nn.MSELoss(reduction='sum'), servers) # manually control syncing to improve performance a3c.set_sync(False) # begin training episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) # manually pull the newest parameters a3c.manual_sync() tmp_observations = [] while not terminal and step <= max_steps: step += 1 with t.no_grad(): old_state = state # agent model inference action = a3c.act({"state": old_state})[0] state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).view(1, observe_dim) total_reward += reward tmp_observations.append({ "state": {"state": old_state}, "action": {"action": action}, "next_state": {"state": state}, "reward": reward, "terminal": terminal or step == max_steps }) # update a3c.store_episode(tmp_observations) a3c.update() # show reward smoothed_total_reward = (smoothed_total_reward * 0.9 + total_reward * 0.1) logger.info("Process {} Episode {} total reward={:.2f}" .format(rank, episode, smoothed_total_reward)) if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") # will cause torch RPC to complain # since other processes may have not finished yet. # just for demonstration. exit(0) else: reward_fulfilled = 0
def test_full_train(self, train_config, ddpg_per_train): c = train_config # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env while episode < c.max_episodes: episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state # agent model inference if episode.get() % c.noise_interval == 0: action = ddpg_per_train.act_with_noise( {"state": old_state.unsqueeze(0)}, noise_param=c.noise_param, mode=c.noise_mode, ) else: action = ddpg_per_train.act({ "state": old_state.unsqueeze(0) }).clamp(-c.action_range, c.action_range) state, reward, terminal, _ = env.step(action.cpu().numpy()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) ddpg_per_train.store_transition({ "state": { "state": old_state.unsqueeze(0) }, "action": { "action": action }, "next_state": { "state": state.unsqueeze(0) }, "reward": float(reward), "terminal": terminal or step == c.max_steps, }) # update if episode > 100: for i in range(step.get()): ddpg_per_train.update() smoother.update(total_reward) step.reset() terminal = False if episode.get() % c.noise_interval != 0: # only log result without noise logger.info( f"Episode {episode} total reward={smoother.value:.2f}") if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: logger.info("Environment solved!") return else: reward_fulfilled.reset() pytest.fail("DDPGPer Training failed.")
def main(rank): env = gym.make("CartPole-v0") observe_dim = 4 action_num = 2 max_episodes = 2000 max_steps = 200 solved_reward = 190 solved_repeat = 5 # initlize distributed world first world = World(world_size=4, rank=rank, name=str(rank), rpc_timeout=20) servers = model_server_helper(model_num=1) impala_group = world.create_rpc_group("impala", ["0", "1", "2", "3"]) if rank in (2, 3): # learner_group.group is the wrapped torch.distributed.ProcessGroup learner_group = world.create_collective_group(ranks=[2, 3]) # wrap the model with DistributedDataParallel # if current process is learner process 2 or 3 actor = DistributedDataParallel(module=Actor(observe_dim, action_num), process_group=learner_group.group) critic = DistributedDataParallel(module=Critic(observe_dim), process_group=learner_group.group) else: actor = Actor(observe_dim, action_num) critic = Critic(observe_dim) # we may use a smaller batch size to train if we are using # DistributedDataParallel # note: since the impala framework is storing a whole # episode as a single sample, we should wait for a smaller number impala = IMPALA( actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"), impala_group, servers, batch_size=2, ) # synchronize all processes in the group, make sure # distributed buffer has been created on all processes in apex_group impala_group.barrier() # manually control syncing to improve performance impala.set_sync(False) if rank in (0, 1): # Process 0 and 1 are workers(samplers) # begin training episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: # sleep to wait for learners keep up sleep(0.1) episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) # manually pull the newest parameters impala.manual_sync() tmp_observations = [] while not terminal and step <= max_steps: step += 1 with t.no_grad(): old_state = state # agent model inference action, action_log_prob, *_ = impala.act( {"state": old_state}) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).view(1, observe_dim) total_reward += reward tmp_observations.append({ "state": { "state": old_state }, "action": { "action": action }, "next_state": { "state": state }, "reward": reward, "action_log_prob": action_log_prob.item(), "terminal": terminal or step == max_steps, }) impala.store_episode(tmp_observations) smoothed_total_reward = smoothed_total_reward * 0.9 + total_reward * 0.1 logger.info( f"Process {rank} Episode {episode} total reward={smoothed_total_reward:.2f}" ) if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") # will cause torch RPC to complain # since other processes may have not finished yet. # just for demonstration. exit(0) else: reward_fulfilled = 0 elif rank in (2, 3): # wait for enough samples # note: since the impala framework is storing a whole # episode as a single sample, we should wait for a smaller number while impala.replay_buffer.all_size() < 5: sleep(0.1) while True: impala.update()