def test_full_train(self, train_config, dqn_train): c = train_config # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env while episode < c.max_episodes: episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state # agent model inference action = dqn_train.act_discrete_with_noise( {"state": old_state.unsqueeze(0)} ) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) dqn_train.store_transition( { "state": {"state": old_state.unsqueeze(0)}, "action": {"action": action}, "next_state": {"state": state.unsqueeze(0)}, "reward": float(reward), "terminal": terminal or step == c.max_steps, } ) # update if episode.get() > 100: for _ in range(step.get()): dqn_train.update() smoother.update(total_reward) step.reset() terminal = False logger.info(f"Episode {episode} total reward={smoother.value:.2f}") if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: logger.info("Environment solved!") return else: reward_fulfilled.reset() pytest.fail("DQN Training failed.")
def test_full_train(self, train_config, a2c_train, gae_lambda): c = train_config a2c_train.gae_lambda = gae_lambda # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env env.seed(0) while episode < c.max_episodes: episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) tmp_observations = [] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state # agent model inference action = a2c_train.act({"state": old_state.unsqueeze(0)})[0] state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) tmp_observations.append( { "state": {"state": old_state.unsqueeze(0)}, "action": {"action": action}, "next_state": {"state": state.unsqueeze(0)}, "reward": float(reward), "terminal": terminal or step == c.max_steps, } ) # update a2c_train.store_episode(tmp_observations) a2c_train.update() smoother.update(total_reward) step.reset() terminal = False logger.info(f"Episode {episode} total reward={smoother.value:.2f}") if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: logger.info("Environment solved!") return else: reward_fulfilled.reset() pytest.fail("A2C Training failed.")
def test_full_train(rank): c = TestARS.c ars = TestARS.ars("cpu", t.float32) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env # for cpu usage viewing default_logger.info(f"{rank}, pid {os.getpid()}") while episode < c.max_episodes: episode.count() all_reward = 0 for at in ars.get_actor_types(): total_reward = 0 # batch size = 1 state = t.tensor(env.reset(), dtype=t.float32) while not terminal and step <= c.max_steps: step.count() with t.no_grad(): # agent model inference action = ars.act({"state": state.unsqueeze(0)}, at) state, reward, terminal, __ = env.step(action) state = t.tensor(state, dtype=t.float32) total_reward += float(reward) step.reset() terminal = False ars.store_reward(total_reward, at) all_reward += total_reward # update ars.update() smoother.update(all_reward / len(ars.get_actor_types())) default_logger.info( f"Process {rank} Episode {episode} total reward={smoother.value:.2f}" ) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") raise SafeExit else: reward_fulfilled.reset() raise RuntimeError("ARS Training failed.")
def test_full_train(self, train_config, ddpg_per_train): c = train_config # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env while episode < c.max_episodes: episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state # agent model inference if episode.get() % c.noise_interval == 0: action = ddpg_per_train.act_with_noise( {"state": old_state.unsqueeze(0)}, noise_param=c.noise_param, mode=c.noise_mode, ) else: action = ddpg_per_train.act( {"state": old_state.unsqueeze(0)} ).clamp(-c.action_range, c.action_range) state, reward, terminal, _ = env.step(action.cpu().numpy()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) ddpg_per_train.store_transition( { "state": {"state": old_state.unsqueeze(0)}, "action": {"action": action}, "next_state": {"state": state.unsqueeze(0)}, "reward": float(reward), "terminal": terminal or step == c.max_steps, } ) # update if episode > 100: for i in range(step.get()): ddpg_per_train.update() smoother.update(total_reward) step.reset() terminal = False if episode.get() % c.noise_interval != 0: # only log result without noise logger.info(f"Episode {episode} total reward={smoother.value:.2f}") if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: logger.info("Environment solved!") return else: reward_fulfilled.reset() pytest.fail("DDPGPer Training failed.")
def test_full_train(rank): training_group = get_world().create_rpc_group("training", ["0", "1", "2"]) c = TestIMPALA.c impala = TestIMPALA.impala("cpu", t.float32) # perform manual syncing to decrease the number of rpc calls impala.set_sync(False) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env env.seed(rank) # make sure all things are initialized. training_group.barrier() # for cpu usage viewing default_logger.info(f"{rank}, pid {os.getpid()}") while episode < c.max_episodes: episode.count() if rank in (0, 1): # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) impala.manual_sync() tmp_observations = [] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state action, action_log_prob, *_ = impala.act( {"state": old_state.unsqueeze(0)} ) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) tmp_observations.append( { "state": {"state": old_state.unsqueeze(0)}, "action": {"action": action}, "next_state": {"state": state.unsqueeze(0)}, "reward": float(reward), "action_log_prob": action_log_prob.item(), "terminal": terminal or step == c.max_steps, } ) impala.store_episode(tmp_observations) smoother.update(total_reward) step.reset() terminal = False default_logger.info( "Process {} Episode {} " "total reward={:.2f}".format(rank, episode, smoother.value) ) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") try: training_group.pair(f"solved", True) except KeyError: # already solved in another process pass else: reward_fulfilled.reset() else: # wait for some samples if episode.get() > 200: for _ in range(100): impala.update() default_logger.info("Updated 100 times.") training_group.barrier() if training_group.is_paired("solved"): return True raise RuntimeError("IMPALA Training failed.")
def test_full_train(rank): c = TestDDPGApex.c ddpg_apex = TestDDPGApex.ddpg_apex("cpu", t.float32, discrete=True) # perform manual syncing to decrease the number of rpc calls ddpg_apex.set_sync(False) # begin training episode, step = Counter(), Counter() avg_step = Smooth() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env world = get_world() all_group = world.create_rpc_group("all", ["0", "1", "2"]) all_group.pair(f"{rank}_running", True) default_logger.info(f"{rank}, pid {os.getpid()}") if rank == 0: all_group.pair("episode", episode) if rank in (0, 1): while episode < c.max_episodes: # wait for trainer to keep up sleep(0.2) episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) ddpg_apex.manual_sync() while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state action, probs = ddpg_apex.act_discrete_with_noise( {"state": old_state.unsqueeze(0)}) state, reward, terminal, _ = env.step( action.cpu().item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) ddpg_apex.store_transition({ "state": { "state": old_state.unsqueeze(0) }, "action": { "action": probs }, "next_state": { "state": state.unsqueeze(0) }, "reward": float(reward), "terminal": terminal or step == c.max_steps, }) smoother.update(total_reward) avg_step.update(step.get()) step.reset() terminal = False default_logger.info("Process {} Episode {} " "total reward={:.2f}".format( rank, episode, smoother.value)) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") all_group.unpair(f"{rank}_running") while all_group.is_paired( "0_running") or all_group.is_paired( "1_running"): # wait for all workers to join sleep(1) # wait for trainer sleep(5) return True else: reward_fulfilled.reset() else: # wait for some samples while ddpg_apex.replay_buffer.all_size() < 500: sleep(0.1) while all_group.is_paired("0_running") or all_group.is_paired( "1_running"): ddpg_apex.update() default_logger.info(f"Updated") return True raise RuntimeError("DDPG-Apex Training failed.")
def test_full_train(self, train_config, maddpg_train): c = train_config # begin training episode, step = Counter(), Counter() # first for prey, second for pred smoother = Smooth() reward_fulfilled = Counter() terminal = False env = c.env env.seed(0) while episode < c.max_episodes: episode.count() # batch size = 1 total_reward = 0 states = [ t.tensor(st, dtype=t.float32).view(1, c.observe_dim) for st in env.reset() ] tmp_observations_list = [[] for _ in range(c.agent_num)] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_states = states # agent model inference results = maddpg_train.act_discrete_with_noise([{ "state": st.view(1, c.observe_dim) } for st in states]) actions = [int(r[0]) for r in results] action_probs = [r[1] for r in results] states, rewards, terminals, _ = env.step(actions) states = [ t.tensor(st, dtype=t.float32).view(1, c.observe_dim) for st in states ] total_reward += float(sum(rewards)) / c.agent_num for tmp_observations, ost, act, st, rew, term in zip( tmp_observations_list, old_states, action_probs, states, rewards, terminals, ): tmp_observations.append({ "state": { "state": ost }, "action": { "action": act }, "next_state": { "state": st }, "reward": float(rew), "terminal": term or step == c.max_steps, }) maddpg_train.store_episodes(tmp_observations_list) # update if episode > 5: for i in range(step.get()): maddpg_train.update() # total reward is divided by steps here, since: # "Agents are rewarded based on minimum agent distance # to each landmark, penalized for collisions" smoother.update(total_reward / step.get()) logger.info(f"Episode {episode} total steps={step}") step.reset() terminal = False logger.info(f"Episode {episode} total reward={smoother.value:.2f}") if smoother.value > c.solved_reward and episode > 20: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: logger.info("Environment solved!") return else: reward_fulfilled.reset() pytest.fail("MADDPG Training failed.")
def test_full_train(rank, gae_lambda): c = TestA3C.c a3c = TestA3C.a3c("cpu", t.float32) a3c.set_sync(False) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env # for cpu usage viewing default_logger.info(f"{rank}, pid {os.getpid()}") while episode < c.max_episodes: episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) a3c.manual_sync() tmp_observations = [] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state # agent model inference action = a3c.act({"state": old_state.unsqueeze(0)})[0] state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) tmp_observations.append({ "state": { "state": old_state.unsqueeze(0) }, "action": { "action": action }, "next_state": { "state": state.unsqueeze(0) }, "reward": float(reward), "terminal": terminal or step == c.max_steps, }) # update a3c.store_episode(tmp_observations) a3c.update() smoother.update(total_reward) step.reset() terminal = False default_logger.info( f"Process {rank} Episode {episode} total reward={smoother.value:.2f}" ) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") return True else: reward_fulfilled.reset() raise RuntimeError("A3C Training failed.")
def test_full_train(rank): training_group = get_world().create_rpc_group("training", ["0", "1", "2"]) c = TestARS.c ars = TestARS.ars("cpu", t.float32) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env env.seed(rank) # for cpu usage viewing default_logger.info(f"{rank}, pid {os.getpid()}") # make sure all things are initialized. training_group.barrier() while episode < c.max_episodes: episode.count() all_reward = 0 for at in ars.get_actor_types(): total_reward = 0 # batch size = 1 state = t.tensor(env.reset(), dtype=t.float32) while not terminal and step <= c.max_steps: step.count() with t.no_grad(): # agent model inference action = ars.act({"state": state.unsqueeze(0)}, at) state, reward, terminal, __ = env.step(action) state = t.tensor(state, dtype=t.float32) total_reward += float(reward) step.reset() terminal = False ars.store_reward(total_reward, at) all_reward += total_reward # update ars.update() smoother.update(all_reward / len(ars.get_actor_types())) default_logger.info( f"Process {rank} Episode {episode} total reward={smoother.value:.2f}" ) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") try: training_group.pair(f"solved", True) except KeyError: # already solved in another process pass else: reward_fulfilled.reset() training_group.barrier() if training_group.is_paired("solved"): return True raise RuntimeError("ARS Training failed.")
def test_full_train(rank): c = TestIMPALA.c impala = TestIMPALA.impala("cpu", t.float32) # perform manual syncing to decrease the number of rpc calls impala.set_sync(False) # begin training episode, step = Counter(), Counter() reward_fulfilled = Counter() smoother = Smooth() terminal = False env = c.env world = get_world() all_group = world.create_rpc_group("all", ["0", "1", "2"]) all_group.pair(f"{rank}_running", True) default_logger.info(f"{rank}, pid {os.getpid()}") if rank == 0: all_group.pair("episode", episode) if rank in (0, 1): while episode < c.max_episodes: # wait for trainer to keep up sleep(0.2) episode.count() # batch size = 1 total_reward = 0 state = t.tensor(env.reset(), dtype=t.float32) impala.manual_sync() tmp_observations = [] while not terminal and step <= c.max_steps: step.count() with t.no_grad(): old_state = state action, action_log_prob, *_ = impala.act( {"state": old_state.unsqueeze(0)}) state, reward, terminal, _ = env.step(action.item()) state = t.tensor(state, dtype=t.float32).flatten() total_reward += float(reward) tmp_observations.append({ "state": { "state": old_state.unsqueeze(0) }, "action": { "action": action }, "next_state": { "state": state.unsqueeze(0) }, "reward": float(reward), "action_log_prob": action_log_prob.item(), "terminal": terminal or step == c.max_steps, }) impala.store_episode(tmp_observations) smoother.update(total_reward) step.reset() terminal = False default_logger.info("Process {} Episode {} " "total reward={:.2f}".format( rank, episode, smoother.value)) if smoother.value > c.solved_reward: reward_fulfilled.count() if reward_fulfilled >= c.solved_repeat: default_logger.info("Environment solved!") all_group.unpair(f"{rank}_running") while all_group.is_paired( "0_running") or all_group.is_paired( "1_running"): # wait for all workers to join sleep(1) # wait for trainer sleep(5) return True else: reward_fulfilled.reset() else: # wait for some samples # Note: the number of entries in buffer means "episodes" # rather than steps here! while impala.replay_buffer.all_size() < 5: sleep(0.1) while all_group.is_paired("0_running") or all_group.is_paired( "1_running"): impala.update() default_logger.info("Updated") return True raise RuntimeError("IMPALA Training failed.")