def test_process(config, steps, target_actor): env = NormalizedEnv(gym.make('Pendulum-v0')) agent = Action(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0]) reward_list = [] try: while True: # for test if (steps.value) != 0 and (steps.value % config.test_every_eposide == 0): agent.load_param(target_actor) print("test agent load param ") et_reward = 0 for index in range(config.num_eposide_test): eposide = 0 state = env.reset() state = (state - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) while True: action = agent.chose_action(state, explort=False) next_state, reward, done, _ = env.step(action) env.render() next_state = (next_state - env.observation_space.low ) / (env.observation_space.high - env.observation_space.low) eposide += reward state = next_state if done: break et_reward += eposide print("\033[93m [ test ] eposide average reward : {}\033[00m". format(et_reward / config.num_eposide_test)) reward_list.append(et_reward / config.num_eposide_test) x = np.arange(len(reward_list)) y = np.array(reward_list) plt.plot(x, y) plt.savefig("./eposide_reward.png") except Exception as e: print(e) print("test process exit") env.close()
def main(): env = NormalizedEnv(gym.make('Pendulum-v0')) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] agent = Actor(state_dim, action_dim).to('cuda') agent.load_state_dict(torch.load('./Models/78.0_actor.pt')) eposide = 0 done = False eposide_list = [] while eposide < 100: eposide_reward = 0 state = env.reset() state = (state - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) state = to_tensor(state) while not done: action = agent.forward(state).detach().cpu().data.numpy() state_, reward, done, _ = env.step(action) state_ = (state_ - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) env.render() state = to_tensor(state_) eposide_reward += reward eposide_list.append(eposide_reward) eposide += 1 done = False print("{} : {}".format(eposide, eposide_reward)) import matplotlib.pyplot as plt x = np.arange(100) y = np.array(eposide_list) plt.plot(x, y) plt.savefig("./test_eposide_reward.png") env.close()
def main(args): env = make_env('simple_tag') env = NormalizedEnv(env) kwargs = dict() kwargs['config'] = args predator_model = Predators(16, 2, num_agent=3, **kwargs) preyer_model = Preyer(14, 2, **kwargs) if args.tensorboard: writer = SummaryWriter(log_dir='runs/' + args.log_dir) episode = 0 total_step = 0 while episode < args.max_episodes: state = env.reset() episode += 1 step = 0 predator_accum_reward = [] preyer_accum_reward = 0 while True: state_predator, state_prayer = split_obs(state) predator_model.prep_eval() action_predator = predator_model.choose_action(state_predator) action_prayer = preyer_model.random_action() #action_prayer = preyer_model.choose_action(state_prayer) action = merge_action(action_predator, action_prayer) next_state, reward, done, info = env.step(action) step += 1 total_step += 1 predator_accum_reward.append(np.mean(reward[:3])) preyer_accum_reward = reward[3] if step > args.episode_length: done = [True, True, True, True] if args.render and (episode % 10 == 1): env.render(mode='rgb_array') predator_model.memory(state[:3], action[:3], reward[:3], next_state[:3], done[:3]) # preyer_model.memory(state[3], action[3], reward[3], next_state[3], done[3]) if len( predator_model.replay_buffer ) >= args.batch_size and total_step % args.steps_per_update == 0: predator_model.prep_train() predator_model.train() # preyer_model.train() if True in done: predator_c_loss, predator_a_loss = predator_model.getLoss() preyer_c_loss, preyer_a_loss = preyer_model.getLoss() print("[Episode %05d] reward_predator %3.1f reward_preyer %3.1f predator_c_loss %3.1f predator_a_loss %3.1f preyer_c_loss %3.1f preyer_a_loss %3.1f" % \ (episode, np.mean(predator_accum_reward).item(), preyer_accum_reward, predator_c_loss, predator_a_loss, preyer_c_loss, preyer_a_loss)) if args.tensorboard: # writer.add_scalar(tag='debug/memory_length', global_step=episode, scalar_value=len(predator_model.replay_buffer)) # writer.add_scalar(tag='debug/predator_epsilon', global_step=episode, scalar_value=predator_model.epsilon) # writer.add_scalar(tag='debug/preyer_epsilon', global_step=episode, scalar_value=preyer_model.epsilon) writer.add_scalar( tag='agent/reward_predator', global_step=episode, scalar_value=np.mean(predator_accum_reward).item()) # writer.add_scalar(tag='perf/reward_preyer', global_step=episode, scalar_value=preyer_accum_reward) if predator_c_loss and predator_a_loss: writer.add_scalars('agent/predator_loss', global_step=episode, tag_scalar_dict={ 'actor': -predator_a_loss, 'critic': predator_c_loss }) # writer.add_scalar(tag='loss/preyer_c_loss', global_step=episode, scalar_value=preyer_c_loss) # writer.add_scalar(tag='loss/preyer_a_loss', global_step=episode, scalar_value=preyer_a_loss) predator_model.reset() preyer_model.reset() break state = next_state if args.tensorboard: writer.close()