#if d > intruder.radius: cos_theta = np.sqrt(d**2 - intruder.radius**2) / d sin_beta1 = sin_alpha * cos_theta - cos_alpha * sin_theta # beta1 = alpha - theta cos_beta1 = cos_alpha * cos_theta + sin_alpha * sin_theta sin_beta2 = sin_alpha * cos_theta + cos_alpha * sin_theta # beta2 = alpha + theta cos_beta2 = cos_alpha * cos_theta - sin_alpha * sin_theta T1x = cx + R * cos_beta1 T1y = cy + R * sin_beta1 T2x = cx + R * cos_beta2 T2y = cy + R * sin_beta2 T1 = (T1x, T1y) T2 = (T2x, T2y) return T1, T2 if __name__ =="__main__": action_std = 0.5 envs = Env.envs() # pixels = 80*80 envs_dest = envDest.envs() action_dim = envs.action_size agent_num = envs.num_agents Test = test() ppo = ActorCritic(action_dim, action_std).to(device) ppo.load_state_dict(torch.load('./PPO_continuous.pth')) moving, count = Test.run(ppo) np.savetxt('test_moving.csv', moving) plt_path(moving) print("conflict number: {}".format(count))
def main(): ################### Hyperparameters ################## solved_reward = -0.5 log_interval = 20 # print avg reward in the interval max_episodes = 2000 # max training episodes max_timesteps = 200 # max timesteps in one episode update_timestep = 500 # update policy every n timesteps action_std = 0.5 #0.5 # constant std for action distribution (Multivariate Normal) K_epochs = 10 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO gamma = 0.99 # discount factor lr = 0.0003 # parameters for Adam optimizer betas = (0.9, 0.999) ###################################################### envs = Env.envs() # pixels = 80*80 envs_dest = envDest.envs() action_dim = envs.action_size agent_num = envs.num_agents memory = Memory() ppo = PPO(action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip) print(lr, betas) #logging variables running_reward = 0 avg_length = 0 time_step = 0 record_reward = [] #training loop for i_episode in range(1, max_episodes + 1): envs.reset() envs_dest.reset() frame, _, reward, _ = envs.step([2] * agent_num) _, frame_dest, _, _ = envs_dest.step([2] * agent_num) state = preprocess_batch([frame, frame_dest]) for t in range(max_timesteps): time_step += 1 # Running policy_old action, _ = ppo.select_action(state, memory) frame, frame_dest, reward, done = envs.step(action) _, frame_dest, _, _ = envs_dest.step(action) state = preprocess_batch([frame, frame_dest]) # Saving reward and is_terminals: memory.rewards.append(reward) memory.is_terminals.append(done) # update if its time if time_step % update_timestep == 0: ppo.update(memory) memory.clear_memory() time_step = 0 running_reward += reward if done.any(): break avg_length += t #plt.imshow(frame_dest[0,:,:,:]) # stop training if avg_reward > solved_reward if i_episode % 500 == 0: torch.save(ppo.policy.state_dict(), './PPO_continuous.pth') if i_episode % log_interval == 0: avg_length = int(avg_length / log_interval) running_reward = (np.mean(running_reward) / log_interval) record_reward.append(running_reward) print('Episode {} \t Avg reward: {}'.format( i_episode, running_reward)) if running_reward > solved_reward: print("########## Sloved! ##########") torch.save(ppo.policy.state_dict(), './PPO_continuous_solved.pth') break running_reward = 0 avg_length = 0 np.savetxt('data_no_action_penalty_2.csv', record_reward) Test = test() moving = Test.run(ppo) np.savetxt('test_moving.csv', moving) plt_path(moving)