コード例 #1
0
    #if d > intruder.radius:
    cos_theta = np.sqrt(d**2 - intruder.radius**2) / d
    sin_beta1 = sin_alpha * cos_theta - cos_alpha * sin_theta  # beta1 = alpha - theta
    cos_beta1 = cos_alpha * cos_theta + sin_alpha * sin_theta
    sin_beta2 = sin_alpha * cos_theta + cos_alpha * sin_theta  # beta2 = alpha + theta
    cos_beta2 = cos_alpha * cos_theta - sin_alpha * sin_theta
    T1x = cx + R * cos_beta1
    T1y = cy + R * sin_beta1
    T2x = cx + R * cos_beta2
    T2y = cy + R * sin_beta2
    T1 = (T1x, T1y)
    T2 = (T2x, T2y)

    return T1, T2

if __name__ =="__main__":
    action_std = 0.5
    envs = Env.envs()  # pixels = 80*80
    envs_dest = envDest.envs()
    action_dim = envs.action_size
    agent_num = envs.num_agents
    Test = test()
    ppo = ActorCritic(action_dim, action_std).to(device)
    ppo.load_state_dict(torch.load('./PPO_continuous.pth'))
    moving, count = Test.run(ppo)
    np.savetxt('test_moving.csv', moving)
    plt_path(moving)
    print("conflict number: {}".format(count))


def main():
    ################### Hyperparameters ##################
    solved_reward = -0.5
    log_interval = 20  # print avg reward in the interval
    max_episodes = 2000  # max training episodes
    max_timesteps = 200  # max timesteps in one episode
    update_timestep = 500  # update policy every n timesteps
    action_std = 0.5  #0.5 # constant std for action distribution (Multivariate Normal)
    K_epochs = 10  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    gamma = 0.99  # discount factor

    lr = 0.0003  # parameters for Adam optimizer
    betas = (0.9, 0.999)

    ######################################################

    envs = Env.envs()  # pixels = 80*80
    envs_dest = envDest.envs()

    action_dim = envs.action_size
    agent_num = envs.num_agents

    memory = Memory()
    ppo = PPO(action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
    print(lr, betas)

    #logging variables
    running_reward = 0
    avg_length = 0
    time_step = 0

    record_reward = []

    #training loop
    for i_episode in range(1, max_episodes + 1):
        envs.reset()
        envs_dest.reset()
        frame, _, reward, _ = envs.step([2] * agent_num)
        _, frame_dest, _, _ = envs_dest.step([2] * agent_num)

        state = preprocess_batch([frame, frame_dest])
        for t in range(max_timesteps):
            time_step += 1
            # Running policy_old
            action, _ = ppo.select_action(state, memory)
            frame, frame_dest, reward, done = envs.step(action)
            _, frame_dest, _, _ = envs_dest.step(action)
            state = preprocess_batch([frame, frame_dest])

            # Saving reward and is_terminals:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)

            # update if its time
            if time_step % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                time_step = 0
            running_reward += reward
            if done.any():
                break

        avg_length += t
        #plt.imshow(frame_dest[0,:,:,:])

        # stop training if avg_reward > solved_reward

        if i_episode % 500 == 0:
            torch.save(ppo.policy.state_dict(), './PPO_continuous.pth')

        if i_episode % log_interval == 0:
            avg_length = int(avg_length / log_interval)
            running_reward = (np.mean(running_reward) / log_interval)
            record_reward.append(running_reward)
            print('Episode {} \t Avg reward: {}'.format(
                i_episode, running_reward))

            if running_reward > solved_reward:
                print("########## Sloved! ##########")
                torch.save(ppo.policy.state_dict(),
                           './PPO_continuous_solved.pth')
                break

            running_reward = 0
            avg_length = 0

    np.savetxt('data_no_action_penalty_2.csv', record_reward)
    Test = test()
    moving = Test.run(ppo)
    np.savetxt('test_moving.csv', moving)
    plt_path(moving)