Exemple #1
0
def main(k):
    path = './direction_BS_woNorm/150/{}'.format(k)
    if not os.path.exists(path):
        os.makedirs(path)
    ############## Hyperparameters ##############
    env_name = "fishEvasion-v0" # used when creating the environment with gym.make
    render = False              # render the environment in training if true
    # solved_reward = 100         # stop training if avg_reward > solved_reward
    log_interval = 27           # print avg reward in the interval
    max_episodes = 10000        # max training episodes
    max_timesteps = 150         # max timesteps in one episode
    
    update_timestep = 4050      # update policy every n timesteps
    action_std = 0.5            # constant std for action distribution (Multivariate Normal)
    K_epochs = 80               # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    gamma = 0.99                # discount factor
    
    lr = 0.0003                 # parameters for Adam optimizer
    betas = (0.9, 0.999)
    
    random_seed = None
    #############################################
    
    # creating environment
    env = fish.FishEvasionEnv(dt = 0.1)

    # set the length of an episode
    from gym.wrappers.time_limit import TimeLimit
    env = TimeLimit(env, max_episode_steps=max_timesteps)

    # get observation and action dimensions from the environment
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    if random_seed:
        print("Random Seed: {}".format(random_seed))
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)
    
    memory = Memory()
    ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
    # ------------------------------------------------------------------
    # start training from an existing policy    
    # ppo.policy_old.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device))
    # ppo.policy.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device))
    # ------------------------------------------------------------------
    
    # logging variables
    running_reward = 0
    avg_length = 0
    time_step = 0

    # training loop
    for i_episode in range(1, max_episodes+1):
        # ------------------------------------------------------------------
        # set a specific distribution for beta 
        # beta0 = angle_normalize(i_episode*3,center = 0)
        # print(beta0)
        # ------------------------------------------------------------------
        state = env.reset()
        for t in range(max_timesteps):
            time_step +=1
            # Running policy_old:
            action = ppo.select_action(state, memory)
            state, reward, done, _ = env.step(action)

            # Storing reward and is_terminals:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            
            # update if it is time
            # ------------------------------------------------------------------
            if time_step % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                time_step = 0
            # ------------------------------------------------------------------
            running_reward += reward
            if render:
                env.render()
            # break if episode ends
            if done:
                break
        avg_length += t

        # ------------------------------------------------------------------
        # stop training if avg_reward > solved_reward
        # if running_reward > (log_interval*solved_reward):
        #     print("########## Solved! ##########")
        #     torch.save(ppo.policy.state_dict(), './PPO_continuous_forwardWoPos_solved_{}.pth'.format(env_name))
        #     break
        # ------------------------------------------------------------------
    
        # save every 50 episodes
        if i_episode % 50 == 0:
            torch.save(ppo.policy.state_dict(), path+'/PPO_{}_direction{:06d}.pth'.format(env_name,i_episode)) 

        # ------------------------------------------------------------------
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = ((running_reward/log_interval))
            print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
def make_env(i):
    env = make("Breakout-v0")
    env = TimeLimit(env, max_episode_steps=20)
    env.seed(i)
    return env
replay_buffer_size = 1000000
plot = True
optimize_every_n_steps = 200
training_iterations = 200
evaluation_iterations = 10
discount = 0.99
polyak_average = 0.995  # the closer to one, the slower the target network updates
prioritized_replay = False
seed = 123
##############################################################
np.random.seed(seed)
torch.manual_seed(seed)

# Global variables:
environment = TimeLimit(gym.make(environment_name), max_episode_steps=200)
environment.seed(seed)
assert isinstance(environment.action_space, gym.spaces.Box)

observation_dimension = environment.observation_space.shape[0]
action_dimension = environment.action_space.shape[0]
action_lower_bound = environment.action_space.low[0]
action_higher_bound = environment.action_space.high[0]

policy = nn.Sequential(nn.Linear(observation_dimension, 15),
                       nn.ReLU(inplace=True), nn.Linear(15, 15),
                       nn.ReLU(inplace=True), nn.Linear(15, action_dimension),
                       nn.Tanh())

q = nn.Sequential(nn.Linear(observation_dimension + action_dimension, 15),
                  nn.ReLU(inplace=True), nn.Linear(15, 15),
                  nn.ReLU(inplace=True), nn.Linear(15, 15),