def main(k): path = './direction_BS_woNorm/150/{}'.format(k) if not os.path.exists(path): os.makedirs(path) ############## Hyperparameters ############## env_name = "fishEvasion-v0" # used when creating the environment with gym.make render = False # render the environment in training if true # solved_reward = 100 # stop training if avg_reward > solved_reward log_interval = 27 # print avg reward in the interval max_episodes = 10000 # max training episodes max_timesteps = 150 # max timesteps in one episode update_timestep = 4050 # update policy every n timesteps action_std = 0.5 # constant std for action distribution (Multivariate Normal) K_epochs = 80 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO gamma = 0.99 # discount factor lr = 0.0003 # parameters for Adam optimizer betas = (0.9, 0.999) random_seed = None ############################################# # creating environment env = fish.FishEvasionEnv(dt = 0.1) # set the length of an episode from gym.wrappers.time_limit import TimeLimit env = TimeLimit(env, max_episode_steps=max_timesteps) # get observation and action dimensions from the environment state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] if random_seed: print("Random Seed: {}".format(random_seed)) torch.manual_seed(random_seed) env.seed(random_seed) np.random.seed(random_seed) memory = Memory() ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip) # ------------------------------------------------------------------ # start training from an existing policy # ppo.policy_old.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device)) # ppo.policy.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device)) # ------------------------------------------------------------------ # logging variables running_reward = 0 avg_length = 0 time_step = 0 # training loop for i_episode in range(1, max_episodes+1): # ------------------------------------------------------------------ # set a specific distribution for beta # beta0 = angle_normalize(i_episode*3,center = 0) # print(beta0) # ------------------------------------------------------------------ state = env.reset() for t in range(max_timesteps): time_step +=1 # Running policy_old: action = ppo.select_action(state, memory) state, reward, done, _ = env.step(action) # Storing reward and is_terminals: memory.rewards.append(reward) memory.is_terminals.append(done) # update if it is time # ------------------------------------------------------------------ if time_step % update_timestep == 0: ppo.update(memory) memory.clear_memory() time_step = 0 # ------------------------------------------------------------------ running_reward += reward if render: env.render() # break if episode ends if done: break avg_length += t # ------------------------------------------------------------------ # stop training if avg_reward > solved_reward # if running_reward > (log_interval*solved_reward): # print("########## Solved! ##########") # torch.save(ppo.policy.state_dict(), './PPO_continuous_forwardWoPos_solved_{}.pth'.format(env_name)) # break # ------------------------------------------------------------------ # save every 50 episodes if i_episode % 50 == 0: torch.save(ppo.policy.state_dict(), path+'/PPO_{}_direction{:06d}.pth'.format(env_name,i_episode)) # ------------------------------------------------------------------ # logging if i_episode % log_interval == 0: avg_length = int(avg_length/log_interval) running_reward = ((running_reward/log_interval)) print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0
def make_env(i): env = make("Breakout-v0") env = TimeLimit(env, max_episode_steps=20) env.seed(i) return env
replay_buffer_size = 1000000 plot = True optimize_every_n_steps = 200 training_iterations = 200 evaluation_iterations = 10 discount = 0.99 polyak_average = 0.995 # the closer to one, the slower the target network updates prioritized_replay = False seed = 123 ############################################################## np.random.seed(seed) torch.manual_seed(seed) # Global variables: environment = TimeLimit(gym.make(environment_name), max_episode_steps=200) environment.seed(seed) assert isinstance(environment.action_space, gym.spaces.Box) observation_dimension = environment.observation_space.shape[0] action_dimension = environment.action_space.shape[0] action_lower_bound = environment.action_space.low[0] action_higher_bound = environment.action_space.high[0] policy = nn.Sequential(nn.Linear(observation_dimension, 15), nn.ReLU(inplace=True), nn.Linear(15, 15), nn.ReLU(inplace=True), nn.Linear(15, action_dimension), nn.Tanh()) q = nn.Sequential(nn.Linear(observation_dimension + action_dimension, 15), nn.ReLU(inplace=True), nn.Linear(15, 15), nn.ReLU(inplace=True), nn.Linear(15, 15),