def train(env_name, start_episodes, num_episodes, gamma, tau, noise_std, batch_size, eval_freq, seed): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' start_episodes: how many episodes purely random policy is run for num_episodes: maximum number of episodes to run gamma: reward discount factor tau: target network update rate batch_size: number of episodes per policy training batch eval_freq: number of training batch before test seed: random seed for all modules with randomness """ # set seeds set_global_seed(seed) # configure log configure_log_info(env_name, seed) # create env env = gym.make(env_name) env.seed(seed) # set env seed obs_dim = env.observation_space.shape[0] obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) act_dim = env.action_space.shape[0] # create actor and target actor actor = Actor(obs_dim, act_dim, float(env.action_space.high[0])).to(device) target_actor = Actor(obs_dim, act_dim, float(env.action_space.high[0])).to(device) # create critic and target critic critic = Critic(obs_dim, act_dim).to(device) target_critic = Critic(obs_dim, act_dim).to(device) # create DDPG agent (hollowed object) agent = DDPG(actor, critic, target_actor, target_critic, noise_std, gamma, tau) agent.align_target() # create replay_buffer replay_buffer = ReplayBuffer() # run a few episodes of untrained policy to initialize scaler and fill in replay buffer run_policy(env, agent, replay_buffer, mode="random", episodes=start_episodes) num_iteration = num_episodes // eval_freq current_episodes = 0 current_steps = 0 for iter in range(num_iteration): # train models for i in range(eval_freq): # sample transitions train_returns, total_steps = run_policy(env, agent, replay_buffer, mode="train", episodes=batch_size) current_episodes += batch_size current_steps += total_steps logger.info('[train] average return:{0}, std return: {1}'.format( np.mean(train_returns), np.std(train_returns))) # train num_epoch = total_steps // batch_size for e in range(num_epoch): observation, action, reward, next_obs, done = replay_buffer.sample( ) agent.update(observation, action, reward, next_obs, done) # test models num_test_episodes = 10 returns, _ = run_policy(env, agent, replay_buffer, mode="test", episodes=num_test_episodes) avg_return = np.mean(returns) std_return = np.std(returns) logger.record_tabular('iteration', iter) logger.record_tabular('episodes', current_episodes) logger.record_tabular('steps', current_steps) logger.record_tabular('avg_return', avg_return) logger.record_tabular('std_return', std_return) logger.dump_tabular()
def train(): # save trained model under preTrained directory directory = "./preTrained" filename = "ddpg" # set epsilon exploration rate and decay rate epsilon = 0.2 eps_min = 1e-3 eps_decay = 2e-3 gaussian_exploration_noise = 0.2 # set learning rate and batch size lr = 1e-3 batch_size = 128 # initialize replay memory replay_buffer = ReplayBuffer(max_size=5e4) # rewards for each episode / for plot rewards = np.zeros(max_episode) # initialize DDPG agent agent = DDPG( state_dim=env.observation_space.shape[0] - 1, # state_dim=env.observation_space.shape[0] action_dim=env.action_space.shape[0], action_bounds=env.action_space.high[0], lr=lr) for epoch in range(max_episode): # reset environment state = env.reset() done = False # epsilon decay epsilon = eps_min if (epsilon - eps_decay) < 0 else (epsilon - eps_decay) while not done: if np.random.random_sample() > epsilon: action = agent.select_action(state) action = action + np.random.normal(0, gaussian_exploration_noise) else: action = np.array( np.random.uniform(env.action_space.low[0], env.action_space.high[0])).reshape(1, ) # perform one step update on pendulum next_state, reward, done, _ = env.step(action) env.render() replay_buffer.add((state, action, reward, next_state, done)) # go to next state state = next_state # store rewards rewards[epoch] += reward # update the DDPG agent sampled on replay buffer and n_iter times agent.update(buffer=replay_buffer, n_iter=10, batch_size=batch_size) if rewards[epoch] > -1.0: print("task solved!\n") # save trained agent if not os.path.exists(directory): os.mkdir(directory) agent.save(directory, filename) # print rewards of current episode if epoch % 10 == 0: print('train epoch:', epoch, 'rewards:', rewards[epoch]) return rewards