ac_high = env.action_space.high num_state = env.observation_space.shape[0] num_action = env.action_space.shape[0] #train agent = torch.load("LunarLander_agent.pkl") reward_history = [] noise = OUNoise(env.action_space,theta=0.15, max_sigma=0.2, min_sigma=0.2) for e in tqdm(range(1,101)): state = env.reset() done = False reward_sum = 0.0 step = 0 while done == False : env.render() action = agent.take_action(state) action = noise.get_action(action.detach().numpy(),step) next_state , reward , done , _ = env.step(action) reward_sum += reward #reward-shaping reward = reward -abs(next_state[0]) + 1/(abs(next_state[1])+0.1)-abs(next_state[4]) agent.store_transition( state , action , reward , next_state , done ) state = next_state[:] step += 1 reward_history.append(reward_sum) env.close() plt.plot(reward_history) plt.savefig("test_reward_history")
def train(cfg): print('Start to train ! \n') env = NormalizedActions(gym.make("Pendulum-v0")) # 增加action噪声 ou_noise = OUNoise(env.action_space) n_states = env.observation_space.shape[0] n_actions = env.action_space.shape[0] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent = DDPG(n_states, n_actions, device="cpu", critic_lr=1e-3, actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) rewards = [] moving_average_rewards = [] ep_steps = [] log_dir = os.path.split( os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE writer = SummaryWriter(log_dir) for i_episode in range(1, cfg.train_eps + 1): state = env.reset() ou_noise.reset() ep_reward = 0 for i_step in range(1, cfg.train_steps + 1): action = agent.select_action(state) action = ou_noise.get_action(action, i_step) # 即paper中的random process next_state, reward, done, _ = env.step(action) ep_reward += reward agent.memory.push(state, action, reward, next_state, done) agent.update() state = next_state if done: break print('Episode:', i_episode, ' Reward: %i' % int(ep_reward), 'n_steps:', i_step) ep_steps.append(i_step) rewards.append(ep_reward) if i_episode == 1: moving_average_rewards.append(ep_reward) else: moving_average_rewards.append(0.9 * moving_average_rewards[-1] + 0.1 * ep_reward) writer.add_scalars('rewards', { 'raw': rewards[-1], 'moving_average': moving_average_rewards[-1] }, i_episode) writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode) writer.close() print('Complete training!') ''' 保存模型 ''' if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹 os.mkdir(SAVED_MODEL_PATH) agent.save_model(SAVED_MODEL_PATH + 'checkpoint.pth') '''存储reward等相关结果''' if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 os.mkdir(RESULT_PATH) np.save(RESULT_PATH + 'rewards_train.npy', rewards) np.save(RESULT_PATH + 'moving_average_rewards_train.npy', moving_average_rewards) np.save(RESULT_PATH + 'steps_train.npy', ep_steps)
critic_lr=1e-3, actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) rewards = [] moving_average_rewards = [] for i_episode in range(1, cfg.max_episodes + 1): state = env.reset() ou_noise.reset() ep_reward = 0 for i_step in range(1, cfg.max_steps + 1): action = agent.select_action(state) action = ou_noise.get_action(action, i_step) # 即paper中的random process next_state, reward, done, _ = env.step(action) ep_reward += reward agent.memory.push(state, action, reward, next_state, done) agent.update() state = next_state if done: break print( 'Episode:', i_episode, ' Reward: %i' % int(ep_reward), ) rewards.append(ep_reward) # if i_episode == 1:
actor_lr = 1e-4 tau = 1e-3 agent = ddpg(state_dim, action_dim, gamma, tau, buffer_maxlen, batch_size, critic_lr, actor_lr) noise = OUNoise(env.action_space) step = 0 for episode in range(max_episodes): state = env.reset() total = 0 done = False while True: action = agent.act(state) action = noise.get_action(action, step) next_state, reward, done, _ = env.step(action) total += reward if next_state[0] > 0.0: reward += 10 reward += 50 * abs(next_state[1]) transition = state, action, reward, next_state, done agent.update(transition) if done: print(f'episode {episode} done. reward: {total}') agent.save()
class DDPGAgent: def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate, train, decay): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.env = env self.gamma = gamma self.tau = tau self.train = train # set to true if we want to train the agent, set to false to simulate agent self.decay = decay # initialize actor and critic networks self.critic = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) # Copy critic target parameters for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) # optimizers self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.replay_buffer = BasicBuffer(buffer_maxlen) # use exploration noise only during training if self.train == True: self.noise = OUNoise(self.env.action_space, decay_period=self.decay) def get_action(self, obs, t=0): state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) action = self.actor.forward(state) action = action.squeeze(0).cpu().detach().numpy() # add exploration noise only during training if self.train == True: action = self.noise.get_action(action, t=t) return action def update(self, batch_size): states, actions, rewards, next_states, _ = self.replay_buffer.sample( batch_size) state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample( batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) masks = torch.FloatTensor(masks).to(self.device) curr_Q = self.critic.forward(state_batch, action_batch) next_actions = self.actor_target.forward(next_state_batch) next_Q = self.critic_target.forward(next_state_batch, next_actions.detach()) expected_Q = reward_batch + self.gamma * next_Q # update critic q_loss = F.mse_loss(curr_Q, expected_Q.detach()) self.critic_optimizer.zero_grad() q_loss.backward() self.critic_optimizer.step() # update actor policy_loss = -self.critic.forward( state_batch, self.actor.forward(state_batch)).mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # update target networks for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))