def __init__(self, env): """Initialize a new agent.""" self.env=env self.observation_size=env.observation_space.shape[0] self.action_size=env.action_space.n self.Transition = namedtuple('Transition', ['state', 'action', 'a_log_prob', 'reward', 'next_state']) self.actor_net = Actor(self.observation_size,self.action_size) self.critic_net = Critic(self.observation_size) self.buffer = [] self.hp=hp#########hyper-parameters-dict self.writer = SummaryWriter('./logs') self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 1e-3) self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), 3e-3) if not os.path.exists('./model'): os.makedirs('./model')
def __init__(self, env): """Initialize a new agent.""" self.env = env self.observation_shape = env.observation_space.shape ####(210, 160, 3)需要卷积形成1维vector self.observation_shape = torch.randn(self.observation_shape).unsqueeze( 0) ##(1,210,160,3)的tensor _, self.observation_size = self._get_observation_size( self.observation_shape) self.action_size = env.action_space.n self.Transition = namedtuple( 'Transition', ['state', 'action', 'a_log_prob', 'reward', 'next_state']) self.actor_net = Actor(self.observation_size, self.action_size) self.critic_net = Critic(self.observation_size) self.buffer = [] self.hp = hp #########hyper-parameters-dict self.writer = SummaryWriter('./logs') self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 1e-3) self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), 3e-3) if not os.path.exists('./model'): os.makedirs('./model')
class PPO_agent(): """ The template to be used to create an agent: any controller of the power grid is expected to be a subclass of this grid2op.Agent.BaseAgent. """ def _get_observation_size(self, input_tensor): x = torch.transpose(input_tensor, 1, 3) x = torch.transpose(x, 2, 3) model = CNN_for_atari() y = model(x) return y, y.shape[0] ########y是tensor,y.shape[0]是数字 def __init__(self, env): """Initialize a new agent.""" self.env = env self.observation_shape = env.observation_space.shape ####(210, 160, 3)需要卷积形成1维vector self.observation_shape = torch.randn(self.observation_shape).unsqueeze( 0) ##(1,210,160,3)的tensor _, self.observation_size = self._get_observation_size( self.observation_shape) self.action_size = env.action_space.n self.Transition = namedtuple( 'Transition', ['state', 'action', 'a_log_prob', 'reward', 'next_state']) self.actor_net = Actor(self.observation_size, self.action_size) self.critic_net = Critic(self.observation_size) self.buffer = [] self.hp = hp #########hyper-parameters-dict self.writer = SummaryWriter('./logs') self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 1e-3) self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), 3e-3) if not os.path.exists('./model'): os.makedirs('./model') def select_action(self, state): ########(x,0),array state = torch.from_numpy(state).float().unsqueeze(0) with torch.no_grad(): action_prob = self.actor_net(state) c = Categorical(action_prob) action = c.sample() return action.item(), action_prob[:, action.item()].item() def get_value(self, state): state = torch.from_numpy(state) with torch.no_grad(): value = self.critic_net(state) return value.item() def save_param(self, i_epoch): torch.save(self.actor_net.state_dict(), './model/actor_net_{}.pt'.format(i_epoch)) torch.save(self.critic_net.state_dict(), './model/critic_net_{}.pt'.format(i_epoch)) def load_param(self, i_epoch, actor_model, critic_model): actor_model.load_state_dict( torch.load('./model/actor_net_{}.pt'.format(i_epoch))) critic_model.load_state_dict( torch.load('./model/critic_net_{}.pt'.format(i_epoch))) def store_transition(self, transition): self.buffer.append(transition) self.hp['counter'] += 1 def update(self, i_ep): state = torch.tensor([t.state for t in self.buffer], dtype=torch.float) action = torch.tensor([t.action for t in self.buffer], dtype=torch.long).view(-1, 1) reward = [t.reward for t in self.buffer] # update: don't need next_state #reward = torch.tensor([t.reward for t in self.buffer], dtype=torch.float).view(-1, 1) #next_state = torch.tensor([t.next_state for t in self.buffer], dtype=torch.float) old_action_log_prob = torch.tensor([t.a_log_prob for t in self.buffer], dtype=torch.float).view(-1, 1) R = 0 Gt = [] for r in reward[::-1]: R = r + self.hp['gamma'] * R Gt.insert(0, R) Gt = torch.tensor(Gt, dtype=torch.float) #print("The agent is updateing....") for i in range(self.hp['ppo_update_time']): for index in BatchSampler( SubsetRandomSampler(range(len(self.buffer))), self.hp['batch_size'], False): # if self.training_step % 1000 ==0: # print('I_ep {} ,train {} times'.format(i_ep,self.hp['training_step'])) #with torch.no_grad(): Gt_index = Gt[index].view(-1, 1) #####torch.Size([32, 1]) V = self.critic_net(state[index]) delta = Gt_index - V advantage = delta.detach() # epoch iteration, PPO core!!! action_prob = self.actor_net(state[index]).gather( 1, action[index]) # new policy ratio = (action_prob / old_action_log_prob[index]) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1 - self.hp['clip_param'], 1 + self.hp['clip_param']) * advantage # update actor network action_loss = -torch.min(surr1, surr2).mean() # MAX->MIN desent self.writer.add_scalar('loss/action_loss', action_loss, global_step=self.hp['training_step']) self.actor_optimizer.zero_grad() action_loss.backward() nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.hp['max_grad_norm']) self.actor_optimizer.step() #update critic network value_loss = F.mse_loss(Gt_index, V) self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.hp['training_step']) self.critic_net_optimizer.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.hp['max_grad_norm']) self.critic_net_optimizer.step() self.hp['training_step'] += 1 del self.buffer[:] # clear experience def train_agent(self): for i_epoch in range(self.hp['step']): print(i_epoch) state = self.env.reset() state = torch.Tensor(state).unsqueeze(0) state_after_CNN, _ = self._get_observation_size(state) state_after_CNN = state_after_CNN.detach().numpy() for t in count(): a, action_prob = self.select_action(state_after_CNN) next_state, reward, done, info = self.env.step(a) next_state = torch.Tensor(next_state).unsqueeze(0) next_state_after_CNN, _ = self._get_observation_size( next_state) next_state_after_CNN = next_state_after_CNN.detach().numpy() trans = self.Transition(state_after_CNN, a, action_prob, reward, next_state_after_CNN) self.store_transition(trans) state_after_CNN = next_state_after_CNN if done or t >= 9999: if len(self.buffer) >= self.hp['batch_size']: self.update(i_epoch) print('#################t:', t) self.writer.add_scalar('livestep', t, global_step=i_epoch) break if i_epoch % 100 == 0: self.save_param(i_epoch) print('save model!') def test_agent(self, render, i_epoch): self.load_param(i_epoch, actor_model=self.actor_net, critic_model=self.critic_net) for i_epoch in range(self.hp['test_step']): print(i_epoch) state = self.env.reset() state = torch.Tensor(state).unsqueeze(0) state_after_CNN, _ = self._get_observation_size(state) state_after_CNN = state_after_CNN.detach().numpy() for t in count(): a, action_prob = self.select_action(state_after_CNN) # print(a,action_prob) next_state, reward, done, info = self.env.step(a) next_state = torch.Tensor(next_state).unsqueeze(0) next_state_after_CNN, _ = self._get_observation_size( next_state) next_state_after_CNN = next_state_after_CNN.detach().numpy() if render == True: self.env.render() state_after_CNN = next_state_after_CNN if done or t >= 9999: print('#################t:', t) break