class Agent(nn.Module): def __init__(self, input_shape, num_actions, device, PATH, gamma=0.95, learning_rate=0.001, replay_size=10000, batch_size=128): super(Agent, self).__init__() self.device = device self.PATH = PATH self.gamma = gamma self.lr = learning_rate self.num_actions = num_actions epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 200 self.epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) self.replay_size = replay_size self.batch_size = batch_size self.policy_net = DQN(input_shape, num_actions).to(device) self.target_net = DQN(input_shape, num_actions).to(device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr) self.replay_buffer = ReplayBuffer(replay_size) self.best_loss = 9999 def declare_networks(self): self.policy_net = DQN(input_shape, num_actions).to(device) self.target_net = DQN(input_shape, num_actions).to(device) def declare_memory(self): self.replay_buffer = ReplayBuffer(self.replay_size) def compute_loss(self): if len(self.replay_buffer) > self.batch_size: state, action, reward, next_state, done = self.replay_buffer.sample( self.batch_size) state = Variable(torch.Tensor(np.array(state))).to(self.device) action = Variable(torch.LongTensor(action)).to(self.device) reward = Variable(torch.Tensor(np.array(reward))).to(self.device) next_state = Variable(torch.Tensor(np.array(next_state))).to( self.device) done = Variable(torch.Tensor(np.array(done))).to(self.device) q_values = self.policy_net(state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) with torch.no_grad(): next_q_values = self.policy_net(next_state) next_q_state_values = self.target_net(next_state) next_q_value = next_q_state_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) expected_q_value = reward + self.gamma * next_q_value * (1 - done) # MSE loss = (q_value - expected_q_value.detach()).pow(2).mean() self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if loss < self.best_loss: self.model_save() self.best_loss = loss return loss.item() else: return 9999 def append_buffer(self, state, action, reward, next_state, done): self.replay_buffer.push(state, action, reward, next_state, done) def get_action(self, state, episode): epsilon = self.epsilon_by_frame(episode) with torch.no_grad(): if random.random() > epsilon: #state = Variable(torch.Tensor(np.array(state))).to(device) q_value = self.policy_net(state) action = q_value.max(1)[1].item() else: action = np.random.randint(0, self.num_actions) return action def update_target_model(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def model_save(self): torch.save( { 'model_state_dict': self.policy_net.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), }, self.PATH) def model_load(self): if self.device == "cuda:0": checkpoint = torch.load(self.PATH) else: checkpoint = torch.load(self.PATH, map_location=torch.device('cpu')) self.policy_net.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
class Agent: """Reinforcement Learning Agent that learns using DDPG""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor Policy Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model params with local model params self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # Noise Process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay Memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm Parameters self.gamma = 0.99 # Discount Factor self.tau = 0.01 # for Soft Update of Target Parameters self.score = 0 self.best_score = -np.inf self.count = 0 self.total_reward = 0.0 def reset_episode(self): self.count = 0 self.total_reward = 0.0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): """Save Experience / Reward""" self.count += 1 self.total_reward += reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn if enough samples are present in memory if len(self.memory) > self.batch_size: self.score = reward experiences = self.memory.sample() self.learn(experiences) # Roll over the last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # Add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" self.score = self.total_reward / float( self.count) if self.count else 0.0 # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) action_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, action_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train the Actor Model action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) if self.score > self.best_score: self.best_score = self.score def soft_update(self, local_model, target_model): """Soft Update Model Parameters""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)