def main(): ddpg = DDPG(GAMMA, TAU, torch.cuda.is_available()) memory = ReplayMemory(REPLAY_SIZE) env.init_state() if os.path.exists('models/ddpg_actor_'): ddpg.load_model() updates = 0 for i_episode in range(NUM_EPISODES): while True: ounoise = OUNoise(1, scale=NOISE_SCALE - NOISE_SCALE // NUM_EPISODES * i_episode) action = ddpg.select_action(env.state, ounoise) transition = env.step(action) memory.push(transition) if len(memory) > BATCH_SIZE: for _ in range(UPDATES_PER_STEP): transitions = memory.sample(BATCH_SIZE) random.shuffle(transitions) batch = Transition(*zip(*transitions)) value_loss, policy_loss = ddpg.update_parameters(batch) print( "Episode: {}, Updates: {}, Value Loss: {}, Policy Loss: {}" .format(i_episode, updates, value_loss, policy_loss)) updates += 1 break if (i_episode + 1) % 100 == 0: ddpg.save_model()
def get_accuracy(model, dqn): model.eval() dqn.eval() correct = 0. total = 0. path = [] for images, labels in test_loader: images = Variable(images.view(-1, 28 * 28)).cuda() outputs = model(images, dqn) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted.cpu() == labels).sum() path.append( torch.stack([ labels.cpu(), torch.tensor(Transition(*zip(*model.get_replays())).action) ]).transpose(0, 1)) accuracy = 100 * correct.float() / total model.train() dqn.train() path = torch.cat(path, 0) return accuracy, path
def run_episode(environment: gym.Env, agent: DQNAgent, render: bool, max_length: int): """ Run one episode in the given environment with the agent. Arguments: environment {`gym.Env`} -- Environment representing the Markov Decision Process agent {`DQNAgent`} -- Reinforcment Learning agent that acts in the envíronment render {`bool`} -- Whether the frames of the episode should be rendered on the screen max_length {`int`} -- Maximum number of steps before the episode is terminated Returns: `float` -- Cumulated reward that the agent received during the episode """ episode_reward = 0 state = environment.reset() for _ in range(max_length): if render: environment.render() action = agent.act(state) next_state, reward, terminal, _ = environment.step(action) agent.observe( Transition(state, action, reward, None if terminal else next_state)) episode_reward += reward if terminal: break else: state = next_state return episode_reward
def update_replays(self, labels, loss, num_labels): loss_reward = (1 - loss.detach()).clamp(min=0.) y_onehot = torch.IntTensor(len(labels), num_labels) y_onehot.zero_() y_onehot.scatter_(1, labels.cpu().reshape([-1, 1]), 1) A = y_onehot.mm(y_onehot.transpose(0, 1)) for i in reversed(sorted(self.replays.keys())): B = torch.IntTensor(len(labels), num_labels) B.zero_() actions = torch.tensor( [replay.action for replay in self.replays[i]], dtype=torch.long).reshape([-1, 1]) B.scatter_(1, actions, 1) B = B.mm(B.transpose(0, 1)) equal_reward = (A * B).float().mean(1) diff_reward = ((1 - A) * (1 - B)).float().mean(1) if i == len(self.replays) - 1: reward = loss_reward gamma = 1.0 else: reward = 0 gamma *= GAMMA for j, replay in enumerate(self.replays[i]): reward += gamma * (equal_reward[j] + diff_reward[j]) self.replays[i][j] = Transition(replay.state.detach(), replay.action, replay.next_state.detach(), reward.cpu())
def update_parameters(self, batch_size): transitions = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) state_batch = normalize( Variable(torch.stack(batch.state)).to(self.device), self.obs_rms, self.device) action_batch = Variable(torch.stack(batch.action)).to(self.device) reward_batch = normalize( Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1), self.ret_rms, self.device) mask_batch = Variable(torch.stack(batch.mask)).to( self.device).unsqueeze(1) next_state_batch = normalize( Variable(torch.stack(batch.next_state)).to(self.device), self.obs_rms, self.device) if self.normalize_returns: reward_batch = torch.clamp(reward_batch, -self.cliprew, self.cliprew) value_loss = self.update_critic(state_batch, action_batch, reward_batch, mask_batch, next_state_batch) policy_loss = self.update_actor(state_batch) self.soft_update() return value_loss, policy_loss
def optimize_model(self): if len(self.memory) < config.BATCH_SIZE: return transitions = self.memory.sample(config.BATCH_SIZE) batch = Transition(*zip(*transitions)) state_batch = tuple([ torch.cat( tuple([batch.state[i][j] for i in range(config.BATCH_SIZE)])) for j in range(3) ]) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) next_state_batch = tuple([ torch.cat( tuple( [batch.next_state[i][j] for i in range(config.BATCH_SIZE)])) for j in range(3) ]) state_action_values = self.policy_net(state_batch).gather( 1, action_batch) next_state_values = self.target_net(next_state_batch).max( 1)[0].detach() expected_state_action_values = (next_state_values * config.GAMMA) + reward_batch loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): if param.grad is not None: param.grad.data.clamp_(-1, 1) self.optimizer.step()
def update_parameters(self, batch_size, number_of_iterations): policy_losses = [] value_losses = [] for _ in range(number_of_iterations): transitions = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) state_batch = Variable(torch.stack(batch.state)).to(self.device) action_batch = Variable(torch.stack(batch.action)).to(self.device) reward_batch = Variable(torch.stack(batch.reward)).to( self.device).unsqueeze(1) mask_batch = Variable(torch.stack(batch.mask)).to( self.device).unsqueeze(1) next_state_batch = Variable(torch.stack(batch.next_state)).to( self.device) value_loss = self.update_critic(state_batch, action_batch, reward_batch, mask_batch, next_state_batch) value_losses.append(value_loss) policy_loss = self.update_actor(state_batch, action_batch) policy_losses.append(policy_loss) self.soft_update() return np.mean(value_losses), np.mean(policy_losses)
def optimize_dqn(policy_net, target_net, replay_memory, optimizer, batch_size, gamma): if len(replay_memory) < batch_size: return transitions = replay_memory.sample(batch_size) batch = Transition(*zip(*transitions)) state = torch.stack(batch.state) action = torch.stack(batch.action).reshape([-1, 1]) next_state = torch.stack(batch.next_state) reward = torch.stack(batch.reward).cuda() q_values = policy_net(state).gather(1, action.reshape([-1, 1]).cuda()).squeeze() #print(batch.reward) expected_q_values = (target_net(next_state).max(1)[0].detach() * gamma) + reward loss = F.smooth_l1_loss(q_values, expected_q_values) optimizer.zero_grad() loss.backward() _clamp_params(policy_net) optimizer.step() for target_param, local_param in zip(target_net.parameters(), policy_net.parameters()): target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data) return loss
def optimize_policy_net(self): if self.use_noisy_nets: self.policy_net.sample_noise() if self.use_priority_replay: transitions, indices, importance_sampling_weights = self.replay_memory.sample(self.batch_size) else: transitions = self.replay_memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) if self.clamp_rewards: reward_batch = torch.clamp(reward_batch, -1, 1) # Compute Q(s_t, a) - state_action_values = self.policy_net(state_batch).gather(1, action_batch) # Compute Q(s_{t+1}) * gamma + reward next_state_values = torch.zeros(self.batch_size, device=self.device) if self.use_ddqn == True: next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach() else: next_state_values[non_final_mask] = self.policy_net(non_final_next_states).max(1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * self.gamma) + reward_batch # Compute loss loss = self.loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Multiply with importance weigths if using priority replay if self.use_priority_replay: loss = loss * torch.reshape(torch.tensor(importance_sampling_weights, device = self.device), (self.batch_size, 1)) new_priorities = loss + 1e-5 loss = loss.mean() # Optimize the model self.optimizer.zero_grad() loss.backward() if self.use_priority_replay: self.replay_memory.update_priorities(indices, new_priorities.data.cpu().numpy()) if self.anneal_importance_sampling_beta: self.replay_memory.importance_sampling_beta = min(1, self.replay_memory.importance_sampling_beta + self.steps_between_batches * (1 - self.start_priority_replay_beta) / (self.training_step_count - self.warmup_step_count)) if self.clamp_grads: for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
def optimize_model(): """Function for gradient updates In this function we sample from memory(ReplayMemory), use the policy_net to get the state_action_values and the target net and the next_states to compute the expected_state_action_values and use huber loss to update the weights. """ if len(memory) < BATCH_SIZE: return transitions = memory.sample(BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool) non_final_next_states = torch.cat([ torch.Tensor(s) for s in batch.next_state if s is not None ]).to(device) state_batch = torch.cat([torch.Tensor(s) for s in batch.state]).to(device) action_batch = torch.cat([torch.LongTensor([[s]]) for s in batch.action]).to(device) reward_batch = torch.cat([torch.Tensor([s]) for s in batch.reward]).to(device) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = policy_net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(BATCH_SIZE, device=device) next_state_values[non_final_mask] = target_net(non_final_next_states).max( 1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() return loss.item()
def update_model(self, batch_size): if len(self.memory) < batch_size: return 0. transitions, indices, weights = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) weights = tf.constant(weights, dtype=tf.float32) state_batch = tf.concat([batch.state], axis=0) action_batch = tf.concat([batch.action], axis=0) reward_batch = tf.concat([batch.reward], axis=0) self.policy_model.reset_noise() self.target_model.reset_noise() # @tf.function def _update(state_batch, action_batch, reward_batch): loss_fn = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.NONE) non_final_mask = tf.constant(tuple(map(lambda s: s is not None, batch.next_state)), dtype=tf.bool) non_final_next_states = tf.concat([[s if s is not None else tf.zeros_like(state_batch[0]) for s in batch.next_state]], axis=0) next_state_values = tf.zeros(shape=(batch_size, ), dtype=tf.float32) next_state_actions = tf.argmax(self.policy_model(non_final_next_states), axis=1) next_state_values_ = tf.reduce_sum( self.target_model(non_final_next_states) * tf.one_hot(next_state_actions, self.n_actions), axis=1 ) next_state_values = tf.where(non_final_mask, next_state_values_, next_state_values) expected_state_action_values = (next_state_values * self.gamma) + reward_batch with tf.GradientTape() as tape: state_action_values = tf.reduce_sum( self.policy_model(state_batch, training=True) * tf.one_hot(action_batch, self.n_actions), axis=1 ) loss_batch = loss_fn(state_action_values, expected_state_action_values) loss = tf.reduce_mean(loss_batch*weights) grads = tape.gradient(loss, self.policy_model.trainable_variables) grads = [tf.clip_by_value(g, -1., 1.) for g in grads] self.optimizer.apply_gradients(zip(grads, self.policy_model.trainable_variables)) return loss, loss_batch loss, loss_batch = _update(state_batch, action_batch, reward_batch) loss_batch = np.array(loss_batch) for index, error in zip(indices, loss_batch): self.memory.update(index, error) return np.array(loss)
def train(self): agent = torch.load(self.nn) for _ in range(self.episode): if len(self.memory) > self.batch_size * 5: for __ in range(5): transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) agent.update_parameters(batch) torch.save(agent, self.nn)
def optimize_model(q_estimator,replay_memory,optimizer,batch_size=BATCH_SIZE,discount_factor=DISCOUNT_FACTOR): if len(replay_memory) < BATCH_SIZE: return transitions = replay_memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) # Get minibatch for training try: a = np.array(batch.state).shape except: for val in batch.state: print(np.array(val).shape,np.array(val)) next_states_batch = torch.FloatTensor(batch.next_state).squeeze() states_batch = torch.FloatTensor(batch.state).squeeze() action_batch = torch.LongTensor(np.array(batch.action).reshape(BATCH_SIZE,1)) reward_batch = torch.FloatTensor(np.array(batch.reward).reshape(BATCH_SIZE,1)) # DDQN Settings # Compute q-values # for x in next_states_batch: print(x) # assert all(x.shape == (pm.Model.input_length) for x in next_states_batch) # print(q_estimator.forward(states_batch).shape,q_estimator.forward(states_batch)) q_state_values = q_estimator.forward(states_batch).gather(1,action_batch) # Compute Target values # q_values_next_target = torch.zeros(BATCH_SIZE,1)#q_estimator.forward(next_states_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. q_next_values = q_estimator.forward(next_states_batch).gather(1,action_batch) # best_next_actions = np.argmax(q_next_values.detach().numpy(), axis=1) discounted_future = q_next_values * DISCOUNT_FACTOR # Compute the expected Q values expected_reward_batch = discounted_future + reward_batch # Compute Huber loss loss = torch.nn.functional.smooth_l1_loss(q_state_values, expected_reward_batch)#.unsqueeze(1) # Optimize the model optimizer.zero_grad() loss.backward() # for param in q_estimator.model.parameters(): # param.grad.data.clamp_(-1, 1) optimizer.step() # pm.polyak_update(from_network = q_estimator,to_network = target_estimator) return loss.item()
def reinforce(self, s_, a_, n_s_, r_, game_over_, env_steps_): # Two steps: first memorize the states, second learn from the pool self.memory.remember(s_, a_, n_s_, r_, game_over_) transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) # print(batch.state) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) # non_final_mask = torch.tensor(torch.cat(batch.game_over), device=device)==False non_final_mask = torch.cat(batch.game_over) == False non_final_next_states = torch.cat(batch.next_state)[non_final_mask] state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # non_final_next_states = torch.cat(batch.next_state)[non_final_index] # print(state_batch.shape) state_values = self.learned_act(state_batch, with_grad=True) state_action_values = state_values.gather(1, action_batch).squeeze(1) next_state_values = torch.zeros(self.batch_size, device=device) if len(non_final_next_states) > 0: with torch.no_grad(): argmax_online = (self.learned_act(non_final_next_states) ).argmax(1).unsqueeze(1) next_state_values[non_final_mask] = self.learned_act( non_final_next_states, target=True).gather(1, argmax_online).squeeze(1) expected_state_action_values = next_state_values * GAMMA + reward_batch loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) # loss = F.mse_loss(state_action_values[non_final_mask], expected_state_action_values[non_final_mask]) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.model.parameters(): # HINT: Clip the target to avoid exploiding gradients.. -- clipping is a bit tighter param.grad.data.clamp_(-1e-5, 1e-5) self.optimizer.step() if env_steps_ % self.target_update_interval == 0: soft_update(self.target_model, self.model, self.tau) return float(loss)
def update_parameters(self, batch_size, mdp_type='mdp', adversary_update=False, exploration_method='mdp'): transitions = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) if mdp_type != 'mdp': robust_update_type = 'full' elif exploration_method != 'mdp': robust_update_type = 'adversary' else: robust_update_type = None state_batch = normalize( Variable(torch.stack(batch.state)).to(self.device), self.obs_rms, self.device) action_batch = Variable(torch.stack(batch.action)).to(self.device) reward_batch = normalize( Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1), self.ret_rms, self.device) mask_batch = Variable(torch.stack(batch.mask)).to( self.device).unsqueeze(1) next_state_batch = normalize( Variable(torch.stack(batch.next_state)).to(self.device), self.obs_rms, self.device) if self.normalize_returns: reward_batch = torch.clamp(reward_batch, -self.cliprew, self.cliprew) value_loss = 0 policy_loss = 0 adversary_loss = 0 if robust_update_type is not None: _value_loss, _policy_loss, _adversary_loss = self.update_robust( state_batch, action_batch, reward_batch, mask_batch, next_state_batch, adversary_update, mdp_type, robust_update_type) value_loss += _value_loss policy_loss += _policy_loss adversary_loss += _adversary_loss if robust_update_type != 'full': _value_loss, _policy_loss, _adversary_loss = self.update_non_robust( state_batch, action_batch, reward_batch, mask_batch, next_state_batch) value_loss += _value_loss policy_loss += _policy_loss adversary_loss += _adversary_loss self.soft_update() return value_loss, policy_loss, adversary_loss
def unpack_batch(self, batch): batch = Transition(*zip(*batch)) states = torch.cat(batch.state).to(self.device).view(self.batch_size, self.n_bits) actions = torch.cat(batch.action).to(self.device).view((-1, 1)) rewards = torch.cat(batch.reward).to(self.device) next_states = torch.cat(batch.next_state).to(self.device).view(self.batch_size, self.n_bits) dones = torch.cat(batch.done).to(self.device) goals = torch.cat(batch.goal).to(self.device).view(self.batch_size, self.n_bits) return states, actions, rewards, dones, next_states, goals
def optimize_model(self): if len(self.memory) < self.BATCH_SIZE: return transitions = self.memory.sample(self.BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.next_state_action( non_final_next_states) # Compute the expected Q values expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch # Compute Huber loss loss = self.loss(state_action_values, expected_state_action_values.unsqueeze(1), **self.loss_params) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
def unpack(self, batch): batch = Transition(*zip(*batch)) states = torch.cat(batch.state).view(self.batch_size, self.n_states).to(self.device) rewards = torch.cat(batch.reward).view(self.batch_size, 1).to(self.device) dones = torch.cat(batch.done).view(self.batch_size, 1).to(self.device) actions = torch.cat(batch.action).view(-1, self.n_actions).to(self.device) next_states = torch.cat(batch.next_state).view( self.batch_size, self.n_states).to(self.device) return states, rewards, dones, actions, next_states
def update(self): # TODO: # To update model, we sample some stored experiences as training examples. if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) one_batch = Transition(*zip(*transitions)) state_batch = torch.cat(one_batch.state) action_batch = torch.cat(one_batch.action) reward_batch = torch.cat(one_batch.reward) # TODO: # Compute Q(s_t, a) with your model. state_action_values = self.online_net(state_batch).gather( 1, action_batch) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, one_batch.next_state)), dtype=torch.uint8).cuda() non_final_next_states = torch.cat( [s for s in one_batch.next_state if s is not None]) with torch.no_grad(): # TODO: # Compute Q(s_{t+1}, a) for all next states. # Since we do not want to backprop through the expected action values, # use torch.no_grad() to stop the gradient from Q(s_{t+1}, a) next_state_values = torch.zeros(self.batch_size).cuda() _, actions = self.online_net(non_final_next_states).max( 1, keepdim=True) next_state_values[non_final_mask] = self.target_net( non_final_next_states).gather(1, actions).view(-1).detach() # TODO: # Compute the expected Q values: rewards + gamma * max(Q(s_{t+1}, a)) # You should carefully deal with gamma * max(Q(s_{t+1}, a)) when it is the terminal state. expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch # TODO: # Compute temporal difference loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() for param in self.online_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() return loss.item()
def optimize_model(self): """ Perform one step of optimization on the neural network """ if len(self.memory) < Config.BATCH_SIZE: return transitions = self.memory.sample(Config.BATCH_SIZE) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.uint8) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = self.policy_net(state_batch).gather(1, action_batch) # Compute argmax Q(s', a; θ) next_state_actions = self.policy_net(non_final_next_states).max(1)[1].detach().unsqueeze(1) # Compute Q(s', argmax Q(s', a; θ), θ-) next_state_values = torch.zeros(Config.BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, next_state_actions).squeeze(1).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * Config.GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
def forward(self, x, dqn): if self.training: eps_threshold = self.get_eps_threshold() else: eps_threshold = -1.0 self.replays = {} outputs = self.start_layer(x) n_layers = len(self.layers) for i, layers, batch_norm in zip(range(n_layers), self.layers, self.batch_norms): actions = _get_action(dqn, outputs).cpu() next_outputs = [] for j in range(len(outputs)): state = outputs[j] actions[j] = actions[j] if random.random( ) > eps_threshold else torch.tensor( random.randrange(len(layers))) next_state = layers[actions[j]](state.reshape([1, -1])) next_outputs.append(next_state) next_outputs = torch.stack(next_outputs).squeeze(1) next_outputs = batch_norm(next_outputs) self.replays[i] = [] for j in range(len(outputs)): state = outputs[j] action = actions[j] next_state = next_outputs[j] self.replays[i].append( Transition(state, action, next_state, 0.)) #print(outputs.shape) outputs = self.end_layer(outputs) self.steps_done += 1 return outputs
def learn(self): if len(self.replay_memory) < self.replay_memory.capacity: return transitions = self.replay_memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) S = self._Var(np.stack(batch.s), torch.FloatTensor) A = self._Var(np.stack(batch.a), torch.FloatTensor) R = self._Var(np.stack(batch.r), torch.FloatTensor) S_ = self._Var(np.stack(batch.s_), torch.FloatTensor) # Use both target network to compute TD-target Q_ = self.critic_target(S_, self.actor_target(S_)).detach() Q_target = R + GAMMA * Q_ # Estimated Q-value Q_est = self.critic.forward(S, A) # Optimize critic C_loss = self.critic_loss_fn(Q_est, Q_target) self.critic_optim.zero_grad() C_loss.backward() self.critic_optim.step() # Optimize actor Q = self.critic.forward(S, self.actor.forward(S)) A_loss = -Q.mean() self.actor_optim.zero_grad() A_loss.backward() self.actor_optim.step() # Soft update on target networks for c, c_t, a, a_t in zip_longest(self.critic.parameters(), self.critic_target.parameters(), self.actor.parameters(), self.actor_target.parameters()): if c is not None: c_t.data = TAU * c.data + (1 - TAU) * c_t.data if a is not None: a_t.data = TAU * a.data + (1 - TAU) * a_t.data
def optimize_model(self): if len(self.memory) < self.batch_size: # ReplayMemoryが小さすぎる場合は何もしない return transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) # non-final states non_final_mask = ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) # 終了状態に対してbackpropをOFFにする? non_final_next_staes = Variable(torch.cat( [s for s in batch.next_state if s is not None]), volatile=True) # batches state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.cat(batch.reward)) # compute Qt(s, a) Qsa_values = self.model(state_batch).gather(1, action_batch) # compute Vt+1(s) Vs_values = Variable(torch.zeros(self.batch_size).type(Tensor)) Vs_values[non_final_mask] = self.model(non_final_next_staes).max(1)[0] Vs_values.volatile = False expected_Qsa_values = reward_batch + Vs_values * self.gamma # Loss loss = F.smooth_l1_loss(Qsa_values, expected_Qsa_values) # optimize self.optim.zero_grad() loss.backward() for param in self.model.parameters(): param.grad.data.clamp_(-1, 1) self.optim.step()
def optimize(self): transitions = self.memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat([ torch.tensor(s, device=self.device, dtype=torch.float) for s in batch.next_state if s is not None ]) state_batch = torch.cat( [torch.tensor(batch.state, device=self.device, dtype=torch.float)]) action_batch = torch.cat( [torch.tensor(batch.action, device=self.device, dtype=torch.long)]) reward_batch = torch.cat( [torch.tensor(batch.reward, device=self.device, dtype=torch.int)]) state_action_values = self.policy_net(state_batch).gather( 1, action_batch.unsqueeze(1)) next_state_values = torch.zeros(self.batch_size, device=self.device) next_state_values[non_final_mask] = self.target_net( non_final_next_states.unsqueeze(1)).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch self.loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) self.optimizer.zero_grad() self.loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
def optimize_model(): if len(memory) < BATCH_SIZE: return transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), dtype=torch.uint8) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) reward_batch = reward_batch.type(torch.FloatTensor) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = policy_net(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. next_state_values = torch.zeros(BATCH_SIZE) next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach() # Compute the expected Q values #print(next_state_values.type()) reward_batch.type(torch.FloatTensor) #print(reward_batch.type()) expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model optimizer.zero_grad() loss.backward() for param in policy_net.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step()
def update(self): if len(self.replay_memory.memory) > self.min_replay_size: transitions = self.replay_memory.sample(self.batch_size) states, actions, next_states, rewards, is_terminal_list = Transition(*zip(*transitions)) actions = self.encode_action(actions) # Encode each of the actions as a 1 hot vector # Reshape the arrays to be input into the model states = np.asarray(states, dtype=np.float32) states = np.reshape(states, (self.batch_size, 8)) next_states = np.asarray(next_states, dtype=np.float32) next_states = np.reshape(next_states, (self.batch_size, 8)) is_terminal_list = np.array(is_terminal_list) rewards = np.array(rewards) target = rewards + (1 - is_terminal_list) * self.gamma * np.max( self.sess.run(self.target_network, feed_dict={self.observation_input: next_states})) self.sess.run(self.update_op, feed_dict={ self.observation_input: states, self.action_input: actions, self.target_q_val: target})
def train(self, num_episodes, num_epochs, max_timesteps, render=False): timestep = 0 for i_episode in range(1, num_episodes + 1): state = self.env.reset() running_reward = 0 for i_timestep in range(max_timesteps): timestep += 1 # compute action state = torch.FloatTensor(state.reshape(1, -1)).to(self.device) prev_state = state with torch.no_grad(): action, action_log_prob = self.policy_net.act(state) state, reward, done, _ = self.env.step(action.cpu().numpy()) running_reward += reward transition = Transition(prev_state, action, reward, action_log_prob, done) self.memory.push(transition) #Update policy network if timestep % self.update_timestep == 0: self.ppo_update(num_epochs) print("Policy updated") self.memory.clear() timestep = 0 if render: env.render() if done: break print('Episode {} Done, \t length: {} \t reward: {}'.format( i_episode, i_timestep, running_reward)) self.reward_log.append(int(running_reward)) self.time_log.append(i_timestep)
def step(action): """Peform action and return state action = x.x seconds """ global last_score, state # 350 ms ~ 950 ms press_time = (action[0] + 1) * 300 + 350 x1, y1, x2, y2 = get_press_position() jump(press_time, x1, y1, x2, y2) time.sleep(3.5) pull_screenshot('autojump.png') last_state = state state = preprocess(Image.open('autojump.png')).unsqueeze(0) # Game Over if restart('autojump.png'): reward = 0 last_score = 0 mask = 0 init_state() else: score = get_score('autojump.png') reward = 2 if score - last_score >= 2 else 1 last_score = score mask = 1 print("Press Time: {} ms, Mask: {}, Reward: {}".format( press_time, mask, reward)) return Transition(state=torch.Tensor(last_state), action=torch.Tensor(action), mask=torch.Tensor([mask]), next_state=torch.Tensor(state), reward=torch.Tensor([reward]))
action = torch.Tensor(action) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward = torch.Tensor([reward]) # if i_episode % 10 == 0: # env.render() memory.push(state, action, mask, next_state, reward) # line 10 state = next_state if len(memory) > args.batch_size * 5: for _ in range(args.updates_per_step): transitions = memory.sample(args.batch_size) # line 11 batch = Transition(*zip(*transitions)) agent.update_parameters(batch) if done: break rewards.append(episode_reward) ''' ############### Synchronization ############### ''' if i_episode % 10 == 0: weakest_in_pop_index = evo.population.index( min(evo.population, key=attrgetter('fitness'))) evo.population[weakest_in_pop_index] = copy.deepcopy(agent)
def main(): global subdata t_start = time.time() parser = argparse.ArgumentParser(description='PyTorch X-job') parser.add_argument('--env_name', default="OurEnv-v0", help='name of the environment') parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor for reward (default: 0.99)') parser.add_argument('--tau', type=float, default=0.001, help='discount factor for model (default: 0.001)') parser.add_argument('--ou_noise', type=bool, default=True) parser.add_argument('--noise_scale', type=float, default=0.4, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.4)') parser.add_argument('--exploration_end', type=int, default=33, metavar='N', help='number of episodes with noise (default: 100)') parser.add_argument('--seed', type=int, default=4, metavar='N', help='random seed (default: 4)') parser.add_argument('--batch_size', type=int, default=512, metavar='N', help='batch size (default: 512)') parser.add_argument('--num_steps', type=int, default=300, metavar='N', help='max episode length (default: 1000)') parser.add_argument('--num_episodes', type=int, default=50, metavar='N', help='number of episodes (default: 1000)') parser.add_argument('--hidden_size', type=int, default=128, metavar='N', help='hidden size (default: 128)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') parser.add_argument('--save_agent', type=bool, default=True, help='save model to file') parser.add_argument('--load_agent', type=bool, default=False, help='load model from file') parser.add_argument('--train_model', type=bool, default=True, help='Training or run') parser.add_argument('--load_exp', type=bool, default=False, help='load saved experience') parser.add_argument('--state_plot', type=bool, default=True, help='plot Q values for environment') parser.add_argument('--greedy_steps', type=int, default=5, metavar='N', help='amount of times greedy goes (default: 100)') args = parser.parse_args() #env = gym.make(args.env_name) env = Env() #env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # -- initialize agent, Q and Q' -- agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) # -- declare memory buffer and random process N memory = ReplayMemory(args.replay_size) memory_g = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None # -- load existing model -- if args.load_agent: agent.load_model(args.env_name, args.batch_size, '.pth') print("agent: naf_{}_{}_{}, is loaded").format(args.env_name, args.batch_size, '.pth') # -- load experience buffer -- if args.load_exp: with open('/home/aass/catkin_workspace/src/panda_demos/exp_replay.pk1', 'rb') as input: memory.memory = pickle.load(input) memory.position = len(memory) #sate_Q_plot(agent, 50) rewards = [] total_numsteps = 0 greedy_reward = [] avg_greedy_reward = [] upper_reward = [] lower_reward = [] steps_to_goal = [] avg_steps_to_goal = [] state_plot = [] sim_reset_start() pub = rospy.Publisher('/ee_rl/act', DesiredErrorDynamicsMsg, queue_size=10) rospy.Subscriber("/ee_rl/state", StateMsg, callback) rate = rospy.Rate(9) rate.sleep() for i_episode in range(args.num_episodes + 1): # -- reset environment for every episode -- sim_reset() state = torch.Tensor(subdata).unsqueeze(0) # -- initialize noise (random process N) -- if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode / args.exploration_end + args.final_noise_scale) ounoise.reset() episode_reward = 0 while True: # -- action selection, observation and store transition -- action = agent.select_action( state, ounoise) if args.train_model else agent.select_action(state) a = action.numpy()[0] * 50 act_pub = [a[0], a[1]] pub.publish(act_pub) next_state = torch.Tensor(subdata).unsqueeze(0) reward, done, _ = env.calc_shaped_reward(next_state) total_numsteps += 1 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) reward = torch.Tensor([reward]) memory.push(state, action, mask, next_state, reward) # if done: # for i in range(total_numsteps % args.num_steps): # a = i+1 # memory_g.memory.append(memory.memory[-a]) # memory_g.position += 1 state = next_state #-- training -- # if len(memory_g) > args.batch_size / 2 and len(memory) > args.batch_size/2 and args.train_model: # for _ in range(10): # transitions_b = memory.sample(args.batch_size/2) # transitions_g = memory_g.sample(args.batch_size/2) # for i in range(transitions_g): # transitions_b.append(transitions_g[i]) # batch = Transition(*zip(*transitions_b)) # agent.update_parameters(batch) if len(memory) > args.batch_size and args.train_model: for _ in range(10): transitions = memory.sample(args.batch_size) batch = Transition(*zip(*transitions)) agent.update_parameters(batch) else: time.sleep(0.1) rate.sleep() if done or total_numsteps % args.num_steps == 0: break pub.publish([0, 0]) rewards.append(episode_reward) # -- plot Q value -- if i_episode % 10 == 0: sate_Q_plot(agent, i_episode) # -- saves model -- if args.save_agent: agent.save_model(args.env_name, args.batch_size, i_episode, '.pth') with open('exp_replay.pk1', 'wb') as output: pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL) #with open('exp_replay_g.pk1', 'wb') as output: #pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL) if args.train_model: greedy_episode = max(args.num_episodes / 100, 5) else: greedy_episode = 10 greedy_range = min(args.greedy_steps, greedy_episode) # -- calculates episode without noise -- if i_episode % greedy_episode == 0 and not i_episode == 0: for _ in range(0, greedy_range + 1): # -- reset environment for every episode -- sim_reset() state_visited = [] action_taken = [] print("Greedy episode ongoing") state = torch.Tensor(subdata).unsqueeze(0) episode_reward = 0 steps = 0 state_plot.append([]) st = state.numpy()[0] sta = [st[0], st[1]] state_plot[_].append(sta) while True: action = agent.select_action(state) a = action.numpy()[0] * 50 act_pub = [a[0], a[1]] pub.publish(act_pub) next_state = torch.Tensor(subdata).unsqueeze(0) reward, done, obs_hit = env.calc_shaped_reward(next_state) episode_reward += reward state_visited.append(state) action_taken.append(action) state = next_state steps += 1 if done or steps == args.num_steps: greedy_reward.append(episode_reward) break rate.sleep() if obs_hit: steps = 300 steps_to_goal.append(steps) # -- plot path -- if i_episode % 10 == 0: agent.plot_path(state_visited, action_taken, i_episode) upper_reward.append((np.max(greedy_reward[-greedy_range:]))) lower_reward.append((np.min(greedy_reward[-greedy_range:]))) avg_greedy_reward.append((np.mean(greedy_reward[-greedy_range:]))) avg_steps_to_goal.append((np.mean(steps_to_goal[-greedy_range:]))) print( "Episode: {}, total numsteps: {}, avg_greedy_reward: {}, average reward: {}" .format(i_episode, total_numsteps, avg_greedy_reward[-1], np.mean(rewards[-greedy_episode:]))) #-- saves model -- if args.save_agent: agent.save_model(args.env_name, args.batch_size, i_episode, '.pth') with open('exp_replay.pk1', 'wb') as output: pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL) #with open('exp_replay_g.pk1', 'wb') as output: # pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL) print('Training ended after {} minutes'.format( (time.time() - t_start) / 60)) print('Time per ep : {} s').format( (time.time() - t_start) / args.num_episodes) print('Mean greedy reward: {}'.format(np.mean(greedy_reward))) print('Mean reward: {}'.format(np.mean(rewards))) print('Max reward: {}'.format(np.max(rewards))) print('Min reward: {}'.format(np.min(rewards))) # -- plot learning curve -- pos_greedy = [] for pos in range(0, len(lower_reward)): pos_greedy.append(pos * greedy_episode) plt.title('Greedy policy outcome') plt.fill_between(pos_greedy, lower_reward, upper_reward, facecolor='red', alpha=0.3) plt.plot(pos_greedy, avg_greedy_reward, 'r') plt.xlabel('Number of episodes') plt.ylabel('Rewards') fname1 = 'plot1_obs_{}_{}_{}'.format(args.env_name, args.batch_size, '.png') plt.savefig(fname1) plt.close() plt.title('Steps to reach goal') plt.plot(steps_to_goal) plt.ylabel('Number of steps') plt.xlabel('Number of episodes') fname2 = 'plot2_obs_{}_{}_{}'.format(args.env_name, args.batch_size, '.png') plt.savefig(fname2) plt.close()