def episode_finished(self, episode_number): all_actions = torch.stack(self.actions, dim=0).to(self.train_device).squeeze(-1) all_rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) all_values = torch.stack(self.values, dim=0).to(self.train_device).squeeze(-1) self.values, self.actions, self.rewards = [], [], [] discounted_rewards = discount_rewards(all_rewards, self.gamma) error = discounted_rewards - all_values error -= torch.mean(error) error /= torch.std(error.detach()) self.optimizer_p.zero_grad() self.optimizer_v.zero_grad() p_loss = (error.detach() * all_actions).sum() c_loss = error.pow(2).mean() p_loss.backward() c_loss.backward() self.optimizer_p.step() self.optimizer_v.step() self.episode_number += 1
def episode_finished(self, episode_number): action_probs = torch.stack(self.action_probs, dim=0) \ .to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) self.states, self.action_probs, self.rewards = [], [], [] # TODO: Compute discounted rewards (use the discount_rewards function) discounted_rewards = discount_rewards(rewards, self.gamma) #Task1.3 discounted_rewards -= torch.mean(discounted_rewards) discounted_rewards /= torch.std(discounted_rewards) # TODO: Compute critic loss and advantages (T3) # TODO: Compute the optimization term (T1, T3) T = len(rewards) gammas = torch.tensor([self.gamma**t for t in range(T)]).to(self.train_device) #baseline=20(Task 1b) #optim = -gammas*(discounted_rewards-20)*action_probs optim = -gammas * discounted_rewards * action_probs loss = optim.sum() loss.backward() # TODO: Compute the gradients of loss w.r.t. network parameters (T1) # TODO: Update network parameters using self.optimizer and zero gradients (T1) self.optimizer.step() self.optimizer.zero_grad()
def episode_finished(self, episode_number): action_probs = torch.stack(self.action_probs, dim=0) \ .to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) state_values = torch.stack(self.state_values, dim=0).to(self.train_device).squeeze(-1) self.states, self.action_probs, self.rewards, self.state_values = [], [], [], [] # Compute discounted rewards (use the discount_rewards function) discounted_rewards = discount_rewards(rewards, self.gamma) discounted_rewards = (discounted_rewards - torch.mean(discounted_rewards)) / torch.std( discounted_rewards) # T1c # Compute critic loss and advantages (T3) loss = 0 for log_prob, value, reward in zip(action_probs, state_values, discounted_rewards): advantage = reward - value.item() policy_loss = -advantage * log_prob value_loss = F.smooth_l1_loss(value, reward) # Using loss l1 loss += (policy_loss + value_loss) # Compute the optimization term (T1, T3) #loss = policy_losses.sum() + value_losses.sum() # Compute the gradients of loss w.r.t. network parameters (T1) loss.backward() # Update network parameters using self.optimizer and zero gradients (T1) self.optimizer.step() self.optimizer.zero_grad()
def episode_finished(self, episode_number): # Task 2a: update sigma of the policy exponentially decreasingly. # self.policy.update_sigma_exponentially(episode_number + 1) action_probs = torch.stack(self.action_probs, dim=0) \ .to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) self.states, self.action_probs, self.rewards = [], [], [] # DONE: Compute discounted rewards (use the discount_rewards function) discounted_rewards = discount_rewards(rewards, self.gamma) # Task 1c discounted_rewards -= torch.mean(discounted_rewards) discounted_rewards /= torch.std(discounted_rewards) # DONE: Compute the optimization term (T1) # task 1a baseline = 0 # task 1b # baseline = 20 weighted_probs = -action_probs * (discounted_rewards - baseline) # DONE: Compute the gradients of loss w.r.t. network parameters (T1) loss = torch.mean(weighted_probs) loss.backward() # DONE: Update network parameters using self.optimizer and zero gradients (T1) self.optimizer.step() self.optimizer.zero_grad()
def episode_finished(self): action_probs = torch.stack(self.action_probs, dim=0) \ .to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) self.states, self.action_probs, self.rewards = [], [], [] # TODO: Compute discounted rewards (use the discount_rewards function) discounted_rewards = discount_rewards( rewards, self.gamma) # computing the discounted reward # normalize the discounted rewards task 1.c discounted_rewards -= torch.mean(discounted_rewards) discounted_rewards /= torch.std(discounted_rewards) # TODO: Compute critic loss and advantages (T3) # TODO: Compute the optimization term (T1, T3) #weighted_probs = -action_probs * (discounted_rewards -20) # with baseline task 1.b weighted_probs = -action_probs * (discounted_rewards ) # without baseline # TODO: Compute the gradients of loss w.r.t. network parameters (T1) #computing the loss loss = torch.mean(weighted_probs) loss.backward() # TODO: Update network parameters using self.optimizer and zero gradients (T1) self.optimizer.step() self.optimizer.zero_grad()
def episode_finished(self, episode_number): action_probs = torch.stack(self.action_probs, dim=0) \ .to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) self.states, self.action_probs, self.rewards = [], [], [] # b = 20 # T1b baseline # T2a # self.policy.sigma = self.policy.sigma_init*np.exp(-0.0005*episode_number) # DONE: Compute discounted rewards (use the discount_rewards function) discounted_rewards = discount_rewards(rewards, self.gamma) discounted_rewards -= torch.mean(discounted_rewards) # T1c discounted_rewards /= torch.std(discounted_rewards) # T1c weighted_probs = -action_probs * discounted_rewards # T1a, T1c, T2 # weighted_probs = -action_probs * (discounted_rewards - b) # T1b # DONE: Compute the optimization term (T1) loss = torch.mean(weighted_probs) # DONE: Compute the gradients of loss w.r.t. network parameters (T1) loss.backward() # DONE: Update network parameters using self.optimizer and zero gradients (T1) self.optimizer.step() self.optimizer.zero_grad()
def __init__(self, sess, state_dim, action_dim, learning_rate): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.learning_rate = learning_rate # Actor Network self.inputs, self.out = self.create_actor_network() network_params = tf.trainable_variables() # This returns will be provided by the Discount Reward self._current_val = tf.placeholder("float", [None, 1], name='current_val') self._returns = tf.placeholder("float", [None, 1], name='returns') self.actions = tf.placeholder("float", [None, self.a_dim], name='actions') self._discounted_returns = utils.discount_rewards(self._returns) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.action_prob = tf.reduce_sum(self.actions * self.out, reduction_indices=1) self.loss = -tf.log(self.action_prob) * (self._discounted_returns - self._current_val) #self.optimize = optimizer.minimize(self.loss) grads_and_vars = optimizer.compute_gradients(self.loss, network_params) self.optimize = optimizer.apply_gradients(grads_and_vars)
def episode_finished(self, episode_number): action_probs = torch.stack(self.action_probs, dim=0) \ .to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) self.states, self.action_probs, self.rewards = [], [], [] # TODO: Update policy variance (T2) -- DONE c = 5e-4 # self.variance = self.policy.sigma * np.exp(-c * episode_number) # Exponentially decaying variance # TODO: Compute discounted rewards (use the discount_rewards function) -- DONE rewards = discount_rewards(rewards, gamma=self.gamma) rewards = (rewards - torch.mean(rewards))/torch.std(rewards) # REINFORCE with normalized rewards # TODO: Compute critic loss and advantages (T3) # TODO: Compute the optimization term (T1, T3) -- DONE loss = torch.sum(-rewards * action_probs) # REINFORCE # loss = torch.sum(-(rewards - self.baseline) * action_probs) # REINFORCE with baseline # TODO: Compute the gradients of loss w.r.t. network parameters (T1) -- DONE loss.backward() # TODO: Update network parameters using self.optimizer and zero gradients (T1) -- DONE self.optimizer.step() self.optimizer.zero_grad()
def learn(self): """Run learning algorithm""" reporter = Reporter() config = self.config total_n_trajectories = np.zeros(len(self.envs)) for iteration in range(config["n_iter"]): self.session.run([self.reset_accum_grads]) for i, learner in enumerate(self.task_learners): # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = learner.get_trajectories() total_n_trajectories[i] += len(trajectories) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths self.session.run( [self.add_accum_grads[i]], feed_dict={ self.state: all_state, self.action_taken: all_action, self.advantage: all_adv }) # summary = self.session.run([self.master.summary_op], feed_dict={ # self.reward: reward # # self.master.episode_length: trajectory["steps"] # }) # self.writer.add_summary(summary[0], iteration) # self.writer.flush() print("Task:", i) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories[i]) # Apply accumulated gradient after all the gradients of each task are summed self.session.run([self.apply_gradients])
def rewards_discounted(self): """Compute the discounted reward backwards through time.""" reward_his = discount_rewards(self.rewards) # standardize the rewards to be unit normal # (helps control the gradient estimator variance) reward_his -= np.mean(reward_his) tmp = np.std(reward_his) if tmp > 0: reward_his /= tmp # fix zero-divide return reward_his
def run(self): # Assume global shared parameter vectors θ and θv and global shared counter T = 0 # Assume thread-specific parameter vectors θ' and θ'v sess = self.master.session t = 1 # thread step counter while self.master.T < self.master.config[ 'T_max'] and not self.master.stop_requested: # Reset gradients: dθ = 0 and dθv = 0 sess.run([self.actor_reset_ag, self.critic_reset_ag]) # Synchronize thread-specific parameters θ' = θ and θ'v = θv sess.run([self.actor_sync_net, self.critic_sync_net]) trajectory = self.get_trajectory( self.master.config['episode_max_length']) reward = sum(trajectory['reward']) trajectory['reward'][-1] = 0 if trajectory[ 'done'] else self.get_critic_value(trajectory['state'][None, -1])[0] returns = discount_rewards(trajectory['reward'], self.master.config['gamma']) fetches = [ self.actor_net.summary_loss, self.critic_net.summary_loss, self.actor_add_ag, self.critic_add_ag, self.master.global_step ] # What does the master global step thing do? ac_net = self.actor_net cr_net = self.critic_net qw_new = self.master.session.run( [cr_net.value], feed_dict={cr_net.state: trajectory['state']})[0].flatten() all_action = self.transform_actions( trajectory['action'] ) # Transform actions back to the output shape of the actor network (e.g. one-hot for discrete action space) results = sess.run(fetches, feed_dict={ ac_net.state: trajectory["state"], cr_net.state: trajectory["state"], ac_net.actions_taken: all_action, ac_net.critic_feedback: qw_new, ac_net.critic_rewards: returns, cr_net.target: returns.reshape(-1, 1) }) summary = sess.run( [self.master.summary_op], feed_dict={ self.master.actor_loss: results[0], self.master.critic_loss: results[1], self.master.reward: reward, self.master.episode_length: trajectory["steps"] }) self.writer.add_summary(summary[0], t) self.writer.flush() sess.run([self.apply_actor_gradients, self.apply_critic_gradients]) t += 1 self.master.T += trajectory['steps']
def learn_REINFORCE(self): """Learn using updates like in the REINFORCE algorithm.""" reporter = Reporter() config = self.master.config total_n_trajectories = 0 iteration = 0 while iteration < config["n_iter"] and not self.master.stop_requested: iteration += 1 self.master.session.run([self.master.reset_accum_grads]) # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.task_learner.get_trajectories() total_n_trajectories += len(trajectories) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths self.master.session.run( [self.add_accum_grad], feed_dict={ self.master.state: all_state, self.master.action_taken: all_action, self.master.advantage: all_adv }) print("Task:", self.thread_id) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories) self.master.session.run([self.master.apply_gradients])
def learn(self): """Run learning algorithm""" reporter = Reporter() config = self.config total_n_trajectories = 0 for iteration in range(config["n_iter"]): # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.get_trajectories() total_n_trajectories += len(trajectories) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths result = self.session.run( [self.summary_op, self.train], feed_dict={ self.state: all_state, self.a_n: all_action, self.adv_n: all_adv, self.episode_lengths: np.mean(episode_lengths), self.rewards: np.mean(episode_rewards) }) self.writer.add_summary(result[0], iteration) self.writer.flush() reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories)
def learn(self, env): reporter = Reporter() self.session.run([self.reset_accumulative_grads]) iteration = 0 # amount of batches processed episode_nr = 0 episode_lengths = np.zeros(self.config['batch_size']) episode_rewards = np.zeros(self.config['batch_size']) mean_rewards = [] while True: # Keep executing episodes trajectory = self.get_trajectory(env, self.config["episode_max_length"]) episode_rewards[episode_nr % self.config['batch_size']] = sum( trajectory['reward']) episode_lengths[episode_nr % self.config['batch_size']] = len( trajectory['reward']) episode_nr += 1 action_taken = (np.arange( self.nA) == trajectory['action'][:, None]).astype( np.float32) # one-hot encoding discounted_episode_rewards = discount_rewards( trajectory['reward'], self.config['gamma']) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) std = np.std(discounted_episode_rewards) std = std if std > 0 else 1 discounted_episode_rewards /= std feedback = np.reshape( np.repeat(discounted_episode_rewards, self.nA), (len(discounted_episode_rewards), self.nA)) self.session.run( [self.accumulate_grads], feed_dict={ self.state: trajectory["state"], self.action_taken: action_taken, self.feedback: feedback }) if episode_nr % self.config['batch_size'] == 0: # batch is done iteration += 1 self.session.run([self.apply_gradients]) self.session.run([self.reset_accumulative_grads]) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, episode_nr) mean_rewards.append(episode_rewards.mean()) if episode_nr % self.config['draw_frequency'] == 0: reporter.draw_rewards(mean_rewards)
def episode_finished(self, episode_number): all_actions = torch.stack(self.actions, dim=0).to(self.train_device).squeeze(-1) all_rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) self.observations, self.actions, self.rewards = [], [], [] discounted_rewards = discount_rewards(all_rewards, self.gamma) discounted_rewards -= torch.mean(discounted_rewards) discounted_rewards /= torch.std(discounted_rewards) weighted_probs = all_actions * discounted_rewards loss = torch.mean(weighted_probs) loss.backward() if (episode_number+1) % self.batch_size == 0: self.update_policy()
def learn(self): reporter = Reporter() gradient1 = np.zeros_like(self.w1) gradient2 = np.zeros_like(self.w2) rmsprop1 = np.zeros_like(self.w1) rmsprop2 = np.zeros_like(self.w2) iteration = 0 # amount of batches processed episode_nr = 0 episode_lengths = np.zeros(self.config['batch_size']) episode_rewards = np.zeros(self.config['batch_size']) mean_rewards = [] while True: # Keep executing episodes trajectory = self.get_trajectory(self.config["episode_max_length"]) episode_rewards[episode_nr % self.config['batch_size']] = sum(trajectory['reward']) episode_lengths[episode_nr % self.config['batch_size']] = len(trajectory['reward']) episode_nr += 1 action_taken = (np.arange(self.nA) == trajectory['action'][:, None]).astype(np.float32) # one-hot encoding epdlogp = action_taken - trajectory['prob'] # episode_states = np.vstack(encountered_states) discounted_episode_rewards = discount_rewards(trajectory['reward'], self.config['gamma']) # print(discounted_episode_rewards) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) discounted_episode_rewards /= np.std(discounted_episode_rewards) epdlogp *= np.reshape(np.repeat(discounted_episode_rewards, self.nA), (len(discounted_episode_rewards), self.nA)) change_w1, change_w2 = self.backward_step(trajectory['state'], trajectory['x1'], epdlogp) gradient1 += change_w1 gradient2 += change_w2 if episode_nr % self.config['batch_size'] == 0: # batch is done iteration += 1 rmsprop1 = self.config['decay_rate'] * rmsprop1 + (1 - self.config['decay_rate']) * gradient1**2 rmsprop2 = self.config['decay_rate'] * rmsprop2 + (1 - self.config['decay_rate']) * gradient2**2 self.w1 += self.config['learning_rate'] * gradient1 / (np.sqrt(rmsprop1) + 1e-5) self.w2 += self.config['learning_rate'] * gradient2 / (np.sqrt(rmsprop2) + 1e-5) gradient1 = np.zeros_like(self.w1) gradient2 = np.zeros_like(self.w2) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, episode_nr) mean_rewards.append(episode_rewards.mean()) if episode_nr % self.config['draw_frequency'] == 0: reporter.draw_rewards(mean_rewards)
def learn(self): """Run learning algorithm""" reporter = Reporter() config = self.config possible_actions = np.arange(self.nA) total_n_trajectories = 0 for iteration in range(config["n_iter"]): # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.get_trajectories() total_n_trajectories += len(trajectories) all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_action = (possible_actions == all_action[:, None]).astype( np.float32) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards returns = np.concatenate([ discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories ]) qw_new = self.get_critic_value(all_state) episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths results = self.sess.run( [self.summary_op, self.critic_train, self.actor_train], feed_dict={ self.critic_state_in: all_state, self.critic_target: returns, self.actor_input: all_state, self.actions_taken: all_action, self.critic_feedback: qw_new, self.critic_rewards: returns, self.rewards: np.mean(episode_rewards), self.episode_lengths: np.mean(episode_lengths) }) self.writer.add_summary(results[0], iteration) self.writer.flush() reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories)
def update_policy(self, episode_number): # Convert buffers to Torch tensors action_probs = torch.stack(self.action_probs, dim=0) \ .to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) states = torch.stack(self.states, dim=0).to(self.train_device).squeeze(-1) next_states = torch.stack(self.next_states, dim=0).to(self.train_device).squeeze(-1) done = torch.Tensor(self.done).to(self.train_device) # Clear state transition buffers self.states, self.action_probs, self.rewards = [], [], [] self.next_states, self.done = [], [] # DONE: Compute state values state_values = torch.stack( [self.policy.forward(state)[1][0] for state in states]) next_state_values = torch.stack( [self.policy.forward(state)[1][0] for state in next_states]) # DONE: Compute critic loss (MSE) discounted_rewards = discount_rewards(rewards, self.gamma) # Normalize discounted rewards. discounted_rewards -= torch.mean(discounted_rewards) discounted_rewards /= torch.std(discounted_rewards) mse_loss = torch.nn.MSELoss() critic_loss = mse_loss(state_values, discounted_rewards) # Advantage estimates # DONE: Compute advantage estimates advantages = rewards + self.gamma * next_state_values - state_values # DONE: Calculate actor loss (very similar to PG) weighted_probs = -action_probs * advantages.detach() actor_loss = torch.mean(weighted_probs) # DONE: Compute the gradients of loss w.r.t. network parameters # Or copy from Ex5 loss = actor_loss + critic_loss loss.backward() # DONE: Update network parameters using self.optimizer and zero gradients # Or copy from Ex5 self.optimizer.step() self.optimizer.zero_grad()
def episode_finished(self, episode_number): action_probs = torch.stack(self.action_probs, dim=0) \ .to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) self.states, self.action_probs, self.rewards = [], [], [] G = discount_rewards(rewards,self.gamma) if self.normalize: G = ((G-G.mean())/(G.std() + 1e-6)) optimizer_terms = -(G-self.baseline)*action_probs loss = optimizer_terms.sum() loss.backward() # TODO: Update network parameters using self.optimizer and zero gradients (T1) self.optimizer.step() self.optimizer.zero_grad()
def episode_finished(self, episode_number): all_actions = torch.stack(self.actions, dim=0).to(self.train_device).squeeze(-1) all_rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) self.observations, self.actions, self.rewards = [], [], [] discounted_rewards = discount_rewards(all_rewards, self.gamma) discounted_rewards -= torch.mean(discounted_rewards) discounted_rewards /= torch.std(discounted_rewards) weighted_probs = all_actions * discounted_rewards loss = torch.sum(weighted_probs) loss.backward() # Update policy self.optimizer.step() self.optimizer.zero_grad()
def __init__(self, sess, state_dim, learning_rate): self.sess = sess self.s_dim = state_dim self.learning_rate = learning_rate # Critic Network self.inputs, self._out = self.create_actor_network() # This returns will be provided by the Discount Reward self.returns = tf.placeholder("float", [None, 1], name='returns') # tf reward processing self._discounted_returns = utils.discount_rewards(self.returns) optimizer = tf.train.AdamOptimizer(self.learning_rate) self._loss = tf.nn.l2_loss(self._out - self._discounted_returns) self.optimize = optimizer.minimize(self._loss)
def learn(self): """Run learning algorithm""" reporter = Reporter() config = self.config total_n_trajectories = 0 for iteration in range(config["n_iter"]): # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.get_trajectories(self.env) total_n_trajectories += len(trajectories) all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_ob = np.concatenate( [trajectory["ob"] for trajectory in trajectories]) # Compute discounted sums of rewards returns = np.concatenate([ discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories ]) qw_new = self.get_critic_value(all_ob) print(qw_new) self.sess.run( [self.critic_train], feed_dict={ self.critic_state_in: all_ob, self.critic_target: returns.reshape(-1, 1) }) target = np.mean((returns - qw_new)**2) self.sess.run( [self.actor_train], feed_dict={ self.input_state: all_ob, self.actions_taken: all_action, self.target: target }) episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories)
def update(self, acts, rews, obs, optimizer): rews_disc = U.discount_rewards(rews, self.gamma) acts = torch.Tensor(acts) rews_disc = torch.Tensor(rews_disc) obs = torch.Tensor(obs) logits = self.forward(obs) probs = F.softmax(logits, dim=1) # index logprobs by acts logprobs = dists.Categorical(probs=probs).log_prob(acts) loss = (-logprobs * rews_disc).mean() ent_loss = (-probs * torch.log(probs)).sum(dim=1).mean() loss -= self.ent_coeff * ent_loss optimizer.zero_grad() loss.backward() optimizer.step()
def episode_finished(self, done): action_probs = torch.stack(self.action_probs, dim=0) \ .to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) values = torch.stack(self.values, dim=0).to(self.train_device).squeeze( -1) # values from the network if done: self.states, self.action_probs, self.rewards, self.values = [], [], [], [] #else: #print(done) #print("values : "+str(values)) #print("reward : "+str(rewards)) # TODO: Compute discounted rewards (use the discount_rewards function) discounted_rewards = discount_rewards(rewards, self.gamma) #print(discounted_rewards) #discounted_rewards = rewards + (self.gamma * values) # TODO: Compute critic loss and advantages (T3) if done: advantage = discounted_rewards else: advantage = discounted_rewards - values advantage -= torch.mean(advantage) advantage /= torch.std(advantage.detach()) critic_loss = advantage.pow(2).mean() # TODO: Compute the optimization term (T1, T3) weighted_probs = -action_probs * advantage.detach() #loss = weighted_probs.sum() actor_loss = weighted_probs.mean() ac_loss = actor_loss + critic_loss # TODO: Compute the gradients of loss w.r.t. network parameters (T1) # TODO: Update network parameters using self.optimizer and zero gradients (T1) ac_loss.backward(retain_graph=True) self.optimizer.step() self.optimizer.zero_grad() # sigma implementation """
def learn_Karpathy(self): """Learn using updates like in the Karpathy algorithm.""" reporter = Reporter() config = self.master.config self.master.session.run([self.master.reset_accum_grads]) iteration = 0 episode_nr = 0 mean_rewards = [] while not self.master.stop_requested: # Keep executing episodes until the master requests a stop (e.g. using SIGINT) iteration += 1 trajectory = self.task_learner.get_trajectory() reward = sum(trajectory['reward']) action_taken = trajectory['action'] discounted_episode_rewards = discount_rewards( trajectory['reward'], config['gamma']) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) std = np.std(discounted_episode_rewards) std = std if std > 0 else 1 discounted_episode_rewards /= std feedback = discounted_episode_rewards results = self.master.session.run( [self.loss, self.add_accum_grad], feed_dict={ self.master.state: trajectory["state"], self.master.action_taken: action_taken, self.master.advantage: feedback }) results = self.master.session.run( [self.master.summary_op], feed_dict={ self.master.loss: results[0], self.master.reward: reward, self.master.episode_length: trajectory["steps"] }) self.writer.add_summary(results[0], iteration) self.writer.flush() self.master.session.run([self.master.apply_gradients]) self.master.session.run([self.master.reset_accum_grads])
def _make_batch(self, epoch): current_policy, current_value, current_oracle = get_current_policy(self.env, self.PGNetwork, self.VNetwork, self.ZNetwork) # states = [ #task1 [[---episode_1---],...,[---episode_n---]], #task2 [[---episode_1---],...,[---episode_n---]] # ] states, tasks, actions, rewards, next_states = self.rollout.rollout_batch(self.PGNetwork, current_policy, epoch) discounted_rewards, GAEs = [], [] for task in range(self.env.num_task): discounted_rewards.append([]) GAEs.append([]) for ep_state, ep_next, ep_reward in zip(states[task], next_states[task], rewards[task]): discounted_rewards[task] += discount_rewards(self.env, ep_reward, ep_state, ep_next, task, current_value) GAEs[task] += GAE(self.env, ep_reward, ep_state, ep_next, task, current_value) states[task] = np.concatenate(states[task]) tasks[task] = np.concatenate(tasks[task]) actions[task] = np.concatenate(actions[task]) rewards[task] = np.concatenate(rewards[task]) next_states[task] = np.concatenate(next_states[task]) state_dict, count_dict = statistic(self.env, states, actions, discounted_rewards, GAEs, next_states, current_value) task_states, task_actions, task_target_values, task_advantages, \ sharing_states, sharing_actions, sharing_advantages = self._process_PV_batch(states, actions, discounted_rewards, GAEs, next_states, current_policy, current_value, current_oracle, count_dict) z_states, z_actions, z_rewards = self._process_Z_batch(state_dict, count_dict) return task_states, task_actions, task_target_values, task_advantages, \ sharing_states, sharing_actions, sharing_advantages, \ np.concatenate(rewards), \ z_states, z_actions, z_rewards
def episode_finished(self, episode_number): action_probs = torch.stack(self.action_probs, dim=0) \ .to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) values = torch.stack(self.values, dim=0).to(self.train_device).squeeze(-1) # values from the network done =torch.stack(self.done, dim=0).to(self.train_device).squeeze(-1) self.states, self.action_probs, self.rewards, self.values, self.done = [], [], [], [] for i in range(len(values)): td_target = rewards[i] + self.gamma * self.value() # TODO: Compute discounted rewards (use the discount_rewards function) discounted_rewards = discount_rewards(rewards,self.gamma) #discounted_rewards -= torch.mean(discounted_rewards) #discounted_rewards /= torch.std(discounted_rewards) advantage = discounted_rewards - values advantage -= torch.mean(advantage) advantage /= torch.std(advantage.detach()) # TODO: Compute critic loss and advantages (T3) self.value_optimizer.zero_grad() self.policy_optimizer.zero_grad() # TODO: Compute the optimization term (T1, T3) #weighted_probs = -action_probs * (discounted_rewards -20) # with baseline weighted_probs = -action_probs * advantage.detach() # without baseline policy_l = weighted_probs.sum() critic_l = advantage.pow(2).mean() # TODO: Compute the gradients of loss w.r.t. network parameters (T1) policy_l.backward() critic_l.backward() # TODO: Update network parameters using self.optimizer and zero gradients (T1) self.policy_optimizer.step() self.value_optimizer.step() # sigma implementation """
def episode_finished(self, episode_number): action_probs = torch.stack(self.action_probs, dim=0).to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) self.states, self.action_probs, self.rewards = [], [], [] # TODO: Compute discounted rewards (use the discount_rewards function) discounted_r = discount_rewards(rewards, self.gamma) #discounted_r -= torch.mean(discounted_r) # for Task 1 c) #discounted_r /= torch.std(discounted_r) # TODO: Compute the optimization term (T1) #weighted_probs = action_probs * discounted_r # REINFORCE without baseline # T1 a)+c) & T2 # from exercise 1 weighted_probs = action_probs * (discounted_r - self.baseline) # REINFORCE with baseline # T1 b) loss = torch.mean((-1) * weighted_probs) # needs to be multiplied by (-1) because gradient is a maximize function, so we maximize negative loss to converge it towards zero # TODO: Compute the gradients of loss w.r.t. network parameters (T1) loss.backward() # like in policy gradient tutorial 2.2 Automatic differentiation # TODO: Update network parameters using self.optimizer and zero gradients (T1) self.optimizer.step() # like in policy gradient tutorial 2.3 Using optimizers self.optimizer.zero_grad()
def episode_finished(self, episode_number): action_probs = torch.stack(self.action_probs, dim=0).to(self.train_device).squeeze(-1) rewards = torch.stack(self.rewards, dim=0).to(self.train_device).squeeze(-1) self.states, self.action_probs, self.rewards = [], [], [] #c = 5e-4 #T2 #self.variance = self.policy.sigma * np.exp(-c * episode_number) # TODO: Compute discounted rewards (use the discount_rewards function) G = discount_rewards(r=rewards, gamma=self.gamma) #normalized rewards G = (G - torch.mean(G)) / torch.std(G) # TODO: Compute the optimization term (T1) loss = torch.sum(-G * action_probs) #basic REINFORCE #loss = torch.sum(-(G-self.baseline)*action_probs) #REINFORCE with baseline # TODO: Compute the gradients of loss w.r.t. network parameters (T1) loss.backward() # TODO: Update network parameters using self.optimizer and zero gradients (T1) self.optimizer.step() self.optimizer.zero_grad()
def episode_finished(self, episode_number): # Save nn every 200th eps if episode_number % 200 == 0 and episode_number > 0: torch.save(self.policy.state_dict(), 'model-nn.pt') # Calc discounted rewards all_rewards = torch.stack(self.rewards, dim=0).to( self.train_device).squeeze(1) discounted_rewards = discount_rewards(all_rewards, self.gamma) discounted_rewards -= torch.mean(discounted_rewards) discounted_rewards /= torch.std(discounted_rewards) # Stack prop_ups, fake_labels, measure losses all_actions = torch.stack(self.prop_ups).float().to( self.train_device).squeeze(1) all_labels = torch.tensor( self.fake_labels).float().to(self.train_device) losses = self.loss(all_actions, all_labels) losses *= discounted_rewards loss = torch.mean(losses) # Reset self.reset() # Compute grad loss.backward(torch.tensor(1.0/self.batch_size).to(self.train_device)) # Output loss and rewards every now and then reward_sum = sum(all_rewards) if episode_number % 10 == 1: print(f'Episode: {episode_number}, Loss: {loss}. Rewards: {reward_sum}') # Update policy depends on batch size if episode_number % self.batch_size == 0 and episode_number > 0: self.optimizer.step() self.optimizer.zero_grad()