class PG() : def __init__(self,env : gym.Env, config, r_seed) : if not os.path.exists(config.output_path) : os.makedirs(config.output_path) self.config = config self.r_seed = r_seed self.env = env self.env.seed(self.r_seed) self.discrete = isinstance(env.action_space, gym.spaces.Discrete) self.observation_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[0] = self.config.learning_rate self.set_summary() self.set_optimizer() def build(self) : self.observation = keras.Input( dtype = tf.float32, shape = (self.observation_dim,), ) self.action = build_mlp(self.observation, self.action_dim, self.config.n_layers, self.config.layer_size, self.config.activation) self.action_logit = keras.Model(inputs = self.observation, outputs = self.action) if self.discrete : sampled_action = tf.squeeze(tf.random.categorical(self.action,1)) else : self.normal_layer = Normal_action_sample() sampled_action = self.normal_layer(self.action) self.sample_action = keras.Model(inputs = self.observation, outputs = sampled_action, name='sample_action') self.sample_action.summary() if self.config.use_baseline : self.baseline_network = BaselineNetwork(self.config, self.observation) def set_optimizer(self) : self.optimizer = keras.optimizers.Adam() def loss_func(self, observations, actions, advantages) : if self.discrete : self.logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(actions, self.action_logit(observations)) else : self.logprob = tfp.distributions.MultivariateNormalDiag( loc = self.action_logit(observations), scale_diag= tf.math.exp(self.normal_layer.log_std), ).log_prob(actions) advantages = tf.cast(advantages, tf.float32) return -tf.math.multiply(self.logprob, advantages) def train_step(self, observations, actions, advantages) : with tf.GradientTape() as tape : self.loss = self.loss_func(observations, actions, advantages) gradients = tape.gradient(self.loss, self.sample_action.trainable_variables) self.optimizer.learning_rate = self.optimizer.apply_gradients(zip(gradients, self.sample_action.trainable_variables)) def set_summary(self) : self.file_writer = tf.summary.create_file_writer(self.config.output_path) self.file_writer.set_as_default() def get_returns(self, paths) : all_returns = [] for path in paths : rewards = path["reward"] gammas = self.config.gamma**np.arange(len(rewards)) returns = np.flip(np.cumsum(np.flip(rewards * gammas))) rev_gammas = self.config.gamma**(-np.arange(len(rewards))) returns = rev_gammas * returns all_returns.append(returns) returns = np.concatenate(all_returns) return returns def normalize_advantage(self, advantages) : adv = (advantages - tf.math.reduce_mean(advantages))/tfp.stats.stddev(advantages) return adv def calculate_advantage(self, returns, observations) : if self.config.use_baseline : advantages = self.baseline_network.calculate_advantage(returns, observations) else : advantages = returns if self.config.normalize_advantage : advantages = self.normalize_advantage(advantages) return advantages def add_sumary(self, t) : tf.summary.scalar('Avg Reward', self.avg_reward, step= t) tf.summary.scalar('Max Reward', self.max_reward, step= t) tf.summary.scalar('Std Reward', self.std_reward, step= t) tf.summary.scalar('Eval Reward', self.eval_reward, step= t) def init_averages(self) : self.avg_reward = 0 self.max_reward = 0 self.std_reward = 0 self.eval_reward = 0 def update_averages(self, rewards, scores_eval) : self.avg_reward = np.mean(rewards) self.max_reward = np.max(rewards) self.std_reward = np.sqrt(np.var(rewards)/ len(rewards)) if len(scores_eval) > 0 : self.eval_reward = scores_eval[-1] def sample_path(self, env, num_episodes = None) : episode = 0 episode_rewards = [] paths = [] t = 0 while (num_episodes or t < self.config.batch_size) : state = env.reset() states, actions, rewards = [], [], [] episode_reward = 0 for step in range(self.config.max_ep_len) : states.append(state) action = self.sample_action(states[-1][None]) if self.discrete : action = int(action) else : action = action[0] state, reward, done, info = env.step(action) # env.render() actions.append(action) rewards.append(reward) episode_reward += reward t += 1 if (done or step == self.config.max_ep_len-1) : episode_rewards.append(episode_reward) break if (not num_episodes) and t == self.config.batch_size : break path = { 'observation' : np.array(states), 'reward' : np.array(rewards), 'action' : np.array(actions) } paths.append(path) episode += 1 if num_episodes and episode >= num_episodes : break return paths, episode_rewards def train(self) : last_eval = 0 last_record = 0 scores_eval = [] self.init_averages() for t in range(self.config.num_batches) : paths, total_rewards = self.sample_path(self.env) scores_eval = scores_eval + total_rewards observations = np.concatenate([path['observation'] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = np.concatenate([path["reward"] for path in paths]) returns = self.get_returns(paths) advantages = self.calculate_advantage(returns, observations) if self.config.use_baseline : self.baseline_network.update_baseline(returns, observations) self.train_step(observations, actions, advantages) if (t % self.config.summary_freq == 0) : self.update_averages(total_rewards, scores_eval) self.add_sumary(t) avg_reward = np.mean(total_rewards) sigma_reward = np.sqrt(np.var(total_rewards)/len(total_rewards)) sys.stdout.write('\r') sys.stdout.flush() msg = "Average reward: {0:04.2f} +/- {1:04.2f} step:{2}/{3} ".format(avg_reward, sigma_reward, t, self.config.num_batches) print(msg, end='') if self.config.record and not ((t+1)% self.config.record_freq): sys.stdout.write('\n') sys.stdout.flush() print('Recording') self.record() sys.stdout.write('\n') sys.stdout.flush() print('Training done.') print(self.normal_layer.log_std.numpy()) export_plot(scores_eval, 'Score', self.config.env_name, self.config.plot_output) def evaluate(self, env= None, num_episodes=1): if env==None : env = self.env self.sample_path(env, num_episodes) def record(self) : env = gym.make(self.config.env_name) env.seed(self.r_seed) env = gym.wrappers.Monitor(env, self.config.record_path, video_callable= lambda x: True, resume=True) self.evaluate(env, 1) def run(self) : if self.config.record : self.record() self.train() if self.config.record : self.record()
class PG(object): """ Abstract Class for implementing a Policy Gradient Based Algorithm """ def __init__(self, env, config, r_seed, logger=None): """ Initialize Policy Gradient Class Args: env: an OpenAI Gym environment config: class with hyperparameters logger: logger instance from the logging module You do not need to implement anything in this function. However, you will need to use self.discrete, self.observation_dim, self.action_dim, and in other methods. """ # directory for training outputs if not os.path.exists(config.output_path): os.makedirs(config.output_path) # store hyperparameters self.config = config self.r_seed = r_seed self.logger = logger if logger is None: self.logger = get_logger(config.log_path) self.env = env self.env.seed(self.r_seed) # discrete vs continuous action space self.discrete = isinstance(env.action_space, gym.spaces.Discrete) self.observation_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[ 0] = self.config.learning_rate # build model def add_placeholders_op(self): """ Add placeholders for observation, action, and advantage: self.observation_placeholder, type: tf.float32 self.action_placeholder, type: depends on the self.discrete self.advantage_placeholder, type: tf.float32 HINT: Check self.observation_dim and self.action_dim HINT: In the case of continuous action space, an action will be specified by 'self.action_dim' float32 numbers (i.e. a vector with size 'self.action_dim') """ ####################################################### ######### YOUR CODE HERE - 4-6 lines. ############ obs_dim, action_dim = self.observation_dim, self.action_dim self.observation_placeholder = tf.placeholder(tf.float32, shape=(None, obs_dim), name="obs") if self.discrete: self.action_placeholder = tf.placeholder(tf.int32, shape=(None), name="action") else: self.action_placeholder = tf.placeholder(tf.float32, shape=(None, action_dim), name="action") self.advantage_placeholder = tf.placeholder(tf.float32, shape=(None,), name="advantage") # TODO Double check shape of advantage placeholder ####################################################### ######### END YOUR CODE. ############ def build_policy_network_op(self, scope="policy_network"): """ Build the policy network, construct the tensorflow operation to sample actions from the policy network outputs, and compute the log probabilities of the actions taken (for computing the loss later). These operations are stored in self.sampled_action and self.logprob. Must handle both settings of self.discrete. Args: scope: the scope of the neural network TODO: Discrete case: action_logits: the logits for each action HINT: use build_mlp, check self.config for layer_size and n_layers self.sampled_action: sample from these logits HINT: use tf.multinomial + tf.squeeze self.logprob: compute the log probabilities of the taken actions HINT: 1. tf.nn.sparse_softmax_cross_entropy_with_logits computes the *negative* log probabilities of labels, given logits. 2. taken actions are different than sampled actions! Continuous case: To build a policy in a continuous action space domain, we will have the model output the means of each action dimension, and then sample from a multivariate normal distribution with these means and trainable standard deviation. That is, the action a_t ~ N( mu(o_t), sigma) where mu(o_t) is the network that outputs the means for each action dimension, and sigma is a trainable variable for the standard deviations. N here is a multivariate gaussian distribution with the given parameters. action_means: the predicted means for each action dimension. HINT: use build_mlp, check self.config for layer_size and n_layers log_std: a trainable variable for the log standard deviations. HINT: think about why we use log std as the trainable variable instead of std HINT: use tf.get_variable HINT: The shape of this should match the shape of action dimension self.sampled_action: sample from the gaussian distribution as described above HINT: use tf.random_normal HINT: use re-parametrization to obtain N(mu, sigma) from N(0, 1) self.lobprob: the log probabilities of the taken actions HINT: use tf.contrib.distributions.MultivariateNormalDiag """ ####################################################### ######### YOUR CODE HERE - 8-12 lines. ############ self.scope = scope if self.discrete: action_logits = build_mlp(self.observation_placeholder, self.action_dim, self.scope, self.config.n_layers, self.config.layer_size, output_activation=self.config.activation) #self.sampled_action = tf.multinomial(action_logits, 1) self.sampled_action = tf.squeeze(tf.multinomial(action_logits, 1),1) self.logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits( #labels=self.action_placeholder, labels=self.action_placeholder, logits=action_logits) else: action_means = build_mlp(self.observation_placeholder, self.action_dim, self.scope, self.config.n_layers, self.config.layer_size) log_std = tf.get_variable("log_std") self.sampled_action = action_means + tf.multiply( log_std.exp(), tf.random_normal(self.batch_size)) self.lobprob = tf.log( tf.contrib.distributions.MultivariateNormalDiag()) ####################################################### ######### END YOUR CODE. ############ def add_loss_op(self): """ Compute the loss, averaged for a given batch. Recall the update for REINFORCE with advantage: θ = θ + α ∇_θ log π_θ(a_t|s_t) A_t Think about how to express this update as minimizing a loss (so that tensorflow will do the gradient computations for you). You only have to reference fields of 'self' that have already been set in the previous methods. Save the loss as self.loss """ ###################################################### ######### YOUR CODE HERE - 1-2 lines. ############ self.loss = -tf.reduce_mean(self.advantage_placeholder * self.logprob) # TODO ####################################################### ######### END YOUR CODE. ############ def add_optimizer_op(self): """ Set 'self.train_op' using AdamOptimizer HINT: Use, and minimize self.loss """ ###################################################### ######### YOUR CODE HERE - 1-2 lines. ############ self.train_op = tf.train.AdamOptimizer( # TODO ####################################################### ######### END YOUR CODE. ############ def build(self): """ Build the model by adding all necessary variables. You don't have to change anything here - we are just calling all the operations you already defined above to build the tensorflow graph. """ # add placeholders self.add_placeholders_op() # create policy net self.build_policy_network_op() # add square loss self.add_loss_op() # add optmizer for the main networks self.add_optimizer_op() # add baseline if self.config.use_baseline: # check if the baseline is enabled and instantiate the baseline network in that case self.baseline_network = BaselineNetwork( self.env, self.config, self.observation_placeholder) self.baseline_network.add_baseline_op() def initialize(self): """ Assumes the graph has been constructed (have called Creates a tf Session and run initializer of variables You don't have to change or use anything here. """ # setting the seed #pdb.set_trace() # create tf session self.sess = tf.Session() # tensorboard stuff self.add_summary() # initiliaze all variables init = tf.global_variables_initializer() if self.config.use_baseline: self.baseline_network.set_session(self.sess) def add_summary(self): """ Tensorboard stuff. You don't have to change or use anything here. """ # extra placeholders to log stuff from python self.avg_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_reward") self.max_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="max_reward") self.std_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="std_reward") self.eval_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="eval_reward") # extra summaries from python -> placeholders tf.summary.scalar("Avg Reward", self.avg_reward_placeholder) tf.summary.scalar("Max Reward", self.max_reward_placeholder) tf.summary.scalar("Std Reward", self.std_reward_placeholder) tf.summary.scalar("Eval Reward", self.eval_reward_placeholder) # logging self.merged = tf.summary.merge_all() self.file_writer = tf.summary.FileWriter(self.config.output_path, self.sess.graph) def init_averages(self): """ Defines extra attributes for tensorboard. You don't have to change or use anything here. """ self.avg_reward = 0. self.max_reward = 0. self.std_reward = 0. self.eval_reward = 0. def update_averages(self, rewards, scores_eval): """ Update the averages. You don't have to change or use anything here. Args: rewards: deque scores_eval: list """ self.avg_reward = np.mean(rewards) self.max_reward = np.max(rewards) self.std_reward = np.sqrt(np.var(rewards) / len(rewards)) if len(scores_eval) > 0: self.eval_reward = scores_eval[-1] def record_summary(self, t): """ Add summary to tensorboard You don't have to change or use anything here. """ fd = { self.avg_reward_placeholder: self.avg_reward, self.max_reward_placeholder: self.max_reward, self.std_reward_placeholder: self.std_reward, self.eval_reward_placeholder: self.eval_reward, } summary =, feed_dict=fd) # tensorboard stuff self.file_writer.add_summary(summary, t) def sample_path(self, env, num_episodes=None): """ Sample paths (trajectories) from the environment. Args: num_episodes: the number of episodes to be sampled if none, sample one batch (size indicated by config file) env: open AI Gym envinronment Returns: paths: a list of paths. Each path in paths is a dictionary with path["observation"] a numpy array of ordered observations in the path path["actions"] a numpy array of the corresponding actions in the path path["reward"] a numpy array of the corresponding rewards in the path total_rewards: the sum of all rewards encountered during this "path" You do not have to implement anything in this function, but you will need to understand what it returns, and it is worthwhile to look over the code just so you understand how we are taking actions in the environment and generating batches to train on. """ episode = 0 episode_rewards = [] paths = [] t = 0 while (num_episodes or t < self.config.batch_size): state = env.reset() states, actions, rewards = [], [], [] episode_reward = 0 for step in range(self.config.max_ep_len): states.append(state) action = self.sampled_action, feed_dict={self.observation_placeholder: states[-1][None]})[0] state, reward, done, info = env.step(action) actions.append(action) rewards.append(reward) episode_reward += reward t += 1 if (done or step == self.config.max_ep_len - 1): episode_rewards.append(episode_reward) break if (not num_episodes) and t == self.config.batch_size: break path = { "observation": np.array(states), "reward": np.array(rewards), "action": np.array(actions)} paths.append(path) episode += 1 if num_episodes and episode >= num_episodes: break return paths, episode_rewards def get_returns(self, paths): """ Calculate the returns G_t for each timestep Args: paths: recorded sample paths. See sample_path() for details. Return: returns: return G_t for each timestep After acting in the environment, we record the observations, actions, and rewards. To get the advantages that we need for the policy update, we have to convert the rewards into returns, G_t, which are themselves an estimate of Q^π (s_t, a_t): G_t = r_t + γ r_{t+1} + γ^2 r_{t+2} + ... + γ^{T-t} r_T where T is the last timestep of the episode. Note that here we are creating a list of returns for each path TODO: compute and return G_t for each timestep. Use self.config.gamma. """ all_returns = [] for path in paths: rewards = path["reward"] ####################################################### ######### YOUR CODE HERE - 5-10 lines. ############ returns = [] previous_return = 0 for t in range(rewards.shape[0] - 1, -1, -1): current_return = rewards[ t] + self.config.gamma * previous_return returns.append(current_return) previous_return = current_return # reverse the list returns = returns[::-1] ####################################################### ######### END YOUR CODE. ############ all_returns.append(returns) returns = np.concatenate(all_returns) return returns def normalize_advantage(self, advantages): """ Normalizes the advantage. This function is called only if self.config.normalize_advantage is True. Args: advantages: the advantages Returns: adv: Normalized Advantage Calculate the advantages, by normalizing the advantages. TODO: Normalize the advantages so that they have a mean of 0 and standard deviation of 1. """ ####################################################### ######### YOUR CODE HERE - 1-5 lines. ############ advantages -= tf.reduce_mean(advantages) advantages /= tf.math.reduce_std(advantages) ####################################################### ######### END YOUR CODE. ############ return advantages def calculate_advantage(self, returns, observations): """ Calculates the advantage for each of the observations Args: returns: the returns observations: the observations Returns: advantage: the advantage """ if self.config.use_baseline: # override the behavior of advantage by subtracting baseline advantages = self.baseline_network.calculate_advantage( returns, observations) else: advantages = returns if self.config.normalize_advantage: advantages = self.normalize_advantage(advantages) return advantages def train(self): """ Performs training You do not have to change or use anything here, but take a look to see how all the code you've written fits together! """ last_eval = 0 last_record = 0 scores_eval = [] self.init_averages() scores_eval = [] # list of scores computed at iteration time for t in range(self.config.num_batches): # collect a minibatch of samples paths, total_rewards = self.sample_path(self.env) scores_eval = scores_eval + total_rewards observations = np.concatenate( [path["observation"] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = np.concatenate([path["reward"] for path in paths]) # compute Q-val estimates (discounted future returns) for each time step returns = self.get_returns(paths) # advantage will depend on the baseline implementation advantages = self.calculate_advantage(returns, observations) # run training operations if self.config.use_baseline: self.baseline_network.update_baseline(returns, observations), feed_dict={ self.observation_placeholder: observations, self.action_placeholder: actions, self.advantage_placeholder: advantages }) # tf stuff if (t % self.config.summary_freq == 0): self.update_averages(total_rewards, scores_eval) self.record_summary(t) # compute reward statistics for this batch and log avg_reward = np.mean(total_rewards) sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards)) msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) if self.config.record and (last_record > self.config.record_freq):"Recording...") last_record = 0 self.record()"- Training done.") export_plot(scores_eval, "Score", self.config.env_name, self.config.plot_output) def evaluate(self, env=None, num_episodes=1): """ Evaluates the return for num_episodes episodes. Not used right now, all evaluation statistics are computed during training episodes. """ if env == None: env = self.env paths, rewards = self.sample_path(env, num_episodes) avg_reward = np.mean(rewards) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) return avg_reward def record(self): """ Recreate an env and record a video for one episode """ env = gym.make(self.config.env_name) env.seed(self.r_seed) env = gym.wrappers.Monitor(env, self.config.record_path, video_callable=lambda x: True, resume=True) self.evaluate(env, 1) def run(self): """ Apply procedures of training for a PG. """ # initialize self.initialize() # record one game at the beginning if self.config.record: self.record() # model self.train() # record one game at the end if self.config.record: self.record()
class PolicyGradient(object): """ Class for implementing a policy gradient algorithm """ def __init__(self, env, config, seed, logger=None): """ Initialize Policy Gradient Class Args: env: an OpenAI Gym environment config: class with hyperparameters logger: logger instance from the logging module You do not need to implement anything in this function. However, you will need to use self.discrete, self.observation_dim, self.action_dim, and in other methods. """ # directory for training outputs if not os.path.exists(config.output_path): os.makedirs(config.output_path) # store hyperparameters self.config = config self.seed = seed self.logger = logger if logger is None: self.logger = get_logger(config.log_path) self.env = env self.env.seed(self.seed) # discrete vs continuous action space self.discrete = isinstance(env.action_space, gym.spaces.Discrete) self.observation_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.n if self.discrete else self.env.action_space.shape[ 0] = self.config.learning_rate self.init_policy() if config.use_baseline: self.baseline_network = BaselineNetwork(env, config) def init_policy(self): """ Please do the following: 1. Create a network using build_mlp. It should map vectors of size self.observation_dim to vectors of size self.action_dim, and use the number of layers and layer size from self.config 2. If self.discrete = True (meaning that the actions are discrete, i.e. from the set {0, 1, ..., N-1} where N is the number of actions), instantiate a CategoricalPolicy. If self.discrete = False (meaning that the actions are continuous, i.e. elements of R^d where d is the dimension), instantiate a GaussianPolicy. Either way, assign the policy to self.policy 3. Create an optimizer for the policy, with learning rate Note that the policy is an instance of (a subclass of) nn.Module, so you can call the parameters() method to get its parameters. """ ####################################################### ######### YOUR CODE HERE - 8-12 lines. ############ ####################################################### ######### END YOUR CODE. ############ def init_averages(self): """ You don't have to change or use anything here. """ self.avg_reward = 0. self.max_reward = 0. self.std_reward = 0. self.eval_reward = 0. def update_averages(self, rewards, scores_eval): """ Update the averages. You don't have to change or use anything here. Args: rewards: deque scores_eval: list """ self.avg_reward = np.mean(rewards) self.max_reward = np.max(rewards) self.std_reward = np.sqrt(np.var(rewards) / len(rewards)) if len(scores_eval) > 0: self.eval_reward = scores_eval[-1] def record_summary(self, t): pass def sample_path(self, env, num_episodes=None): """ Sample paths (trajectories) from the environment. Args: num_episodes: the number of episodes to be sampled if none, sample one batch (size indicated by config file) env: open AI Gym envinronment Returns: paths: a list of paths. Each path in paths is a dictionary with path["observation"] a numpy array of ordered observations in the path path["actions"] a numpy array of the corresponding actions in the path path["reward"] a numpy array of the corresponding rewards in the path total_rewards: the sum of all rewards encountered during this "path" You do not have to implement anything in this function, but you will need to understand what it returns, and it is worthwhile to look over the code just so you understand how we are taking actions in the environment and generating batches to train on. """ episode = 0 episode_rewards = [] paths = [] t = 0 while (num_episodes or t < self.config.batch_size): state = env.reset() states, actions, rewards = [], [], [] episode_reward = 0 for step in range(self.config.max_ep_len): states.append(state) action = self.policy.act(states[-1][None])[0] state, reward, done, info = env.step(action) actions.append(action) rewards.append(reward) episode_reward += reward t += 1 if (done or step == self.config.max_ep_len - 1): episode_rewards.append(episode_reward) break if (not num_episodes) and t == self.config.batch_size: break path = { "observation": np.array(states), "reward": np.array(rewards), "action": np.array(actions) } paths.append(path) episode += 1 if num_episodes and episode >= num_episodes: break return paths, episode_rewards def get_returns(self, paths): """ Calculate the returns G_t for each timestep Args: paths: recorded sample paths. See sample_path() for details. Return: returns: return G_t for each timestep After acting in the environment, we record the observations, actions, and rewards. To get the advantages that we need for the policy update, we have to convert the rewards into returns, G_t, which are themselves an estimate of Q^π (s_t, a_t): G_t = r_t + γ r_{t+1} + γ^2 r_{t+2} + ... + γ^{T-t} r_T where T is the last timestep of the episode. Note that here we are creating a list of returns for each path TODO: compute and return G_t for each timestep. Use self.config.gamma. """ all_returns = [] for path in paths: rewards = path["reward"] ####################################################### ######### YOUR CODE HERE - 5-10 lines. ############ ####################################################### ######### END YOUR CODE. ############ all_returns.append(returns) returns = np.concatenate(all_returns) return returns def normalize_advantage(self, advantages): """ Args: advantages: np.array of shape [batch size] Returns: normalized_advantages: np.array of shape [batch size] TODO: Normalize the advantages so that they have a mean of 0 and standard deviation of 1. Put the result in a variable called normalized_advantages (which will be returned). Note: This function is called only if self.config.normalize_advantage is True. """ ####################################################### ######### YOUR CODE HERE - 1-2 lines. ############ ####################################################### ######### END YOUR CODE. ############ return normalized_advantages def calculate_advantage(self, returns, observations): """ Calculates the advantage for each of the observations Args: returns: np.array of shape [batch size] observations: np.array of shape [batch size, dim(observation space)] Returns: advantages: np.array of shape [batch size] """ if self.config.use_baseline: # override the behavior of advantage by subtracting baseline advantages = self.baseline_network.calculate_advantage( returns, observations) else: advantages = returns if self.config.normalize_advantage: advantages = self.normalize_advantage(advantages) return advantages def update_policy(self, observations, actions, advantages): """ Args: observations: np.array of shape [batch size, dim(observation space)] actions: np.array of shape [batch size, dim(action space)] if continuous [batch size] (and integer type) if discrete advantages: np.array of shape [batch size] Perform one update on the policy using the provided data. To compute the loss, you will need the log probabilities of the actions given the observations. Note that the policy's action_distribution method returns an instance of a subclass of torch.distributions.Distribution, and that object can be used to compute log probabilities. See Note: PyTorch optimizers will try to minimize the loss you compute, but you want to maximize the policy's performance. """ observations = np2torch(observations) actions = np2torch(actions) advantages = np2torch(advantages) ####################################################### ######### YOUR CODE HERE - 5-7 lines. ############ ####################################################### ######### END YOUR CODE. ############ def train(self): """ Performs training You do not have to change or use anything here, but take a look to see how all the code you've written fits together! """ last_record = 0 self.init_averages() all_total_rewards = [ ] # the returns of all episodes samples for training purposes averaged_total_rewards = [] # the returns for each iteration for t in range(self.config.num_batches): # collect a minibatch of samples paths, total_rewards = self.sample_path(self.env) all_total_rewards.extend(total_rewards) observations = np.concatenate( [path["observation"] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = np.concatenate([path["reward"] for path in paths]) # compute Q-val estimates (discounted future returns) for each time step returns = self.get_returns(paths) # advantage will depend on the baseline implementation advantages = self.calculate_advantage(returns, observations) # run training operations if self.config.use_baseline: self.baseline_network.update_baseline(returns, observations) self.update_policy(observations, actions, advantages) # logging if (t % self.config.summary_freq == 0): self.update_averages(total_rewards, all_total_rewards) self.record_summary(t) # compute reward statistics for this batch and log avg_reward = np.mean(total_rewards) sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards)) msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) averaged_total_rewards.append(avg_reward) if self.config.record and (last_record > self.config.record_freq):"Recording...") last_record = 0 self.record()"- Training done."), averaged_total_rewards) export_plot(averaged_total_rewards, "Score", self.config.env_name, self.config.plot_output) def evaluate(self, env=None, num_episodes=1): """ Evaluates the return for num_episodes episodes. Not used right now, all evaluation statistics are computed during training episodes. """ if env == None: env = self.env paths, rewards = self.sample_path(env, num_episodes) avg_reward = np.mean(rewards) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) return avg_reward def record(self): """ Recreate an env and record a video for one episode """ env = gym.make(self.config.env_name) env.seed(self.seed) env = gym.wrappers.Monitor(env, self.config.record_path, video_callable=lambda x: True, resume=True) self.evaluate(env, 1) def run(self): """ Apply procedures of training for a PG. """ # record one game at the beginning if self.config.record: self.record() # model self.train() # record one game at the end if self.config.record: self.record()