def train(self): """ Performs training You do not have to change or use anything here, but take a look to see how all the code you've written fits together! """ last_eval = 0 last_record = 0 scores_eval = [] self.init_averages() scores_eval = [] # list of scores computed at iteration time for t in range(self.config.num_batches): # collect a minibatch of samples paths, total_rewards = self.sample_path(self.env) scores_eval = scores_eval + total_rewards observations = np.concatenate( [path["observation"] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = np.concatenate([path["reward"] for path in paths]) # compute Q-val estimates (discounted future returns) for each time step returns = self.get_returns(paths) # advantage will depend on the baseline implementation advantages = self.calculate_advantage(returns, observations) # run training operations if self.config.use_baseline: self.baseline_network.update_baseline(returns, observations) self.sess.run(self.train_op, feed_dict={ self.observation_placeholder: observations, self.action_placeholder: actions, self.advantage_placeholder: advantages }) # tf stuff if (t % self.config.summary_freq == 0): self.update_averages(total_rewards, scores_eval) self.record_summary(t) # compute reward statistics for this batch and log avg_reward = np.mean(total_rewards) sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards)) msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) self.logger.info(msg) if self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() self.logger.info("- Training done.") export_plot(scores_eval, "Score", self.config.env_name, self.config.plot_output)
def train(self): """ Performs training You do not have to change or use anything here, but take a look to see how all the code you've written fits together! """ last_record = 0 self.init_averages() all_total_rewards = [ ] # the returns of all episodes samples for training purposes averaged_total_rewards = [] # the returns for each iteration for t in range(self.config.num_batches): # collect a minibatch of samples paths, total_rewards = self.sample_path(self.env) all_total_rewards.extend(total_rewards) observations = np.concatenate( [path["observation"] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = np.concatenate([path["reward"] for path in paths]) # compute Q-val estimates (discounted future returns) for each time step returns = self.get_returns(paths) # advantage will depend on the baseline implementation advantages = self.calculate_advantage(returns, observations) # run training operations if self.config.use_baseline: self.baseline_network.update_baseline(returns, observations) self.update_policy(observations, actions, advantages) # logging if (t % self.config.summary_freq == 0): self.update_averages(total_rewards, all_total_rewards) self.record_summary(t) # compute reward statistics for this batch and log avg_reward = np.mean(total_rewards) sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards)) msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) averaged_total_rewards.append(avg_reward) self.logger.info(msg) if self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() self.logger.info("- Training done.") np.save(self.config.scores_output, averaged_total_rewards) export_plot(averaged_total_rewards, "Score", self.config.env_name, self.config.plot_output)
def train(self) : last_eval = 0 last_record = 0 scores_eval = [] self.init_averages() for t in range(self.config.num_batches) : paths, total_rewards = self.sample_path(self.env) scores_eval = scores_eval + total_rewards observations = np.concatenate([path['observation'] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = np.concatenate([path["reward"] for path in paths]) returns = self.get_returns(paths) advantages = self.calculate_advantage(returns, observations) if self.config.use_baseline : self.baseline_network.update_baseline(returns, observations) self.train_step(observations, actions, advantages) if (t % self.config.summary_freq == 0) : self.update_averages(total_rewards, scores_eval) self.add_sumary(t) avg_reward = np.mean(total_rewards) sigma_reward = np.sqrt(np.var(total_rewards)/len(total_rewards)) sys.stdout.write('\r') sys.stdout.flush() msg = "Average reward: {0:04.2f} +/- {1:04.2f} step:{2}/{3} ".format(avg_reward, sigma_reward, t, self.config.num_batches) print(msg, end='') if self.config.record and not ((t+1)% self.config.record_freq): sys.stdout.write('\n') sys.stdout.flush() print('Recording') self.record() sys.stdout.write('\n') sys.stdout.flush() print('Training done.') print(self.normal_layer.log_std.numpy()) export_plot(scores_eval, 'Score', self.config.env_name, self.config.plot_output)