Example #1
0
    def train(self):
        """
        Performs training

        You do not have to change or use anything here, but take a look
        to see how all the code you've written fits together!
        """
        last_eval = 0
        last_record = 0
        scores_eval = []

        self.init_averages()
        scores_eval = []  # list of scores computed at iteration time

        for t in range(self.config.num_batches):            
            # collect a minibatch of samples
            paths, total_rewards = self.sample_path(self.env)
            scores_eval = scores_eval + total_rewards
            observations = np.concatenate(
                [path["observation"] for path in paths])
            actions = np.concatenate([path["action"] for path in paths])
            rewards = np.concatenate([path["reward"] for path in paths])
            # compute Q-val estimates (discounted future returns) for each time step
            returns = self.get_returns(paths)
            
            # advantage will depend on the baseline implementation
            advantages = self.calculate_advantage(returns, observations)
            
            # run training operations
            if self.config.use_baseline:
                self.baseline_network.update_baseline(returns, observations)

            self.sess.run(self.train_op,
                          feed_dict={
                              self.observation_placeholder: observations,
                              self.action_placeholder: actions,
                              self.advantage_placeholder: advantages
                          })

            # tf stuff
            if (t % self.config.summary_freq == 0):
                self.update_averages(total_rewards, scores_eval)
                self.record_summary(t)

            # compute reward statistics for this batch and log
            avg_reward = np.mean(total_rewards)
            sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards))
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

            if self.config.record and (last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        self.logger.info("- Training done.")
        export_plot(scores_eval, "Score", self.config.env_name,
                    self.config.plot_output)
    def train(self):
        """
        Performs training

        You do not have to change or use anything here, but take a look
        to see how all the code you've written fits together!
        """
        last_record = 0

        self.init_averages()
        all_total_rewards = [
        ]  # the returns of all episodes samples for training purposes
        averaged_total_rewards = []  # the returns for each iteration

        for t in range(self.config.num_batches):

            # collect a minibatch of samples
            paths, total_rewards = self.sample_path(self.env)
            all_total_rewards.extend(total_rewards)
            observations = np.concatenate(
                [path["observation"] for path in paths])
            actions = np.concatenate([path["action"] for path in paths])
            rewards = np.concatenate([path["reward"] for path in paths])
            # compute Q-val estimates (discounted future returns) for each time step
            returns = self.get_returns(paths)

            # advantage will depend on the baseline implementation
            advantages = self.calculate_advantage(returns, observations)

            # run training operations
            if self.config.use_baseline:
                self.baseline_network.update_baseline(returns, observations)
            self.update_policy(observations, actions, advantages)

            # logging
            if (t % self.config.summary_freq == 0):
                self.update_averages(total_rewards, all_total_rewards)
                self.record_summary(t)

            # compute reward statistics for this batch and log
            avg_reward = np.mean(total_rewards)
            sigma_reward = np.sqrt(np.var(total_rewards) / len(total_rewards))
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            averaged_total_rewards.append(avg_reward)
            self.logger.info(msg)

            if self.config.record and (last_record > self.config.record_freq):
                self.logger.info("Recording...")
                last_record = 0
                self.record()

        self.logger.info("- Training done.")
        np.save(self.config.scores_output, averaged_total_rewards)
        export_plot(averaged_total_rewards, "Score", self.config.env_name,
                    self.config.plot_output)
    def train(self) :
        last_eval = 0
        last_record = 0
        scores_eval = []

        self.init_averages()

        for t in range(self.config.num_batches) :
            paths, total_rewards = self.sample_path(self.env)
            scores_eval = scores_eval + total_rewards
            observations = np.concatenate([path['observation'] for path in paths])
            actions = np.concatenate([path["action"] for path in paths])
            rewards = np.concatenate([path["reward"] for path in paths])
            returns = self.get_returns(paths)

            advantages = self.calculate_advantage(returns, observations)

            if self.config.use_baseline :
                self.baseline_network.update_baseline(returns, observations)

            self.train_step(observations, actions, advantages)

            if (t % self.config.summary_freq == 0) :
                self.update_averages(total_rewards, scores_eval)
                self.add_sumary(t)

            avg_reward = np.mean(total_rewards)
            sigma_reward = np.sqrt(np.var(total_rewards)/len(total_rewards))
            sys.stdout.write('\r')
            sys.stdout.flush()
            msg = "Average reward: {0:04.2f} +/- {1:04.2f} step:{2}/{3}         ".format(avg_reward, sigma_reward,
                  t, self.config.num_batches)
            print(msg, end='')

            if  self.config.record and not ((t+1)% self.config.record_freq):
                sys.stdout.write('\n')
                sys.stdout.flush()
                print('Recording')
                self.record()

        sys.stdout.write('\n')
        sys.stdout.flush()
        print('Training done.')
        print(self.normal_layer.log_std.numpy())
        export_plot(scores_eval, 'Score', self.config.env_name, self.config.plot_output)