예제 #1
0
class Agent:
    """
  Advantage Actor Suggestor Algorithm

  Note:
    Implementation of Adavantage Actor Suggestor algorithm as proposed in ###
  
  Args: 
    env: OpenAI environment object
    
    sess: TensorFlow Session object
    
    data_collection_params (dict): Parameters for data collection / interacting with the environment
      'min_batch_size' (int): Minimum batch size for interacting with the environment
      'min_episodes' (int): Minimum episodes to interact with the environment per batch 
      'episode_adapt_rate' (int): amount to increase or decrease for min_epidoes
    
    training_params (dict): Parameters for training
      'total_timesteps' (int): Total time steps to train for 
      'adaptive_lr' (bool): Use adaptive learning rate based on the desired kl divergence between current and last policy 
      'desired_kl' (double): Desired kl divergence for adaptive learning rate
    
    network_params (dict): Parameters to define the network used
      'value_network' (list): Defines the value network e.g. ['fully_connected_network','small/medium/large'] 
      'policy_network' (list): Defines the policy network e.g. ['fully_connected_network','small/medium/large'] 
    
    algorithm_params (dict): Parameters specific to the algorithm 
      'gamma' (double): discount rate 
      'learning_rate' (double): Learning rate for gradient updates 
      'std_dev' (list): Different ways to define the standard devation of the policy e.g. ['fixed'/'linear'/'network', (double)/(double)/-] NOTE: network option has the policy network to output the std dev 
      'target_update_rate' (double): Rate to perform the soft updates for the networks
               
    logs_path (string): Path to save training logs

  """
    def __init__(self,
                 env,
                 sess,
                 data_collection_params={
                     'min_batch_size': 1000,
                     'min_episodes': 1,
                     'episode_adapt_rate': 3
                 },
                 training_params={
                     'total_timesteps': 1000000,
                     'adaptive_lr': True,
                     'desired_kl': 2e-3
                 },
                 network_params={
                     'policy_network': ['fully_connected_network', 'large']
                 },
                 algorithm_params={
                     'gamma': 0.99,
                     'learning_rate': 1e-3,
                     'std_dev': ['fixed', 0.2],
                     'target_update_rate': 0.001
                 },
                 logs_path="/home/user/workspace/logs/"):

        # Tensorflow Session
        self.sess = sess

        # OpenAI Environment
        self.env = env

        # Getting the shape of observation space and action space of the environment
        self.observation_shape = self.env.observation_space.shape[0]
        self.action_shape = self.env.action_space.shape[0]

        # Hyper Parameters
        self.data_collection_params = data_collection_params
        self.training_params = training_params
        self.network_params = network_params
        self.algorithm_params = algorithm_params

        # Hyper Paramters for Networks
        policy_network_params = {
            'network_type': self.network_params['policy_network'][0],
            'network_size': self.network_params['policy_network'][1]
        }

        # Path to save training logs
        self.logs_path = logs_path

        ##### Networks #####

        self.policy_network = PolicyNetwork(self.env, self.sess,
                                            policy_network_params,
                                            self.algorithm_params)

        ##### Logging #####

        # Placeholder for average reward for logging
        self.average_reward = tf.placeholder(tf.float32, name="average_reward")

        # Log useful information
        self.reward_summary = tf.summary.scalar("average_reward",
                                                self.average_reward)

        # Setup the tf summary writer and initialize all tf variables
        self.writer = tf.summary.FileWriter(logs_path, sess.graph)
        self.sess.run(tf.global_variables_initializer())

    # Collecting experience (data) and training the agent (networks)
    def train(self, saver=None, save_dir=None):

        # Keeping count of total timesteps and episodes of environment experience for stats
        total_timesteps = 0
        total_episodes = 0

        # KL divergence, used to adjust the learning rate
        kl = 0

        # Keeping track of the best averge reward
        best_average_reward = -np.inf

        ##### Training #####

        # Training iterations
        while total_timesteps < self.training_params['total_timesteps']:

            # Collect batch of data
            trajectories, returns, undiscounted_returns, advantages, batch_size, episodes = self.collect_trajs(
                total_timesteps)
            observations_batch, actions_batch, rewards_batch, returns_batch, next_observations_batch, advantages_batch = self.traj_to_batch(
                trajectories, returns, advantages)

            # update total timesteps and total episodes
            total_timesteps += batch_size
            total_episodes += episodes

            # Learning rate adaptation
            learning_rate = self.update_lr(kl)

            # Average undiscounted return for the last data collection
            average_reward = np.mean(undiscounted_returns)

            # Save the best model
            if average_reward > best_average_reward:
                # Save the model
                best_average_reward = average_reward
                saver.save(self.sess, save_dir)

            ##### Optimization #####

            policy_summaries, policy_stats = self.train_policy_network(
                observations_batch, actions_batch, advantages_batch,
                learning_rate)
            policy_network_loss = policy_stats['policy_network_loss']
            kl = policy_stats['kl']
            average_advantage = policy_stats['average_advantage']

            self.print_stats(total_timesteps, total_episodes,
                             best_average_reward, average_reward, kl,
                             policy_network_loss, average_advantage,
                             learning_rate, batch_size)

        self.writer.close()

    ##### Helper Functions #####

    # Collect trajectores
    def collect_trajs(self, total_timesteps):
        # Batch size and episodes experienced in current iteration
        batch_size = 0
        episodes = 0

        # Lists to collect data
        trajectories, returns, undiscounted_returns, advantages = [], [], [], []

        ##### Collect Batch #####

        # Collecting minium batch size or minimum episodes of experience
        while episodes < self.data_collection_params[
                'min_episodes'] or batch_size < self.data_collection_params[
                    'min_batch_size']:

            ##### Episode #####

            # Run one episode
            observations, actions, rewards, dones = self.run_one_episode()

            ##### Data Appending #####

            # Get sum of reward for this episode
            undiscounted_returns.append(np.sum(rewards))

            # Update the counters
            batch_size += len(rewards)
            total_timesteps += len(rewards)
            episodes += 1

            self.log_rewards(np.sum(rewards), total_timesteps)

            # Episode trajectory
            trajectory = {
                "observations": np.array(observations),
                "actions": np.array(actions),
                "rewards": np.array(rewards),
                "dones": np.array(dones)
            }
            trajectories.append(trajectory)

            # Computing the discounted return for this episode (NOT A SINGLE NUMBER, FOR EACH OBSERVATION)
            return_ = discount(trajectory["rewards"],
                               self.algorithm_params['gamma'])

            # Computing the advantage estimate
            advantage = return_ - np.mean(return_)
            returns.append(return_)
            advantages.append(advantage)

        return [
            trajectories, returns, undiscounted_returns, advantages,
            batch_size, episodes
        ]

    # Run one episode
    def run_one_episode(self):

        # Restart env
        observation = self.env.reset()

        # Flag that env is in terminal state
        done = False

        observations, actions, rewards, dones = [], [], [], []
        while not done:
            # Collect the observation
            observations.append(observation)

            # Sample action with current policy
            action = self.compute_action(observation)

            # For single dimension actions, wrap it in np array
            if not isinstance(action, (list, tuple, np.ndarray)):
                action = np.array([action])
            action = np.concatenate(action)

            # Take action in environment
            observation, reward, done, _ = self.env.step(action)

            # Collect reward and action
            rewards.append(reward)
            actions.append(action)
            dones.append(done)

        return [observations, actions, rewards, dones]

    # Compute action using policy network
    def compute_action(self, observation):
        action = self.policy_network.compute_action(observation)
        return action

    # Log rewards
    def log_rewards(self, rewards, timestep):
        reward_summary = self.sess.run(
            [self.reward_summary], {self.average_reward: np.sum(rewards)})[0]
        self.writer.add_summary(reward_summary, timestep)

    # Convert trajectories to batches
    def traj_to_batch(self, trajectories, returns, advantages):
        ##### Data Prep #####

        # Observations for this batch
        observations_batch = np.concatenate(
            [trajectory["observations"] for trajectory in trajectories])
        next_observations_batch = np.roll(observations_batch, 1, axis=0)
        next_observations_batch[0, :] = observations_batch[0, :]

        # Actions for this batch, reshapeing to handel 1D action space
        actions_batch = np.concatenate([
            trajectory["actions"] for trajectory in trajectories
        ]).reshape([-1, self.action_shape])

        # Rewards of the trajectory as a batch
        rewards_batch = np.concatenate([
            trajectory["rewards"] for trajectory in trajectories
        ]).reshape([-1, 1])

        # Binary dones from environment in a batch
        dones_batch = np.concatenate(
            [trajectory["dones"] for trajectory in trajectories])

        # Discounted returns for this batch. itertool used to make batch into long np array
        returns_batch = np.array(list(
            itertools.chain.from_iterable(returns))).reshape([-1, 1])

        # Advantages for this batch. itertool used to make batch into long np array
        advantages_batch = np.array(
            list(itertools.chain.from_iterable(advantages))).flatten().reshape(
                [-1, 1])

        return [
            observations_batch, actions_batch, rewards_batch, returns_batch,
            next_observations_batch, advantages_batch
        ]

    # Update learning rate
    def update_lr(self, kl):
        if self.training_params['adaptive_lr']:
            if kl > self.training_params['desired_kl'] * 2:
                self.algorithm_params['learning_rate'] /= 1.5
            elif kl < self.training_params['desired_kl'] / 2:
                self.algorithm_params['learning_rate'] *= 1.5
            learning_rate = self.algorithm_params['learning_rate']
        else:
            learning_rate = self.algorithm_params['learning_rate']
        return learning_rate

    # Add summaries to the writer
    def add_summaries(self, summaries, timestep):
        for summary in summaries:
            # Write summary for tensorboard visualization
            self.writer.add_summary(summary, timestep)

    # Train value network
    def train_value_network(self, observations_batch, returns_batch,
                            learning_rate):
        summaries, stats = self.value_network.train(observations_batch,
                                                    returns_batch,
                                                    learning_rate)
        return [summaries, stats]

    # Train policy network
    def train_policy_network(self, observations_batch, actions_batch,
                             advantages_batch, learning_rate):
        summaries, stats = self.policy_network.train(observations_batch,
                                                     advantages_batch,
                                                     actions_batch,
                                                     learning_rate)
        return [summaries, stats]

    # Print stats
    def print_stats(self, total_timesteps, total_episodes, best_average_reward,
                    average_reward, kl, policy_network_loss, average_advantage,
                    learning_rate, batch_size):
        ##### Reporting Performance #####

        # Printing performance progress and other useful infromation
        print(
            "_______________________________________________________________________________________________________________________________________________________________________________________________________________"
        )
        print("{:>15} {:>15} {:>15} {:>15} {:>20} {:>20} {:>20} {:>10} {:>15}".
              format("total_timesteps", "episodes", "best_reward", "reward",
                     "kl_divergence", "policy_loss", "average_advantage", "lr",
                     "batch_size"))
        print(
            "{:>15} {:>15} {:>15.2f} {:>15.2f} {:>20.5E} {:>20.2f} {:>20.2f} {:>10.2E} {:>15}"
            .format(total_timesteps, total_episodes, best_average_reward,
                    average_reward, kl, policy_network_loss, average_advantage,
                    learning_rate, batch_size))
예제 #2
0
파일: A2S.py 프로젝트: raejeong/RaeboSchool
class Agent:
    """
  Advantage Actor Suggestor Algorithm

  Note:
    Implementation of Adavantage Actor Suggestor algorithm as proposed in ###
  
  Args: 
    env: OpenAI environment object
    
    sess: TensorFlow Session object
    
    data_collection_params (dict): Parameters for data collection / interacting with the environment
      'min_batch_size' (int): Minimum batch size for interacting with the environment
      'min_episodes' (int): Minimum episodes to interact with the environment per batch 
      'episode_adapt_rate' (int): amount to increase or decrease for min_epidoes
    
    training_params (dict): Parameters for training
      'total_timesteps' (int): Total time steps to train for 
      'adaptive_lr' (bool): Use adaptive learning rate based on the desired kl divergence between current and last policy 
      'desired_kl' (double): Desired kl divergence for adaptive learning rate
    
    network_params (dict): Parameters to define the network used
      'q_network' (list): Defines the Q network e.g. ['fully_connected_network','small/medium/large'] 
      'value_network' (list): Defines the value network e.g. ['fully_connected_network','small/medium/large'] 
      'policy_network' (list): Defines the policy network e.g. ['fully_connected_network','small/medium/large'] 
    
    algorithm_params (dict): Parameters specific to the algorithm 
      'gamma' (double): discount rate 
      'learning_rate' (double): Learning rate for gradient updates 
      'number_of_suggestions' (int): Number of suggestion given by the policy network to the Q network 
      'q_target_estimate_iteratoin' (int): Number of iterations to estimate the target Q value
      'std_dev' (list): Different ways to define the standard devation of the policy e.g. ['fixed'/'linear'/'network', (double)/(double)/-] NOTE: network option has the policy network to output the std dev 
      'PER_size' (int): Experience replay buffer size 
      'PER_batch_size' (int): Experience replay buffer size 
      'PER_iterations' (int): Number of iterations to train from the experience replay buffer
      'PER_alpha' (double): Proportional prioritization constant 
      'PER_epsilon' (double): Small positive constant that ensures that no transition has zero priority
      'target_update_rate' (double): Rate to perform the soft updates for the networks
               
    logs_path (string): Path to save training logs

  """
    def __init__(self,
                 env,
                 sess,
                 data_collection_params={
                     'min_batch_size': 1000,
                     'min_episodes': 3,
                     'episode_adapt_rate': 3
                 },
                 training_params={
                     'total_timesteps': 1000000,
                     'adaptive_lr': True,
                     'desired_kl': 6e-3
                 },
                 network_params={
                     'q_network': ['fully_connected_network', 'large'],
                     'value_network': ['fully_connected_network', 'large'],
                     'policy_network': ['fully_connected_network', 'large']
                 },
                 algorithm_params={
                     'gamma': 0.99,
                     'learning_rate': 1e-3,
                     'number_of_suggestions': 5,
                     'q_target_estimate_iteration': 10,
                     'std_dev': ['fixed', 0.2],
                     'PER_size': 50000,
                     'PER_batch_size': 64,
                     'PER_iterations': 100,
                     'PER_alpha': 0.6,
                     'PER_epsilon': 0.01,
                     'target_update_rate': 0.001
                 },
                 logs_path="/home/user/workspace/logs/"):

        # Tensorflow Session
        self.sess = sess

        # OpenAI Environment
        self.env = env

        # Getting the shape of observation space and action space of the environment
        self.observation_shape = self.env.observation_space.shape[0]
        self.action_shape = self.env.action_space.shape[0]

        # Hyper Parameters
        self.data_collection_params = data_collection_params
        self.training_params = training_params
        self.network_params = network_params
        self.algorithm_params = algorithm_params
        self.A2C = False

        # Hyper Paramters for Networks
        q_network_params = {
            'network_type': self.network_params['q_network'][0],
            'network_size': self.network_params['q_network'][1]
        }
        value_network_params = {
            'network_type': self.network_params['value_network'][0],
            'network_size': self.network_params['value_network'][1]
        }
        policy_network_params = {
            'network_type': self.network_params['policy_network'][0],
            'network_size': self.network_params['policy_network'][1]
        }

        # Path to save training logs
        self.logs_path = logs_path

        ##### Networks #####

        self.q_network = QNetwork(self.env, self.sess, q_network_params,
                                  self.algorithm_params)
        self.value_network = ValueNetwork(self.env, self.sess,
                                          value_network_params,
                                          self.algorithm_params)
        self.policy_network = PolicyNetwork(self.env, self.sess,
                                            policy_network_params,
                                            self.algorithm_params)

        ##### Logging #####

        # Placeholder for average reward for logging
        self.average_reward = tf.placeholder(tf.float32, name="average_reward")

        # Log useful information
        self.reward_summary = tf.summary.scalar("average_reward",
                                                self.average_reward)

        # Setup the tf summary writer and initialize all tf variables
        self.writer = tf.summary.FileWriter(logs_path, sess.graph)
        self.sess.run(tf.global_variables_initializer())

    # Collecting experience (data) and training the agent (networks)
    def train(self, saver=None, save_dir=None):

        # Keeping count of total timesteps and episodes of environment experience for stats
        total_timesteps = 0
        total_episodes = 0

        # KL divergence, used to adjust the learning rate
        kl = 0

        # Keeping track of the best averge reward
        best_max_reward = -np.inf
        count = 0
        flag1 = True
        flag25 = True
        flag50 = True
        flag75 = True

        ##### Training #####

        # Training iterations
        while total_timesteps < self.training_params['total_timesteps']:

            # Collect batch of data
            trajectories, returns, undiscounted_returns, advantages, batch_size, episodes = self.collect_trajs(
                total_timesteps)
            observations_batch, actions_batch, rewards_batch, returns_batch, next_observations_batch, advantages_batch = self.traj_to_batch(
                trajectories, returns, advantages)

            # update total timesteps and total episodes
            total_timesteps += batch_size
            total_episodes += episodes

            # Learning rate adaptation
            learning_rate = self.update_lr(kl)
            # learning_rate = self.algorithm_params['learning_rate']

            # Average undiscounted return for the last data collection
            average_reward = np.mean(undiscounted_returns) + 100
            max_reward = np.max(undiscounted_returns) + 100

            self.policy_network.update_std_dev()

            if flag1 and total_timesteps > 10000:
                saver.save(self.sess, save_dir + "/" + "A2S-10k.ckpt")
                flag1 = False

            if flag25 and total_timesteps > 250000:
                saver.save(self.sess, save_dir + "/" + "A2S-250k.ckpt")
                flag25 = False

            if flag50 and total_timesteps > 500000:
                saver.save(self.sess, save_dir + "/" + "A2S-500k.ckpt")
                flag50 = False

            if flag75 and total_timesteps > 750000:
                saver.save(self.sess, save_dir + "/" + "A2S-750k.ckpt")
                flag75 = False

            print(self.policy_network.std_dev)

            # Save the best model
            if max_reward > best_max_reward:
                count = 0
                if self.A2C:
                    count += 0
                    if count > 5:
                        count = 0
                        self.A2C = not self.A2C
                # Backup network
                self.q_network.backup()
                self.value_network.backup()
                self.policy_network.backup()
                # Save the model
                best_max_reward = max_reward
                saver.save(self.sess, save_dir + "/" + "A2S-Best.ckpt")

            if (max_reward < best_max_reward) and (
                    1 - (abs(max_reward - best_max_reward) /
                         (abs(best_max_reward) + abs(max_reward))) <
                    np.random.random() - 0.65):
                # if max_reward < best_max_reward:
                # count += 1

                # if count > 12:
                #Restore networks
                print("RESTORED")
                count = 0
                self.algorithm_params['learning_rate'] /= 5.0
                learning_rate = self.algorithm_params['learning_rate']
                self.training_params['desired_kl'] /= 1.1
                self.q_network.restore()
                self.value_network.restore()
                self.policy_network.restore()
                actionswe_batch = self.current_q_sample_actions_batch(
                    observations_batch.shape[0], observations_batch)
                self.policy_network.train_q(observations_batch.shape[0],
                                            observations_batch,
                                            actionswe_batch, learning_rate)

            else:
                ##### Optimization #####

                q_summaries, q_stats = self.train_q_network(
                    batch_size, observations_batch, actions_batch,
                    rewards_batch, next_observations_batch, returns_batch,
                    learning_rate)
                q_network_loss = q_stats['q_network_loss']
                self.add_summaries(q_summaries, total_timesteps)

                value_summaries, value_stats = self.train_value_network(
                    batch_size, observations_batch, returns_batch,
                    learning_rate)
                value_network_loss = value_stats['value_network_loss']
                self.add_summaries(value_summaries, total_timesteps)
                actionswe_batch = self.current_q_sample_actions_batch(
                    observations_batch.shape[0], observations_batch)
                self.policy_network.train_q(observations_batch.shape[0],
                                            observations_batch,
                                            actionswe_batch, learning_rate)

                policy_summaries, policy_stats = self.train_policy_network(
                    observations_batch, actions_batch, advantages_batch,
                    learning_rate)
                policy_network_loss = policy_stats['policy_network_loss']
                kl = policy_stats['kl']
                average_advantage = policy_stats['average_advantage']
                self.print_stats(total_timesteps, total_episodes,
                                 best_max_reward, max_reward, kl,
                                 policy_network_loss, value_network_loss,
                                 q_network_loss, average_advantage,
                                 learning_rate, batch_size)

        self.writer.close()

    ##### Helper Functions #####

    # Collect trajectores
    def collect_trajs(self, total_timesteps):
        # Batch size and episodes experienced in current iteration
        batch_size = 0
        episodes = 0

        # Lists to collect data
        trajectories, returns, undiscounted_returns, advantages = [], [], [], []

        ##### Collect Batch #####

        # Collecting minium batch size or minimum episodes of experience
        while episodes < self.data_collection_params[
                'min_episodes'] or batch_size < self.data_collection_params[
                    'min_batch_size']:

            ##### Episode #####

            # Run one episode
            observations, actions, rewards, dones = self.run_one_episode()

            ##### Data Appending #####

            # Get sum of reward for this episode
            undiscounted_returns.append(np.sum(rewards))

            # Update the counters
            batch_size += len(rewards)
            total_timesteps += len(rewards)
            episodes += 1

            self.log_rewards(np.sum(rewards), total_timesteps)

            # Episode trajectory
            trajectory = {
                "observations": np.array(observations),
                "actions": np.array(actions),
                "rewards": np.array(rewards),
                "dones": np.array(dones)
            }
            trajectories.append(trajectory)

            # Computing the discounted return for this episode (NOT A SINGLE NUMBER, FOR EACH OBSERVATION)
            return_ = discount(trajectory["rewards"],
                               self.algorithm_params['gamma'])

            # Compute the value estimates for the observations seen during this episode
            values = self.value_network.compute_value(observations)

            # Compute the q value estimates for the observations seen during this episode
            observations_batch = np.concatenate(observations).reshape(
                [-1, self.observation_shape])
            actions_batch = np.concatenate([actions
                                            ]).reshape([-1, self.action_shape])
            q_values = self.q_network.compute_target_q_batch(
                observations_batch, actions_batch)

            # Computing the advantage estimate
            if True:
                # if np.random.random() > 0.0:
                advantage = return_ - np.concatenate(values[0])
            else:
                advantage = np.concatenate(q_values[0]) - np.concatenate(
                    values[0])
            returns.append(return_)
            advantages.append(advantage)

        return [
            trajectories, returns, undiscounted_returns, advantages,
            batch_size, episodes
        ]

    # Run one episode
    def run_one_episode(self):

        # Restart env
        observation = self.env.reset()

        # Flag that env is in terminal state
        done = False

        observations, actions, rewards, dones = [], [], [], []
        while not done:
            # Collect the observation
            observations.append(observation)

            # Sample action with current policy
            action = self.compute_action(observation)

            # For single dimension actions, wrap it in np array
            if not isinstance(action, (list, tuple, np.ndarray)):
                action = np.array([action])
            action = np.concatenate(action)

            # Take action in environment
            observation, reward, done, _ = self.env.step(action)

            # Collect reward and action
            rewards.append(reward)
            actions.append(action)
            dones.append(done)

        return [observations, actions, rewards, dones]

    # Compute action using Q network and policy network
    def compute_action(self, observation):
        if np.random.random() > 12:
            best_action = self.policy_network.compute_action(observation)
        else:
            suggested_actions = self.policy_network.compute_suggested_actions(
                observation)
            best_action = None
            best_q = -np.inf
            for action in suggested_actions:
                current_q = self.q_network.compute_target_q(
                    observation, action)
                if current_q > best_q:
                    best_q = current_q
                    best_action = action
        return best_action

    # Log rewards
    def log_rewards(self, rewards, timestep):
        reward_summary = self.sess.run(
            [self.reward_summary], {self.average_reward: np.sum(rewards)})[0]
        self.writer.add_summary(reward_summary, timestep)

    # Convert trajectories to batches
    def traj_to_batch(self, trajectories, returns, advantages):
        ##### Data Prep #####

        # Observations for this batch
        observations_batch = np.concatenate(
            [trajectory["observations"] for trajectory in trajectories])
        next_observations_batch = np.roll(observations_batch, 1, axis=0)
        next_observations_batch[0, :] = observations_batch[0, :]

        # Actions for this batch, reshapeing to handel 1D action space
        actions_batch = np.concatenate([
            trajectory["actions"] for trajectory in trajectories
        ]).reshape([-1, self.action_shape])

        # Rewards of the trajectory as a batch
        rewards_batch = np.concatenate([
            trajectory["rewards"] for trajectory in trajectories
        ]).reshape([-1, 1])

        # Binary dones from environment in a batch
        dones_batch = np.concatenate(
            [trajectory["dones"] for trajectory in trajectories])

        # Discounted returns for this batch. itertool used to make batch into long np array
        returns_batch = np.array(list(
            itertools.chain.from_iterable(returns))).reshape([-1, 1])

        # Advantages for this batch. itertool used to make batch into long np array
        advantages_batch = np.array(
            list(itertools.chain.from_iterable(advantages))).flatten().reshape(
                [-1, 1])

        return [
            observations_batch, actions_batch, rewards_batch, returns_batch,
            next_observations_batch, advantages_batch
        ]

    # Update learning rate
    def update_lr(self, kl):
        if self.training_params['adaptive_lr']:
            if kl > self.training_params['desired_kl'] * 2:
                self.algorithm_params['learning_rate'] /= 1.5
            elif kl < self.training_params['desired_kl'] / 2:
                self.algorithm_params['learning_rate'] *= 1.5
            learning_rate = self.algorithm_params['learning_rate']
        else:
            learning_rate = self.algorithm_params['learning_rate']
        return learning_rate

    # Train Q Network
    def train_q_network(self, batch_size, observations_batch, actions_batch,
                        rewards_batch, next_observations_batch, returns_batch,
                        learning_rate):
        # y = self.compute_q_network_y_batch(batch_size, rewards_batch, next_observations_batch)
        y = returns_batch
        [summaries,
         stats] = self.q_network.train_current(batch_size, observations_batch,
                                               actions_batch, rewards_batch, y,
                                               learning_rate)
        # self.q_network.replay_buffer_add_batch(batch_size, observations_batch, actions_batch, rewards_batch, next_observations_batch, y)
        # batches = self.q_network.get_batches()
        # batches = self.update_q_batches(batches)
        # summaries, stats = self.q_network.train(batches, learning_rate)
        # batches = self.update_q_batches(batches)
        # self.q_network.replay_buffer_update_batch(batches)
        return [summaries, stats]

    # Compute the y (target) for Q network with the policy
    def compute_q_network_y_batch(self, batch_size, rewards_batch,
                                  next_observations_batch):
        q_target_estimates = np.zeros([
            self.algorithm_params['q_target_estimate_iteration'], batch_size, 1
        ])
        for i in range(self.algorithm_params['q_target_estimate_iteration']):
            q_target_estimates[i, :, :] = self.sample_target_q(
                batch_size, next_observations_batch)
        q_target_mean = np.mean(q_target_estimates, axis=0)
        y = rewards_batch + self.algorithm_params['gamma'] * q_target_mean
        return y

    # Sample a target q value from the policy
    def sample_target_q(self, batch_size, next_observations_batch):
        actions_batch = self.current_q_sample_actions_batch(
            batch_size, next_observations_batch)
        target_q_estimate_batch = self.q_network.compute_target_q_batch(
            next_observations_batch, actions_batch)
        return target_q_estimate_batch[0]

    # Get best action from current Q network
    def current_q_sample_actions_batch(self, batch_size, observations_batch):
        actions_batch = []
        for i in range(batch_size):
            suggested_actions = self.policy_network.compute_suggested_actions(
                observations_batch[i, :])
            best_action = None
            best_q = -np.inf
            for action in suggested_actions:
                current_q = self.q_network.compute_target_q(
                    observations_batch[i, :], action)
                if current_q > best_q:
                    best_q = current_q
                    best_action = action
            actions_batch.append(best_action)

        actions_batch = np.concatenate(actions_batch).reshape(
            [-1, self.action_shape])
        return actions_batch

    # Update Q batches
    def update_q_batches(self, batches):
        for i in range(len(batches)):
            for j in range(len(batches[i])):
                sample = batches[i][j]
                next_observations_batch = np.array(
                    [sample[1].next_observation])
                rewards_batch = np.array([sample[1].reward])
                y = self.compute_q_network_y_batch(1, rewards_batch,
                                                   next_observations_batch)[0]
                q_value_estimate = self.q_network.compute_q(
                    sample[1].observation, sample[1].action[None])
                error = y[0] - q_value_estimate
                batches[i][j][1].y = y[0]
                batches[i][j][1].error = error
        return batches

    # Add summaries to the writer
    def add_summaries(self, summaries, timestep):
        for summary in summaries:
            # Write summary for tensorboard visualization
            self.writer.add_summary(summary, timestep)

    # Train value network
    def train_value_network(self, batch_size, observations_batch,
                            returns_batch, learning_rate):
        summaries, stats = self.value_network.train(batch_size,
                                                    observations_batch,
                                                    returns_batch,
                                                    learning_rate)
        return [summaries, stats]

    # Train policy network
    def train_policy_network(self, observations_batch, actions_batch,
                             advantages_batch, learning_rate):
        summaries, stats = self.policy_network.train(observations_batch,
                                                     advantages_batch,
                                                     actions_batch,
                                                     learning_rate)
        return [summaries, stats]

    def restore_networks(self):
        self.q_network.restore()
        self.value_network.restore()
        self.policy_network.restore()

    # Print stats
    def print_stats(self, total_timesteps, total_episodes, best_max_reward,
                    max_reward, kl, policy_network_loss, value_network_loss,
                    q_network_loss, average_advantage, learning_rate,
                    batch_size):
        ##### Reporting Performance #####

        # Printing performance progress and other useful infromation
        print(
            "_______________________________________________________________________________________________________________________________________________________________________________________________________________"
        )
        print(
            "{:>15} {:>15} {:>15} {:>15} {:>20} {:>20} {:>20} {:>20} {:>20} {:>10} {:>15}"
            .format("total_timesteps", "episodes", "best_max_reward",
                    "max_reward", "kl_divergence", "policy_loss", "value_loss",
                    "q_network_loss", "average_advantage", "lr", "batch_size"))
        print(
            "{:>15} {:>15} {:>15.2f} {:>15.2f} {:>20.5E} {:>20.2f} {:>20.2f} {:>20.2f} {:>20.2f} {:>10.2E} {:>15}"
            .format(total_timesteps, total_episodes, best_max_reward,
                    max_reward, kl, policy_network_loss, value_network_loss,
                    q_network_loss, average_advantage, learning_rate,
                    batch_size))