Python Environment.reset Examples

Programming Language: Python

Namespace/Package Name: lib.environment

Class/Type: Environment

Method/Function: reset

Examples at hotexamples.com: 2

Python Environment.reset - 2 examples found. These are the top rated real world Python examples of lib.environment.Environment.reset extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Environment(10)

start(2)

sendAction(2)

reset(2)

getState(2)

get_action_size(2)

Shared(2)

process(1)

stop(1)

get_observation(1)

is_done(1)

get_surface(1)

game_over(1)

exit(1)

apply(1)

action(1)

update(1)

Example #1

Show file

File: Chapter_04_Cross_Entropy.py Project: anramir/Reinforcement-Learning-Book

    def play_episode(self, env: Environment) -> Episode:
        env.reset()
        episode_steps = []
        total_reward: Reward = 0.0

        while not env.is_done():
            episode_step, reward = self.step(env)
            episode_steps.append(episode_step),
            total_reward += reward

        episode = Episode(steps=episode_steps, reward=total_reward)
        return episode

Example #2

Show file

File: a3c.py Project: charlesashby/RL_tuts

class Trainer(object):
    """ Class for Training a Local Network / ONE agent """
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate,
                 grad_applier,
                 show_env=False,
                 local_t_max=20,
                 max_global_time_step=10 * 10**7,
                 gamma=0.99,
                 save_interval_step=100 * 1000,
                 env='Breakout-v0',
                 device='/cpu:0'):

        self.thread_index = thread_index
        self.learning_rate = learning_rate
        self.env = env

        # Whether to render the environment
        # or not during training (default is
        # True for one of the agents) - change
        # this in main.py
        self.show_env = show_env

        # Discount factor for the reward
        self.gamma = gamma

        # Number of "epochs"
        self.max_global_time_step = max_global_time_step

        # Number of steps for the LSTM
        self.local_t_max = local_t_max

        # Number of actions the agent can take
        self.action_size = Environment.get_action_size(env)

        self.local_network = A3C(self.action_size, self.thread_index, device)

        self.global_network = global_network

        # Build computational graph
        self.local_network._create_network()

        # Build computational graph for the losses
        # and gradients
        self.local_network.prepare_a3c_loss()
        self.apply_gradients = grad_applier.minimize_local(
            self.local_network.a3c_loss, global_network.get_vars(),
            self.local_network.get_vars())

        # Sync the weights of the local network with those
        # of the main network
        self.sync = self.local_network.sync_from(global_network)

        # Initialize time step, learning rate, etc
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0

    def build_environment(self):
        """ Create the environment """
        self.environment = Environment(self.env, show_env=self.show_env)

    def stop(self):
        """ Terminate the environment """
        self.environment.stop()

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        """ Save Score to Tensorboard """
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)

        # Write to disk
        summary_writer.flush()

    def choose_action(self, pi_values):
        """
        Sample from the learned policy
        distribution

        :param pi_values:
            Probability distribution for
            every actions
        """
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def concat_action_reward(self, action, action_size, reward):
        """
        Return one hot vectored action and reward.
        """
        action_reward = np.zeros([action_size + 1], dtype='float32')
        action_reward[action] = 1.0
        action_reward[-1] = float(reward)
        return action_reward

    def _decay_learning_rate(self, global_time_step):
        """ Decay the learning rate linearly """
        time_left = self.max_global_time_step - global_time_step
        learning_rate = self.initial_learning_rate * time_left \
                        / self.max_global_time_step

        # Clip learning rate at 0.0
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def _process_a3c(self, sess, global_t, summary_writer, summary_op,
                     score_input):
        """
        Process max_local_t steps/frames in the
        A3C network

        :param sess:
            TensorFlow session object

        :param global_t:
            Global time step (number of steps
            processed by the global/shared network)
        """
        # States of the LSTM
        states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []

        # Synchronize with global network
        sess.run(self.sync)

        # Initial local time step
        self.local_t = 0

        # Whether we hit a terminal state or not
        terminal_end = False
        start_lstm_state = self.local_network.lstm_state_out

        # Loops local_t_max time steps
        for _ in range(self.local_t_max):
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = self.concat_action_reward(
                last_action, self.action_size, last_reward)

            # Compute policy and value function
            pi_, value_ = self.local_network.run_pi_value(
                sess, self.environment.last_state, last_action_reward)

            # Pick an action given the new computed policy
            action = self.choose_action(pi_)

            # Append results to placeholders...
            states.append(self.environment.last_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            # Process next action
            new_state, reward, terminal = self.environment.process(action)

            rewards.append(reward)
            self.episode_reward += reward

            self.local_t += 1

            if terminal:
                # Environment hit a terminal state
                terminal_end = True

                # ----------------
                # PRINT STATISTICS
                # ----------------

                print('Time step: %5d k - Score: %3d' %
                      (global_t / 1000, self.episode_reward))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                # If we hit a terminal state, then the
                # reward is set to 0, else, it is set
                # to the value function
                self.episode_reward = 0
                self.environment.reset()
                self.local_network.reset_state()
                break

        # ---------
        # BACK-PROP
        # ---------

        # We discount the rewards from t - 1 to t_start. At
        # time step t the reward is either 0 (if terminal state)
        # or V (non terminal state)
        R = 0.0
        if not terminal_end:
            R = self.local_network.run_last_value(sess, new_state,
                                                  last_action_reward)

        # Reverse placeholders
        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        # To compute the gradients we compute a minibatch of
        # length local_t_max
        batch_s = []
        batch_a = []
        batch_adv = []
        batch_R = []

        # For printing
        R_non_discounted = R

        # Discounting...
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            adv = R - Vi
            a = np.array([0] * self.action_size)
            a[ai] = 1.0

            batch_s.append(si)
            batch_a.append(a)

            # Convert np.array -> float because
            # the advantage and reward placeholders
            #  expects shape [None, ] not [None, 1]
            batch_adv.append(float(adv))
            batch_R.append(float(R))

        batch_s.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        # Decay learning rate
        cur_learning_rate = self._decay_learning_rate(global_t)

        # Create feed_dict for gradient_applier
        feed_dict = {
            self.local_network.input: batch_s,
            self.local_network.last_action_reward: last_action_rewards,
            self.local_network.a: batch_a,
            self.local_network.adv: batch_adv,
            self.local_network.R: batch_R,
            self.local_network.lstm_state: start_lstm_state,
            self.learning_rate: cur_learning_rate
        }

        # compute gradients and update weights
        sess.run(self.apply_gradients, feed_dict=feed_dict)
        """
        # ----------------
        # PRINT STATISTICS
        # ----------------

        # Compute losses
        total_loss, policy_loss, value_loss = self.local_network.run_losses(sess,
                                                                            feed_dict)

        total_loss = np.mean(total_loss)
        policy_loss = np.mean(policy_loss)
        value_loss = np.mean(value_loss)

        if global_t % 1000 == 0:
            print('Time Step: %6d k Reward: %3d - Total Loss: %.4f - '
                  'Policy Loss: %.4f - Value Loss: %.4f' %
                  (global_t / 1000, float(R_non_discounted), total_loss,
                   policy_loss, value_loss))

            # Save to log file
            with open(LOG_FILE, 'a') as f:
                f.write('Reward: %3d - Total Loss: %.4f - Policy Loss: %.4f '
                  '- Value Loss: %.4f \n' %
                  (float(R), total_loss, policy_loss, value_loss))
        """

        # Return the number of steps taken
        # to update global_time_steps
        return self.local_t