Python RunningStats.update Examples

Programming Language: Python

Namespace/Package Name: utils

Class/Type: RunningStats

Method/Function: update

Examples at hotexamples.com: 3

Python RunningStats.update - 3 examples found. These are the top rated real world Python examples of utils.RunningStats.update extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RunningStats(7)

mean(4)

push(3)

standard_deviation(3)

update(3)

stddev(2)

load_stats(1)

online_update(1)

save_stats(1)

Example #1

Show file

            if (t > 0 and episode > 200):
                # Obtain TD-error
                _, base_v = ppo.evaluate_state(s_old)
                _, target_v = ppo.evaluate_state(s)
                lambda_mix = lambda_max * (1 - np.exp(-factor * np.abs(
                    r + GAMMA * np.squeeze(target_v) - np.squeeze(base_v))))
            else:
                lambda_mix = 10.
                lambda_actual = 10.
            lambda_store[t] = lambda_mix

            # Update ppo
            if t == BATCH:  # or (terminal and t < BATCH):
                # Normalise rewards
                rewards = np.array(buffer_r)
                rolling_r.update(rewards)
                rewards = np.clip(rewards / rolling_r.std, -10, 10)

                v_final = [
                    v * (1 - terminal)
                ]  # v = 0 if terminal, otherwise use the predicted v
                values = np.array(buffer_v + v_final)
                terminals = np.array(buffer_terminal + [terminal])

                # Generalized Advantage Estimation - https://arxiv.org/abs/1506.02438
                delta = rewards + GAMMA * values[1:] * (
                    1 - terminals[1:]) - values[:-1]
                advantage = discount(delta, GAMMA * LAMBDA, terminals)
                buffer_v = np.squeeze(np.array(buffer_v))[:, np.newaxis]
                returns = np.squeeze(advantage)[:, np.newaxis] + buffer_v
                advantage = (advantage - advantage.mean()) / np.maximum(

Example #2

Show file

File: ppo_lstm_joined.py Project: joekkim/rlexamples

                bs, ba, br, badv = np.reshape(buffer_s, (len(buffer_s),) + ppo.s_dim), np.vstack(buffer_a), \
                                   np.vstack(returns), np.vstack(advantage)
                experience.append([bs, ba, br, badv])

                buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []

                # Update ppo
                if t >= BATCH:
                    # Per batch normalisation of advantages
                    advs = np.concatenate(list(zip(*experience))[3])
                    for x in experience:
                        x[3] = (x[3] - np.mean(advs)) / np.maximum(
                            np.std(advs), 1e-6)

                    # Update rolling reward stats
                    rolling_r.update(np.array(batch_rewards))

                    print("Training using %i episodes and %i steps..." %
                          (len(experience), t))
                    graph_summary = ppo.update(experience)
                    t, experience, batch_rewards = 0, [], []

            buffer_s.append(s)
            buffer_a.append(a)
            buffer_v.append(v)
            buffer_terminal.append(terminal)
            ep_a.append(a)

            if not ppo.discrete:
                a = np.clip(a, env.action_space.low, env.action_space.high)
            s, r, terminal, _ = env.step(a)

Example #3

Show file

    def work(self):
        hooks = [self.ppo.sync_replicas_hook]
        sess = tf.train.MonitoredTrainingSession(master=self.server.target,
                                                 is_chief=(self.wid == 0),
                                                 checkpoint_dir=SUMMARY_DIR,
                                                 save_summaries_steps=None,
                                                 save_summaries_secs=None,
                                                 hooks=hooks)
        if self.wid == 0:
            writer = SummaryWriterCache.get(SUMMARY_DIR)

        t, episode, terminal = 0, 0, False
        buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
        rolling_r = RunningStats()

        while not sess.should_stop() and not (episode > EP_MAX
                                              and self.wid == 0):

            s = self.env.reset()
            ep_r, ep_t, ep_a = 0, 0, []

            while True:
                a, v = self.ppo.evaluate_state(s, sess)

                # Update ppo
                if t == BATCH:  # or (terminal and t < BATCH):
                    # Normalise rewards
                    rewards = np.array(buffer_r)
                    rolling_r.update(rewards)
                    rewards = np.clip(rewards / rolling_r.std, -10, 10)

                    v_final = [
                        v * (1 - terminal)
                    ]  # v = 0 if terminal, otherwise use the predicted v
                    values = np.array(buffer_v + v_final)
                    terminals = np.array(buffer_terminal + [terminal])

                    # Generalized Advantage Estimation - https://arxiv.org/abs/1506.02438
                    delta = rewards + GAMMA * values[1:] * (
                        1 - terminals[1:]) - values[:-1]
                    advantage = discount(delta, GAMMA * LAMBDA, terminals)
                    returns = advantage + np.array(buffer_v)
                    advantage = (advantage - advantage.mean()) / np.maximum(
                        advantage.std(), 1e-6)

                    bs, ba, br, badv = np.reshape(buffer_s, (t,) + self.ppo.s_dim), np.vstack(buffer_a), \
                                       np.vstack(returns), np.vstack(advantage)

                    graph_summary = self.ppo.update(bs, ba, br, badv, sess)
                    buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
                    t = 0

                buffer_s.append(s)
                buffer_a.append(a)
                buffer_v.append(v)
                buffer_terminal.append(terminal)
                ep_a.append(a)

                if not self.ppo.discrete:
                    a = np.clip(a, self.env.action_space.low,
                                self.env.action_space.high)
                s, r, terminal, _ = self.env.step(a)
                buffer_r.append(r)

                ep_r += r
                ep_t += 1
                t += 1

                if terminal:
                    # End of episode summary
                    print('Worker_%i' % self.wid, '| Episode: %i' % episode,
                          "| Reward: %.2f" % ep_r, '| Steps: %i' % ep_t)

                    if self.wid == 0:
                        worker_summary = tf.Summary()
                        worker_summary.value.add(tag="Reward",
                                                 simple_value=ep_r)

                        # Create Action histograms for each dimension
                        actions = np.array(ep_a)
                        if self.ppo.discrete:
                            add_histogram(writer,
                                          "Action",
                                          actions,
                                          episode,
                                          bins=self.ppo.a_dim)
                        else:
                            for a in range(self.ppo.a_dim):
                                add_histogram(writer, "Action/Dim" + str(a),
                                              actions[:, a], episode)

                        try:
                            writer.add_summary(graph_summary, episode)
                        except NameError:
                            pass
                        writer.add_summary(worker_summary, episode)
                        writer.flush()

                    episode += 1
                    break

        self.env.close()
        print("Worker_%i finished" % self.wid)