Example #1
0
    def infer(self, rollout, sess, gamma, bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        values = rollout[:, 5]

        # Here we take the rewards and values from the rollout, and use them to
        # generate the advantage and discounted returns.
        # The advantage function uses "Generalized Advantage Estimation"
        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = utils.discount(rewards_plus, gamma)[:-1]
        value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * value_plus[1:] - value_plus[:-1]
        advantages = utils.discount(advantages, gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {
            self.local_AC_network.target_v: discounted_rewards,
            self.local_AC_network.inputs: np.stack(observations),
            self.local_AC_network.actions: actions,
            self.local_AC_network.advantages: advantages
        }
        l, v_l, p_l, e_l, g_n, v_n, _ = sess.run([
            self.local_AC_network.loss, self.local_AC_network.value_loss,
            self.local_AC_network.policy_loss, self.local_AC_network.entropy,
            self.local_AC_network.grad_norms, self.local_AC_network.var_norms,
            self.local_AC_network.apply_grads
        ],
                                                 feed_dict=feed_dict)
        return l / len(rollout), v_l / len(rollout), p_l / len(
            rollout), e_l / len(rollout), g_n, v_n
Example #2
0
    def train(self, rollout, sess, gamma, bootstrap_value):
        rollout = np.array(rollout)
        states = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]

        prev_rewards = [0] + rewards[:-1].tolist()
        prev_actions = [0] + actions[:-1].tolist()
        values = rollout[:, 3]

        self.pr = prev_rewards
        self.pa = prev_actions
        # Here we take the rewards and values from the rollout, and use them to
        # generate the advantage and discounted returns.
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = ut.discount(self.rewards_plus, gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards +\
            gamma * self.value_plus[1:] -\
            self.value_plus[:-1]
        advantages = ut.discount(advantages, gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        rnn_state = self.local_AC.st_init
        if self.network == 'lstm':
            feed_dict = {
                self.local_AC.target_v: discounted_rewards,
                self.local_AC.state: np.stack(states, axis=0),
                self.local_AC.prev_rewards: np.vstack(prev_rewards),
                self.local_AC.prev_actions: prev_actions,
                self.local_AC.actions: actions,
                self.local_AC.advantages: advantages,
                self.local_AC.state_in[0]: rnn_state[0],
                self.local_AC.state_in[1]: rnn_state[1]
            }
        elif (self.network == 'relu') or\
             (self.network == 'gru') or\
             (self.network == 'ugru'):
            feed_dict = {
                self.local_AC.target_v: discounted_rewards,
                self.local_AC.st: np.stack(states, axis=0),
                self.local_AC.prev_rewards: np.vstack(prev_rewards),
                self.local_AC.prev_actions: prev_actions,
                self.local_AC.actions: actions,
                self.local_AC.advantages: advantages,
                self.local_AC.st_in: rnn_state
            }

        v_l, p_l, e_l, g_n, v_n, _ = sess.run([
            self.local_AC.value_loss, self.local_AC.policy_loss,
            self.local_AC.entropy, self.local_AC.grad_norms,
            self.local_AC.var_norms, self.local_AC.apply_grads
        ],
                                              feed_dict=feed_dict)
        aux = len(rollout)
        return v_l / aux, p_l / aux, e_l / aux, g_n, v_n
    def train(self, rollout, sess, bootstrap_value, settings, summaries=False):
        rollout = np.array(rollout)
        actions = rollout[:, 0]
        rewards = rollout[:, 1]
        timesteps = rollout[:, 2]
        if FLAGS.meta:
            prev_rewards = [0] + rewards[:-1].tolist()
        prev_actions = [0] + actions[:-1].tolist()
        values = rollout[:, 4]

        # The advantage function uses "Generalized Advantage Estimation"
        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(rewards_plus, settings["gamma"])[:-1]
        value_plus = np.asarray(values.tolist() + [bootstrap_value])
        policy_target = discounted_rewards - value_plus[:-1]
        if FLAGS.gen_adv:
            td_residuals = rewards + settings["gamma"] * value_plus[
                1:] - value_plus[:-1]
            advantages = discount(td_residuals, settings["gamma"])
            policy_target = advantages

        rnn_state = self.local_AC.state_init
        if FLAGS.meta:
            feed_dict = {
                self.local_AC.target_v: discounted_rewards,
                self.local_AC.prev_rewards: np.vstack(prev_rewards),
                self.local_AC.prev_actions: prev_actions,
                self.local_AC.actions: actions,
                self.local_AC.timestep: np.vstack(timesteps),
                self.local_AC.advantages: policy_target,
                self.local_AC.state_in[0]: rnn_state[0],
                self.local_AC.state_in[1]: rnn_state[1]
            }
        else:
            feed_dict = {
                self.local_AC.target_v: discounted_rewards,
                self.local_AC.prev_actions: prev_actions,
                self.local_AC.actions: actions,
                self.local_AC.timestep: np.vstack(timesteps),
                self.local_AC.advantages: policy_target,
                self.local_AC.state_in[0]: rnn_state[0],
                self.local_AC.state_in[1]: rnn_state[1]
            }

        if summaries:
            l, v_l, p_l, e_l, g_n, v_n, _, ms = sess.run([
                self.local_AC.loss, self.local_AC.value_loss,
                self.local_AC.policy_loss, self.local_AC.entropy,
                self.local_AC.grad_norms, self.local_AC.var_norms,
                self.local_AC.apply_grads, self.local_AC.merged_summary
            ],
                                                         feed_dict=feed_dict)

            return l / len(rollout), v_l / len(rollout), p_l / len(
                rollout), e_l / len(rollout), g_n, v_n, ms
        else:
            _ = sess.run([self.local_AC.apply_grads], feed_dict=feed_dict)
            return None
Example #4
0
    def train(self, rollout, sess, bootstrap_value, settings, summaries=False):
        rollout = np.array(rollout)
        actions = rollout[:, 0]
        rewards = rollout[:, 1]
        timesteps = rollout[:, 2]
        if FLAGS.meta:
            prev_rewards = [0] + rewards[:-1].tolist()
        prev_actions = [0] + actions[:-1].tolist()
        values = rollout[:, 4]

        reward_multiplier = [10 for _ in prev_rewards]

        # The advantage function uses "Generalized Advantage Estimation"
        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(rewards_plus, settings["gamma"])[:-1]
        value_plus = np.asarray(values.tolist() + [bootstrap_value])
        policy_target = discounted_rewards - value_plus[:-1]
        if FLAGS.gen_adv:
            td_residuals = rewards + settings["gamma"] * value_plus[1:] - value_plus[:-1]
            advantages = discount(td_residuals, settings["gamma"])
            policy_target = advantages

        rnn_state = self.local_AC.state_init
        if FLAGS.meta:
            feed_dict = {self.local_AC.target_v: discounted_rewards,
                         self.local_AC.prev_rewards: prev_rewards,
                         self.local_AC.reward_multiplier: reward_multiplier,
                         self.local_AC.prev_actions: prev_actions,
                         self.local_AC.actions: actions,
                         self.local_AC.timestep: np.vstack(timesteps),
                         self.local_AC.advantages: policy_target,
                         self.local_AC.state_in[0]: rnn_state[0],
                         self.local_AC.state_in[1]: rnn_state[1]}
        else:
            feed_dict = {self.local_AC.target_v: discounted_rewards,
                         self.local_AC.prev_actions: prev_actions,
                         self.local_AC.actions: actions,
                         self.local_AC.timestep: np.vstack(timesteps),
                         self.local_AC.advantages: policy_target,
                         self.local_AC.state_in[0]: rnn_state[0],
                         self.local_AC.state_in[1]: rnn_state[1]}

        if summaries:
            l, v_l, p_l, e_l, g_n, v_n, _, ms = sess.run([self.local_AC.loss,
                                                          self.local_AC.value_loss,
                                                          self.local_AC.policy_loss,
                                                          self.local_AC.entropy,
                                                          self.local_AC.grad_norms,
                                                          self.local_AC.var_norms,
                                                          self.local_AC.apply_grads,
                                                          self.local_AC.merged_summary],
                                                         feed_dict=feed_dict)

            return l / len(rollout), v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n, ms
        else:
            _ = sess.run([self.local_AC.apply_grads], feed_dict=feed_dict)
            return None
    def run_n_step(self):
        obs_list,action_list,reward_list,next_obs_list,value_list,rnn_states = [],[],[],[],[],[]
        try:
            if self.obs == None:
                self.obs = self.env.reset()
                self.rnn_state = self.zero_rnn_init
        except:
            ValueError

        for _ in range(self.n_steps):
            obs_list.append(self.obs)
            rnn_states.append(self.rnn_state)
            action, reward, next_obs, self.done, value, next_rnn_state = self._run_one_step(
            )
            action_list.append(action)
            reward_list.append(reward)
            next_obs_list.append(next_obs)
            value_list.append(value)

            if self.done:
                ## reset env
                self.obs = self.env.reset()
                self.rnn_state = self.zero_rnn_init

                ## write reward and length to server variables
                self.server.last_reward.value = int(self.total_reward)
                self.server.last_length.value = int(self.total_length)
                self.total_length = 0
                self.total_reward = 0
                break
            else:
                self.obs = next_obs
                self.rnn_state = next_rnn_state

        ## caculate ref values and adv
        if self.done:
            value_p1 = 0
        else:
            self.predict_q.put((self.id, self.obs, (self.rnn_state[0][0],
                                                    self.rnn_state[1][0])))
            _, value_p1, _ = self.return_q.get()

        rewards_plus = np.asarray(reward_list + [value_p1])
        reward_list = discount(rewards_plus, self.gamma)[:-1]
        value_plus = np.asarray(value_list + [value_p1])
        advs = rewards_plus[:-1] + self.gamma * value_plus[1:] - value_plus[:-1]
        lamb = 1
        advs = discount(advs, self.gamma * lamb)
        self.training_q.put((np.array(obs_list), np.array(action_list), advs,
                             reward_list, rnn_states[0]))
Example #6
0
    def _cal_adv_and_old_v(self, traj: Trajectories):
        """
        Complete adv and old_v in traj
        """
        v = self.critic(traj.s)
        v_s = v[:-1, :]
        v_s_next = v[1:, :]
        td = traj.r - v_s + self.parms.gamma * v_s_next * traj.not_done

        advs = np.zeros_like(td)

        # Find done index
        done_index = np.where(traj.not_done == 0)[0]

        # Cal adv for each intervel
        start_index = 0
        multiplier = self.parms.gamma * self.parms.lamda
        for end_index in done_index:
            advs[start_index:end_index + 1,
                 ...] = discount(td[start_index:end_index + 1, ...],
                                 multiplier)
            start_index = end_index + 1

        traj.adv = advs
        traj.old_v = v_s.numpy()
Example #7
0
    def test_discount(self):
        rewards = np.array([5, 10, 15, 20, 30, 50])
        gamma = 0.9

        discounted_rewards = np.array([5, 9, 12.15, 14.58, 19.683, 29.5245])

        self.assertTrue(
            np.allclose(discounted_rewards, utils.discount(rewards, gamma)))
Example #8
0
    def play_episode(self, train=True):
        # TODO
        state = self.env.reset(train)

        e_states = []
        e_actions = []
        e_rewards = []
        e_newstates = []
        e_dones = []

        for step_count in range(MAX_STEPS):
            self.loss = 0
            self.epsilon -= 1.0 / EXPLORE

            actions = self.actor.model.predict(
                state.reshape(1, 1, state.shape[0])).ravel()
            actions = self.noise_actions(actions, self.epsilon)

            new_state, reward, done = self.env.step(actions)

            e_states.append(state)
            e_actions.append(actions)
            e_rewards.append(reward)
            e_newstates.append(new_state)
            e_dones.append(done)

            state = new_state

            if not step_count % 50:
                logging.info("Step: {0} Reward: {1} Actions: {2}".format(
                    step_count, reward, actions))

            if done:
                self.save_weights()
                break

        # for st, act, rew, nst, d in zip(e_states, e_actions, discount(e_rewards).tolist(), e_newstates, e_dones):
        self.buff.add([e_states, e_actions, e_rewards, e_newstates, e_dones])

        if train:
            self.loss = self._train_episode()

        logging.info("Total reward for episode: {0}".format(
            discount(e_rewards).sum()))
        return discount(e_rewards).sum()
Example #9
0
    def gae(self, rewards, values, next_vals, dones, gamma, lambda_):
        """
        Performs Generalized Advantage Estimation
    
        rewards - torch FloatTensor of actual rewards collected. Size = L
        values - torch FloatTensor of value predictions. Size = L
        next_vals - torch FloatTensor of value predictions. Size = L
        dones - torch FloatTensor of done signals. Size = L
        gamma - float discount factor
        lambda_ - float gae moving average factor
    
        Returns
         advantages - torch FloatTensor of genralized advantage estimations. Size = L
        """

        deltas = rewards + gamma * next_vals * (1 - dones) - values
        return cuda_if(discount(deltas, dones, gamma * lambda_))
Example #10
0
    def trajectory(batch):
        last_transition = batch[-1]
        is_not_terminal = (1 - last_transition['is_done'])

        target = sess.run(name_to_ops['value'],
                          feed_dict={
                              name_to_ops['state']:
                              last_transition['new_state'][np.newaxis]
                          })
        R = is_not_terminal * target
        batch = utils.chunk_maps(batch)

        clipped = np.clip(batch['reward'], -1, 1)
        rollout = np.append(clipped, R)

        discounted_reward = utils.discount(rollout, hyper_params['gamma'])
        batch['target'] = discounted_reward[:-1]
        return batch
    def collect_trajs(self, video):
        """Run episodes and concatenate data."""
        size = 0
        trajs = []
        lengths = []

        for _ in range(self.episodes_per_batch):
            traj = self.do_episode(video)
            trajs.append(traj)
            length = len(traj["rewards"])
            size += length
            lengths.append(length)

        obs = np.concatenate([traj["obs"] for traj in trajs])
        rewards = np.concatenate([traj["rewards"] for traj in trajs])
        actions = np.concatenate([traj["actions"] for traj in trajs])
        returns = np.concatenate(
            [discount(traj["rewards"], self.gamma) for traj in trajs])

        return dict(obs=obs,
                    rewards=rewards,
                    actions=actions,
                    returns=returns,
                    lengths=np.array(lengths))
Example #12
0
            if terminal:
                # Normalise rewards
                rewards = np.array(buffer_r)
                rewards = np.clip(rewards / rolling_r.std, -10, 10)
                batch_rewards = batch_rewards + buffer_r

                v_final = [
                    v * (1 - terminal)
                ]  # v = 0 if terminal, otherwise use the predicted v
                values = np.array(buffer_v + v_final)
                terminals = np.array(buffer_terminal + [terminal])

                # Generalized Advantage Estimation - https://arxiv.org/abs/1506.02438
                delta = rewards + GAMMA * values[1:] * (
                    1 - terminals[1:]) - values[:-1]
                advantage = discount(delta, GAMMA * LAMBDA, terminals)
                returns = advantage + np.array(buffer_v)
                # Per episode normalisation of advantages
                # advantage = (advantage - advantage.mean()) / np.maximum(advantage.std(), 1e-6)

                bs, ba, br, badv = np.reshape(buffer_s, (len(buffer_s),) + ppo.s_dim), np.vstack(buffer_a), \
                                   np.vstack(returns), np.vstack(advantage)
                experience.append([bs, ba, br, badv])

                buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []

                # Update ppo
                if t >= BATCH:
                    # Per batch normalisation of advantages
                    advs = np.concatenate(list(zip(*experience))[3])
                    for x in experience:
Example #13
0
                    # Track observation for when we return to this environment in the next episode (danananana sup snoop!)
                    obs_bookmarks[i] = observation
                    prev_bookmarks[i] = prev_obs
                    break


    net.train(mode=True)
    print("T="+str(T),"– Episode", episode, "–– Avg Reward:", avg_reward, "–– Avg Action:", np.mean(actions))

    if reward_count > 100 and avg_reward > rew_cutoff:
        rew_cutoff += 0.1
        entropy_const *= .8
        max_norm *= .8
        clip_const *= .8

    advantages = discount(advantages, gamma*lambda_, mask) # Generalized Value Estimation
    fit_batch_size = len(advantages)//n_minibatches

    data = [actions, observs, rewards, advantages, old_pis, old_vals, mask]
    fit_obj.fit_policy(net, data, optimizer, epochs=n_epochs, clip_const=clip_const, batch_size=fit_batch_size, entropy_const=entropy_const, val_const=val_const, gamma=gamma, lambda_=lambda_)

    if episode % (ep_batch_size*5) == 0:
        torch.save(net.state_dict(), net_save_file)
        torch.save(optimizer.state_dict(), optim_save_file)

    # Check for memory leaks
    gc.collect()
    max_mem_used = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print("Memory Used: {:.2f} MB".format(max_mem_used / 1024))

    episode_reward = 0
Example #14
0
    def train(self, rollout, bootstrap_value, summaries=False):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        if FLAGS.meta:
            prev_rewards = [0] + rewards[:-1].tolist()
            prev_actions = [0] + actions[:-1].tolist()
        values = rollout[:, 5]

        # The advantage function uses "Generalized Advantage Estimation"
        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(rewards_plus, FLAGS.gamma)[:-1]
        value_plus = np.asarray(values.tolist() + [bootstrap_value])
        policy_target = discounted_rewards - value_plus[:-1]
        if FLAGS.gen_adv:
            td_residuals = rewards + FLAGS.gamma * value_plus[
                1:] - value_plus[:-1]
            advantages = discount(td_residuals, FLAGS.gamma)
            policy_target = advantages

        if FLAGS.lstm:
            if FLAGS.meta:
                rnn_state = self.local_AC.state_init
                feed_dict = {
                    self.local_AC.target_v: discounted_rewards,
                    self.local_AC.prev_rewards: np.vstack(prev_rewards),
                    self.local_AC.prev_actions: prev_actions,
                    self.local_AC.actions: actions,
                    self.local_AC.inputs: np.stack(observations, axis=0),
                    self.local_AC.advantages: policy_target,
                    self.local_AC.state_in[0]: rnn_state[0],
                    self.local_AC.state_in[1]: rnn_state[1]
                }
            else:
                rnn_state = self.local_AC.state_init
                feed_dict = {
                    self.local_AC.target_v: discounted_rewards,
                    self.local_AC.inputs: np.stack(observations, axis=0),
                    self.local_AC.actions: actions,
                    self.local_AC.advantages: policy_target,
                    self.local_AC.state_in[0]: rnn_state[0],
                    self.local_AC.state_in[1]: rnn_state[1]
                }
        else:
            feed_dict = {
                self.local_AC.target_v: discounted_rewards,
                self.local_AC.inputs: np.stack(observations, axis=0),
                self.local_AC.actions: actions,
                self.local_AC.advantages: policy_target
            }
        if summaries:
            l, v_l, p_l, e_l, g_n, v_n, _, ms, img_summ, max_v, min_v, mean_v, max_r, min_r, mean_r = self.sess.run(
                [
                    self.local_AC.loss, self.local_AC.value_loss,
                    self.local_AC.policy_loss, self.local_AC.entropy,
                    self.local_AC.grad_norms, self.local_AC.var_norms,
                    self.local_AC.apply_grads, self.local_AC.merged_summary,
                    self.local_AC.image_summaries, self.local_AC.max_value,
                    self.local_AC.min_value, self.local_AC.mean_value,
                    self.local_AC.max_reward, self.local_AC.min_reward,
                    self.local_AC.mean_reward
                ],
                feed_dict=feed_dict)
            return l / len(rollout), v_l / len(rollout), p_l / len(
                rollout
            ), e_l / len(
                rollout
            ), g_n, v_n, ms, img_summ, max_v, min_v, mean_v, max_r, min_r, mean_r
        else:
            _ = self.sess.run([self.local_AC.apply_grads], feed_dict=feed_dict)
            return None
Example #15
0
    def train(self):

        start_time = time.time()

        self.episodes = self.env.generate_episodes(config.NUM_EPISODES, self)

        # Computing returns and estimating advantage function.
        for episode in self.episodes:
            episode["baseline"] = self.value_func.predict(episode)
            episode["returns"] = utils.discount(episode["rewards"],
                                                config.GAMMA)
            episode["advantage"] = episode["returns"] - episode["baseline"]

        # Updating policy.
        actions_dist_n = np.concatenate(
            [episode["actions_dist"] for episode in self.episodes])
        states_n = np.concatenate(
            [episode["states"] for episode in self.episodes])
        actions_n = np.concatenate(
            [episode["actions"] for episode in self.episodes])
        baseline_n = np.concatenate(
            [episode["baseline"] for episode in self.episodes])
        returns_n = np.concatenate(
            [episode["returns"] for episode in self.episodes])

        # Standardize the advantage function to have mean=0 and std=1.
        advantage_n = np.concatenate(
            [episode["advantage"] for episode in self.episodes])
        advantage_n -= advantage_n.mean()
        advantage_n /= (advantage_n.std() + 1e-8)

        # Computing baseline function for next iter.
        print(states_n.shape, actions_n.shape, advantage_n.shape,
              actions_dist_n.shape)
        feed = {
            self.policy.state: states_n,
            self.action: actions_n,
            self.advantage: advantage_n,
            self.policy.pi_theta_old: actions_dist_n
        }

        episoderewards = np.array(
            [episode["rewards"].sum() for episode in self.episodes])

        #print("\n********** Iteration %i ************" % i)

        self.value_func.fit(self.episodes)
        self.theta_old = self.current_theta()

        def fisher_vector_product(p):
            feed[self.flat_tangent] = p
            return self.session.run(self.fisher_vect_prod,
                                    feed) + config.CG_DAMP * p

        self.g = self.session.run(self.surr_loss_grad, feed_dict=feed)

        self.grad_step = utils.conjugate_gradient(fisher_vector_product,
                                                  -self.g)

        self.sAs = .5 * self.grad_step.dot(
            fisher_vector_product(self.grad_step))

        self.beta_inv = np.sqrt(self.sAs / config.MAX_KL)
        self.full_grad_step = self.grad_step / self.beta_inv

        self.negdot_grad_step = -self.g.dot(self.grad_step)

        def loss(th):
            self.set_theta(th)
            return self.session.run(self.surr_loss, feed_dict=feed)

        self.theta = utils.line_search(loss, self.theta_old,
                                       self.full_grad_step,
                                       self.negdot_grad_step / self.beta_inv)
        self.set_theta(self.theta)

        surr_loss_new = -self.session.run(self.surr_loss, feed_dict=feed)
        KL_old_new = self.session.run(self.KL, feed_dict=feed)
        entropy = self.session.run(self.entropy, feed_dict=feed)

        old_new_norm = np.sum((self.theta - self.theta_old)**2)

        if np.abs(KL_old_new) > 2.0 * config.MAX_KL:
            print("Keeping old theta")
            self.set_theta(self.theta_old)

        stats = {}
        stats["L2 of old - new"] = old_new_norm
        stats["Total number of episodes"] = len(self.episodes)
        stats["Average sum of rewards per episode"] = episoderewards.mean()
        stats["Entropy"] = entropy
        exp = utils.explained_variance(np.array(baseline_n),
                                       np.array(returns_n))
        stats["Baseline explained"] = exp
        stats["Time elapsed"] = "%.2f mins" % (
            (time.time() - start_time) / 60.0)
        stats["KL between old and new distribution"] = KL_old_new
        stats["Surrogate loss"] = surr_loss_new
        self.stats.append(stats)
        utils.write_dict(stats)
        save_path = self.saver.save(self.session, "./checkpoints/model.ckpt")
        print('Saved checkpoint to %s' % save_path)
        for k, v in stats.items():
            print(k + ": " + " " * (40 - len(k)) + str(v))
Example #16
0
    def update(self, paths):
        self.time_step += 1

        acts = np.concatenate([path["action"] for path in paths])
        obs_scan = np.concatenate([path["obs_scan"] for path in paths])
        obs_goal = np.concatenate([path["obs_goal"] for path in paths])
        obs_vel = np.concatenate([path["obs_vel"] for path in paths])
        obs_image = np.concatenate([path["obs_image"] for path in paths])

        baseline_value = self.baseline.predict(
            [obs_scan, obs_goal, obs_vel, obs_image])

        last_path_size = 0
        for _, path in enumerate(paths):
            np.array(path["reward"])
            path["return"] = discount(path["reward"], self.args.gamma)
            b = path["baseline"] = baseline_value[
                last_path_size:last_path_size + path["done_id"]]
            b1 = np.append(b, 0 if path["terminated"] else b[-1])
            deltas = path["reward"] + self.args.gamma * b1[1:] - b1[:-1]
            path["advantage"] = discount(deltas,
                                         self.args.gamma * self.args.lamda)
            last_path_size = path["done_id"]

        rets = np.concatenate([path["return"] for path in paths])
        advs = np.concatenate([path["advantage"] for path in paths])
        advs = (advs - advs.mean()) / (advs.std() + 1e-6)

        if self.time_step > 1:  # train acotr after trained critic
            kl = self.actor_update(obs_scan, obs_image, obs_goal, obs_vel,
                                   acts, advs)
        self.critic_update(obs_scan, obs_image, obs_goal, obs_vel, rets)

        stats = OrderedDict()

        epRewards = np.array([path["reward"].sum() for path in paths])
        epPathLengths = np.array([len(path["reward"]) for path in paths])
        stats["EpRewardsMean"] = epRewards.mean()
        stats["EpRewardsMax"] = epRewards.max()
        stats["EpRewardsMin"] = epRewards.min()
        stats["EpPathLengthsMean"] = epPathLengths.mean()
        stats["EpPathLengthsMax"] = epPathLengths.max()
        stats["EpPathLengthsMin"] = epPathLengths.min()
        stats["RewardPerStep"] = epRewards.sum() / epPathLengths.sum()

        if self.time_step > 1:
            stats["Beta"] = self.beta
            stats["ActorLearningRate"] = self.actor_lr * self.lr_multiplier
            stats["KL-Divergence"] = kl

            feed_dict = {
                self.obs_scan: obs_scan,
                self.obs_goal: obs_goal,
                self.obs_vel: obs_vel,
                self.obs_image: obs_image,
                self.obs_scan_value: obs_scan,
                self.obs_image_value: obs_image,
                self.obs_goal_value: obs_goal,
                self.obs_vel_value: obs_vel,
                self.act_ph: acts,
                self.advantages_ph: advs,
                self.beta_ph: self.beta,
                self.eta_ph: self.eta,
                self.lr_ph: self.actor_lr * self.lr_multiplier,
                self.ret_ph: rets,
                self.visual_kl: kl,
                self.visual_reward: epRewards.mean()
            }

            summary = self.session.run(self.merge_all, feed_dict)
            self.writer.add_summary(summary, self.time_step)

        if epRewards.mean() > self.best_score:
            self.actor.save_network('best')
            self.baseline.save_network('best')
            self.best_score = epRewards.mean()

        self.actor.save_network('last')
        self.baseline.save_network('last')

        return stats
    def train(self,
              rollout,
              sess,
              bootstrap_value_w,
              bootstrap_value_m,
              summaries=False):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        timesteps = rollout[:, 3]
        w_values = rollout[:, 5]
        m_values = rollout[:, 6]
        sum_of_prev_goals = rollout[:, 7]
        intr_rewards = rollout[:, 8]
        goals = rollout[:, 9]

        # if FLAGS.meta:
        prev_rewards = [0] + rewards[:-1].tolist()
        prev_actions = [0] + actions[:-1].tolist()
        prev_goals = [np.random.normal(size=(FLAGS.hidden_dim, ))
                      ] + goals[:-1].tolist()

        # The advantage function uses "Generalized Advantage Estimation"
        rewards_plus_w = np.asarray(rewards.tolist() + [bootstrap_value_w])
        rewards_plus_m = np.asarray(rewards.tolist() + [bootstrap_value_m])
        intr_rewards_plus = np.asarray(intr_rewards.tolist() +
                                       [bootstrap_value_w])
        w_discounted_rewards = discount(rewards_plus_w, FLAGS.w_gamma)[:-1]
        m_discounted_rewards = discount(rewards_plus_m, FLAGS.m_gamma)[:-1]
        w_discounted_intr_rewards = discount(intr_rewards_plus,
                                             FLAGS.w_gamma)[:-1]
        # w_value_plus = np.asarray(w_values.tolist() + [bootstrap_value])
        # m_value_plus = np.asarray(m_values.tolist() + [bootstrap_value])

        w_rnn_state = self.local_AC.w_state_init
        m_rnn_state = self.local_AC.m_state_init
        feed_dict = {
            self.local_AC.w_extrinsic_return: w_discounted_rewards,
            self.local_AC.m_extrinsic_return: m_discounted_rewards,
            self.local_AC.inputs: np.stack(observations, axis=0),
            self.local_AC.prev_rewards: prev_rewards,
            self.local_AC.prev_actions: prev_actions,
            self.local_AC.prev_goal: prev_goals,
            self.local_AC.sum_prev_goals: np.stack(sum_of_prev_goals, axis=0),
            self.local_AC.w_intrinsic_return: w_discounted_intr_rewards,
            self.local_AC.actions: actions,
            self.local_AC.w_state_in[0]: w_rnn_state[0],
            self.local_AC.w_state_in[1]: w_rnn_state[1],
            self.local_AC.m_state_in[0]: m_rnn_state[0],
            self.local_AC.m_state_in[1]: m_rnn_state[1]
        }

        if summaries:
            l, w_v_l, m_v_l, p_l, g_l, e_l, g_n, v_n, _, ms, img_summ, cos_sim_state_diff = sess.run(
                [
                    self.local_AC.loss,
                    self.local_AC.w_value_loss,
                    self.local_AC.m_value_loss,
                    self.local_AC.w_policy_loss,
                    self.local_AC.goals_loss,
                    self.local_AC.entropy,
                    self.local_AC.grad_norms,
                    self.local_AC.var_norms,
                    self.local_AC.apply_grads,
                    self.local_AC.merged_summary,
                    self.local_AC.image_summaries,
                    self.local_AC.cos_sim_state_diff,
                ],
                feed_dict=feed_dict)
            return l / len(rollout), w_v_l / len(rollout), m_v_l / len(rollout), \
                   p_l / len(rollout), g_l / len(rollout), \
                   e_l / len(
                       rollout), g_n, v_n, ms, img_summ, m_discounted_rewards, w_discounted_rewards, w_discounted_intr_rewards, cos_sim_state_diff
        else:
            _ = sess.run([self.local_AC.apply_grads], feed_dict=feed_dict)
            return None
Example #18
0
    def learn(self, paths):
        # is it possible to replace A(s,a) with Q(s,a)?
        for path in paths:
            path["baseline"] = self.vf.predict(path)
            path["returns"] = utils.discount(path["rewards"], self.args.gamma)
            path["advantage"] = path["returns"] - path["baseline"]
            # path["advantage"] = path["returns"]

        # puts all the experiences in a matrix: total_timesteps x options
        action_dist_mu = np.concatenate(
            [path["action_dists_mu"] for path in paths])
        action_dist_logstd = np.concatenate(
            [path["action_dists_logstd"] for path in paths])
        obs_n = np.concatenate([path["obs"] for path in paths])
        action_n = np.concatenate([path["actions"] for path in paths])

        # standardize to mean 0 stddev 1
        advant_n = np.concatenate([path["advantage"] for path in paths])
        advant_n -= advant_n.mean()
        advant_n /= (advant_n.std() + 1e-8)

        # train value function / baseline on rollout paths
        self.vf.fit(paths)

        feed_dict = {
            self.obs: obs_n,
            self.action: action_n,
            self.advantage: advant_n,
            self.oldaction_dist_mu: action_dist_mu,
            self.oldaction_dist_logstd: action_dist_logstd
        }

        # parameters
        thprev = self.gf()

        # computes fisher vector product: F * [self.pg]
        def fisher_vector_product(p):
            feed_dict[self.flat_tangent] = p
            return self.session.run(self.fvp,
                                    feed_dict) + p * self.args.cg_damping

        g = self.session.run(self.pg, feed_dict)

        # solve Ax = g, where A is Fisher information metrix and g is gradient of parameters
        # stepdir = A_inverse * g = x
        stepdir = utils.conjugate_gradient(fisher_vector_product, -g)

        # let stepdir =  change in theta / direction that theta changes in
        # KL divergence approximated by 0.5 x stepdir_transpose * [Fisher Information Matrix] * stepdir
        # where the [Fisher Information Matrix] acts like a metric
        # ([Fisher Information Matrix] * stepdir) is computed using the function,
        # and then stepdir * [above] is computed manually.
        shs = 0.5 * stepdir.dot(fisher_vector_product(stepdir))

        lm = np.sqrt(shs / self.args.max_kl)
        # if self.args.max_kl > 0.001:
        #     self.args.max_kl *= self.args.kl_anneal

        fullstep = stepdir / lm
        negative_g_dot_steppdir = -g.dot(stepdir)

        def loss(th):
            self.sff(th)
            # surrogate loss: policy gradient loss
            return self.session.run(self.losses[0], feed_dict)

        # finds best parameter by starting with a big step and working backwards
        theta = utils.linesearch(loss, thprev, fullstep,
                                 negative_g_dot_steppdir / lm)
        # i guess we just take a fullstep no matter what
        theta = thprev + fullstep
        self.sff(theta)

        surrogate_after, kl_after, entropy_after = self.session.run(
            self.losses, feed_dict)

        episoderewards = np.array([path["rewards"].sum() for path in paths])
        stats = {}
        stats["Average sum of rewards per episode"] = episoderewards.mean()
        stats["Entropy"] = entropy_after
        stats["max KL"] = self.args.max_kl
        stats["Timesteps"] = sum([len(path["rewards"]) for path in paths])
        # stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time) / 60.0)
        stats["KL between old and new distribution"] = kl_after
        stats["Surrogate loss"] = surrogate_after
        # print(("\n********** Iteration {} ************".format(i)))
        for k, v in stats.items():
            print(k + ": " + " " * (40 - len(k)) + str(v))

        return stats["Average sum of rewards per episode"]
Example #19
0
    def _loop_listening(self):
        """
        Ensures that the program will continue listening until closure
        """
        episode = 1
        collection_length = self.batch_size * self.sequence_length

        while (self.listening):
            # Record the keys and game frames while recording is enabled
            while (self.playing):
                while (episode <= self.episodes and self.playing):
                    state = self.sr_game.reset()
                    terminal = False

                    while (not terminal and self.playing):
                        states = []
                        actions = []
                        rewards = []
                        values = []

                        while (len(states) < collection_length and self.playing
                               and not terminal):
                            start = time()

                            state = self.sr_game.state

                            tens_state = torch.FloatTensor([state]).to(
                                self.model.device)
                            tens_state = (tens_state / 255.0).permute(
                                0, 3, 1, 2)

                            action, policy, value, rnd = self.model.step(
                                tens_state)

                            next_state, reward, terminal = self.sr_game.step(
                                action)

                            reward = reward + rnd

                            states.append(state)
                            actions.append(action)
                            rewards.append(reward)
                            values.append(value)

                            #print("Loop time:", time() - start)

                        if (len(states) == collection_length):
                            states = (np.stack(states) / 255.0).astype(
                                np.float32)
                            actions = np.array(actions, dtype=np.float32)
                            rewards = np.array(rewards, dtype=np.float32)
                            values = np.array(values, dtype=np.float32)

                            returns = discount(rewards, decay)
                            advantages = returns - values
                            advantages = normalize(advantages,
                                                   1e-5).astype(np.float32)

                            loss = self.model.train_reinforce(
                                [states, actions, rewards, advantages])
                            print("Loss:", loss)
                            """ Just training RND for now
                            supervised = self.data_handler.sequenced_sample(
                                                               self.batch_size,
                                                               self.sequence_length,
                                                               str(self.model.device) == "cuda"
                                                               )
                            supervised = [tens.view(-1, *tens.shape[2:])
                                          for tens in supervised]
                            self.model.train_supervised(*supervised)
                            """
                if (episode % self.save_interval == 0):
                    self.model.save(self.save_path)

                if (episode == self.episodes):
                    self.stop()
Example #20
0
    def train(self, rollout, sess, bootstrap_value_w, bootstrap_value_m, summaries=False):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        timesteps = rollout[:, 3]
        w_values = rollout[:, 5]
        m_values = rollout[:, 6]
        sum_of_prev_goals = rollout[:, 7]
        intr_rewards = rollout[:, 8]
        goals = rollout[:, 9]

        # if FLAGS.meta:
        prev_rewards = [0] + rewards[:-1].tolist()
        prev_actions = [0] + actions[:-1].tolist()
        prev_goals = [np.random.normal(size=(FLAGS.hidden_dim,))] + goals[:-1].tolist()

        # The advantage function uses "Generalized Advantage Estimation"
        rewards_plus_w = np.asarray(rewards.tolist() + [bootstrap_value_w])
        rewards_plus_m = np.asarray(rewards.tolist() + [bootstrap_value_m])
        intr_rewards_plus = np.asarray(intr_rewards.tolist() + [bootstrap_value_w])
        w_discounted_rewards = discount(rewards_plus_w, FLAGS.w_gamma)[:-1]
        m_discounted_rewards = discount(rewards_plus_m, FLAGS.m_gamma)[:-1]
        w_discounted_intr_rewards = discount(intr_rewards_plus, FLAGS.w_gamma)[:-1]
        # w_value_plus = np.asarray(w_values.tolist() + [bootstrap_value])
        # m_value_plus = np.asarray(m_values.tolist() + [bootstrap_value])

        w_rnn_state = self.local_AC.w_state_init
        m_rnn_state = self.local_AC.m_state_init
        feed_dict = {self.local_AC.w_extrinsic_return: w_discounted_rewards,
                     self.local_AC.m_extrinsic_return: m_discounted_rewards,
                     self.local_AC.inputs: np.stack(observations, axis=0),
                     self.local_AC.prev_rewards: prev_rewards,
                     self.local_AC.prev_actions: prev_actions,
                     self.local_AC.prev_goal: prev_goals,
                     self.local_AC.sum_prev_goals: np.stack(sum_of_prev_goals, axis=0),
                     self.local_AC.w_intrinsic_return: w_discounted_intr_rewards,
                     self.local_AC.actions: actions,
                     self.local_AC.w_state_in[0]: w_rnn_state[0],
                     self.local_AC.w_state_in[1]: w_rnn_state[1],
                     self.local_AC.m_state_in[0]: m_rnn_state[0],
                     self.local_AC.m_state_in[1]: m_rnn_state[1]
                     }

        if summaries:
            l, w_v_l, m_v_l, p_l, g_l, e_l, g_n, v_n, _, ms, img_summ, cos_sim_state_diff = sess.run(
                [self.local_AC.loss,
                 self.local_AC.w_value_loss,
                 self.local_AC.m_value_loss,
                 self.local_AC.w_policy_loss,
                 self.local_AC.goals_loss,
                 self.local_AC.entropy,
                 self.local_AC.grad_norms,
                 self.local_AC.var_norms,
                 self.local_AC.apply_grads,
                 self.local_AC.merged_summary,
                 self.local_AC.image_summaries,
                 self.local_AC.cos_sim_state_diff,
                 ],
                feed_dict=feed_dict)
            return l / len(rollout), w_v_l / len(rollout), m_v_l / len(rollout), \
                   p_l / len(rollout), g_l / len(rollout), \
                   e_l / len(
                       rollout), g_n, v_n, ms, img_summ, m_discounted_rewards, w_discounted_rewards, w_discounted_intr_rewards, cos_sim_state_diff
        else:
            _ = sess.run([self.local_AC.apply_grads], feed_dict=feed_dict)
            return None
Example #21
0
def run_vanilla_policy_gradient_experiment(args, vf_params, logdir, env, sess, continuous_control):
    """
    General purpose method to run vanilla policy gradients.
    Works for both continuous and discrete environments.

    Roughly inspired by starter code for this homework and
    https://github.com/DanielTakeshi/rl_algorithms/blob/master/vpg/main.py

    Thanks!

    Params
    ------
    args: arguments for vanilla policy gradient.
    vf_params: dict of params for value function
    logdir: where to store outputs or None if you don't want to store anything
    env: openai gym env
    sess: TF session
    continuous_control: boolean, if true then we do gaussian continuous control
    """

    ob_dim = env.observation_space.shape[0]

    if args.vf_type == 'linear':
       value_function = LinearValueFunction(**vf_params)
    elif args.vf_type == 'nn':
       value_function = NnValueFunction(session=sess, ob_dim=ob_dim)
    #value_function = LinearValueFunction()

    if continuous_control:
        ac_dim = env.action_space.shape[0]
        policy_fn = policies.GaussianPolicy(sess, ob_dim, ac_dim)
    else:
        ac_dim = env.action_space.n
        policy_fn = policies.DisceretePolicy(sess, ob_dim, ac_dim)


    sess.__enter__()  # equivalent to with sess, to reduce indentation
    tf.global_variables_initializer().run()
    total_timesteps = 0
    stepsize = args.initial_stepsize

    filterAction = 0.1
    stepMax = 100
    for i in range(args.n_iter):
        print("\n********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps.
        timesteps_this_batch = 0
        paths = []
        step = 0

        #if(filterAction > 1.0):
        #    filterAction = 1.0
        #else:
        #    filterAction = filterAction*1.1
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode = (
            len(paths) == 0 and (i % 10 == 0) and args.render)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = policy_fn.sample_action(ob)
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                step = step + 1
                if done:
                    step = 0
                    #print "done "
                    break
                #if done or step > stepMax:
                #    print "max steps: {}".format(stepMax)
                #    step = 0
                #    stepMax = stepMax + 2
                #    break
            path = {"observation": np.array(obs), "terminated": terminated,
                    "reward": np.array(rewards), "action": np.array(acs)}
            paths.append(path)
            timesteps_this_batch += utils.pathlength(path)
            if timesteps_this_batch > args.min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Estimate advantage function using baseline vf (these are lists!).
        # return_t: list of sum of discounted rewards (to end of
        # episode), one per time
        # vpred_t: list of value function's predictions of components of
        # return_t
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = utils.discount(rew_t, args.gamma)
            vpred_t = value_function.predict(path["observation"])
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update and **re-fit the baseline**.
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        std_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        value_function.fit(ob_no, vtarg_n)

        # Policy update, plus diagnostics stuff. Is there a better way to
        #  handle
        # the continuous vs discrete control cases?
        if continuous_control:
            surr_loss, oldmean_na, oldlogstd_a = policy_fn.update_policy(
                ob_no, ac_n, std_adv_n, stepsize)

            kl, ent = policy_fn.kldiv_and_entropy(
                ob_no, oldmean_na, oldlogstd_a
            )
        else:
            surr_loss, oldlogits_na = policy_fn.update_policy(
                ob_no, ac_n, std_adv_n, stepsize)
            kl, ent = policy_fn.kldiv_and_entropy(ob_no, oldlogits_na)

        # Step size heuristic to ensure that we don't take too large steps.
        if args.use_kl_heuristic:
            if kl > args.desired_kl * 2:
                stepsize /= 1.5
                print('PG stepsize -> %s' % stepsize)
            elif kl < args.desired_kl / 2:
                stepsize *= 1.5
                print('PG stepsize -> %s' % stepsize)
            else:
                print('PG stepsize OK')

        # Log diagnostics
        if i % args.log_every_t_iter == 0:
            logz.log_tabular("EpRewMean", np.mean(
                [path["reward"].sum() for path in paths]))
            logz.log_tabular("EpLenMean", np.mean(
                [utils.pathlength(path) for path in paths]))
            logz.log_tabular("KLOldNew", kl)
            logz.log_tabular("Entropy", ent)
            logz.log_tabular("EVBefore",
                             utils.explained_variance_1d(vpred_n, vtarg_n))
            logz.log_tabular("EVAfter",
                             utils.explained_variance_1d(value_function.predict(ob_no),
                                                         vtarg_n))
            logz.log_tabular("SurrogateLoss", surr_loss)
            logz.log_tabular("TimestepsSoFar", total_timesteps)
            # If you're overfitting, EVAfter will be way larger than
            # EVBefore.
            # Note that we fit the value function AFTER using it to
            # compute the
            # advantage function to avoid introducing bias
            logz.dump_tabular()
Example #22
0
    def train(self, rollout, bootstrap_value, summaries=False):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        if FLAGS.meta:
            prev_rewards = [0] + rewards[:-1].tolist()
            prev_actions = [0] + actions[:-1].tolist()
        values = rollout[:, 5]

        # The advantage function uses "Generalized Advantage Estimation"
        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(rewards_plus, FLAGS.gamma)[:-1]
        value_plus = np.asarray(values.tolist() + [bootstrap_value])
        policy_target = discounted_rewards - value_plus[:-1]
        if FLAGS.gen_adv:
            td_residuals = rewards + FLAGS.gamma * value_plus[1:] - value_plus[:-1]
            advantages = discount(td_residuals, FLAGS.gamma)
            policy_target = advantages

        if FLAGS.lstm:
            if FLAGS.meta:
                rnn_state = self.local_AC.state_init
                feed_dict = {self.local_AC.target_v: discounted_rewards,
                             self.local_AC.prev_rewards: np.vstack(prev_rewards),
                             self.local_AC.prev_actions: prev_actions,
                             self.local_AC.actions: actions,
                             self.local_AC.inputs: np.stack(observations, axis=0),
                             self.local_AC.advantages: policy_target,
                             self.local_AC.state_in[0]: rnn_state[0],
                             self.local_AC.state_in[1]: rnn_state[1]}
            else:
                rnn_state = self.local_AC.state_init
                feed_dict = {self.local_AC.target_v: discounted_rewards,
                             self.local_AC.inputs: np.stack(observations, axis=0),
                             self.local_AC.actions: actions,
                             self.local_AC.advantages: policy_target,
                             self.local_AC.state_in[0]: rnn_state[0],
                             self.local_AC.state_in[1]: rnn_state[1]}
        else:
            feed_dict = {self.local_AC.target_v: discounted_rewards,
                         self.local_AC.inputs: np.stack(observations, axis=0),
                         self.local_AC.actions: actions,
                         self.local_AC.advantages: policy_target}
        if summaries:
            l, v_l, p_l, e_l, g_n, v_n, _, ms, img_summ, max_v, min_v, mean_v, max_r, min_r, mean_r = self.sess.run(
                [self.local_AC.loss,
                 self.local_AC.value_loss,
                 self.local_AC.policy_loss,
                 self.local_AC.entropy,
                 self.local_AC.grad_norms,
                 self.local_AC.var_norms,
                 self.local_AC.apply_grads,
                 self.local_AC.merged_summary,
                 self.local_AC.image_summaries,
                 self.local_AC.max_value,
                 self.local_AC.min_value,
                 self.local_AC.mean_value,
                 self.local_AC.max_reward,
                 self.local_AC.min_reward,
                 self.local_AC.mean_reward],
                feed_dict=feed_dict)
            return l / len(rollout), v_l / len(rollout), p_l / len(rollout), e_l / len(
                rollout), g_n, v_n, ms, img_summ, max_v, min_v, mean_v, max_r, min_r, mean_r
        else:
            _ = self.sess.run([self.local_AC.apply_grads], feed_dict=feed_dict)
            return None
Example #23
0
    def work(self):
        hooks = [self.ppo.sync_replicas_hook]
        sess = tf.train.MonitoredTrainingSession(master=self.server.target,
                                                 is_chief=(self.wid == 0),
                                                 checkpoint_dir=SUMMARY_DIR,
                                                 save_summaries_steps=None,
                                                 save_summaries_secs=None,
                                                 hooks=hooks)
        if self.wid == 0:
            writer = SummaryWriterCache.get(SUMMARY_DIR)

        t, episode, terminal = 0, 0, False
        buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
        rolling_r = RunningStats()

        while not sess.should_stop() and not (episode > EP_MAX
                                              and self.wid == 0):

            s = self.env.reset()
            ep_r, ep_t, ep_a = 0, 0, []

            while True:
                a, v = self.ppo.evaluate_state(s, sess)

                # Update ppo
                if t == BATCH:  # or (terminal and t < BATCH):
                    # Normalise rewards
                    rewards = np.array(buffer_r)
                    rolling_r.update(rewards)
                    rewards = np.clip(rewards / rolling_r.std, -10, 10)

                    v_final = [
                        v * (1 - terminal)
                    ]  # v = 0 if terminal, otherwise use the predicted v
                    values = np.array(buffer_v + v_final)
                    terminals = np.array(buffer_terminal + [terminal])

                    # Generalized Advantage Estimation - https://arxiv.org/abs/1506.02438
                    delta = rewards + GAMMA * values[1:] * (
                        1 - terminals[1:]) - values[:-1]
                    advantage = discount(delta, GAMMA * LAMBDA, terminals)
                    returns = advantage + np.array(buffer_v)
                    advantage = (advantage - advantage.mean()) / np.maximum(
                        advantage.std(), 1e-6)

                    bs, ba, br, badv = np.reshape(buffer_s, (t,) + self.ppo.s_dim), np.vstack(buffer_a), \
                                       np.vstack(returns), np.vstack(advantage)

                    graph_summary = self.ppo.update(bs, ba, br, badv, sess)
                    buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
                    t = 0

                buffer_s.append(s)
                buffer_a.append(a)
                buffer_v.append(v)
                buffer_terminal.append(terminal)
                ep_a.append(a)

                if not self.ppo.discrete:
                    a = np.clip(a, self.env.action_space.low,
                                self.env.action_space.high)
                s, r, terminal, _ = self.env.step(a)
                buffer_r.append(r)

                ep_r += r
                ep_t += 1
                t += 1

                if terminal:
                    # End of episode summary
                    print('Worker_%i' % self.wid, '| Episode: %i' % episode,
                          "| Reward: %.2f" % ep_r, '| Steps: %i' % ep_t)

                    if self.wid == 0:
                        worker_summary = tf.Summary()
                        worker_summary.value.add(tag="Reward",
                                                 simple_value=ep_r)

                        # Create Action histograms for each dimension
                        actions = np.array(ep_a)
                        if self.ppo.discrete:
                            add_histogram(writer,
                                          "Action",
                                          actions,
                                          episode,
                                          bins=self.ppo.a_dim)
                        else:
                            for a in range(self.ppo.a_dim):
                                add_histogram(writer, "Action/Dim" + str(a),
                                              actions[:, a], episode)

                        try:
                            writer.add_summary(graph_summary, episode)
                        except NameError:
                            pass
                        writer.add_summary(worker_summary, episode)
                        writer.flush()

                    episode += 1
                    break

        self.env.close()
        print("Worker_%i finished" % self.wid)
Example #24
0
    def update_model(self, shared_data):
        """
        This function accepts the data collected from a rollout and performs Q value update iterations
        on the neural net.

        shared_data - dict of torch tensors with shared memory to collect data. Each 
                tensor contains indices from idx*n_tsteps to (idx+1)*n_tsteps
                Keys (assume string keys):
                    "states" - MDP states at each timestep t
                            type: FloatTensor
                            shape: (n_states, *state_shape)
                    "deltas" - gae deltas collected at timestep t+1
                            type: FloatTensor
                            shape: (n_states,)
                    "h_states" - Recurrent states at timestep t+1
                            type: FloatTensor
                            shape: (n_states, h_size)
                    "rewards" - Collects float rewards collected at each timestep t
                            type: FloatTensor
                            shape: (n_states,)
                    "dones" - Collects the dones collected at each timestep t
                            type: FloatTensor
                            shape: (n_states,)
                    "actions" - Collects actions performed at each timestep t
                            type: LongTensor
                            shape: (n_states,)
        """
        hyps = self.hyps
        net = self.net
        net.req_grads(True)

        states = shared_data['states']
        rewards = shared_data['rewards']
        dones = shared_data['dones']
        actions = shared_data['actions']
        deltas = shared_data['deltas']
        advs = cuda_if(
            discount(deltas.squeeze(), dones.squeeze(),
                     hyps['gamma'] * hyps['lambda_']))

        # Forward Pass
        if 'h_states' in shared_data:
            h_states = Variable(cuda_if(shared_data['h_states']))
            if hyps['use_bptt']:
                vals, logits = self.bptt(states, h_states, dones)
            else:
                vals, logits, _ = net(Variable(cuda_if(states)), h_states)
        else:
            vals, logits = net(Variable(cuda_if(states)))

        # Log Probabilities
        log_softs = F.log_softmax(logits, dim=-1)
        logprobs = log_softs[torch.arange(len(actions)).long(), actions]

        # Returns
        if hyps['use_nstep_rets']:
            returns = advs + vals.data.squeeze()
        else:
            returns = cuda_if(
                discount(rewards.squeeze(), dones.squeeze(), hyps['gamma']))

        # Advantages
        if hyps['norm_advs']:
            advs = (advs - advs.mean()) / (advs.std() + 1e-6)

        # A2C Losses
        pi_loss = -(logprobs.squeeze() * Variable(advs.squeeze())).mean()
        val_loss = hyps['val_coef'] * F.mse_loss(vals.squeeze(), returns)
        entr_loss = -hyps['entr_coef'] * (
            (log_softs * F.softmax(logits, dim=-1)).sum(-1)).mean()

        loss = pi_loss + val_loss - entr_loss
        loss.backward()
        self.norm = nn.utils.clip_grad_norm_(net.parameters(),
                                             hyps['max_norm'])
        self.optim.step()
        self.optim.zero_grad()

        self.info = {
            "Loss": loss.item(),
            "Pi_Loss": pi_loss.item(),
            "ValLoss": val_loss.item(),
            "Entropy": entr_loss.item(),
            "GradNorm": self.norm.item()
        }
        return self.info
Example #25
0
    if not done:
        rewards.append(model.value(obs).item())
    else:
        rewards.append(0.0)

    # RESHAPING
    states = torch.stack(states)
    actions = torch.stack(actions)
    logprobs = torch.stack(logprobs).detach()
    rewards = torch.tensor(rewards, dtype=torch.float, device=opts["device"])
    with torch.no_grad():
        values = model.value(states).reshape(-1)
        if done:
            values[-1] = 0.0
        advantages = rewards[:-1] + opts["gamma"] * values[1:] - values[:-1]
    discounted_adv = utils.discount(advantages, opts["gamma"] * opts["lambda"])
    cumrew = rewards[:-1].sum().item()
    rewards = utils.discount(rewards, gamma=opts["gamma"])[:-1]
    temp_history.append(cumrew)

    print("Episode: %d, reward: %.3f, std: %.3f, %.3f" %
          (it, cumrew, *torch.exp(model.log_std).detach()))

    # ADD TO MEMORY
    for i in range(states.shape[0] - 1):
        model.record(states[i], actions[i], logprobs[i], rewards[i],
                     discounted_adv[i])

    # UPDATE
    if model.memory.size >= opts["update_iter"]:
        loss = model.update()