Ejemplo n.º 1
0
def start(env):
    env = gym.make(env)
    frames = []
    MASTER_NAME = "master-0"
    IMAGE_PATH = "images/%s.gif" % env.spec.id
    tf.reset_default_graph()

    with tf.Session() as session:
        with tf.variable_scope(MASTER_NAME) as scope:
            env_opts = environments.get_env_options(env, False)
            policy = get_policy(env_opts, session)
            master_agent = PPOAgent(policy, session, MASTER_NAME, env_opts)

        saver = tf.train.Saver(max_to_keep=1)
        saver = tf.train.import_meta_graph(
            tf.train.latest_checkpoint("models/%s/" % env.spec.id) + ".meta")
        saver.restore(session,
                      tf.train.latest_checkpoint("models/%s/" % env.spec.id))
        try:
            pass
        except:
            print("Failed to restore model, starting from scratch")
            session.run(tf.global_variables_initializer())

        global_step = 0
        while global_step < 1000:
            terminal = False
            s0 = env.reset()
            cum_rew = 0
            cur_hidden_state = master_agent.get_init_hidden_state()
            episode_count = 0
            while not terminal:
                episode_count += 1
                frames.append(env.render(mode='rgb_array'))
                action, h_out = master_agent.get_strict_sample(
                    s0, cur_hidden_state)
                cur_hidden_state = h_out
                s0, r, terminal, _ = env.step(action)
                cum_rew += r
                global_step += 1
            print(episode_count, cum_rew)
        imageio.mimsave(IMAGE_PATH, frames, duration=1.0 / 60.0)
Ejemplo n.º 2
0
def start(env):
    env = gym.make(env)

    MASTER_NAME = "master-0"

    tf.reset_default_graph()

    with tf.Session() as session:
        with tf.variable_scope(MASTER_NAME) as scope:
            env_opts = environments.get_env_options(env, False)
            policy = get_policy(env_opts, session)
            master_agent = PPOAgent(policy, session, MASTER_NAME, env_opts)

        saver = tf.train.Saver(max_to_keep=1)
        saver = tf.train.import_meta_graph(tf.train.latest_checkpoint("models/%s/" % env.spec.id) + ".meta")
        saver.restore(session, tf.train.latest_checkpoint("models/%s/" % env.spec.id))
        try:
            pass
        except:
            print("Failed to restore model, starting from scratch")
            session.run(tf.global_variables_initializer())


        while True:
            terminal = False
            s0 = env.reset()
            cum_rew = 0
            cur_hidden_state = master_agent.get_init_hidden_state()
            episode_count = 0
            while not terminal:
                episode_count += 1
                env.render()
                action, h_out = master_agent.get_strict_sample(s0, cur_hidden_state)
                cur_hidden_state = h_out
                s0, r, terminal, _ = env.step(action)
                cum_rew += r
            print(episode_count, cum_rew)
Ejemplo n.º 3
0
class GatheringWorker:
    def __init__(self, idx, env_producer, env_opts, rollout_size, worker_queue,
                 weights_queue):
        self.session = None
        self.idx = idx
        self.env_producer = env_producer
        self.env = None
        self.s0 = None
        self.trainable_vars = None
        self.agent = None
        self.env_opts = env_opts
        self.cur_hidden_state = None
        self.episode = None
        self.episodes = []
        self.batch_size = env_opts["batch_size"]
        self.terminal = False
        self.recurrent_policy = env_opts["recurrent"]
        self.timestep_size = env_opts["timestep_size"]
        if not self.recurrent_policy:
            self.timestep_size = 1
        self.discount_factor = env_opts["discount_factor"]
        self.gae_factor = env_opts["gae_factor"]
        self.max_episode_steps = env_opts["max_episode_steps"]
        self.rollout_size = rollout_size
        self.discrete_env = env_opts["discrete"]
        self.ep_count = 0
        self.episode_step = 0
        self.cum_rew = 0
        self.global_step = 0
        self.sampled_action = None
        self.sampled_a_prob = None
        self.accum_vars = None
        self.assign_op = None
        self.worker_queue = worker_queue
        self.weights_queue = weights_queue
        self.stats = []
        self.get_experience()

    def get_experience(self):
        self.init()
        action, a_prob, h_out, v_out = self.agent.get_sample(
            self.s0, self.cur_hidden_state)
        self.sampled_action = action
        self.sampled_a_prob = a_prob
        while True:
            self.stats = []
            self.apply_weights()
            self.episodes = []
            for i in range(self.rollout_size):
                if self.terminal:
                    if self.episode_step == self.max_episode_steps and len(
                            self.episode[1]) > 0:
                        self.episode[4][-1] = False
                    self.episode_step = 0
                    self.s0 = self.env.reset()
                    self.episodes.append(self.episode)
                    self.cur_hidden_state = self.agent.get_init_hidden_state()
                    self.episode = [self.s0
                                    ], [], [], [], [], [self.cur_hidden_state
                                                        ], []
                    self.stats.append({
                        "reward": self.cum_rew,
                        "step": self.ep_count,
                        "a_probs": self.sampled_a_prob,
                        "picked_a": self.sampled_action,
                        "a_dim": self.env_opts["action_dim"],
                        "discrete": self.env_opts["discrete"]
                    })
                    self.terminal = False
                    self.ep_count += 1
                    self.cum_rew = 0

                action, a_prob, h_out, v_out = self.agent.get_sample(
                    self.s0, self.cur_hidden_state)
                self.episode_step += 1
                self.global_step += 1
                if np.random.random() > 0.99:
                    self.sampled_action = action
                    self.sampled_a_prob = a_prob
                self.cur_hidden_state = h_out
                self.s0, r, self.terminal, _ = self.env.step(action)
                self.cum_rew += r
                self.episode[0].append(self.s0)
                self.episode[1].append(self.agent.transform_reward(r))
                self.episode[2].append(action)
                self.episode[3].append(a_prob)
                self.episode[4].append(self.terminal)
                self.episode[5].append(h_out)
                self.episode[6].append(v_out)
            self.episodes.append(self.episode)
            self.episode = [self.s0], [], [], [], [], [self.cur_hidden_state
                                                       ], []
            result = self.process_episodes(self.episodes)
            self.worker_queue.put(result)

    def apply_weights(self):
        weights = self.weights_queue.get()
        feed_dict = {}
        for i, t in enumerate(self.accum_vars):
            feed_dict[t] = weights[i]
        self.session.run(self.assign_op, feed_dict=feed_dict)

    def init(self):
        import tensorflow as tf
        self.env = self.env_producer.get_new_environment()
        self.s0 = self.env.reset()
        self.session = utils.create_session(self.env_opts, False)
        with tf.device("/cpu:0"):
            with tf.variable_scope("gather-%s" % self.idx):
                pol = get_policy(self.env_opts, self.session)
                self.agent = PPOAgent(pol, self.session,
                                      "gather-%s" % self.idx, self.env_opts)
                self.trainable_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, "gather-%s" % self.idx)
                self.accum_vars = [
                    tf.Variable(tf.zeros_like(tv.initialized_value()),
                                trainable=False) for tv in self.trainable_vars
                ]
                assign_ops = [
                    self.trainable_vars[i].assign(self.accum_vars[i])
                    for i in range(len(self.trainable_vars))
                ]
                self.assign_op = tf.group(assign_ops)
            self.session.run(tf.global_variables_initializer())
            self.cur_hidden_state = self.agent.get_init_hidden_state()
            self.episode = [self.s0], [], [], [], [], [self.cur_hidden_state
                                                       ], []

    def process_episodes(self, episodes):
        all_states = []
        all_advantages = []
        all_returns = []
        all_picked_actions = []
        all_old_actions_probs = []
        all_pred_values = []
        all_hidden_states = []

        for episode in episodes:
            st, rewards, picked_actions, old_action_probs, terminals, hidden_states, values = episode
            if len(rewards) == 0:
                continue
            states = np.asarray(st)
            pred_values = np.zeros(len(values) + 1)
            pred_values[:-1] = np.array(values)
            episode_len = len(rewards)
            advantages = np.zeros((episode_len, ))
            returns = np.zeros((episode_len + 1, ))
            if terminals[-1]:
                pred_values[-1] = 0
            else:
                _, _, _, v_out = self.agent.get_sample(states[-1],
                                                       hidden_states[-1])
                pred_values[-1] = v_out
            returns[-1] = pred_values[-1]
            for i in reversed(range(episode_len)):
                r = rewards[i]
                next_v = pred_values[i + 1]
                cur_v = pred_values[i]
                diff = r + self.discount_factor * next_v - cur_v
                if i == episode_len - 1:
                    advantages[i] = diff
                else:
                    advantages[
                        i] = diff + self.discount_factor * self.gae_factor * advantages[
                            i + 1]
                returns[i] = r + self.discount_factor * returns[i + 1]
            returns = returns[:-1]

            ep_states = states[:-1]
            ep_advantages = advantages
            ep_returns = returns
            ep_picked_actions = np.array(picked_actions)
            ep_old_action_probs = np.array(old_action_probs)
            ep_all_pred_values = pred_values
            ep_hidden_state = np.array(hidden_states[:-1])
            splitted = utils.split_episode(ep_states, ep_advantages,
                                           ep_returns, ep_picked_actions,
                                           ep_old_action_probs,
                                           ep_all_pred_values, ep_hidden_state,
                                           self.timestep_size)
            for b_states, b_hidden_state, b_advantages, b_returns, b_picked_actions, b_old_action_probs, b_all_pred_values in splitted:
                all_states.append(b_states)
                all_advantages.append(b_advantages)
                all_returns.append(b_returns)
                all_picked_actions.append(b_picked_actions)
                all_old_actions_probs.append(b_old_action_probs)
                all_pred_values.append(b_all_pred_values)
                all_hidden_states.append(b_hidden_state)

        all_states = np.array(all_states)
        all_advantages = np.array(all_advantages)
        all_picked_actions = np.array(all_picked_actions)
        all_returns = np.array(all_returns)
        all_old_actions_probs = np.array(all_old_actions_probs)
        all_pred_values = np.array(all_pred_values)
        all_hidden_states = np.array(all_hidden_states)

        return [
            all_states, all_advantages, all_picked_actions, all_returns,
            all_old_actions_probs, all_pred_values, all_hidden_states,
            self.ep_count, self.stats, self.idx
        ]