예제 #1
0
def main(args):

    model_store_sprefix = "snapshot"

    # NormalizedEnv
    env = gym.make(args.env)

    env.seed(args.seed)
    torch.manual_seed(args.seed)

    env, generator, model, cont = get_functions(env, args)

    optimizer = optim.Adam(model.parameters(), lr=args.rllr)

    memory = Memory(args)

    agent = PPOAgent(args, model, optimizer, env, generator, memory, cont)
    if args.resume:
        agent.load_model(model_store_sprefix)

    agent.train(model_store_sprefix, args.save_interval)
예제 #2
0
                next_state, reward, done, _ = envs.step(action.cpu().numpy())

                log_prob = dist.log_prob(action)
                agent.entropy += dist.entropy().mean()

                agent.append_log_prob(log_prob)
                agent.append_value(value)
                agent.append_reward(reward)
                agent.append_done(done)

                agent.append_state(state)
                agent.append_action(action)

                state = next_state
                idx += 1

                if idx % 1000 == 0:
                    score = np.mean([agent.test_env(env) for _ in range(100)])
                    print(idx, score)
                    scores.append(score)
                    if score > best_avg_score:
                        best_avg_score = score
                        agent.save_model('../models/ppo/model.pt')
                        print('Saved best model')
                    if score > args.threshold_score:
                        early_stop = True

            agent.train(next_state)

        plot_and_save_scores(scores, args.max_frames / 1000, args)
예제 #3
0
class Worker:
    def __init__(self, env_producer, idx, env_opts, num_gather_workers,
                 master_weights_in_queue, master_weights_out_queue):
        self.env_opts = env_opts
        self.num_gather_workers = num_gather_workers
        self.env_producer = env_producer
        self.batch_size = env_opts["batch_size"]
        self.clip_eps = env_opts["clip_eps"]
        self.grad_step = env_opts["grad_step"]
        self.epochs = env_opts["epochs"]
        self.entropy_coef = env_opts["entropy_coef"]
        self.state_dim = env_opts["state_dim"]
        self.idx = idx
        self.session = None
        self.episode_step = 0
        self.initialized = False
        self.beta = self.env_opts["init_beta"]
        self.eta = self.env_opts["eta"]
        self.kl_target = self.env_opts["kl_target"]
        self.use_kl_loss = self.env_opts["use_kl_loss"]
        self.lr_multiplier = 1.0
        self.prev_batch = None
        self.variables_file_path = "models/%s/variables.txt" % env_opts[
            "env_name"]
        self.worker_queue = Queue()
        self.weights_queues = [Queue() for _ in range(self.num_gather_workers)]
        self.master_weights_in_queue = master_weights_in_queue
        self.master_weights_out_queue = master_weights_out_queue
        self.init_workers()
        self.agent = None
        self.trainable_vars = None
        self.accum_vars = None
        self.assign_op = None
        self.p_opt_vars = None
        self.v_opt_vars = None
        self.init_agent()

    def init_agent(self):
        import tensorflow as tf
        self.session = utils.create_session(self.env_opts, True)
        with tf.variable_scope("worker-%s" % self.idx):
            pol = get_policy(self.env_opts, self.session)
            self.agent = PPOAgent(pol, self.session, "worker-%s" % self.idx,
                                  self.env_opts)
            self.trainable_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, "worker-%s" % self.idx)
            self.accum_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in self.trainable_vars
            ]
            p_vars = self.agent.p_opt.variables()
            v_vars = self.agent.v_opt.variables()
            self.p_opt_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in p_vars
            ]
            self.v_opt_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in v_vars
            ]
            p_assign_ops = [
                p_vars[i].assign(self.p_opt_vars[i])
                for i in range(len(p_vars))
            ]
            v_assign_ops = [
                v_vars[i].assign(self.v_opt_vars[i])
                for i in range(len(v_vars))
            ]

            assign_ops = [
                self.trainable_vars[i].assign(self.accum_vars[i])
                for i in range(len(self.trainable_vars))
            ]
            self.assign_op = tf.group(assign_ops + p_assign_ops + v_assign_ops)

        self.session.run(tf.global_variables_initializer())
        self.run()

    def init_workers(self):
        for i in range(self.num_gather_workers):
            rollout_size = self.env_opts[
                "rollout_size"] // self.num_gather_workers
            t = Process(target=make_worker,
                        args=(i, self.env_producer, self.env_opts,
                              self.worker_queue, self.weights_queues[i],
                              rollout_size))
            t.start()

    def run(self):
        while True:
            self.apply_shared_variables()
            self.apply_weights_to_gather_workers()
            stats = self.compute_grads_and_stats()
            self.send_to_master(stats)

    def send_to_master(self, stats):
        weights, p_opt_weights, v_opt_weights = self.session.run([
            self.trainable_vars,
            self.agent.p_opt.variables(),
            self.agent.v_opt.variables()
        ])
        arr = [
            self.beta, self.lr_multiplier, p_opt_weights, v_opt_weights,
            weights, stats
        ]
        self.master_weights_out_queue.put(arr)

    def apply_weights_to_gather_workers(self):
        weights = self.session.run(self.trainable_vars)
        for q in self.weights_queues:
            q.put(weights)

    def apply_shared_variables(self):
        beta, lr_multiplier, p_opt_weights, v_opt_weights, weights = self.master_weights_in_queue.get(
        )
        self.beta = beta
        self.lr_multiplier = lr_multiplier
        fd = {}
        for i, t in enumerate(self.accum_vars):
            fd[t] = weights[i]
        for i, t in enumerate(self.p_opt_vars):
            fd[t] = p_opt_weights[i]
        for i, t in enumerate(self.v_opt_vars):
            fd[t] = v_opt_weights[i]
        self.session.run(self.assign_op, feed_dict=fd)

    def compute_grads_and_stats(self):
        results = []
        for i in range(self.num_gather_workers):
            results.append(self.worker_queue.get())
        w_idx = list(range(self.num_gather_workers))
        cur_all_states = np.concatenate([results[i][0] for i in w_idx], axis=0)
        cur_all_advantages = np.concatenate([results[i][1] for i in w_idx],
                                            axis=0)
        cur_all_picked_actions = np.concatenate([results[i][2] for i in w_idx],
                                                axis=0)
        cur_all_returns = np.concatenate([results[i][3] for i in w_idx],
                                         axis=0)
        cur_all_old_actions_probs = np.concatenate(
            [results[i][4] for i in w_idx], axis=0)
        cur_all_pred_values = np.concatenate([results[i][5] for i in w_idx],
                                             axis=0)
        cur_all_hidden_states = np.concatenate([results[i][6] for i in w_idx],
                                               axis=0)

        if self.prev_batch is not None:
            prev_all_states, prev_all_advantages, prev_all_picked_actions, prev_all_returns, \
                prev_all_old_actions_probs, prev_all_pred_values, prev_all_hidden_states = self.prev_batch
            all_states = np.concatenate([cur_all_states, prev_all_states],
                                        axis=0)
            all_advantages = np.concatenate(
                [cur_all_advantages, prev_all_advantages], axis=0)
            all_picked_actions = np.concatenate(
                [cur_all_picked_actions, prev_all_picked_actions], axis=0)
            all_returns = np.concatenate([cur_all_returns, prev_all_returns],
                                         axis=0)
            all_old_actions_probs = np.concatenate(
                [cur_all_old_actions_probs, prev_all_old_actions_probs],
                axis=0)
            all_pred_values = np.concatenate(
                [cur_all_pred_values, prev_all_pred_values], axis=0)
            all_hidden_states = np.concatenate(
                [cur_all_hidden_states, prev_all_hidden_states], axis=0)
        else:
            all_states = cur_all_states
            all_advantages = cur_all_advantages
            all_picked_actions = cur_all_picked_actions
            all_returns = cur_all_returns
            all_old_actions_probs = cur_all_old_actions_probs
            all_pred_values = cur_all_pred_values
            all_hidden_states = cur_all_hidden_states

        self.prev_batch = [
            cur_all_states, cur_all_advantages, cur_all_picked_actions,
            cur_all_returns, cur_all_old_actions_probs, cur_all_pred_values,
            cur_all_hidden_states
        ]

        all_advantages = (all_advantages - all_advantages.mean()) / (max(
            all_advantages.std(), 1e-4))

        first_gather = [x for x in results if x[9] == 0][0]

        self.episode_step = first_gather[7]
        stats = first_gather[8]

        sz = len(all_states)
        n_batches = (sz - 1) // self.batch_size + 1
        steps = 0
        cur_kl = 0
        entropy = 0
        hinge = 0
        src_policy_loss = 0
        vloss = 0
        ploss = 0
        for cur_epoch in range(self.epochs):
            idx = np.arange(len(all_states))
            np.random.shuffle(idx)
            all_states = all_states[idx]
            all_returns = all_returns[idx]
            all_picked_actions = all_picked_actions[idx]
            all_old_actions_probs = all_old_actions_probs[idx]
            all_advantages = all_advantages[idx]
            all_pred_values = all_pred_values[idx]
            all_hidden_states = all_hidden_states[idx]
            for b in range(n_batches):
                start = b * self.batch_size
                end = min(sz, (b + 1) * self.batch_size)
                states_b = all_states[start:end]
                returns_b = all_returns[start:end]
                picked_actions_b = all_picked_actions[start:end]
                old_action_probs_b = all_old_actions_probs[start:end]
                advantages_b = all_advantages[start:end]
                hidden_states_b = all_hidden_states[start:end]
                old_values_b = all_pred_values[start:end]
                cur_kl, entropy, hinge, src_policy_loss, vloss, ploss = \
                    self.agent.train(states_b,
                                     advantages_b,
                                     returns_b,
                                     picked_actions_b,
                                     old_action_probs_b,
                                     hidden_states_b,
                                     old_values_b,
                                     self.clip_eps,
                                     self.beta,
                                     self.eta,
                                     self.grad_step * self.lr_multiplier)
                steps += 1
            if cur_kl > self.kl_target * 4 and self.use_kl_loss:
                break

        if self.use_kl_loss:
            if cur_kl > self.kl_target * 2:
                self.beta = np.minimum(35, 1.5 * self.beta)
                if self.beta > 30.0:
                    self.lr_multiplier /= 1.5
            elif cur_kl < self.kl_target / 2:
                self.beta = np.maximum(1 / 35, self.beta / 1.5)
                if self.beta <= 1 / 30.0:
                    self.lr_multiplier *= 1.5
            self.lr_multiplier = max(min(self.lr_multiplier, 3.0), 0.1)

        train_stats = {
            "stats": stats,
            "kl": cur_kl,
            "entropy": entropy,
            "hinge": hinge,
            "src_policy_loss": src_policy_loss,
            "vloss": vloss,
            "ploss": ploss,
            "lr_multiplier": self.lr_multiplier,
            "beta": self.beta,
            "step": self.episode_step,
            "idx": self.idx
        }
        return train_stats