Ejemplo n.º 1
0
def load_act_model(load_file, model_scope, env, nenvs=1, num_actions=5):
    print('Loading from...', load_file)

    ob_shape = utils.get_shape(env.observation_space)
    ac_space = env.action_space

    sess = tf.get_default_session()

    act = CnnPolicy(sess,
                    ob_shape,
                    ac_space,
                    nenvs,
                    1,
                    model_scope,
                    reuse=False)

    with tf.variable_scope(model_scope):
        params = tf.trainable_variables(model_scope)

    loaded_params = joblib.load(Config.MODEL_DIR + load_file)
    restores = []
    for p, loaded_p in zip(params, loaded_params):
        restores.append(p.assign(loaded_p))
    sess.run(restores)

    return act
Ejemplo n.º 2
0
class Actor():
    def __init__(self, ob_space, ac_space, n_batch, n_steps):
        self.network = CnnPolicy(sess, ob_space, ac_space, n_batch, n_steps)
        saver = tf.train.Saver()
        saver.restore(sess, "./checkpoints/model.ckpt")

    def act(self, state):
        stuff = self.network.step(state)
        action, value_, _ = stuff[0], stuff[1], stuff[2:]
        return action, value_
Ejemplo n.º 3
0
def test(args):
    env = gym.make(args.env)
    obs = env.reset()

    policy = CnnPolicy(env.action_space.n)
    policy.load_state_dict(torch.load(args.ckpt_path)['state_dict'])
    if not args.no_cuda: policy.cuda()

    frames = Variable(torch.zeros((1, 4, 80, 80)))  # used to hold 4 consecutive frames
    if not args.no_cuda: frames = frames.cuda()
    prepro = preprocess_pong if 'Pong' in args.env else preprocess_atari

    while True:
        env.render()

        obs = prepro(obs)
        obs = np.expand_dims(np.expand_dims(obs, 0), 0)
        obs = Variable(torch.from_numpy(obs))
        if not args.no_cuda: obs = obs.cuda()
        
        # add current observation to structure that holds cosecutive frames
        frames = frames[:, :-1, :, :]
        frames = torch.cat((obs, frames), 1)

        action_probs, _ = policy(frames)
        action_dist = Categorical(action_probs)
        action = action_dist.sample()
        
        obs, reward, done, _ = env.step(action.data[0])
        if done: return
            
        time.sleep(0.01)  # so the game wouldn't move too fast
Ejemplo n.º 4
0
def train(args):
    env = gym.make(args.env)
    obs = env.reset()

    policy = CnnPolicy(env.action_space.n)
    if not args.no_cuda: policy.cuda()
    optimizer = optim.Adam(policy.parameters(), lr=args.eta)

    frames = Variable(torch.zeros((1, 4, 80, 80)))
    if not args.no_cuda: frames = frames.cuda()
    prepro = preprocess_pong if 'Pong' in args.env else preprocess_atari

    rewards, logprobs, aprobs, state_values = [], [], [], []
    reward_sum = 0
    epi = 0
    ep_start = time.time()

    running_reward = args.init_runreward
    running_rewards = []
    saved_reward_epi = epi
    saved_ckpt_epi = epi
    start_ts = 1

    if args.resume_ckpt:
        checkpoint = torch.load(args.resume_ckpt)
        policy.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_ts = checkpoint['step']
        epi = checkpoint['episode']
        running_reward = checkpoint['running_reward']

    for ts in range(start_ts, args.nb_steps + 1):
        frames = _build_frames(prepro, obs, frames, args.no_cuda)

        action_probs, state_value = policy(frames)
        action_dist = Categorical(action_probs)
        action = action_dist.sample()

        obs, reward, done, _ = env.step(action.data[0])
        reward_sum += reward

        rewards.append(reward)
        logprobs.append(action_dist.log_prob(action))
        aprobs.append(action_probs)
        state_values.append(state_value)

        if done or not ts % args.update_freq:
            # feed last state through the network to get policy values
            final_state = _build_frames(prepro, obs, frames, args.no_cuda)
            _, final_sval = policy(final_state)

            disc_rewards = np.array(rewards)
            disc_rewards = (disc_rewards - np.mean(disc_rewards)) / (
                np.std(disc_rewards) + 1e-10)
            disc_rewards = _discount_rewards(disc_rewards,
                                             0 if done else final_sval.data)
            disc_rewards = Variable(torch.Tensor(disc_rewards)).cuda()

            aprobs = torch.cat(aprobs).clamp(1e-8)
            state_values = torch.cat(state_values).squeeze()
            entropies = -torch.sum(aprobs * torch.log(aprobs), dim=1)

            actor_loss = -torch.cat(logprobs) * (disc_rewards -
                                                 state_values.detach())
            critic_loss = torch.pow(disc_rewards - state_values, 2)
            loss = actor_loss.sum() + critic_loss.sum(
            ) - args.beta * entropies.sum()

            # param update
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # zero-out buffers
            frames = Variable(torch.zeros((1, 4, 80, 80)))
            if not args.no_cuda: frames = frames.cuda()
            rewards, logprobs, aprobs, state_values = [], [], [], []

        if done:
            total_time = time.time() - ep_start
            running_reward = reward_sum if not running_reward else running_reward * 0.99 + reward_sum * 0.01
            print(
                "Episode {} took {:.2f} s. Steps: {}. Reward: {:.2f}. Running: {:.2f}"
                .format(epi, total_time, ts, reward_sum, running_reward))

            epi += 1
            reward_sum = 0
            ep_start = time.time()
            obs = env.reset()

        if not epi % args.save_ckpt_freq and saved_ckpt_epi < epi:
            model_state = {
                'state_dict': policy.state_dict(),
                'optimizer': optimizer.state_dict(),
                'step': ts,
                'episode': epi,
                'running_reward': running_reward
            }
            torch.save(model_state, args.ckpt_path)
            saved_ckpt_epi = epi

        if not epi % args.save_reward_freq and saved_reward_epi < epi:
            running_rewards.append(running_reward)
            with open(args.rewards_path, 'wb') as f:
                pickle.dump(running_rewards, f)
            saved_reward_epi = epi
Ejemplo n.º 5
0
 def __init__(self, ob_space, ac_space, n_batch, n_steps):
     with tf.variable_scope("global"):
         self.network = CnnPolicy(sess, ob_space, ac_space, n_batch,
                                  n_steps)
         saver = tf.train.Saver()
         saver.restore(sess, "./checkpoints/model.ckpt")
Ejemplo n.º 6
0
class TRPO():
    def __kl_div(self, p, q):
        return torch.mean(torch.sum(p * (torch.log(p) - torch.log(q)), 1))

    def __hessian_vec_prod(self, v):
        # Note: torch.autograd.grad doesn't accumulate gradients
        # into the .grad buffers, but instead returns gradients
        # as Variable tuples. Hence, no zero_grad() is needed.
        kl = self.__kl_div(self.pi_old.detach(), self.pi_old)
        g_kl = torch.autograd.grad(kl,
                                   self.policy.parameters(),
                                   create_graph=True)
        g_kl = torch.cat([p.view(-1) for p in g_kl])
        grad_vec_prod = torch.dot(g_kl, v)
        Hv = torch.autograd.grad(grad_vec_prod,
                                 self.policy.parameters(),
                                 create_graph=True)
        return torch.cat([p.contiguous().view(-1)
                          for p in Hv]) + args.cg_damp * v

    # def __conjugate_gradient(self, b):
    #     """Optimizer conjugate-gradient, Nocedal & Wright algorithm 5.2"""
    #     x = torch.zeros_like(b)
    #     r = b - self.__hessian_vec_prod(x)
    #     p = -r
    #
    #     for cg_iter in range(args.nb_cgsteps):
    #         rr = torch.dot(r, r)
    #         Ap = self.__hessian_vec_prod(p.detach())
    #         alpha = rr / torch.dot(p, Ap)
    #         x += alpha*p
    #         r -= alpha*Ap
    #         beta = torch.dot(r, r) / rr
    #         p = -r + beta*p
    #
    #     return x

    def __conjugate_gradient(self, b):
        """Optimizer conjugate-gradient, Nocedal & Wright algorithm 5.2"""
        """Code adapted from https://github.com/openai/baselines/blob/master/baselines/common/cg.py"""
        p = b.clone()
        r = b.clone()
        x = torch.zeros_like(b)
        rdotr = torch.dot(r, r)

        for cg_iter in range(args.nb_cgsteps):
            z = self.__hessian_vec_prod(p.detach())
            v = rdotr / torch.dot(p, z)
            x += v * p
            r -= v * z
            newrdotr = torch.dot(r, r)
            mu = newrdotr / rdotr
            p = r + mu * p
            rdotr = newrdotr
        return x

    def __line_search(self, s, prev_is_obj):
        theta_old = self.policy.gather_flat_params()

        sAs = torch.dot(s, self.__hessian_vec_prod(s.detach()))
        beta = torch.sqrt((2 * args.stepsize) / sAs)

        nb_iters = 1
        while True:
            theta = (theta_old + beta * s)
            self.policy.replace_params(theta)
            pi = self.policy(self.frames_hist)
            kl_indicator = 0 if self.__kl_div(
                self.pi_old, pi) <= args.stepsize else float("inf")

            aprobs = torch.gather(pi, 1, self.action_hist.view(-1, 1))
            entropies = -torch.sum(aprobs * torch.log(aprobs), dim=1)
            is_obj = torch.sum((aprobs / self.aprobs_old) *
                               self.advs) + args.ent_coeff * entropies.sum()

            if is_obj - kl_indicator >= prev_is_obj: break
            beta /= 2
            nb_iters += 1

            if nb_iters > 100:
                print("WARNING! Line search didn't terminate in 100 steps.")
                return

        print("Line search terminated after {} steps.".format(nb_iters))

    def train(self, args):
        env = gym.make(args.env)
        obs = env.reset()

        self.policy = CnnPolicy(env.action_space.n)
        if not args.no_cuda: self.policy.cuda()

        frames = Variable(torch.zeros((1, 4, 80, 80)))
        if not args.no_cuda: frames = frames.cuda()
        prepro = preprocess_pong if 'Pong' in args.env else preprocess_atari

        # arrays for holding history and statistics
        frames_hist, action_hist, rewards = [], [], []

        # stuff for monitoring and logging progress
        reward_sum = 0
        epi = 0
        ep_start = time.time()
        running_reward = args.init_runreward
        running_rewards = []
        saved_reward_epi = epi
        saved_ckpt_epi = epi
        start_ts = 1

        if args.resume_ckpt:
            checkpoint = torch.load(args.resume_ckpt)
            self.policy.load_state_dict(checkpoint['state_dict'])
            start_ts = checkpoint['step']
            epi = checkpoint['episode']
            running_reward = checkpoint['running_reward']

        for ts in range(start_ts, args.nb_steps + 1):
            frames = _build_frames(prepro, obs, frames, args.no_cuda)

            action_probs = self.policy(frames)
            action_dist = Categorical(action_probs)
            action = action_dist.sample()

            obs, reward, done, _ = env.step(action.data[0])
            reward_sum += reward

            frames_hist.append(frames)
            action_hist.append(action)
            rewards.append(reward)

            if not ts % args.update_freq:
                disc_rewards = _discount_rewards(rewards)
                disc_rewards = (disc_rewards - np.mean(disc_rewards)) / (
                    np.std(disc_rewards) + 1e-10)
                self.advs = Variable(
                    torch.Tensor(disc_rewards)).cuda().detach()

                self.frames_hist = torch.cat(frames_hist)
                self.action_hist = torch.cat(action_hist)
                self.pi_old = self.policy(self.frames_hist)
                self.aprobs_old = torch.gather(self.pi_old, 1,
                                               self.action_hist.view(-1, 1))

                # Gradient of linear term
                entropies = -torch.sum(
                    self.aprobs_old * torch.log(self.aprobs_old), dim=1)
                is_obj = torch.sum(
                    (self.aprobs_old / self.aprobs_old.detach()) *
                    self.advs) + args.ent_coeff * entropies.sum()
                is_obj.backward(retain_graph=True)
                g = self.policy.gather_flat_grad()

                p = self.__conjugate_gradient(g)
                self.__line_search(p, is_obj)

                # zero-out buffers
                frames = Variable(torch.zeros((1, 4, 80, 80)))
                if not args.no_cuda: frames = frames.cuda()
                frames_hist, action_hist, rewards = [], [], []

            if done:
                total_time = time.time() - ep_start
                running_reward = reward_sum if not running_reward else running_reward * 0.99 + reward_sum * 0.01
                print(
                    "Episode {} took {:.2f} s. Steps: {}. Reward: {:.2f}. Running: {:.2f}"
                    .format(epi, total_time, ts, reward_sum, running_reward))

                epi += 1
                reward_sum = 0
                ep_start = time.time()
                obs = env.reset()

            if not epi % args.save_ckpt_freq and saved_ckpt_epi < epi:
                model_state = {
                    'state_dict': self.policy.state_dict(),
                    'step': ts,
                    'episode': epi,
                    'running_reward': running_reward
                }
                torch.save(model_state, args.ckpt_path)
                saved_ckpt_epi = epi

            if not epi % args.save_reward_freq and saved_reward_epi < epi:
                running_rewards.append(running_reward)
                with open(args.rewards_path, 'wb') as f:
                    pickle.dump(running_rewards, f)
                saved_reward_epi = epi
Ejemplo n.º 7
0
    def train(self, args):
        env = gym.make(args.env)
        obs = env.reset()

        self.policy = CnnPolicy(env.action_space.n)
        if not args.no_cuda: self.policy.cuda()

        frames = Variable(torch.zeros((1, 4, 80, 80)))
        if not args.no_cuda: frames = frames.cuda()
        prepro = preprocess_pong if 'Pong' in args.env else preprocess_atari

        # arrays for holding history and statistics
        frames_hist, action_hist, rewards = [], [], []

        # stuff for monitoring and logging progress
        reward_sum = 0
        epi = 0
        ep_start = time.time()
        running_reward = args.init_runreward
        running_rewards = []
        saved_reward_epi = epi
        saved_ckpt_epi = epi
        start_ts = 1

        if args.resume_ckpt:
            checkpoint = torch.load(args.resume_ckpt)
            self.policy.load_state_dict(checkpoint['state_dict'])
            start_ts = checkpoint['step']
            epi = checkpoint['episode']
            running_reward = checkpoint['running_reward']

        for ts in range(start_ts, args.nb_steps + 1):
            frames = _build_frames(prepro, obs, frames, args.no_cuda)

            action_probs = self.policy(frames)
            action_dist = Categorical(action_probs)
            action = action_dist.sample()

            obs, reward, done, _ = env.step(action.data[0])
            reward_sum += reward

            frames_hist.append(frames)
            action_hist.append(action)
            rewards.append(reward)

            if not ts % args.update_freq:
                disc_rewards = _discount_rewards(rewards)
                disc_rewards = (disc_rewards - np.mean(disc_rewards)) / (
                    np.std(disc_rewards) + 1e-10)
                self.advs = Variable(
                    torch.Tensor(disc_rewards)).cuda().detach()

                self.frames_hist = torch.cat(frames_hist)
                self.action_hist = torch.cat(action_hist)
                self.pi_old = self.policy(self.frames_hist)
                self.aprobs_old = torch.gather(self.pi_old, 1,
                                               self.action_hist.view(-1, 1))

                # Gradient of linear term
                entropies = -torch.sum(
                    self.aprobs_old * torch.log(self.aprobs_old), dim=1)
                is_obj = torch.sum(
                    (self.aprobs_old / self.aprobs_old.detach()) *
                    self.advs) + args.ent_coeff * entropies.sum()
                is_obj.backward(retain_graph=True)
                g = self.policy.gather_flat_grad()

                p = self.__conjugate_gradient(g)
                self.__line_search(p, is_obj)

                # zero-out buffers
                frames = Variable(torch.zeros((1, 4, 80, 80)))
                if not args.no_cuda: frames = frames.cuda()
                frames_hist, action_hist, rewards = [], [], []

            if done:
                total_time = time.time() - ep_start
                running_reward = reward_sum if not running_reward else running_reward * 0.99 + reward_sum * 0.01
                print(
                    "Episode {} took {:.2f} s. Steps: {}. Reward: {:.2f}. Running: {:.2f}"
                    .format(epi, total_time, ts, reward_sum, running_reward))

                epi += 1
                reward_sum = 0
                ep_start = time.time()
                obs = env.reset()

            if not epi % args.save_ckpt_freq and saved_ckpt_epi < epi:
                model_state = {
                    'state_dict': self.policy.state_dict(),
                    'step': ts,
                    'episode': epi,
                    'running_reward': running_reward
                }
                torch.save(model_state, args.ckpt_path)
                saved_ckpt_epi = epi

            if not epi % args.save_reward_freq and saved_reward_epi < epi:
                running_rewards.append(running_reward)
                with open(args.rewards_path, 'wb') as f:
                    pickle.dump(running_rewards, f)
                saved_reward_epi = epi
Ejemplo n.º 8
0
def train(args):
    env = gym.make(args.env)
    obs = env.reset()

    policy = CnnPolicy(env.action_space.n)
    if not args.no_cuda: policy.cuda()
    optimizer = optim.Adam(policy.parameters(), lr=args.eta)

    frames = Variable(torch.zeros((1, 4, 80, 80)))
    if not args.no_cuda: frames = frames.cuda()
    prepro = preprocess_pong if 'Pong' in args.env else preprocess_atari

    # arrays for holding history and statistics
    frames_hist, rewards, logprobs, action_hist = [], [], [], []

    # stuff for monitoring and logging progress
    reward_sum = 0
    epi = 0
    ep_start = time.time()
    running_reward = args.init_runreward
    running_rewards = []
    saved_reward_epi = epi
    saved_ckpt_epi = epi
    start_ts = 1

    if args.resume_ckpt:
        checkpoint = torch.load(args.resume_ckpt)
        policy.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_ts = checkpoint['step']
        epi = checkpoint['episode']
        running_reward = checkpoint['running_reward']

    for ts in range(start_ts, args.nb_steps + 1):
        frames = _build_frames(prepro, obs, frames, args.no_cuda)

        action_probs, _ = policy(frames)
        action_dist = Categorical(action_probs)
        action = action_dist.sample()

        obs, reward, done, _ = env.step(action.data[0])
        reward_sum += reward

        frames_hist.append(frames)
        action_hist.append(action)
        rewards.append(reward)
        logprobs.append(action_dist.log_prob(action))

        if done or not ts % args.update_freq:
            # feed last state through the network to get policy values
            final_state = _build_frames(prepro, obs, frames, args.no_cuda)
            _, final_sval = policy(final_state)

            disc_rewards = np.array(rewards)
            disc_rewards = (disc_rewards - np.mean(disc_rewards)) / (
                np.std(disc_rewards) + 1e-10)
            disc_rewards = _discount_rewards(disc_rewards,
                                             0 if done else final_sval.data)
            disc_rewards = Variable(torch.Tensor(disc_rewards)).cuda()

            frames_hist = torch.cat(frames_hist)
            action_hist = torch.cat(action_hist)
            pi_old = torch.cat(logprobs).exp().detach()

            for _ in range(args.nb_epochs):
                n = len(rewards)

                # shuffle
                indices = torch.randperm(n).cuda()
                frames_hist = frames_hist[indices, :, :, :]
                disc_rewards = disc_rewards[indices]
                action_hist = action_hist[indices]
                pi_old = pi_old[indices]

                nb_batches = int(np.ceil(n / args.batch_size))
                for i in range(nb_batches):
                    sidx = i * args.batch_size

                    batch_frames = frames_hist[sidx:sidx + args.batch_size]
                    aprobs, statevals = policy(batch_frames)
                    aprobs = aprobs.clamp(1e-8)

                    action_dist = Categorical(aprobs)
                    batch_actions = action_hist[sidx:sidx + args.batch_size]
                    pi = action_dist.log_prob(batch_actions).exp()

                    # clipped actor loss
                    ratio = pi / pi_old[sidx:sidx + args.batch_size]
                    advs = disc_rewards[sidx:sidx +
                                        args.batch_size] - statevals.squeeze()

                    lhs = ratio * advs.detach()
                    rhs = torch.clamp(ratio, 1 - args.eps,
                                      1 + args.eps) * advs.detach()
                    loss_clip = torch.min(lhs, rhs).sum()

                    # critic loss
                    loss_critic = torch.pow(advs, 2).sum()

                    # full loss
                    entropies = -torch.sum(aprobs * torch.log(aprobs), dim=1)
                    loss = -loss_clip + args.c1 * loss_critic - args.c2 * entropies.sum(
                    )

                    # param update
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            # zero-out buffers
            frames = Variable(torch.zeros((1, 4, 80, 80)))
            if not args.no_cuda: frames = frames.cuda()
            frames_hist, action_hist, rewards, logprobs = [], [], [], []

        if done:
            total_time = time.time() - ep_start
            running_reward = reward_sum if not running_reward else running_reward * 0.99 + reward_sum * 0.01
            print(
                "Episode {} took {:.2f} s. Steps: {}. Reward: {:.2f}. Running: {:.2f}"
                .format(epi, total_time, ts, reward_sum, running_reward))

            epi += 1
            reward_sum = 0
            ep_start = time.time()
            obs = env.reset()

        if not epi % args.save_ckpt_freq and saved_ckpt_epi < epi:
            model_state = {
                'state_dict': policy.state_dict(),
                'optimizer': optimizer.state_dict(),
                'step': ts,
                'episode': epi,
                'running_reward': running_reward
            }
            torch.save(model_state, args.ckpt_path)
            saved_ckpt_epi = epi

        if not epi % args.save_reward_freq and saved_reward_epi < epi:
            running_rewards.append(running_reward)
            with open(args.rewards_path, 'wb') as f:
                pickle.dump(running_rewards, f)
            saved_reward_epi = epi