def load_act_model(load_file, model_scope, env, nenvs=1, num_actions=5): print('Loading from...', load_file) ob_shape = utils.get_shape(env.observation_space) ac_space = env.action_space sess = tf.get_default_session() act = CnnPolicy(sess, ob_shape, ac_space, nenvs, 1, model_scope, reuse=False) with tf.variable_scope(model_scope): params = tf.trainable_variables(model_scope) loaded_params = joblib.load(Config.MODEL_DIR + load_file) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) return act
class Actor(): def __init__(self, ob_space, ac_space, n_batch, n_steps): self.network = CnnPolicy(sess, ob_space, ac_space, n_batch, n_steps) saver = tf.train.Saver() saver.restore(sess, "./checkpoints/model.ckpt") def act(self, state): stuff = self.network.step(state) action, value_, _ = stuff[0], stuff[1], stuff[2:] return action, value_
def test(args): env = gym.make(args.env) obs = env.reset() policy = CnnPolicy(env.action_space.n) policy.load_state_dict(torch.load(args.ckpt_path)['state_dict']) if not args.no_cuda: policy.cuda() frames = Variable(torch.zeros((1, 4, 80, 80))) # used to hold 4 consecutive frames if not args.no_cuda: frames = frames.cuda() prepro = preprocess_pong if 'Pong' in args.env else preprocess_atari while True: env.render() obs = prepro(obs) obs = np.expand_dims(np.expand_dims(obs, 0), 0) obs = Variable(torch.from_numpy(obs)) if not args.no_cuda: obs = obs.cuda() # add current observation to structure that holds cosecutive frames frames = frames[:, :-1, :, :] frames = torch.cat((obs, frames), 1) action_probs, _ = policy(frames) action_dist = Categorical(action_probs) action = action_dist.sample() obs, reward, done, _ = env.step(action.data[0]) if done: return time.sleep(0.01) # so the game wouldn't move too fast
def train(args): env = gym.make(args.env) obs = env.reset() policy = CnnPolicy(env.action_space.n) if not args.no_cuda: policy.cuda() optimizer = optim.Adam(policy.parameters(), lr=args.eta) frames = Variable(torch.zeros((1, 4, 80, 80))) if not args.no_cuda: frames = frames.cuda() prepro = preprocess_pong if 'Pong' in args.env else preprocess_atari rewards, logprobs, aprobs, state_values = [], [], [], [] reward_sum = 0 epi = 0 ep_start = time.time() running_reward = args.init_runreward running_rewards = [] saved_reward_epi = epi saved_ckpt_epi = epi start_ts = 1 if args.resume_ckpt: checkpoint = torch.load(args.resume_ckpt) policy.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) start_ts = checkpoint['step'] epi = checkpoint['episode'] running_reward = checkpoint['running_reward'] for ts in range(start_ts, args.nb_steps + 1): frames = _build_frames(prepro, obs, frames, args.no_cuda) action_probs, state_value = policy(frames) action_dist = Categorical(action_probs) action = action_dist.sample() obs, reward, done, _ = env.step(action.data[0]) reward_sum += reward rewards.append(reward) logprobs.append(action_dist.log_prob(action)) aprobs.append(action_probs) state_values.append(state_value) if done or not ts % args.update_freq: # feed last state through the network to get policy values final_state = _build_frames(prepro, obs, frames, args.no_cuda) _, final_sval = policy(final_state) disc_rewards = np.array(rewards) disc_rewards = (disc_rewards - np.mean(disc_rewards)) / ( np.std(disc_rewards) + 1e-10) disc_rewards = _discount_rewards(disc_rewards, 0 if done else final_sval.data) disc_rewards = Variable(torch.Tensor(disc_rewards)).cuda() aprobs = torch.cat(aprobs).clamp(1e-8) state_values = torch.cat(state_values).squeeze() entropies = -torch.sum(aprobs * torch.log(aprobs), dim=1) actor_loss = -torch.cat(logprobs) * (disc_rewards - state_values.detach()) critic_loss = torch.pow(disc_rewards - state_values, 2) loss = actor_loss.sum() + critic_loss.sum( ) - args.beta * entropies.sum() # param update optimizer.zero_grad() loss.backward() optimizer.step() # zero-out buffers frames = Variable(torch.zeros((1, 4, 80, 80))) if not args.no_cuda: frames = frames.cuda() rewards, logprobs, aprobs, state_values = [], [], [], [] if done: total_time = time.time() - ep_start running_reward = reward_sum if not running_reward else running_reward * 0.99 + reward_sum * 0.01 print( "Episode {} took {:.2f} s. Steps: {}. Reward: {:.2f}. Running: {:.2f}" .format(epi, total_time, ts, reward_sum, running_reward)) epi += 1 reward_sum = 0 ep_start = time.time() obs = env.reset() if not epi % args.save_ckpt_freq and saved_ckpt_epi < epi: model_state = { 'state_dict': policy.state_dict(), 'optimizer': optimizer.state_dict(), 'step': ts, 'episode': epi, 'running_reward': running_reward } torch.save(model_state, args.ckpt_path) saved_ckpt_epi = epi if not epi % args.save_reward_freq and saved_reward_epi < epi: running_rewards.append(running_reward) with open(args.rewards_path, 'wb') as f: pickle.dump(running_rewards, f) saved_reward_epi = epi
def __init__(self, ob_space, ac_space, n_batch, n_steps): with tf.variable_scope("global"): self.network = CnnPolicy(sess, ob_space, ac_space, n_batch, n_steps) saver = tf.train.Saver() saver.restore(sess, "./checkpoints/model.ckpt")
class TRPO(): def __kl_div(self, p, q): return torch.mean(torch.sum(p * (torch.log(p) - torch.log(q)), 1)) def __hessian_vec_prod(self, v): # Note: torch.autograd.grad doesn't accumulate gradients # into the .grad buffers, but instead returns gradients # as Variable tuples. Hence, no zero_grad() is needed. kl = self.__kl_div(self.pi_old.detach(), self.pi_old) g_kl = torch.autograd.grad(kl, self.policy.parameters(), create_graph=True) g_kl = torch.cat([p.view(-1) for p in g_kl]) grad_vec_prod = torch.dot(g_kl, v) Hv = torch.autograd.grad(grad_vec_prod, self.policy.parameters(), create_graph=True) return torch.cat([p.contiguous().view(-1) for p in Hv]) + args.cg_damp * v # def __conjugate_gradient(self, b): # """Optimizer conjugate-gradient, Nocedal & Wright algorithm 5.2""" # x = torch.zeros_like(b) # r = b - self.__hessian_vec_prod(x) # p = -r # # for cg_iter in range(args.nb_cgsteps): # rr = torch.dot(r, r) # Ap = self.__hessian_vec_prod(p.detach()) # alpha = rr / torch.dot(p, Ap) # x += alpha*p # r -= alpha*Ap # beta = torch.dot(r, r) / rr # p = -r + beta*p # # return x def __conjugate_gradient(self, b): """Optimizer conjugate-gradient, Nocedal & Wright algorithm 5.2""" """Code adapted from https://github.com/openai/baselines/blob/master/baselines/common/cg.py""" p = b.clone() r = b.clone() x = torch.zeros_like(b) rdotr = torch.dot(r, r) for cg_iter in range(args.nb_cgsteps): z = self.__hessian_vec_prod(p.detach()) v = rdotr / torch.dot(p, z) x += v * p r -= v * z newrdotr = torch.dot(r, r) mu = newrdotr / rdotr p = r + mu * p rdotr = newrdotr return x def __line_search(self, s, prev_is_obj): theta_old = self.policy.gather_flat_params() sAs = torch.dot(s, self.__hessian_vec_prod(s.detach())) beta = torch.sqrt((2 * args.stepsize) / sAs) nb_iters = 1 while True: theta = (theta_old + beta * s) self.policy.replace_params(theta) pi = self.policy(self.frames_hist) kl_indicator = 0 if self.__kl_div( self.pi_old, pi) <= args.stepsize else float("inf") aprobs = torch.gather(pi, 1, self.action_hist.view(-1, 1)) entropies = -torch.sum(aprobs * torch.log(aprobs), dim=1) is_obj = torch.sum((aprobs / self.aprobs_old) * self.advs) + args.ent_coeff * entropies.sum() if is_obj - kl_indicator >= prev_is_obj: break beta /= 2 nb_iters += 1 if nb_iters > 100: print("WARNING! Line search didn't terminate in 100 steps.") return print("Line search terminated after {} steps.".format(nb_iters)) def train(self, args): env = gym.make(args.env) obs = env.reset() self.policy = CnnPolicy(env.action_space.n) if not args.no_cuda: self.policy.cuda() frames = Variable(torch.zeros((1, 4, 80, 80))) if not args.no_cuda: frames = frames.cuda() prepro = preprocess_pong if 'Pong' in args.env else preprocess_atari # arrays for holding history and statistics frames_hist, action_hist, rewards = [], [], [] # stuff for monitoring and logging progress reward_sum = 0 epi = 0 ep_start = time.time() running_reward = args.init_runreward running_rewards = [] saved_reward_epi = epi saved_ckpt_epi = epi start_ts = 1 if args.resume_ckpt: checkpoint = torch.load(args.resume_ckpt) self.policy.load_state_dict(checkpoint['state_dict']) start_ts = checkpoint['step'] epi = checkpoint['episode'] running_reward = checkpoint['running_reward'] for ts in range(start_ts, args.nb_steps + 1): frames = _build_frames(prepro, obs, frames, args.no_cuda) action_probs = self.policy(frames) action_dist = Categorical(action_probs) action = action_dist.sample() obs, reward, done, _ = env.step(action.data[0]) reward_sum += reward frames_hist.append(frames) action_hist.append(action) rewards.append(reward) if not ts % args.update_freq: disc_rewards = _discount_rewards(rewards) disc_rewards = (disc_rewards - np.mean(disc_rewards)) / ( np.std(disc_rewards) + 1e-10) self.advs = Variable( torch.Tensor(disc_rewards)).cuda().detach() self.frames_hist = torch.cat(frames_hist) self.action_hist = torch.cat(action_hist) self.pi_old = self.policy(self.frames_hist) self.aprobs_old = torch.gather(self.pi_old, 1, self.action_hist.view(-1, 1)) # Gradient of linear term entropies = -torch.sum( self.aprobs_old * torch.log(self.aprobs_old), dim=1) is_obj = torch.sum( (self.aprobs_old / self.aprobs_old.detach()) * self.advs) + args.ent_coeff * entropies.sum() is_obj.backward(retain_graph=True) g = self.policy.gather_flat_grad() p = self.__conjugate_gradient(g) self.__line_search(p, is_obj) # zero-out buffers frames = Variable(torch.zeros((1, 4, 80, 80))) if not args.no_cuda: frames = frames.cuda() frames_hist, action_hist, rewards = [], [], [] if done: total_time = time.time() - ep_start running_reward = reward_sum if not running_reward else running_reward * 0.99 + reward_sum * 0.01 print( "Episode {} took {:.2f} s. Steps: {}. Reward: {:.2f}. Running: {:.2f}" .format(epi, total_time, ts, reward_sum, running_reward)) epi += 1 reward_sum = 0 ep_start = time.time() obs = env.reset() if not epi % args.save_ckpt_freq and saved_ckpt_epi < epi: model_state = { 'state_dict': self.policy.state_dict(), 'step': ts, 'episode': epi, 'running_reward': running_reward } torch.save(model_state, args.ckpt_path) saved_ckpt_epi = epi if not epi % args.save_reward_freq and saved_reward_epi < epi: running_rewards.append(running_reward) with open(args.rewards_path, 'wb') as f: pickle.dump(running_rewards, f) saved_reward_epi = epi
def train(self, args): env = gym.make(args.env) obs = env.reset() self.policy = CnnPolicy(env.action_space.n) if not args.no_cuda: self.policy.cuda() frames = Variable(torch.zeros((1, 4, 80, 80))) if not args.no_cuda: frames = frames.cuda() prepro = preprocess_pong if 'Pong' in args.env else preprocess_atari # arrays for holding history and statistics frames_hist, action_hist, rewards = [], [], [] # stuff for monitoring and logging progress reward_sum = 0 epi = 0 ep_start = time.time() running_reward = args.init_runreward running_rewards = [] saved_reward_epi = epi saved_ckpt_epi = epi start_ts = 1 if args.resume_ckpt: checkpoint = torch.load(args.resume_ckpt) self.policy.load_state_dict(checkpoint['state_dict']) start_ts = checkpoint['step'] epi = checkpoint['episode'] running_reward = checkpoint['running_reward'] for ts in range(start_ts, args.nb_steps + 1): frames = _build_frames(prepro, obs, frames, args.no_cuda) action_probs = self.policy(frames) action_dist = Categorical(action_probs) action = action_dist.sample() obs, reward, done, _ = env.step(action.data[0]) reward_sum += reward frames_hist.append(frames) action_hist.append(action) rewards.append(reward) if not ts % args.update_freq: disc_rewards = _discount_rewards(rewards) disc_rewards = (disc_rewards - np.mean(disc_rewards)) / ( np.std(disc_rewards) + 1e-10) self.advs = Variable( torch.Tensor(disc_rewards)).cuda().detach() self.frames_hist = torch.cat(frames_hist) self.action_hist = torch.cat(action_hist) self.pi_old = self.policy(self.frames_hist) self.aprobs_old = torch.gather(self.pi_old, 1, self.action_hist.view(-1, 1)) # Gradient of linear term entropies = -torch.sum( self.aprobs_old * torch.log(self.aprobs_old), dim=1) is_obj = torch.sum( (self.aprobs_old / self.aprobs_old.detach()) * self.advs) + args.ent_coeff * entropies.sum() is_obj.backward(retain_graph=True) g = self.policy.gather_flat_grad() p = self.__conjugate_gradient(g) self.__line_search(p, is_obj) # zero-out buffers frames = Variable(torch.zeros((1, 4, 80, 80))) if not args.no_cuda: frames = frames.cuda() frames_hist, action_hist, rewards = [], [], [] if done: total_time = time.time() - ep_start running_reward = reward_sum if not running_reward else running_reward * 0.99 + reward_sum * 0.01 print( "Episode {} took {:.2f} s. Steps: {}. Reward: {:.2f}. Running: {:.2f}" .format(epi, total_time, ts, reward_sum, running_reward)) epi += 1 reward_sum = 0 ep_start = time.time() obs = env.reset() if not epi % args.save_ckpt_freq and saved_ckpt_epi < epi: model_state = { 'state_dict': self.policy.state_dict(), 'step': ts, 'episode': epi, 'running_reward': running_reward } torch.save(model_state, args.ckpt_path) saved_ckpt_epi = epi if not epi % args.save_reward_freq and saved_reward_epi < epi: running_rewards.append(running_reward) with open(args.rewards_path, 'wb') as f: pickle.dump(running_rewards, f) saved_reward_epi = epi
def train(args): env = gym.make(args.env) obs = env.reset() policy = CnnPolicy(env.action_space.n) if not args.no_cuda: policy.cuda() optimizer = optim.Adam(policy.parameters(), lr=args.eta) frames = Variable(torch.zeros((1, 4, 80, 80))) if not args.no_cuda: frames = frames.cuda() prepro = preprocess_pong if 'Pong' in args.env else preprocess_atari # arrays for holding history and statistics frames_hist, rewards, logprobs, action_hist = [], [], [], [] # stuff for monitoring and logging progress reward_sum = 0 epi = 0 ep_start = time.time() running_reward = args.init_runreward running_rewards = [] saved_reward_epi = epi saved_ckpt_epi = epi start_ts = 1 if args.resume_ckpt: checkpoint = torch.load(args.resume_ckpt) policy.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) start_ts = checkpoint['step'] epi = checkpoint['episode'] running_reward = checkpoint['running_reward'] for ts in range(start_ts, args.nb_steps + 1): frames = _build_frames(prepro, obs, frames, args.no_cuda) action_probs, _ = policy(frames) action_dist = Categorical(action_probs) action = action_dist.sample() obs, reward, done, _ = env.step(action.data[0]) reward_sum += reward frames_hist.append(frames) action_hist.append(action) rewards.append(reward) logprobs.append(action_dist.log_prob(action)) if done or not ts % args.update_freq: # feed last state through the network to get policy values final_state = _build_frames(prepro, obs, frames, args.no_cuda) _, final_sval = policy(final_state) disc_rewards = np.array(rewards) disc_rewards = (disc_rewards - np.mean(disc_rewards)) / ( np.std(disc_rewards) + 1e-10) disc_rewards = _discount_rewards(disc_rewards, 0 if done else final_sval.data) disc_rewards = Variable(torch.Tensor(disc_rewards)).cuda() frames_hist = torch.cat(frames_hist) action_hist = torch.cat(action_hist) pi_old = torch.cat(logprobs).exp().detach() for _ in range(args.nb_epochs): n = len(rewards) # shuffle indices = torch.randperm(n).cuda() frames_hist = frames_hist[indices, :, :, :] disc_rewards = disc_rewards[indices] action_hist = action_hist[indices] pi_old = pi_old[indices] nb_batches = int(np.ceil(n / args.batch_size)) for i in range(nb_batches): sidx = i * args.batch_size batch_frames = frames_hist[sidx:sidx + args.batch_size] aprobs, statevals = policy(batch_frames) aprobs = aprobs.clamp(1e-8) action_dist = Categorical(aprobs) batch_actions = action_hist[sidx:sidx + args.batch_size] pi = action_dist.log_prob(batch_actions).exp() # clipped actor loss ratio = pi / pi_old[sidx:sidx + args.batch_size] advs = disc_rewards[sidx:sidx + args.batch_size] - statevals.squeeze() lhs = ratio * advs.detach() rhs = torch.clamp(ratio, 1 - args.eps, 1 + args.eps) * advs.detach() loss_clip = torch.min(lhs, rhs).sum() # critic loss loss_critic = torch.pow(advs, 2).sum() # full loss entropies = -torch.sum(aprobs * torch.log(aprobs), dim=1) loss = -loss_clip + args.c1 * loss_critic - args.c2 * entropies.sum( ) # param update optimizer.zero_grad() loss.backward() optimizer.step() # zero-out buffers frames = Variable(torch.zeros((1, 4, 80, 80))) if not args.no_cuda: frames = frames.cuda() frames_hist, action_hist, rewards, logprobs = [], [], [], [] if done: total_time = time.time() - ep_start running_reward = reward_sum if not running_reward else running_reward * 0.99 + reward_sum * 0.01 print( "Episode {} took {:.2f} s. Steps: {}. Reward: {:.2f}. Running: {:.2f}" .format(epi, total_time, ts, reward_sum, running_reward)) epi += 1 reward_sum = 0 ep_start = time.time() obs = env.reset() if not epi % args.save_ckpt_freq and saved_ckpt_epi < epi: model_state = { 'state_dict': policy.state_dict(), 'optimizer': optimizer.state_dict(), 'step': ts, 'episode': epi, 'running_reward': running_reward } torch.save(model_state, args.ckpt_path) saved_ckpt_epi = epi if not epi % args.save_reward_freq and saved_reward_epi < epi: running_rewards.append(running_reward) with open(args.rewards_path, 'wb') as f: pickle.dump(running_rewards, f) saved_reward_epi = epi