def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if len(envs.observation_space.shape) == 3: actor_critic = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) target_actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) critic = Critic(in_channels=4, num_actions=envs.action_space.n) critic_target = Critic(in_channels=4, num_actions=envs.action_space.n) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if args.cuda: actor_critic.cuda() critic.cuda() critic_target.cuda() target_actor.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) critic_optim = optim.Adam(critic.parameters(), lr=1e-4) gamma = 0.99 tau = 0.001 #memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) mem_buffer = ReplayBuffer() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, envs.action_space.n) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) value = critic.forward( Variable(rollouts.observations[step], volatile=True), action_log_prob) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks pre_state = rollouts.observations[step].cpu().numpy() update_current_obs(obs) mem_buffer.add((pre_state, current_obs, action_log_prob.data.cpu().numpy(), reward, done)) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True)) #[0].data next_value = critic.forward( Variable(rollouts.observations[-1], volatile=True), action_log_prob).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if True: state, next_state, action, reward, done = mem_buffer.sample(5) next_state = next_state.reshape([-1, *obs_shape]) state = state.reshape([-1, *obs_shape]) action = action.reshape([-1, 6]) next_q_values = critic_target( to_tensor(next_state, volatile=True), target_actor(to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True))[0]) next_q_values.volatile = False target_q_batch = to_tensor(reward) + args.gamma * to_tensor( done.astype(np.float)) * next_q_values critic.zero_grad() q_batch = critic(to_tensor(state), to_tensor(action)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() critic_optim.step() actor_critic.zero_grad() policy_loss = -critic( to_tensor(state), actor_critic(to_tensor(state), to_tensor(state), to_tensor(state))[0]) policy_loss = policy_loss.mean() policy_loss.backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() soft_update(target_actor, actor_critic, tau) soft_update(critic_target, critic, tau) ''' if args.algo in ['a2c', 'acktr']: action_log_probs, probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = critic.forward(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), probs).data values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) #advantages = Variable(rollouts.returns[:-1]) - values advantages = rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages) * action_log_probs).mean() #action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() critic_optim.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() critic_optim.step() ''' rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), value_loss.data.cpu().numpy()[0], policy_loss.data.cpu().numpy()[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
class DDPG(object): def __init__(self, memory, nb_status, nb_actions, action_noise=None, gamma=0.99, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), actor_lr=1e-4, critic_lr=1e-3): self.nb_status = nb_status self.nb_actions = nb_actions self.action_range = action_range self.observation_range = observation_range self.normalize_observations = normalize_observations self.actor = Actor(self.nb_status, self.nb_actions) self.actor_target = Actor(self.nb_status, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(self.nb_status, self.nb_actions) self.critic_target = Critic(self.nb_status, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) # Create replay buffer self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.action_noise = action_noise # Hyper-parameters self.batch_size = batch_size self.tau = tau self.discount = gamma if self.normalize_observations: self.obs_rms = RunningMeanStd() else: self.obs_rms = None def pi(self, obs, apply_noise=True, compute_Q=True): obs = np.array([obs]) action = to_numpy(self.actor(to_tensor(obs))).squeeze(0) if compute_Q: q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q[0][0] def store_transition(self, obs0, action, reward, obs1, terminal1): self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) next_q_values = self.critic_target([ to_tensor(batch['obs1'], volatile=True), self.actor_target(to_tensor(batch['obs1'], volatile=True)) ]) next_q_values.volatile = False target_q_batch = to_tensor(batch['rewards']) + \ self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values self.critic.zero_grad() q_batch = self.critic( [to_tensor(batch['obs0']), to_tensor(batch['actions'])]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.cpu().data[0], policy_loss.cpu().data[0] def initialize(self): hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_target_net(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def reset(self): if self.action_noise is not None: self.action_noise.reset() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda()
class DDPG(object): def __init__(self, nb_actions, nb_states, layer_norm, obs_norm, actor_lr, critic_lr, SGLD_coef, noise_decay, lr_decay, batch_size, discount, tau, pool_size, parameters_noise, action_noise, SGLD_mode, pool_mode, with_cuda): self.nb_actions = nb_actions self.nb_states = nb_states self.layer_norm = layer_norm self.parameters_noise = parameters_noise self.action_noise = action_noise self.batch_size = batch_size self.discount = discount self.tau = tau self.pool_size = pool_size self.critic_lr = critic_lr self.actor_lr = actor_lr self.SGLD_coef = SGLD_coef self.noise_coef = 1 self.noise_decay = noise_decay self.lr_coef = 1 self.lr_decay = lr_decay self.SGLD_mode = SGLD_mode self.pool_mode = pool_mode self.with_cuda = with_cuda self.actor = Actor(nb_states=self.nb_states, nb_actions=self.nb_actions, layer_norm=self.layer_norm) self.actor_target = Actor(nb_states=self.nb_states, nb_actions=self.nb_actions, layer_norm=self.layer_norm) self.critic = Critic(nb_states, nb_actions, layer_norm=self.layer_norm) self.critic_target = Critic(nb_states, nb_actions, layer_norm=self.layer_norm) if self.with_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) #self.actor_optim = SGD(self.actor.parameters(), lr=actor_lr, momentum=0.9,weight_decay = 0.01) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) #self.critic_optim = SGD(self.critic.parameters(), lr=critic_lr, momentum=0.9,weight_decay = 0.01) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) self.memory = Memory(int(1e6), (nb_actions, ), (nb_states, ), with_cuda) self.obs_norm = obs_norm if self.obs_norm: self.run_obs_norm = Run_Normalizer((nb_states, ), self.with_cuda) self.is_training = True if self.pool_size > 0: self.agent_pool = Agent_pool(self.pool_size) self.s_t = None self.a_t = None def store_transition(self, s_t, a_t, r_t, s_t1, done_t): if self.is_training: self.memory.append(s_t, a_t, r_t, s_t1, done_t) if self.obs_norm: self.run_obs_norm.observe(s_t) self.s_t = s_t1 def update(self): # Sample batch batch = self.memory.sample(self.batch_size) tensor_obs0 = batch['obs0'] tensor_obs1 = batch['obs1'] if self.obs_norm: tensor_obs0 = self.run_obs_norm.normalize(tensor_obs0) tensor_obs1 = self.run_obs_norm.normalize(tensor_obs1) # Prepare for the target q batch with torch.no_grad(): next_q_values = self.critic_target([ tensor_obs1, self.actor_target(tensor_obs1), ]) target_q_batch = batch['rewards'] + \ self.discount*(1-batch['terminals1'])*next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([tensor_obs0, batch['actions']]) value_loss = nn.functional.mse_loss(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if (self.SGLD_mode == 2) or (self.SGLD_mode == 3): SGLD_update(self.critic, self.critic_lr * self.lr_coef, self.SGLD_coef) # Actor update self.actor.zero_grad() policy_loss = -self.critic([tensor_obs0, self.actor(tensor_obs0)]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() if (self.SGLD_mode == 1) or (self.SGLD_mode == 3): SGLD_update(self.actor, self.actor_lr * self.lr_coef, self.SGLD_coef) # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.item(), policy_loss.item() def apply_lr_decay(self): if self.lr_decay > 0: self.lr_coef = self.lr_decay * self.lr_coef / (self.lr_coef + self.lr_decay) self.critic_optim.param_groups[0][ 'lr'] = self.critic_lr * self.lr_coef def apply_noise_decay(self): if self.noise_decay > 0: self.noise_coef = self.noise_decay * self.noise_coef / ( self.noise_coef + self.noise_decay) def select_action(self, random=False, s_t=None, if_noise=True): if random: action = np.random.uniform(-1., 1., self.nb_actions) else: if s_t is None: raise RuntimeError() s_t = torch.tensor(s_t, dtype=torch.float32, requires_grad=False) if self.with_cuda: s_t = s_t.cuda() if self.obs_norm: s_t = self.run_obs_norm.normalize(s_t) with torch.no_grad(): action = self.actor(s_t).cpu().numpy().squeeze(0) if if_noise & (self.action_noise is not None): action += self.is_training * max(self.noise_coef, 0) * self.action_noise() action = np.clip(action, -1., 1.) self.a_t = action return action def load_weights(self, output): self.actor = torch.load('{}/actor.pkl'.format(output)) self.critic = torch.load('{}/critic.pkl'.format(output)) if self.obs_norm: self.run_obs_norm = torch.load('{}/obs_norm.pkl'.format(output)) def save_model(self, output): torch.save(self.actor, '{}/actor.pkl'.format(output)) torch.save(self.critic, '{}/critic.pkl'.format(output)) if self.obs_norm: torch.save(self.run_obs_norm, '{}/obs_norm.pkl'.format(output)) def get_actor_buffer(self): buffer = io.BytesIO() torch.save(self.actor, buffer) return buffer def get_norm_param(self): return self.run_obs_norm.mean.cpu(), self.run_obs_norm.var.cpu() #TODO recode agent pool def append_actor(self): self.agent_pool.actor_append(self.actor.state_dict(), self.actor_target.state_dict()) def pick_actor(self): actor, actor_target = self.agent_pool.get_actor() self.actor.load_state_dict(actor) self.actor_target.load_state_dict(actor_target) def append_critic(self): self.agent_pool.critic_append(self.critic.state_dict(), self.critic_target.state_dict()) def pick_critic(self): critic, critic_target = self.agent_pool.get_critic() self.critic.load_state_dict(critic) self.critic_target.load_state_dict(critic_target) def append_actor_critic(self): self.agent_pool.actor_append(self.actor.state_dict(), self.actor_target.state_dict()) self.agent_pool.critic_append(self.critic.state_dict(), self.critic_target.state_dict()) def pick_actor_critic(self): actor, actor_target, critic, critic_target = self.agent_pool.get_agent( ) self.actor.load_state_dict(actor) self.actor_target.load_state_dict(actor_target) self.critic.load_state_dict(critic) self.critic_target.load_state_dict(critic_target) def append_agent(self): if self.pool_mode == 1: self.append_actor() elif self.pool_mode == 2: self.append_critic() elif self.pool_mode == 3: self.append_actor_critic() def pick_agent(self): if self.pool_mode == 1: self.pick_actor() elif self.pool_mode == 2: self.pick_critic() elif self.pool_mode == 3: self.pick_actor_critic() def reset(self, obs): self.s_t = obs if self.action_noise is not None: self.action_noise.reset()
class DDPG(object): def __init__(self, args, nb_states, nb_actions): USE_CUDA = torch.cuda.is_available() if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions= nb_actions self.gpu_ids = [i for i in range(args.gpu_nums)] if USE_CUDA and args.gpu_nums > 0 else [-1] self.gpu_used = True if self.gpu_ids[0] >= 0 else False net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_w':args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_optim = Adam(self.actor.parameters(), lr=args.p_lr, weight_decay=args.weight_decay) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_optim = Adam(self.critic.parameters(), lr=args.c_lr, weight_decay=args.weight_decay) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau_update = args.tau_update self.gamma = args.gamma # Linear decay rate of exploration policy self.depsilon = 1.0 / args.epsilon # initial exploration rate self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.continious_action_space = False def update_policy(self): pass def cuda_convert(self): if len(self.gpu_ids) == 1: if self.gpu_ids[0] >= 0: with torch.cuda.device(self.gpu_ids[0]): print('model cuda converted') self.cuda() if len(self.gpu_ids) > 1: self.data_parallel() self.cuda() self.to_device() print('model cuda converted and paralleled') def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def data_parallel(self): self.actor = nn.DataParallel(self.actor, device_ids=self.gpu_ids) self.actor_target = nn.DataParallel(self.actor_target, device_ids=self.gpu_ids) self.critic = nn.DataParallel(self.critic, device_ids=self.gpu_ids) self.critic_target = nn.DataParallel(self.critic_target, device_ids=self.gpu_ids) def to_device(self): self.actor.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.actor_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.critic.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.critic_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) # self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): # proto action action = to_numpy( self.actor(to_tensor(np.array([s_t]), gpu_used=self.gpu_used, gpu_0=self.gpu_ids[0])), gpu_used=self.gpu_used ).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon # self.a_t = action return action def reset(self, s_t): self.s_t = s_t self.random_process.reset_states() def load_weights(self, dir): if dir is None: return if self.gpu_used: # load all tensors to GPU (gpu_id) ml = lambda storage, loc: storage.cuda(self.gpu_ids) else: # load all tensors to CPU ml = lambda storage, loc: storage self.actor.load_state_dict( torch.load('output/{}/actor.pkl'.format(dir), map_location=ml) ) self.critic.load_state_dict( torch.load('output/{}/critic.pkl'.format(dir), map_location=ml) ) print('model weights loaded') def save_model(self,output): if len(self.gpu_ids) == 1 and self.gpu_ids[0] > 0: with torch.cuda.device(self.gpu_ids[0]): torch.save( self.actor.state_dict(), '{}/actor.pt'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pt'.format(output) ) elif len(self.gpu_ids) > 1: torch.save(self.actor.module.state_dict(), '{}/actor.pt'.format(output) ) torch.save(self.actor.module.state_dict(), '{}/critic.pt'.format(output) ) else: torch.save( self.actor.state_dict(), '{}/actor.pt'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pt'.format(output) ) def seed(self,seed): torch.manual_seed(seed) if len(self.gpu_ids) > 0: torch.cuda.manual_seed_all(seed)
class DDPG(object): def __init__(self, nb_states, nb_actions, args, discrete, use_cuda=False): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.discrete = discrete # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_w':args.init_w } self.actor = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = use_cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # state_batch, action_batch, reward_batch, \ # next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([ to_tensor(state_batch), to_tensor(action_batch) ]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor == True: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): print("use cuda") self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=1): action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) # print(self.random_process.sample(), action) noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) # print(max(self.epsilon, 0) * self.random_process.sample() * noise_level, noise_level) action = np.clip(action, -1., 1.) # print(action) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict( torch.load('{}/actor.pkl'.format(output)) ) self.critic.load_state_dict( torch.load('{}/critic.pkl'.format(output)) ) def save_model(self, output): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor.pkl'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pkl'.format(output) ) if self.use_cuda: self.actor.cuda() self.critic.cuda() def seed(self,s): torch.manual_seed(s) if self.use_cuda: torch.cuda.manual_seed(s)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' # logger = Logger(environment_name = args.env_name, entropy_coff= 'entropy_coeff_' + str(args.entropy_coef), folder = args.folder) # logger.save_args(args) # print ("---------------------------------------") # print ('Saving to', logger.save_folder) # print ("---------------------------------------") if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] ### for the number of processes to use if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) ## ALE Environments : mostly has Discrete action_space type if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] ### shape==3 for ALE Environments : States are 3D (Image Pi) if len(envs.observation_space.shape) == 3: actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) target_actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) critic = Critic(in_channels=4, num_actions=envs.action_space.n) critic_target = Critic(in_channels=4, num_actions=envs.action_space.n) baseline_target = Baseline_Critic(in_channels=4, num_actions=envs.action_space.n) if args.cuda: actor.cuda() critic.cuda() critic_target.cuda() target_actor.cuda() baseline_target.cuda() actor_optim = optim.Adam(actor.parameters(), lr=args.actor_lr) critic_optim = optim.Adam(critic.parameters(), lr=args.critic_lr) baseline_optim = optim.Adam(actor.parameters(), lr=1e-4) tau_soft_update = 0.001 mem_buffer = ReplayBuffer() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor.state_size, envs.action_space.n) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): temperature = 1.0 ## num_steps = 5 as in A2C for step in range(args.num_steps): temperature = temperature / (step + 1) # Sample actions action, action_log_prob, states, dist_entropy = actor.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True), temperature, envs.action_space.n, args.num_processes) value = critic.forward( Variable(rollouts.observations[step], volatile=True), action_log_prob) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks pre_state = rollouts.observations[step].cpu().numpy() update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, dist_entropy.data, value.data, reward, masks) nth_step_return = rollouts.returns[0].cpu().numpy() current_state = rollouts.observations[0].cpu().numpy() nth_state = rollouts.observations[-1].cpu().numpy() current_action = rollouts.action_log_probs[0].cpu().numpy() current_action_dist_entropy = rollouts.dist_entropy[0].cpu().numpy() mem_buffer.add((current_state, nth_state, current_action, nth_step_return, done, current_action_dist_entropy)) action, action_log_prob, states, dist_entropy = actor.act( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True), temperature, envs.action_space.n, args.num_processes) #[0].data next_value = critic.forward( Variable(rollouts.observations[-1], volatile=True), action_log_prob).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) bs_size = args.batch_size if len(mem_buffer.storage) >= bs_size: ##samples from the replay buffer state, next_state, action, returns, done, entropy_log_prob = mem_buffer.sample( bs_size) next_state = next_state.reshape([-1, *obs_shape]) state = state.reshape([-1, *obs_shape]) action = action.reshape([-1, envs.action_space.n]) #current Q estimate q_batch = critic(to_tensor(state), to_tensor(action)) # target Q estimate next_state_action_probs = target_actor( to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True)) next_q_values = critic_target(to_tensor(next_state, volatile=True), next_state_action_probs[1]) next_q_values.volatile = False target_q_batch = to_tensor(returns) + args.gamma * to_tensor( done.astype(np.float)) * next_q_values critic.zero_grad() value_loss = criterion(q_batch, target_q_batch) if args.gradient_penalty == True: gradients = torch.autograd.grad(value_loss, critic.parameters(), allow_unused=True, retain_graph=True, create_graph=True, only_inputs=True)[0] gradient_penalty = ((gradients.norm(2, dim=1) - 1)** 2).mean() * args.lambda_grad_penalty gradient_penalty.backward() else: value_loss = criterion(q_batch, target_q_batch) value_loss.backward() critic_optim.step() actor.zero_grad() policy_loss = -critic( to_tensor(state), actor(to_tensor(state), to_tensor(state), to_tensor(state))[0]) ### Soft trust region constraint for the actor current_action_probs = actor(to_tensor(state, volatile=False), to_tensor(state, volatile=False), to_tensor(state, volatile=False))[0] target_action_probs = target_actor(to_tensor(state, volatile=True), to_tensor(state, volatile=True), to_tensor(state, volatile=True))[0] policy_regularizer = criterion(current_action_probs, target_action_probs) ## Actor update with entropy penalty policy_loss = policy_loss.mean() - args.entropy_coef * Variable(torch.from_numpy(np.expand_dims(entropy_log_prob.mean(), axis=0))).cuda() \ + args.actor_kl_lambda * policy_regularizer if args.actor_several_updates == True: for p in range(args.actor_updates): policy_loss.backward(retain_variables=True) else: policy_loss.backward() ##clipping of gradient norms gradient_norms = nn.utils.clip_grad_norm(actor.parameters(), args.max_grad_norm) print("gradient_norms", gradient_norms) actor_optim.step() if args.second_order_grads == True: """ Training the Baseline critic (f(s, \mu(s))) """ baseline_target.zero_grad() ## f(s, \mu(s)) current_baseline = baseline_target( to_tensor(state), actor(to_tensor(state), to_tensor(state), to_tensor(state))[0]) ## \grad f(s,a) grad_baseline_params = torch.autograd.grad( current_baseline.mean(), actor.parameters(), retain_graph=True, create_graph=True) ## MSE : (Q - f)^{2} baseline_loss = (q_batch.detach() - current_baseline).pow(2).mean() # baseline_loss.volatile=True actor.zero_grad() baseline_target.zero_grad() grad_norm = 0 for grad_1, grad_2 in zip(grad_params, grad_baseline_params): grad_norm += grad_1.data.pow(2).sum() - grad_2.pow(2).sum() grad_norm = grad_norm.sqrt() ##Loss for the Baseline approximator (f) overall_loss = baseline_loss + args.lambda_second_order_grads * grad_norm overall_loss.backward() baseline_optim.step() soft_update(target_actor, actor, tau_soft_update) soft_update(critic_target, critic, tau_soft_update) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "" and len( mem_buffer.storage) >= bs_size: save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor if args.cuda: save_model = copy.deepcopy(actor).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(mem_buffer.storage) >= bs_size: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}, Entropy {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), value_loss.data.cpu().numpy()[0], policy_loss.data.cpu().numpy()[0], entropy_log_prob.mean())) final_rewards_mean = [final_rewards.mean()] final_rewards_median = [final_rewards.median()] final_rewards_min = [final_rewards.min()] final_rewards_max = [final_rewards.max()] all_value_loss = [value_loss.data.cpu().numpy()[0]] all_policy_loss = [policy_loss.data.cpu().numpy()[0]] # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max, all_value_loss, all_policy_loss) # # logger.save() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
class DDPG(object): def __init__(self, nb_states, nb_actions): self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions) self.actor_target = Actor(self.nb_states, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic = Critic(self.nb_states, self.nb_actions) self.critic_target = Critic(self.nb_states, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=HISTORY_LEN) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=OU_THETA, mu=OU_MU, sigma=OU_SIGMA) # Hyper-parameters self.batch_size = BATCH_SIZE self.tau = TAU self.discount = GAMMA self.depsilon = 1.0 / DEPSILON self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True if USE_CUDA: self.cuda() def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ])[:, 0] next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() torch.nn.utils.clip_grad_norm(self.critic.parameters(), 10.0) for p in self.critic.parameters(): p.data.add_(-CRITIC_LR, p.grad.data) self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() torch.nn.utils.clip_grad_norm(self.actor.parameters(), 10.0) for p in self.actor.parameters(): p.data.add_(-ACTOR_LR, p.grad.data) self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t]))))[0] ou = self.random_process.sample() prGreen('eps:{}, act:{}, random:{}'.format(self.epsilon, action, ou)) action += self.is_training * max(self.epsilon, 0) * ou action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
def main(args): with open(args.data_dir+'/ptb.vocab.json', 'r') as file: vocab = json.load(file) # required to map between integer-value sentences and real sentences w2i, i2w = vocab['w2i'], vocab['i2w'] # make sure our models for the VAE and Actor exist if not os.path.exists(args.load_vae): raise FileNotFoundError(args.load_vae) model = SentenceVAE( vocab_size=len(w2i), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], unk_idx=w2i['<unk>'], max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) model.load_state_dict( torch.load(args.load_vae, map_location=lambda storage, loc: storage)) model.eval() print("vae model loaded from %s"%(args.load_vae)) # to run in constraint mode, we need the trained generator if args.constraint_mode: if not os.path.exists(args.load_actor): raise FileNotFoundError(args.load_actor) actor = Actor( dim_z=args.latent_size, dim_model=2048, num_labels=args.n_tags) actor.load_state_dict( torch.load(args.load_actor, map_location=lambda storage, loc:storage)) actor.eval() print("actor model loaded from %s"%(args.load_actor)) if torch.cuda.is_available(): model = model.cuda() if args.constraint_mode: actor = actor.cuda() # TODO: to(self.devices) if args.sample: print('*** SAMPLE Z: ***') # get samples from the prior sample_sents, z = model.inference(n=args.num_samples) sample_sents, sample_tags = get_sents_and_tags(sample_sents, i2w, w2i) pickle_it(z.cpu().numpy(), 'samples/z_sample_n{}.pkl'.format(args.num_samples)) pickle_it(sample_sents, 'samples/sents_sample_n{}.pkl'.format(args.num_samples)) pickle_it(sample_tags, 'samples/tags_sample_n{}.pkl'.format(args.num_samples)) print(sample_sents, sep='\n') if args.constraint_mode: print('*** SAMPLE Z_PRIME: ***') # get samples from the prior, conditioned via the actor all_tags_sample_prime = [] all_sents_sample_prime = {} all_z_sample_prime = {} for i, condition in enumerate(LABELS): # binary vector denoting each of the PHRASE_TAGS labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda() # take z and manipulate using the actor to generate z_prime z_prime = actor.forward(z, labels) sample_sents_prime, z_prime = model.inference( z=z_prime, n=args.num_samples) sample_sents_prime, sample_tags_prime = get_sents_and_tags( sample_sents_prime, i2w, w2i) print('conditoned on: {}'.format(condition)) print(sample_sents_prime, sep='\n') all_tags_sample_prime.append(sample_tags_prime) all_sents_sample_prime[LABEL_NAMES[i]] = sample_sents_prime all_z_sample_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy() pickle_it(all_tags_sample_prime, 'samples/tags_sample_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_sents_sample_prime, 'samples/sents_sample_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_z_sample_prime, 'samples/z_sample_prime_n{}.pkl'.format(args.num_samples)) if args.interpolate: # get random samples from the latent space z1 = torch.randn([args.latent_size]).numpy() z2 = torch.randn([args.latent_size]).numpy() z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=args.num_samples-2)).float()) print('*** INTERP Z: ***') interp_sents, _ = model.inference(z=z) interp_sents, interp_tags = get_sents_and_tags(interp_sents, i2w, w2i) pickle_it(z.cpu().numpy(), 'samples/z_interp_n{}.pkl'.format(args.num_samples)) pickle_it(interp_sents, 'samples/sents_interp_n{}.pkl'.format(args.num_samples)) pickle_it(interp_tags, 'samples/tags_interp_n{}.pkl'.format(args.num_samples)) print(interp_sents, sep='\n') if args.constraint_mode: print('*** INTERP Z_PRIME: ***') all_tags_interp_prime = [] all_sents_interp_prime = {} all_z_interp_prime = {} for i, condition in enumerate(LABELS): # binary vector denoting each of the PHRASE_TAGS labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda() # z prime conditioned on this particular binary variable z_prime = actor.forward(z, labels) interp_sents_prime, z_prime = model.inference( z=z_prime, n=args.num_samples) interp_sents_prime, interp_tags_prime = get_sents_and_tags( interp_sents_prime, i2w, w2i) print('conditoned on: {}'.format(condition)) print(interp_sents_prime, sep='\n') all_tags_interp_prime.append(interp_tags_prime) all_sents_interp_prime[LABEL_NAMES[i]] = interp_sents_prime all_z_interp_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy() pickle_it(all_tags_interp_prime, 'samples/tags_interp_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_sents_interp_prime, 'samples/sents_interp_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_z_interp_prime, 'samples/z_interp_prime_n{}.pkl'.format(args.num_samples)) import IPython; IPython.embed()
class DDPG: def __init__(self, env, args): ob_space = env.observation_space goal_dim = env.goal_dim ob_dim = ob_space.shape[0] self.ob_dim = ob_dim self.ac_dim = ac_dim = 7 self.goal_dim = goal_dim self.num_iters = args.num_iters self.random_prob = args.random_prob self.tau = args.tau self.reward_scale = args.reward_scale self.gamma = args.gamma self.log_interval = args.log_interval self.save_interval = args.save_interval self.rollout_steps = args.rollout_steps self.env = env self.batch_size = args.batch_size self.train_steps = args.train_steps self.closest_dist = np.inf self.warmup_iter = args.warmup_iter self.max_grad_norm = args.max_grad_norm self.use_her = args.her self.k_future = args.k_future self.model_dir = os.path.join(args.save_dir, 'model') self.pretrain_dir = args.pretrain_dir os.makedirs(self.model_dir, exist_ok=True) self.global_step = 0 self.actor = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) if args.resume or args.test or args.pretrain_dir is not None: self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir) if not args.test: self.actor_target = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic_target = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.actor_optim = self.construct_optim(self.actor, lr=args.actor_lr) cri_w_decay = args.critic_weight_decay self.critic_optim = self.construct_optim(self.critic, lr=args.critic_lr, weight_decay=cri_w_decay) self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) self.actor_target.eval() self.critic_target.eval() if args.noise_type == 'ou_noise': mu = np.zeros(ac_dim) sigma = float(args.ou_noise_std) * np.ones(ac_dim) self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu, sigma=sigma) elif args.noise_type == 'uniform': low_limit = args.uniform_noise_low high_limit = args.uniform_noise_high dec_step = args.max_noise_dec_step self.action_noise = UniformNoise(low_limit=low_limit, high_limit=high_limit, dec_step=dec_step) elif args.noise_type == 'gaussian': mu = np.zeros(ac_dim) sigma = args.normal_noise_std * np.ones(ac_dim) self.action_noise = NormalActionNoise(mu=mu, sigma=sigma) self.memory = Memory(limit=int(args.memory_limit), action_shape=(int(ac_dim), ), observation_shape=(int(ob_dim), )) self.critic_loss = nn.MSELoss() self.ob_norm = args.ob_norm if self.ob_norm: self.obs_oms = OnlineMeanStd(shape=(1, ob_dim)) else: self.obs_oms = None self.cuda() def test(self, render=False, record=True, slow_t=0): dist, succ_rate = self.rollout(render=render, record=record, slow_t=slow_t) print('Final step distance: ', dist) def train(self): self.net_mode(train=True) tfirststart = time.time() epoch_episode_rewards = deque(maxlen=1) epoch_episode_steps = deque(maxlen=1) total_rollout_steps = 0 for epoch in range(self.global_step, self.num_iters): episode_reward = 0 episode_step = 0 self.action_noise.reset() obs = self.env.reset() obs = obs[0] epoch_actor_losses = [] epoch_critic_losses = [] if self.use_her: ep_experi = { 'obs': [], 'act': [], 'reward': [], 'new_obs': [], 'ach_goals': [], 'done': [] } for t_rollout in range(self.rollout_steps): total_rollout_steps += 1 ran = np.random.random(1)[0] if self.pretrain_dir is None and epoch < self.warmup_iter or \ ran < self.random_prob: act = self.random_action().flatten() else: act = self.policy(obs).flatten() new_obs, r, done, info = self.env.step(act) ach_goals = new_obs[1].copy() new_obs = new_obs[0].copy() episode_reward += r episode_step += 1 self.memory.append(obs, act, r * self.reward_scale, new_obs, ach_goals, done) if self.use_her: ep_experi['obs'].append(obs) ep_experi['act'].append(act) ep_experi['reward'].append(r * self.reward_scale) ep_experi['new_obs'].append(new_obs) ep_experi['ach_goals'].append(ach_goals) ep_experi['done'].append(done) if self.ob_norm: self.obs_oms.update(new_obs) obs = new_obs epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) if self.use_her: for t in range(episode_step - self.k_future): ob = ep_experi['obs'][t] act = ep_experi['act'][t] new_ob = ep_experi['new_obs'][t] ach_goal = ep_experi['ach_goals'][t] k_futures = np.random.choice(np.arange( t + 1, episode_step), self.k_future - 1, replace=False) k_futures = np.concatenate((np.array([t]), k_futures)) for future in k_futures: new_goal = ep_experi['ach_goals'][future] her_ob = np.concatenate( (ob[:-self.goal_dim], new_goal), axis=0) her_new_ob = np.concatenate( (new_ob[:-self.goal_dim], new_goal), axis=0) res = self.env.cal_reward(ach_goal.copy(), new_goal, act) her_reward, _, done = res self.memory.append(her_ob, act, her_reward * self.reward_scale, her_new_ob, ach_goal.copy(), done) self.global_step += 1 if epoch >= self.warmup_iter: for t_train in range(self.train_steps): act_loss, cri_loss = self.train_net() epoch_critic_losses.append(cri_loss) epoch_actor_losses.append(act_loss) if epoch % self.log_interval == 0: tnow = time.time() stats = {} if self.ob_norm: stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy()) stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy()) stats['total_rollout_steps'] = total_rollout_steps stats['rollout/return'] = safemean( [rew for rew in epoch_episode_rewards]) stats['rollout/ep_steps'] = safemean( [l for l in epoch_episode_steps]) if epoch >= self.warmup_iter: stats['actor_loss'] = np.mean(epoch_actor_losses) stats['critic_loss'] = np.mean(epoch_critic_losses) stats['epoch'] = epoch stats['actor_lr'] = self.actor_optim.param_groups[0]['lr'] stats['critic_lr'] = self.critic_optim.param_groups[0]['lr'] stats['time_elapsed'] = tnow - tfirststart for name, value in stats.items(): logger.logkv(name, value) logger.dumpkvs() if (epoch == 0 or epoch >= self.warmup_iter) and \ self.save_interval and\ epoch % self.save_interval == 0 and \ logger.get_dir(): mean_final_dist, succ_rate = self.rollout() logger.logkv('epoch', epoch) logger.logkv('test/total_rollout_steps', total_rollout_steps) logger.logkv('test/mean_final_dist', mean_final_dist) logger.logkv('test/succ_rate', succ_rate) tra_mean_dist, tra_succ_rate = self.rollout(train_test=True) logger.logkv('train/mean_final_dist', tra_mean_dist) logger.logkv('train/succ_rate', tra_succ_rate) # self.log_model_weights() logger.dumpkvs() if mean_final_dist < self.closest_dist: self.closest_dist = mean_final_dist is_best = True else: is_best = False self.save_model(is_best=is_best, step=self.global_step) def train_net(self): batch_data = self.memory.sample(batch_size=self.batch_size) for key, value in batch_data.items(): batch_data[key] = torch.from_numpy(value) obs0_t = batch_data['obs0'] obs1_t = batch_data['obs1'] obs0_t = self.normalize(obs0_t, self.obs_oms) obs1_t = self.normalize(obs1_t, self.obs_oms) obs0 = Variable(obs0_t).float().cuda() with torch.no_grad(): vol_obs1 = Variable(obs1_t).float().cuda() rewards = Variable(batch_data['rewards']).float().cuda() actions = Variable(batch_data['actions']).float().cuda() terminals = Variable(batch_data['terminals1']).float().cuda() cri_q_val = self.critic(obs0, actions) with torch.no_grad(): target_net_act = self.actor_target(vol_obs1) target_net_q_val = self.critic_target(vol_obs1, target_net_act) # target_net_q_val.volatile = False target_q_label = rewards target_q_label += self.gamma * target_net_q_val * (1 - terminals) target_q_label = target_q_label.detach() self.actor.zero_grad() self.critic.zero_grad() cri_loss = self.critic_loss(cri_q_val, target_q_label) cri_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optim.step() self.critic.zero_grad() self.actor.zero_grad() net_act = self.actor(obs0) net_q_val = self.critic(obs0, net_act) act_loss = -net_q_val.mean() act_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optim.step() self.soft_update(self.actor_target, self.actor, self.tau) self.soft_update(self.critic_target, self.critic, self.tau) return act_loss.cpu().data.numpy(), cri_loss.cpu().data.numpy() def normalize(self, x, stats): if stats is None: return x return (x - stats.mean) / stats.std def denormalize(self, x, stats): if stats is None: return x return x * stats.std + stats.mean def net_mode(self, train=True): if train: self.actor.train() self.critic.train() else: self.actor.eval() self.critic.eval() def load_model(self, step=None, pretrain_dir=None): model_dir = self.model_dir if pretrain_dir is not None: ckpt_file = os.path.join(self.pretrain_dir, 'model_best.pth') else: if step is None: ckpt_file = os.path.join(model_dir, 'model_best.pth') else: ckpt_file = os.path.join(model_dir, 'ckpt_{:08d}.pth'.format(step)) if not os.path.isfile(ckpt_file): raise ValueError("No checkpoint found at '{}'".format(ckpt_file)) mutils.print_yellow('Loading checkpoint {}'.format(ckpt_file)) checkpoint = torch.load(ckpt_file) if pretrain_dir is not None: actor_dict = self.actor.state_dict() critic_dict = self.critic.state_dict() actor_pretrained_dict = { k: v for k, v in checkpoint['actor_state_dict'].items() if k in actor_dict } critic_pretrained_dict = { k: v for k, v in checkpoint['critic_state_dict'].items() if k in critic_dict } actor_dict.update(actor_pretrained_dict) critic_dict.update(critic_pretrained_dict) self.actor.load_state_dict(actor_dict) self.critic.load_state_dict(critic_dict) self.global_step = 0 else: self.actor.load_state_dict(checkpoint['actor_state_dict']) self.critic.load_state_dict(checkpoint['critic_state_dict']) self.global_step = checkpoint['global_step'] if step is None: mutils.print_yellow('Checkpoint step: {}' ''.format(checkpoint['ckpt_step'])) self.warmup_iter += self.global_step mutils.print_yellow('Checkpoint loaded...') def save_model(self, is_best, step=None): if step is None: step = self.global_step ckpt_file = os.path.join(self.model_dir, 'ckpt_{:08d}.pth'.format(step)) data_to_save = { 'ckpt_step': step, 'global_step': self.global_step, 'actor_state_dict': self.actor.state_dict(), 'actor_optimizer': self.actor_optim.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'critic_optimizer': self.critic_optim.state_dict() } mutils.print_yellow('Saving checkpoint: %s' % ckpt_file) torch.save(data_to_save, ckpt_file) if is_best: torch.save(data_to_save, os.path.join(self.model_dir, 'model_best.pth')) def rollout(self, train_test=False, render=False, record=False, slow_t=0): test_conditions = self.env.train_test_conditions \ if train_test else self.env.test_conditions done_num = 0 final_dist = [] episode_length = [] for idx in range(test_conditions): if train_test: obs = self.env.train_test_reset(cond=idx) else: obs = self.env.test_reset(cond=idx) for t_rollout in range(self.rollout_steps): obs = obs[0].copy() act = self.policy(obs, stochastic=False).flatten() obs, r, done, info = self.env.step(act) if render: self.env.render() if slow_t > 0: time.sleep(slow_t) if done: done_num += 1 break if record: print('dist: ', info['dist']) final_dist.append(info['dist']) episode_length.append(t_rollout) final_dist = np.array(final_dist) mean_final_dist = np.mean(final_dist) succ_rate = done_num / float(test_conditions) if record: with open('./test_data.json', 'w') as f: json.dump(final_dist.tolist(), f) print('\nDist statistics:') print("Minimum: {0:9.4f} Maximum: {1:9.4f}" "".format(np.min(final_dist), np.max(final_dist))) print("Mean: {0:9.4f}".format(mean_final_dist)) print("Standard Deviation: {0:9.4f}".format(np.std(final_dist))) print("Median: {0:9.4f}".format(np.median(final_dist))) print("First quartile: {0:9.4f}" "".format(np.percentile(final_dist, 25))) print("Third quartile: {0:9.4f}" "".format(np.percentile(final_dist, 75))) print('Success rate:', succ_rate) if render: while True: self.env.render() return mean_final_dist, succ_rate def log_model_weights(self): for name, param in self.actor.named_parameters(): logger.logkv('actor/' + name, param.clone().cpu().data.numpy()) for name, param in self.actor_target.named_parameters(): logger.logkv('actor_target/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic.named_parameters(): logger.logkv('critic/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic_target.named_parameters(): logger.logkv('critic_target/' + name, param.clone().cpu().data.numpy()) def random_action(self): act = np.random.uniform(-1., 1., self.ac_dim) return act def policy(self, obs, stochastic=True): self.actor.eval() ob = Variable(torch.from_numpy(obs)).float().cuda().view(1, -1) act = self.actor(ob) act = act.cpu().data.numpy() if stochastic: act = self.action_noise(act) self.actor.train() return act def cuda(self): self.critic.cuda() self.actor.cuda() if hasattr(self, 'critic_target'): self.critic_target.cuda() self.actor_target.cuda() self.critic_loss.cuda() def construct_optim(self, net, lr, weight_decay=None): if weight_decay is None: weight_decay = 0 params = mutils.add_weight_decay([net], weight_decay=weight_decay) optimizer = optim.Adam(params, lr=lr, weight_decay=weight_decay) return optimizer def soft_update(self, target, source, tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_method': args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor=True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array( np.mean([ np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters() ])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self, nb_status, nb_actions, args): self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn } if args.pic: self.cnn = CNN(3, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self, train_actor=True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) print('label 1') print('size = ', state_batch.shape) state_batch = self.cnn(state_batch) print('label 2') next_state_batch = np.array( [self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn(next_state_batch) next_q_values = self.critic_target( [next_state_batch, self.actor_target(next_state_batch)]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([state_batch, self.actor(state_batch)]) else: policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): if self.pic: # print(s_t.shape) s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) self.eval() # print(s_t.shape) if self.pic: action = to_numpy(self.actor_target(s_t)).squeeze(0) else: action = to_numpy(self.actor(to_tensor(np.array([s_t ])))).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=0): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True if USE_CUDA: self.cuda() def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self, distribution='uniform'): ''' Produce a random action ''' if distribution == 'uniform': action = np.random.uniform(-1., 1., self.nb_actions) # set the action internally to the agent self.a_t = action return action else: raise ValueError('Distribution {} not defined'.format(distribution)) def select_action(self, s_t, decay_epsilon=True, clip=None): ''' Pick action according to actor network. :param s_t: current state s_t :param decay_epsilon: bool. :param clip: tuple to clip action values between clip[0] and clip[1]. Default (-1, 1) Set to false if not clip. ''' # Set default for clip if None if clip is not False and clip is None: clip = (-1., 1.) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) # Add noise to the action. action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() if clip is not False: if len(clip) != 2: raise ValueError('Clip parameter malformed, received {}, \ expected a size 2 tuple') action = np.clip(action, clip[0], clip[1]) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict( torch.load('{}/actor.pkl'.format(output)) ) self.critic.load_state_dict( torch.load('{}/critic.pkl'.format(output)) ) def save_model(self, output): torch.save( self.actor.state_dict(), '{}/actor.pkl'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pkl'.format(output) ) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class UADDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.train_with_dropout = args.train_with_dropout self.dropout_p = args.dropout_p self.dropout_n = args.dropout_n self.print_var_count = 0 self.action_std = np.array([]) self.save_dir = args.output self.episode = 0 # self.save_file = open(self.save_dir + '/std.txt', "a") print("train_with_dropout : " + str(self.train_with_dropout)) print("Dropout p : " + str(self.dropout_p)) print("Dropout n : " + str(self.dropout_n)) # Create Actor and Critic Network net_cfg_actor = { 'dropout_n': args.dropout_n, 'dropout_p': args.dropout_p, 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } net_cfg_critic = { 'dropout_n': args.dropout_n, 'dropout_p': args.dropout_p, 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg_actor) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg_actor) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg_critic) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg_critic) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda() def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split( self.batch_size) # Prepare for the target q batch # TODO : (1) Also apply epistemic and aleatoric uncertainty to both actor and critic target network # TOOD : (2) Is it proper to apply epistemic uncertainty to target network? If then, how to apply? Which network to choose for target? Let's think more about it after July. next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)) ])[: -1] # x : next_state_batch, a : self.actor_target(next_state_batch) target_q_batch = to_tensor(reward_batch) + self.discount * to_tensor( terminal_batch.astype(np.float)) * next_q_values ######################### # Critic update ######################### self.critic.zero_grad() # TODO : (Completed) Add epistemic uncertainty for critic network q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # q_batch_mean, q_batch_var = select_q_with_dropout(state_batch, action_batch) # q_batch = self.critic.foward_with_dropout([to_tensor(state_batch), to_tensor(action_batch)]) # TODO : (Completed) Add aleatoric uncertainty term from aleatoric uncertainty output of critic network (Add aleatoric uncertainty term in criterion) value_loss = criterion(q_batch, target_q_batch) # value_loss = AULoss(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() ######################### # Actor update ######################### self.actor.zero_grad() # policy loss # TODO : (Completed) Add epistemic certainty term from aleatoric certainty output of policy network policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() # policy_loss = policy_loss.mean() + 1 / self.actor(to_tensor(state_batch)[-1]) policy_loss.backward() self.actor_optim.step() ######################### # Target soft update ######################### soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action # def select_action(self, s_t, decay_epsilon=True): # action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) # action += self.is_training*max(self.epsilon, 0)*self.random_process.sample() # # if decay_epsilon: # self.epsilon -= self.depsilon # # self.a_t = action # return action def select_q_with_dropout(self, s_t, a_t): dropout_qs = np.arrary([]) with torch.no_grad(): for i in range(self.dropout_n): q_batch = to_numpy( self.critic.forward_with_dropout([ to_tensor(s_t), to_tensor(a_t) ]).squeeze(0)[:-1]) # ignore aleatoric variance term dropout_qs = np.append(dropout_qs, [q_batch]) q_mean = torch.mean(dropout_qs) q_var = torch.var(dropout_qs) return q_mean, q_var def select_action_with_dropout(self, s_t, decay_epsilon=True): dropout_actions = np.array([]) with torch.no_grad(): for i in range(self.dropout_n): action = to_numpy( self.actor.forward_with_dropout(to_tensor(np.array( [s_t])))).squeeze(0) dropout_actions = np.append(dropout_actions, [action]) if self.train_with_dropout: plt_action = to_numpy( self.actor.forward_with_dropout(to_tensor(np.array( [s_t])))).squeeze(0) plt_action += self.is_training * max( self.epsilon, 0) * self.random_process.sample() else: plt_action = to_numpy(self.actor(to_tensor(np.array( [s_t])))).squeeze(0) plt_action += self.is_training * max( self.epsilon, 0) * self.random_process.sample() """ UNFIXED RESET POINT for Mujoco """ if self.print_var_count != 0 and (self.print_var_count + 1) % 999 == 0: # self.action_std = np.append(self.action_std, [np.std(dropout_actions)]) with open(self.save_dir + "/std.txt", "a") as myfile: myfile.write(str(np.std(dropout_actions)) + '\n') with open(self.save_dir + "/mean.txt", "a") as myfile: myfile.write(str(np.mean(dropout_actions)) + '\n') if self.print_var_count % (1000 * 5) == 0: print("dropout actions std", np.std(dropout_actions), " ", "dir : ", str(self.save_dir)) """ FIXED RESET POINT for MCC """ # if s_t[0] == -0.5 and s_t[1] == 0: # # print("fixed dropout actions std", np.std(dropout_actions), " ", "dir : ", str(self.save_dir)) # self.action_std = np.append(self.action_std, [np.std(dropout_actions)]) # # np.savetxt(self.save_dir + '/std.txt', self.action_std, fmt='%4.10f', delimiter=' ') # with open(self.save_dir + "/std.txt", "a") as myfile: # myfile.write(str(np.std(dropout_actions))+'\n') # with open(self.save_dir + "/mean.txt", "a") as myfile: # myfile.write(str(np.mean(dropout_actions))+'\n') if not (os.path.isdir(self.save_dir + "/episode/" + str(self.episode))): os.makedirs( os.path.join(self.save_dir + "/episode/" + str(self.episode))) self.action_std = np.append(self.action_std, [np.std(dropout_actions)]) with open(self.save_dir + "/episode/" + str(self.episode) + "/std.txt", "a") as myfile: myfile.write(str(np.std(dropout_actions)) + '\n') with open( self.save_dir + "/episode/" + str(self.episode) + "/mean.txt", "a") as myfile: myfile.write(str(np.mean(dropout_actions)) + '\n') self.print_var_count = self.print_var_count + 1 if decay_epsilon: self.epsilon -= self.depsilon # dropout_action = np.array([np.mean(dropout_actions)]) self.a_t = plt_action return plt_action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { "hidden1": args.hidden1, "hidden2": args.hidden2, "init_w": args.init_w, } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update( self.actor_target, self.actor ) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory( limit=args.rmsize, window_length=args.window_length ) self.random_process = OrnsteinUhlenbeckProcess( size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma ) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda() def update_policy(self): # Sample batch ( state_batch, action_batch, reward_batch, next_state_batch, terminal_batch, ) = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target( [ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ] ) # next_q_values.volatile = False target_q_batch = ( to_tensor(reward_batch) + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values ) # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))] ) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.0, 1.0, self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1.0, 1.0) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output))) self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output)) torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_method':args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self, memory, nb_status, nb_actions, action_noise=None, gamma=0.99, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), actor_lr=1e-4, critic_lr=1e-3): self.nb_status = nb_status self.nb_actions = nb_actions self.action_range = action_range self.observation_range = observation_range self.normalize_observations = normalize_observations self.actor = Actor(self.nb_status, self.nb_actions) self.actor_target = Actor(self.nb_status, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(self.nb_status, self.nb_actions) self.critic_target = Critic(self.nb_status, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) # Create replay buffer self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.action_noise = action_noise # Hyper-parameters self.batch_size = batch_size self.tau = tau self.discount = gamma if self.normalize_observations: self.obs_rms = RunningMeanStd() else: self.obs_rms = None def pi(self, obs, apply_noise=True, compute_Q=True): obs = np.array([obs]) action = to_numpy(self.actor(to_tensor(obs))).squeeze(0) if compute_Q: q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q[0][0] def store_transition(self, obs0, action, reward, obs1, terminal1): self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) next_q_values = self.critic_target([ to_tensor(batch['obs1'], volatile=True), self.actor_target(to_tensor(batch['obs1'], volatile=True))]) next_q_values.volatile = False target_q_batch = to_tensor(batch['rewards']) + \ self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values self.critic.zero_grad() q_batch = self.critic([to_tensor(batch['obs0']), to_tensor(batch['actions'])]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.cpu().data[0], policy_loss.cpu().data[0] def initialize(self): hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_target_net(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def reset(self): if self.action_noise is not None: self.action_noise.reset() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda()
class Agent(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions, args.init_w) self.actor_target = Actor(self.nb_states, self.nb_actions, args.init_w) self.critic = Critic(self.nb_states, self.nb_actions, args.init_w) self.critic_target = Critic(self.nb_states, self.nb_actions, args.init_w) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.trajectory_length = args.trajectory_length self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.is_training = True # if USE_CUDA: self.cuda() def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) return action def select_action(self, state, noise_enable=True, decay_epsilon=True): action, _ = self.actor(to_tensor(np.array([state]))) action = to_numpy(action).squeeze(0) if noise_enable == True: action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon return action def reset_lstm_hidden_state(self, done=True): self.actor.reset_lstm_hidden_state(done) def reset(self): self.random_process.reset_states() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def load_weights(self, output): if output is None: return False self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) return True def save_model(self, output): if not os.path.exists(output): os.mkdir(output) torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'init_method':args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([ state_batch, self.actor(state_batch) ]) else: policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if(self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if(self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() # if self.pic: # action = np.concatenate((softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy( self.actor_target(s_t) ).squeeze(0) else: action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = self.random_action(fix=True) # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
num_actions = brain.vector_action_space_size num_agent = env._n_agents[default_brain] print('state size:', num_inputs) print('action size:', num_actions) print('agent count:', num_agent) writer = SummaryWriter(args.logdir) # running average of state running_state = ZFilter((num_agent, num_inputs), clip=5) actor = Actor(num_inputs, num_actions, args).to(device) critic = Critic(num_inputs, args).to(device) if torch.cuda.is_available(): actor = actor.cuda() critic = critic.cuda() if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path, map_location='cpu') actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn, 'init_method': args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array( [self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target( [next_state_batch, self.actor_target(next_state_batch)]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([state_batch, self.actor(state_batch)]) else: policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array( np.mean([ np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters() ])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if (self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if (self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() if self.pic: action = np.concatenate( (softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy(self.actor_target(s_t)).squeeze(0) else: action = to_numpy(self.actor(to_tensor(np.array([s_t ])))).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = (action + self.random_action(fix=True)) / 2. # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self, env, mem_size=7 * int(1e3), lr_critic=1e-3, lr_actor=1e-4, epsilon=1., max_epi=1500, epsilon_decay=1. / (1e5), gamma=.99, target_update_frequency=200, batch_size=64, random_process=True, max_step=None): self.CUDA = torch.cuda.is_available() self.orig_env = env #for recording if max_step is not None: self.orig_env._max_episode_steps = max_step self.env = self.orig_env self.N_S = self.env.observation_space.shape[0] self.N_A = self.env.action_space.shape[0] self.MAX_EPI = max_epi self.LOW = self.env.action_space.low self.HIGH = self.env.action_space.high self.actor = Actor(self.N_S, self.N_A) self.critic = Critic(self.N_S, self.N_A) self.target_actor = Actor(self.N_S, self.N_A) self.target_critic = Critic(self.N_S, self.N_A) self.target_actor.eval() self.target_critic.eval() self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) if self.CUDA: self.actor.cuda() self.critic.cuda() self.target_actor.cuda() self.target_critic.cuda() self.exp = Experience(mem_size) self.optim_critic = optim.Adam(self.critic.parameters(), lr=lr_critic) self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor) self.random_process = OrnsteinUhlenbeckProcess(\ size=self.N_A, theta=.15, mu=0, sigma=.2) self.EPSILON = epsilon self.EPSILON_DECAY = epsilon_decay self.GAMMA = gamma self.TARGET_UPDATE_FREQUENCY = target_update_frequency self.BATCH_SIZE = batch_size title = {common.S_EPI: [], common.S_TOTAL_R: []} self.data = pd.DataFrame(title) self.RAND_PROC = random_process def train(self, dir=None, interval=1000): if dir is not None: self.env = wrappers.Monitor(self.orig_env, '{}/train_record'.format(dir), force=True) os.mkdir(os.path.join(dir, 'models')) update_counter = 0 epsilon = self.EPSILON for epi in trange(self.MAX_EPI, desc='train epi', leave=True): self.random_process.reset_states() o = self.env.reset() counter = 0 acc_r = 0 while True: counter += 1 #if dir is not None: # self.env.render() a = self.choose_action(o) if self.RAND_PROC: a += max(epsilon, 0) * self.random_process.sample() a = np.clip(a, -1., 1.) epsilon -= self.EPSILON_DECAY o_, r, done, info = self.env.step(self.map_to_action(a)) self.exp.push(o, a, r, o_, done) if epi > 0: self.update_actor_critic() update_counter += 1 if update_counter % self.TARGET_UPDATE_FREQUENCY == 0: self.update_target() acc_r += r o = o_ if done: break if dir is not None: if (epi + 1) % interval == 0: self.save(os.path.join(dir, 'models'), str(epi + 1), save_data=False) s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R]) self.data = self.data.append(s, ignore_index=True) def choose_action(self, state): self.actor.eval() s = Variable(torch.Tensor(state)).unsqueeze(0) if self.CUDA: s = s.cuda() a = self.actor(s).data.cpu().numpy()[0].astype('float64') self.actor.train() return a def map_to_action(self, a): return (self.LOW + self.HIGH) / 2 + a * (self.HIGH - self.LOW) / 2 def update_target(self): self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) def update_actor_critic(self): # sample minibatch minibatch = common.Transition(*zip(*self.exp.sample(self.BATCH_SIZE))) bat_o = Variable(torch.Tensor(minibatch.state)) bat_a = Variable(torch.Tensor(minibatch.action)) bat_r = Variable(torch.Tensor(minibatch.reward)).unsqueeze(1) bat_o_ = Variable(torch.Tensor(minibatch.next_state)) bat_not_done_mask = list( map(lambda done: 0 if done else 1, minibatch.done)) bat_not_done_mask = Variable( torch.ByteTensor(bat_not_done_mask)).unsqueeze(1) if self.CUDA: bat_o = bat_o.cuda() bat_a = bat_a.cuda() bat_r = bat_r.cuda() bat_o_ = bat_o_.cuda() bat_not_done_mask = bat_not_done_mask.cuda() # update critic bat_a_o_ = self.target_actor(bat_o_) Gt = bat_r Gt[bat_not_done_mask] += self.GAMMA * self.target_critic( bat_o_, bat_a_o_)[bat_not_done_mask] Gt.detach_() eval_o = self.critic(bat_o, bat_a) criterion = nn.MSELoss() if self.CUDA: criterion.cuda() loss = criterion(eval_o, Gt) self.optim_critic.zero_grad() loss.backward() self.optim_critic.step() # update actor self.critic.eval() bat_a_o = self.actor(bat_o) obj = torch.mean(self.critic(bat_o, bat_a_o)) self.optim_actor.zero_grad() obj.backward() self.optim_actor.step() self.critic.train() def test(self, dir=None, n=1): if dir is not None: self.env = wrappers.Monitor(self.orig_env, '{}/test_record'.format(dir), force=True, video_callable=lambda episode_id: True) title = {common.S_EPI: [], common.S_TOTAL_R: []} df = pd.DataFrame(title) for epi in trange(n, desc='test epi', leave=True): o = self.env.reset() acc_r = 0 while True: #if dir is not None: # self.env.render() a = self.choose_action(o) o_, r, done, info = self.env.step(self.map_to_action(a)) acc_r += r o = o_ if done: break s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R]) df = df.append(s, ignore_index=True) if dir is not None: df.to_csv('{}/test_data.csv'.format(dir)) else: print df def save(self, dir, suffix='', save_data=True): torch.save(self.actor.state_dict(), '{}/actor{}.pt'.format(dir, suffix)) torch.save(self.critic.state_dict(), '{}/critic{}.pt'.format(dir, suffix)) if save_data: self.data.to_csv('{}/train_data{}.csv'.format(dir, suffix)) def load_actor(self, dir): self.actor.load_state_dict(torch.load(dir)) def load_critic(self, dir): self.critic.load_state_dict(torch.load(dir)) def get_data(self): return self.data