def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if len(envs.observation_space.shape) == 3: actor_critic = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) target_actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) critic = Critic(in_channels=4, num_actions=envs.action_space.n) critic_target = Critic(in_channels=4, num_actions=envs.action_space.n) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if args.cuda: actor_critic.cuda() critic.cuda() critic_target.cuda() target_actor.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) critic_optim = optim.Adam(critic.parameters(), lr=1e-4) gamma = 0.99 tau = 0.001 #memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) mem_buffer = ReplayBuffer() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, envs.action_space.n) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) value = critic.forward( Variable(rollouts.observations[step], volatile=True), action_log_prob) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks pre_state = rollouts.observations[step].cpu().numpy() update_current_obs(obs) mem_buffer.add((pre_state, current_obs, action_log_prob.data.cpu().numpy(), reward, done)) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True)) #[0].data next_value = critic.forward( Variable(rollouts.observations[-1], volatile=True), action_log_prob).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if True: state, next_state, action, reward, done = mem_buffer.sample(5) next_state = next_state.reshape([-1, *obs_shape]) state = state.reshape([-1, *obs_shape]) action = action.reshape([-1, 6]) next_q_values = critic_target( to_tensor(next_state, volatile=True), target_actor(to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True))[0]) next_q_values.volatile = False target_q_batch = to_tensor(reward) + args.gamma * to_tensor( done.astype(np.float)) * next_q_values critic.zero_grad() q_batch = critic(to_tensor(state), to_tensor(action)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() critic_optim.step() actor_critic.zero_grad() policy_loss = -critic( to_tensor(state), actor_critic(to_tensor(state), to_tensor(state), to_tensor(state))[0]) policy_loss = policy_loss.mean() policy_loss.backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() soft_update(target_actor, actor_critic, tau) soft_update(critic_target, critic, tau) ''' if args.algo in ['a2c', 'acktr']: action_log_probs, probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = critic.forward(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), probs).data values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) #advantages = Variable(rollouts.returns[:-1]) - values advantages = rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages) * action_log_probs).mean() #action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() critic_optim.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() critic_optim.step() ''' rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), value_loss.data.cpu().numpy()[0], policy_loss.data.cpu().numpy()[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
class DDPG: def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.env = env self.state_dim = state_dim self.action_dim = action_dim self.AE = Actor(state_dim,action_dim).cuda() self.CE = Critic(state_dim,action_dim).cuda() self.AT = Actor(state_dim,action_dim).cuda() self.CT = Critic(state_dim,action_dim).cuda() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.time_step = 0 self.AE.load_state_dict(torch.load(MODEL_DIR+'/obs/actor_340000.pkl')) # self.AT.load_state_dict(torch.load(MODEL_DIR+'/actor_280000.pkl')) # self.CE.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl')) # self.CT.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl')) self.optimizer_a = torch.optim.Adam(self.AE.parameters(), lr=1e-4) self.optimizer_c = torch.optim.Adam(self.CE.parameters(), lr=1e-4) def train(self): self.AE.train() data = self.replay_buffer.get_batch(BATCH_SIZE) bs = np.array([da[0] for da in data]) ba = np.array([da[1] for da in data]) br = np.array([da[2] for da in data]) bs_ = np.array([da[3] for da in data]) bd = np.array([da[4] for da in data]) bs = torch.FloatTensor(bs).cuda() ba = torch.FloatTensor(ba).cuda() br = torch.FloatTensor(br).cuda() bs_ = torch.FloatTensor(bs_).cuda() a_ = self.AT(bs_) ####################### NOTICE !!! ##################################### #q1 = self.CE(bs, a) ###### here use action batch !!! for policy loss!!! q2 = self.CE(bs, ba) ###### here use computed batch !!! for value loss!!! ######################################################################## q_ = self.CT(bs_, a_).detach() q_tar = torch.FloatTensor(BATCH_SIZE) for i in range(len(data)): if bd[i]: q_tar[i] = br[i] else: q_tar[i] = br[i]+GAMMA*q_[i] q_tar = q_tar.view(BATCH_SIZE,1).cuda() # minimize mse_loss of q2 and q_tar td_error = F.mse_loss(q2, q_tar.detach()) # minimize td_error self.CE.zero_grad() td_error.backward(retain_graph=True) self.optimizer_c.step() a = self.AE(bs) q1 = self.CE(bs, a) a_loss = -torch.mean(q1) # maximize q self.AE.zero_grad() a_loss.backward(retain_graph=True) self.optimizer_a.step() self.soft_replace() def soft_replace(self): for t,e in zip(self.AT.parameters(),self.AE.parameters()): t.data = (1-TAU)*t.data + TAU*e.data for t,e in zip(self.CT.parameters(),self.CE.parameters()): t.data = (1-TAU)*t.data + TAU*e.data def action(self, state): self.AE.eval() state_tensor = torch.FloatTensor(state).unsqueeze(0).cuda() # add batch_sz=1 ac_tensor = self.AE(state_tensor) ac = ac_tensor.squeeze(0).cpu().detach().numpy() return ac def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state,action,reward,next_state,done) if self.replay_buffer.count() == REPLAY_START_SIZE: print('\n---------------Start training---------------') # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % 10000 == 0 and self.time_step > 0: torch.save(self.AE.state_dict(), MODEL_DIR + '/obs/actor_{}.pkl'.format(self.time_step)) torch.save(self.CE.state_dict(), MODEL_DIR + '/obs/critic_{}.pkl'.format(self.time_step)) print('Save model state_dict successfully in obs dir...') return self.time_step
class DDPG(object): def __init__(self, nb_actions, nb_states, layer_norm, obs_norm, actor_lr, critic_lr, SGLD_coef, noise_decay, lr_decay, batch_size, discount, tau, pool_size, parameters_noise, action_noise, SGLD_mode, pool_mode, with_cuda): self.nb_actions = nb_actions self.nb_states = nb_states self.layer_norm = layer_norm self.parameters_noise = parameters_noise self.action_noise = action_noise self.batch_size = batch_size self.discount = discount self.tau = tau self.pool_size = pool_size self.critic_lr = critic_lr self.actor_lr = actor_lr self.SGLD_coef = SGLD_coef self.noise_coef = 1 self.noise_decay = noise_decay self.lr_coef = 1 self.lr_decay = lr_decay self.SGLD_mode = SGLD_mode self.pool_mode = pool_mode self.with_cuda = with_cuda self.actor = Actor(nb_states=self.nb_states, nb_actions=self.nb_actions, layer_norm=self.layer_norm) self.actor_target = Actor(nb_states=self.nb_states, nb_actions=self.nb_actions, layer_norm=self.layer_norm) self.critic = Critic(nb_states, nb_actions, layer_norm=self.layer_norm) self.critic_target = Critic(nb_states, nb_actions, layer_norm=self.layer_norm) if self.with_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) #self.actor_optim = SGD(self.actor.parameters(), lr=actor_lr, momentum=0.9,weight_decay = 0.01) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) #self.critic_optim = SGD(self.critic.parameters(), lr=critic_lr, momentum=0.9,weight_decay = 0.01) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) self.memory = Memory(int(1e6), (nb_actions, ), (nb_states, ), with_cuda) self.obs_norm = obs_norm if self.obs_norm: self.run_obs_norm = Run_Normalizer((nb_states, ), self.with_cuda) self.is_training = True if self.pool_size > 0: self.agent_pool = Agent_pool(self.pool_size) self.s_t = None self.a_t = None def store_transition(self, s_t, a_t, r_t, s_t1, done_t): if self.is_training: self.memory.append(s_t, a_t, r_t, s_t1, done_t) if self.obs_norm: self.run_obs_norm.observe(s_t) self.s_t = s_t1 def update(self): # Sample batch batch = self.memory.sample(self.batch_size) tensor_obs0 = batch['obs0'] tensor_obs1 = batch['obs1'] if self.obs_norm: tensor_obs0 = self.run_obs_norm.normalize(tensor_obs0) tensor_obs1 = self.run_obs_norm.normalize(tensor_obs1) # Prepare for the target q batch with torch.no_grad(): next_q_values = self.critic_target([ tensor_obs1, self.actor_target(tensor_obs1), ]) target_q_batch = batch['rewards'] + \ self.discount*(1-batch['terminals1'])*next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([tensor_obs0, batch['actions']]) value_loss = nn.functional.mse_loss(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if (self.SGLD_mode == 2) or (self.SGLD_mode == 3): SGLD_update(self.critic, self.critic_lr * self.lr_coef, self.SGLD_coef) # Actor update self.actor.zero_grad() policy_loss = -self.critic([tensor_obs0, self.actor(tensor_obs0)]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() if (self.SGLD_mode == 1) or (self.SGLD_mode == 3): SGLD_update(self.actor, self.actor_lr * self.lr_coef, self.SGLD_coef) # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.item(), policy_loss.item() def apply_lr_decay(self): if self.lr_decay > 0: self.lr_coef = self.lr_decay * self.lr_coef / (self.lr_coef + self.lr_decay) self.critic_optim.param_groups[0][ 'lr'] = self.critic_lr * self.lr_coef def apply_noise_decay(self): if self.noise_decay > 0: self.noise_coef = self.noise_decay * self.noise_coef / ( self.noise_coef + self.noise_decay) def select_action(self, random=False, s_t=None, if_noise=True): if random: action = np.random.uniform(-1., 1., self.nb_actions) else: if s_t is None: raise RuntimeError() s_t = torch.tensor(s_t, dtype=torch.float32, requires_grad=False) if self.with_cuda: s_t = s_t.cuda() if self.obs_norm: s_t = self.run_obs_norm.normalize(s_t) with torch.no_grad(): action = self.actor(s_t).cpu().numpy().squeeze(0) if if_noise & (self.action_noise is not None): action += self.is_training * max(self.noise_coef, 0) * self.action_noise() action = np.clip(action, -1., 1.) self.a_t = action return action def load_weights(self, output): self.actor = torch.load('{}/actor.pkl'.format(output)) self.critic = torch.load('{}/critic.pkl'.format(output)) if self.obs_norm: self.run_obs_norm = torch.load('{}/obs_norm.pkl'.format(output)) def save_model(self, output): torch.save(self.actor, '{}/actor.pkl'.format(output)) torch.save(self.critic, '{}/critic.pkl'.format(output)) if self.obs_norm: torch.save(self.run_obs_norm, '{}/obs_norm.pkl'.format(output)) def get_actor_buffer(self): buffer = io.BytesIO() torch.save(self.actor, buffer) return buffer def get_norm_param(self): return self.run_obs_norm.mean.cpu(), self.run_obs_norm.var.cpu() #TODO recode agent pool def append_actor(self): self.agent_pool.actor_append(self.actor.state_dict(), self.actor_target.state_dict()) def pick_actor(self): actor, actor_target = self.agent_pool.get_actor() self.actor.load_state_dict(actor) self.actor_target.load_state_dict(actor_target) def append_critic(self): self.agent_pool.critic_append(self.critic.state_dict(), self.critic_target.state_dict()) def pick_critic(self): critic, critic_target = self.agent_pool.get_critic() self.critic.load_state_dict(critic) self.critic_target.load_state_dict(critic_target) def append_actor_critic(self): self.agent_pool.actor_append(self.actor.state_dict(), self.actor_target.state_dict()) self.agent_pool.critic_append(self.critic.state_dict(), self.critic_target.state_dict()) def pick_actor_critic(self): actor, actor_target, critic, critic_target = self.agent_pool.get_agent( ) self.actor.load_state_dict(actor) self.actor_target.load_state_dict(actor_target) self.critic.load_state_dict(critic) self.critic_target.load_state_dict(critic_target) def append_agent(self): if self.pool_mode == 1: self.append_actor() elif self.pool_mode == 2: self.append_critic() elif self.pool_mode == 3: self.append_actor_critic() def pick_agent(self): if self.pool_mode == 1: self.pick_actor() elif self.pool_mode == 2: self.pick_critic() elif self.pool_mode == 3: self.pick_actor_critic() def reset(self, obs): self.s_t = obs if self.action_noise is not None: self.action_noise.reset()
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'init_method':args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([ state_batch, self.actor(state_batch) ]) else: policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if(self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if(self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() # if self.pic: # action = np.concatenate((softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy( self.actor_target(s_t) ).squeeze(0) else: action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = self.random_action(fix=True) # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self, nb_states, nb_actions, args, discrete, use_cuda=False): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.discrete = discrete # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_w':args.init_w } self.actor = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = use_cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # state_batch, action_batch, reward_batch, \ # next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([ to_tensor(state_batch), to_tensor(action_batch) ]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor == True: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): print("use cuda") self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=1): action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) # print(self.random_process.sample(), action) noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) # print(max(self.epsilon, 0) * self.random_process.sample() * noise_level, noise_level) action = np.clip(action, -1., 1.) # print(action) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict( torch.load('{}/actor.pkl'.format(output)) ) self.critic.load_state_dict( torch.load('{}/critic.pkl'.format(output)) ) def save_model(self, output): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor.pkl'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pkl'.format(output) ) if self.use_cuda: self.actor.cuda() self.critic.cuda() def seed(self,s): torch.manual_seed(s) if self.use_cuda: torch.cuda.manual_seed(s)
class DDPG(object): def __init__(self, nb_status, nb_actions, args): self.num_actor = 3 self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn } self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)] self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) for i in range(self.num_actor): hard_update(self.actor_targets[i], self.actors[i]) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor=True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = 0 for i in range(self.num_actor): next_q_values = next_q_values + self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_targets[i](to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values = next_q_values / self.num_actor next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() sum_policy_loss = 0 for i in range(self.num_actor): self.actors[i].zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actors[i](to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor: self.actor_optims[i].step() sum_policy_loss += policy_loss # Target update soft_update(self.actor_targets[i], self.actors[i], self.tau) soft_update(self.critic_target, self.critic, self.tau) return -sum_policy_loss / self.num_actor, value_loss def cuda(self): for i in range(self.num_actor): self.actors[i].cuda() self.actor_targets[i].cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): actions = [] status = [] tot_score = [] for i in range(self.num_actor): action = to_numpy(self.actors[i](to_tensor(np.array([s_t]), volatile=True))).squeeze(0) noise_level = noise_level * max(self.epsilon, 0) action = action + self.random_process.sample() * noise_level status.append(s_t) actions.append(action) tot_score.append(0.) scores = self.critic([to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True)]) for j in range(self.num_actor): tot_score[j] += scores.data[j][0] best = np.array(tot_score).argmax() if decay_epsilon: self.epsilon -= self.depsilon self.a_t = actions[best] return actions[best] def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=0): if output is None: return for i in range(self.num_actor): actor = self.actors[i] actor_target = self.actor_targets[i] actor.load_state_dict( torch.load('{}/actor{}_{}.pkl'.format(output, num, i)) ) actor_target.load_state_dict( torch.load('{}/actor{}_{}.pkl'.format(output, num, i)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: for i in range(self.num_actor): self.actors[i].cpu() self.critic.cpu() for i in range(self.num_actor): torch.save( self.actors[i].state_dict(), '{}/actor{}_{}.pkl'.format(output, num, i) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: for i in range(self.num_actor): self.actors[i].cuda() self.critic.cuda()
class DDPG(object): def __init__(self, memory, nb_status, nb_actions, action_noise=None, gamma=0.99, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), actor_lr=1e-4, critic_lr=1e-3): self.nb_status = nb_status self.nb_actions = nb_actions self.action_range = action_range self.observation_range = observation_range self.normalize_observations = normalize_observations self.actor = Actor(self.nb_status, self.nb_actions) self.actor_target = Actor(self.nb_status, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(self.nb_status, self.nb_actions) self.critic_target = Critic(self.nb_status, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) # Create replay buffer self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.action_noise = action_noise # Hyper-parameters self.batch_size = batch_size self.tau = tau self.discount = gamma if self.normalize_observations: self.obs_rms = RunningMeanStd() else: self.obs_rms = None def pi(self, obs, apply_noise=True, compute_Q=True): obs = np.array([obs]) action = to_numpy(self.actor(to_tensor(obs))).squeeze(0) if compute_Q: q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q[0][0] def store_transition(self, obs0, action, reward, obs1, terminal1): self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) next_q_values = self.critic_target([ to_tensor(batch['obs1'], volatile=True), self.actor_target(to_tensor(batch['obs1'], volatile=True))]) next_q_values.volatile = False target_q_batch = to_tensor(batch['rewards']) + \ self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values self.critic.zero_grad() q_batch = self.critic([to_tensor(batch['obs0']), to_tensor(batch['actions'])]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.cpu().data[0], policy_loss.cpu().data[0] def initialize(self): hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_target_net(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def reset(self): if self.action_noise is not None: self.action_noise.reset() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda()
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_method': args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor=True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array( np.mean([ np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters() ])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self, nb_states, nb_actions): self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions) self.actor_target = Actor(self.nb_states, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic = Critic(self.nb_states, self.nb_actions) self.critic_target = Critic(self.nb_states, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=HISTORY_LEN) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=OU_THETA, mu=OU_MU, sigma=OU_SIGMA) # Hyper-parameters self.batch_size = BATCH_SIZE self.tau = TAU self.discount = GAMMA self.depsilon = 1.0 / DEPSILON self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True if USE_CUDA: self.cuda() def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ])[:, 0] next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() torch.nn.utils.clip_grad_norm(self.critic.parameters(), 10.0) for p in self.critic.parameters(): p.data.add_(-CRITIC_LR, p.grad.data) self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() torch.nn.utils.clip_grad_norm(self.actor.parameters(), 10.0) for p in self.actor.parameters(): p.data.add_(-ACTOR_LR, p.grad.data) self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t]))))[0] ou = self.random_process.sample() prGreen('eps:{}, act:{}, random:{}'.format(self.epsilon, action, ou)) action += self.is_training * max(self.epsilon, 0) * ou action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True if USE_CUDA: self.cuda() def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self, distribution='uniform'): ''' Produce a random action ''' if distribution == 'uniform': action = np.random.uniform(-1., 1., self.nb_actions) # set the action internally to the agent self.a_t = action return action else: raise ValueError('Distribution {} not defined'.format(distribution)) def select_action(self, s_t, decay_epsilon=True, clip=None): ''' Pick action according to actor network. :param s_t: current state s_t :param decay_epsilon: bool. :param clip: tuple to clip action values between clip[0] and clip[1]. Default (-1, 1) Set to false if not clip. ''' # Set default for clip if None if clip is not False and clip is None: clip = (-1., 1.) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) # Add noise to the action. action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() if clip is not False: if len(clip) != 2: raise ValueError('Clip parameter malformed, received {}, \ expected a size 2 tuple') action = np.clip(action, clip[0], clip[1]) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict( torch.load('{}/actor.pkl'.format(output)) ) self.critic.load_state_dict( torch.load('{}/critic.pkl'.format(output)) ) def save_model(self, output): torch.save( self.actor.state_dict(), '{}/actor.pkl'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pkl'.format(output) ) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class DDPG: def __init__(self, env, args): ob_space = env.observation_space goal_dim = env.goal_dim ob_dim = ob_space.shape[0] self.ob_dim = ob_dim self.ac_dim = ac_dim = 7 self.goal_dim = goal_dim self.num_iters = args.num_iters self.random_prob = args.random_prob self.tau = args.tau self.reward_scale = args.reward_scale self.gamma = args.gamma self.log_interval = args.log_interval self.save_interval = args.save_interval self.rollout_steps = args.rollout_steps self.env = env self.batch_size = args.batch_size self.train_steps = args.train_steps self.closest_dist = np.inf self.warmup_iter = args.warmup_iter self.max_grad_norm = args.max_grad_norm self.use_her = args.her self.k_future = args.k_future self.model_dir = os.path.join(args.save_dir, 'model') self.pretrain_dir = args.pretrain_dir os.makedirs(self.model_dir, exist_ok=True) self.global_step = 0 self.actor = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) if args.resume or args.test or args.pretrain_dir is not None: self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir) if not args.test: self.actor_target = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic_target = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.actor_optim = self.construct_optim(self.actor, lr=args.actor_lr) cri_w_decay = args.critic_weight_decay self.critic_optim = self.construct_optim(self.critic, lr=args.critic_lr, weight_decay=cri_w_decay) self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) self.actor_target.eval() self.critic_target.eval() if args.noise_type == 'ou_noise': mu = np.zeros(ac_dim) sigma = float(args.ou_noise_std) * np.ones(ac_dim) self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu, sigma=sigma) elif args.noise_type == 'uniform': low_limit = args.uniform_noise_low high_limit = args.uniform_noise_high dec_step = args.max_noise_dec_step self.action_noise = UniformNoise(low_limit=low_limit, high_limit=high_limit, dec_step=dec_step) elif args.noise_type == 'gaussian': mu = np.zeros(ac_dim) sigma = args.normal_noise_std * np.ones(ac_dim) self.action_noise = NormalActionNoise(mu=mu, sigma=sigma) self.memory = Memory(limit=int(args.memory_limit), action_shape=(int(ac_dim), ), observation_shape=(int(ob_dim), )) self.critic_loss = nn.MSELoss() self.ob_norm = args.ob_norm if self.ob_norm: self.obs_oms = OnlineMeanStd(shape=(1, ob_dim)) else: self.obs_oms = None self.cuda() def test(self, render=False, record=True, slow_t=0): dist, succ_rate = self.rollout(render=render, record=record, slow_t=slow_t) print('Final step distance: ', dist) def train(self): self.net_mode(train=True) tfirststart = time.time() epoch_episode_rewards = deque(maxlen=1) epoch_episode_steps = deque(maxlen=1) total_rollout_steps = 0 for epoch in range(self.global_step, self.num_iters): episode_reward = 0 episode_step = 0 self.action_noise.reset() obs = self.env.reset() obs = obs[0] epoch_actor_losses = [] epoch_critic_losses = [] if self.use_her: ep_experi = { 'obs': [], 'act': [], 'reward': [], 'new_obs': [], 'ach_goals': [], 'done': [] } for t_rollout in range(self.rollout_steps): total_rollout_steps += 1 ran = np.random.random(1)[0] if self.pretrain_dir is None and epoch < self.warmup_iter or \ ran < self.random_prob: act = self.random_action().flatten() else: act = self.policy(obs).flatten() new_obs, r, done, info = self.env.step(act) ach_goals = new_obs[1].copy() new_obs = new_obs[0].copy() episode_reward += r episode_step += 1 self.memory.append(obs, act, r * self.reward_scale, new_obs, ach_goals, done) if self.use_her: ep_experi['obs'].append(obs) ep_experi['act'].append(act) ep_experi['reward'].append(r * self.reward_scale) ep_experi['new_obs'].append(new_obs) ep_experi['ach_goals'].append(ach_goals) ep_experi['done'].append(done) if self.ob_norm: self.obs_oms.update(new_obs) obs = new_obs epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) if self.use_her: for t in range(episode_step - self.k_future): ob = ep_experi['obs'][t] act = ep_experi['act'][t] new_ob = ep_experi['new_obs'][t] ach_goal = ep_experi['ach_goals'][t] k_futures = np.random.choice(np.arange( t + 1, episode_step), self.k_future - 1, replace=False) k_futures = np.concatenate((np.array([t]), k_futures)) for future in k_futures: new_goal = ep_experi['ach_goals'][future] her_ob = np.concatenate( (ob[:-self.goal_dim], new_goal), axis=0) her_new_ob = np.concatenate( (new_ob[:-self.goal_dim], new_goal), axis=0) res = self.env.cal_reward(ach_goal.copy(), new_goal, act) her_reward, _, done = res self.memory.append(her_ob, act, her_reward * self.reward_scale, her_new_ob, ach_goal.copy(), done) self.global_step += 1 if epoch >= self.warmup_iter: for t_train in range(self.train_steps): act_loss, cri_loss = self.train_net() epoch_critic_losses.append(cri_loss) epoch_actor_losses.append(act_loss) if epoch % self.log_interval == 0: tnow = time.time() stats = {} if self.ob_norm: stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy()) stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy()) stats['total_rollout_steps'] = total_rollout_steps stats['rollout/return'] = safemean( [rew for rew in epoch_episode_rewards]) stats['rollout/ep_steps'] = safemean( [l for l in epoch_episode_steps]) if epoch >= self.warmup_iter: stats['actor_loss'] = np.mean(epoch_actor_losses) stats['critic_loss'] = np.mean(epoch_critic_losses) stats['epoch'] = epoch stats['actor_lr'] = self.actor_optim.param_groups[0]['lr'] stats['critic_lr'] = self.critic_optim.param_groups[0]['lr'] stats['time_elapsed'] = tnow - tfirststart for name, value in stats.items(): logger.logkv(name, value) logger.dumpkvs() if (epoch == 0 or epoch >= self.warmup_iter) and \ self.save_interval and\ epoch % self.save_interval == 0 and \ logger.get_dir(): mean_final_dist, succ_rate = self.rollout() logger.logkv('epoch', epoch) logger.logkv('test/total_rollout_steps', total_rollout_steps) logger.logkv('test/mean_final_dist', mean_final_dist) logger.logkv('test/succ_rate', succ_rate) tra_mean_dist, tra_succ_rate = self.rollout(train_test=True) logger.logkv('train/mean_final_dist', tra_mean_dist) logger.logkv('train/succ_rate', tra_succ_rate) # self.log_model_weights() logger.dumpkvs() if mean_final_dist < self.closest_dist: self.closest_dist = mean_final_dist is_best = True else: is_best = False self.save_model(is_best=is_best, step=self.global_step) def train_net(self): batch_data = self.memory.sample(batch_size=self.batch_size) for key, value in batch_data.items(): batch_data[key] = torch.from_numpy(value) obs0_t = batch_data['obs0'] obs1_t = batch_data['obs1'] obs0_t = self.normalize(obs0_t, self.obs_oms) obs1_t = self.normalize(obs1_t, self.obs_oms) obs0 = Variable(obs0_t).float().cuda() with torch.no_grad(): vol_obs1 = Variable(obs1_t).float().cuda() rewards = Variable(batch_data['rewards']).float().cuda() actions = Variable(batch_data['actions']).float().cuda() terminals = Variable(batch_data['terminals1']).float().cuda() cri_q_val = self.critic(obs0, actions) with torch.no_grad(): target_net_act = self.actor_target(vol_obs1) target_net_q_val = self.critic_target(vol_obs1, target_net_act) # target_net_q_val.volatile = False target_q_label = rewards target_q_label += self.gamma * target_net_q_val * (1 - terminals) target_q_label = target_q_label.detach() self.actor.zero_grad() self.critic.zero_grad() cri_loss = self.critic_loss(cri_q_val, target_q_label) cri_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optim.step() self.critic.zero_grad() self.actor.zero_grad() net_act = self.actor(obs0) net_q_val = self.critic(obs0, net_act) act_loss = -net_q_val.mean() act_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optim.step() self.soft_update(self.actor_target, self.actor, self.tau) self.soft_update(self.critic_target, self.critic, self.tau) return act_loss.cpu().data.numpy(), cri_loss.cpu().data.numpy() def normalize(self, x, stats): if stats is None: return x return (x - stats.mean) / stats.std def denormalize(self, x, stats): if stats is None: return x return x * stats.std + stats.mean def net_mode(self, train=True): if train: self.actor.train() self.critic.train() else: self.actor.eval() self.critic.eval() def load_model(self, step=None, pretrain_dir=None): model_dir = self.model_dir if pretrain_dir is not None: ckpt_file = os.path.join(self.pretrain_dir, 'model_best.pth') else: if step is None: ckpt_file = os.path.join(model_dir, 'model_best.pth') else: ckpt_file = os.path.join(model_dir, 'ckpt_{:08d}.pth'.format(step)) if not os.path.isfile(ckpt_file): raise ValueError("No checkpoint found at '{}'".format(ckpt_file)) mutils.print_yellow('Loading checkpoint {}'.format(ckpt_file)) checkpoint = torch.load(ckpt_file) if pretrain_dir is not None: actor_dict = self.actor.state_dict() critic_dict = self.critic.state_dict() actor_pretrained_dict = { k: v for k, v in checkpoint['actor_state_dict'].items() if k in actor_dict } critic_pretrained_dict = { k: v for k, v in checkpoint['critic_state_dict'].items() if k in critic_dict } actor_dict.update(actor_pretrained_dict) critic_dict.update(critic_pretrained_dict) self.actor.load_state_dict(actor_dict) self.critic.load_state_dict(critic_dict) self.global_step = 0 else: self.actor.load_state_dict(checkpoint['actor_state_dict']) self.critic.load_state_dict(checkpoint['critic_state_dict']) self.global_step = checkpoint['global_step'] if step is None: mutils.print_yellow('Checkpoint step: {}' ''.format(checkpoint['ckpt_step'])) self.warmup_iter += self.global_step mutils.print_yellow('Checkpoint loaded...') def save_model(self, is_best, step=None): if step is None: step = self.global_step ckpt_file = os.path.join(self.model_dir, 'ckpt_{:08d}.pth'.format(step)) data_to_save = { 'ckpt_step': step, 'global_step': self.global_step, 'actor_state_dict': self.actor.state_dict(), 'actor_optimizer': self.actor_optim.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'critic_optimizer': self.critic_optim.state_dict() } mutils.print_yellow('Saving checkpoint: %s' % ckpt_file) torch.save(data_to_save, ckpt_file) if is_best: torch.save(data_to_save, os.path.join(self.model_dir, 'model_best.pth')) def rollout(self, train_test=False, render=False, record=False, slow_t=0): test_conditions = self.env.train_test_conditions \ if train_test else self.env.test_conditions done_num = 0 final_dist = [] episode_length = [] for idx in range(test_conditions): if train_test: obs = self.env.train_test_reset(cond=idx) else: obs = self.env.test_reset(cond=idx) for t_rollout in range(self.rollout_steps): obs = obs[0].copy() act = self.policy(obs, stochastic=False).flatten() obs, r, done, info = self.env.step(act) if render: self.env.render() if slow_t > 0: time.sleep(slow_t) if done: done_num += 1 break if record: print('dist: ', info['dist']) final_dist.append(info['dist']) episode_length.append(t_rollout) final_dist = np.array(final_dist) mean_final_dist = np.mean(final_dist) succ_rate = done_num / float(test_conditions) if record: with open('./test_data.json', 'w') as f: json.dump(final_dist.tolist(), f) print('\nDist statistics:') print("Minimum: {0:9.4f} Maximum: {1:9.4f}" "".format(np.min(final_dist), np.max(final_dist))) print("Mean: {0:9.4f}".format(mean_final_dist)) print("Standard Deviation: {0:9.4f}".format(np.std(final_dist))) print("Median: {0:9.4f}".format(np.median(final_dist))) print("First quartile: {0:9.4f}" "".format(np.percentile(final_dist, 25))) print("Third quartile: {0:9.4f}" "".format(np.percentile(final_dist, 75))) print('Success rate:', succ_rate) if render: while True: self.env.render() return mean_final_dist, succ_rate def log_model_weights(self): for name, param in self.actor.named_parameters(): logger.logkv('actor/' + name, param.clone().cpu().data.numpy()) for name, param in self.actor_target.named_parameters(): logger.logkv('actor_target/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic.named_parameters(): logger.logkv('critic/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic_target.named_parameters(): logger.logkv('critic_target/' + name, param.clone().cpu().data.numpy()) def random_action(self): act = np.random.uniform(-1., 1., self.ac_dim) return act def policy(self, obs, stochastic=True): self.actor.eval() ob = Variable(torch.from_numpy(obs)).float().cuda().view(1, -1) act = self.actor(ob) act = act.cpu().data.numpy() if stochastic: act = self.action_noise(act) self.actor.train() return act def cuda(self): self.critic.cuda() self.actor.cuda() if hasattr(self, 'critic_target'): self.critic_target.cuda() self.actor_target.cuda() self.critic_loss.cuda() def construct_optim(self, net, lr, weight_decay=None): if weight_decay is None: weight_decay = 0 params = mutils.add_weight_decay([net], weight_decay=weight_decay) optimizer = optim.Adam(params, lr=lr, weight_decay=weight_decay) return optimizer def soft_update(self, target, source, tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class UADDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.train_with_dropout = args.train_with_dropout self.dropout_p = args.dropout_p self.dropout_n = args.dropout_n self.print_var_count = 0 self.action_std = np.array([]) self.save_dir = args.output self.episode = 0 # self.save_file = open(self.save_dir + '/std.txt', "a") print("train_with_dropout : " + str(self.train_with_dropout)) print("Dropout p : " + str(self.dropout_p)) print("Dropout n : " + str(self.dropout_n)) # Create Actor and Critic Network net_cfg_actor = { 'dropout_n': args.dropout_n, 'dropout_p': args.dropout_p, 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } net_cfg_critic = { 'dropout_n': args.dropout_n, 'dropout_p': args.dropout_p, 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg_actor) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg_actor) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg_critic) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg_critic) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda() def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split( self.batch_size) # Prepare for the target q batch # TODO : (1) Also apply epistemic and aleatoric uncertainty to both actor and critic target network # TOOD : (2) Is it proper to apply epistemic uncertainty to target network? If then, how to apply? Which network to choose for target? Let's think more about it after July. next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)) ])[: -1] # x : next_state_batch, a : self.actor_target(next_state_batch) target_q_batch = to_tensor(reward_batch) + self.discount * to_tensor( terminal_batch.astype(np.float)) * next_q_values ######################### # Critic update ######################### self.critic.zero_grad() # TODO : (Completed) Add epistemic uncertainty for critic network q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # q_batch_mean, q_batch_var = select_q_with_dropout(state_batch, action_batch) # q_batch = self.critic.foward_with_dropout([to_tensor(state_batch), to_tensor(action_batch)]) # TODO : (Completed) Add aleatoric uncertainty term from aleatoric uncertainty output of critic network (Add aleatoric uncertainty term in criterion) value_loss = criterion(q_batch, target_q_batch) # value_loss = AULoss(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() ######################### # Actor update ######################### self.actor.zero_grad() # policy loss # TODO : (Completed) Add epistemic certainty term from aleatoric certainty output of policy network policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() # policy_loss = policy_loss.mean() + 1 / self.actor(to_tensor(state_batch)[-1]) policy_loss.backward() self.actor_optim.step() ######################### # Target soft update ######################### soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action # def select_action(self, s_t, decay_epsilon=True): # action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) # action += self.is_training*max(self.epsilon, 0)*self.random_process.sample() # # if decay_epsilon: # self.epsilon -= self.depsilon # # self.a_t = action # return action def select_q_with_dropout(self, s_t, a_t): dropout_qs = np.arrary([]) with torch.no_grad(): for i in range(self.dropout_n): q_batch = to_numpy( self.critic.forward_with_dropout([ to_tensor(s_t), to_tensor(a_t) ]).squeeze(0)[:-1]) # ignore aleatoric variance term dropout_qs = np.append(dropout_qs, [q_batch]) q_mean = torch.mean(dropout_qs) q_var = torch.var(dropout_qs) return q_mean, q_var def select_action_with_dropout(self, s_t, decay_epsilon=True): dropout_actions = np.array([]) with torch.no_grad(): for i in range(self.dropout_n): action = to_numpy( self.actor.forward_with_dropout(to_tensor(np.array( [s_t])))).squeeze(0) dropout_actions = np.append(dropout_actions, [action]) if self.train_with_dropout: plt_action = to_numpy( self.actor.forward_with_dropout(to_tensor(np.array( [s_t])))).squeeze(0) plt_action += self.is_training * max( self.epsilon, 0) * self.random_process.sample() else: plt_action = to_numpy(self.actor(to_tensor(np.array( [s_t])))).squeeze(0) plt_action += self.is_training * max( self.epsilon, 0) * self.random_process.sample() """ UNFIXED RESET POINT for Mujoco """ if self.print_var_count != 0 and (self.print_var_count + 1) % 999 == 0: # self.action_std = np.append(self.action_std, [np.std(dropout_actions)]) with open(self.save_dir + "/std.txt", "a") as myfile: myfile.write(str(np.std(dropout_actions)) + '\n') with open(self.save_dir + "/mean.txt", "a") as myfile: myfile.write(str(np.mean(dropout_actions)) + '\n') if self.print_var_count % (1000 * 5) == 0: print("dropout actions std", np.std(dropout_actions), " ", "dir : ", str(self.save_dir)) """ FIXED RESET POINT for MCC """ # if s_t[0] == -0.5 and s_t[1] == 0: # # print("fixed dropout actions std", np.std(dropout_actions), " ", "dir : ", str(self.save_dir)) # self.action_std = np.append(self.action_std, [np.std(dropout_actions)]) # # np.savetxt(self.save_dir + '/std.txt', self.action_std, fmt='%4.10f', delimiter=' ') # with open(self.save_dir + "/std.txt", "a") as myfile: # myfile.write(str(np.std(dropout_actions))+'\n') # with open(self.save_dir + "/mean.txt", "a") as myfile: # myfile.write(str(np.mean(dropout_actions))+'\n') if not (os.path.isdir(self.save_dir + "/episode/" + str(self.episode))): os.makedirs( os.path.join(self.save_dir + "/episode/" + str(self.episode))) self.action_std = np.append(self.action_std, [np.std(dropout_actions)]) with open(self.save_dir + "/episode/" + str(self.episode) + "/std.txt", "a") as myfile: myfile.write(str(np.std(dropout_actions)) + '\n') with open( self.save_dir + "/episode/" + str(self.episode) + "/mean.txt", "a") as myfile: myfile.write(str(np.mean(dropout_actions)) + '\n') self.print_var_count = self.print_var_count + 1 if decay_epsilon: self.epsilon -= self.depsilon # dropout_action = np.array([np.mean(dropout_actions)]) self.a_t = plt_action return plt_action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' # logger = Logger(environment_name = args.env_name, entropy_coff= 'entropy_coeff_' + str(args.entropy_coef), folder = args.folder) # logger.save_args(args) # print ("---------------------------------------") # print ('Saving to', logger.save_folder) # print ("---------------------------------------") if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] ### for the number of processes to use if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) ## ALE Environments : mostly has Discrete action_space type if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] ### shape==3 for ALE Environments : States are 3D (Image Pi) if len(envs.observation_space.shape) == 3: actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) target_actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) critic = Critic(in_channels=4, num_actions=envs.action_space.n) critic_target = Critic(in_channels=4, num_actions=envs.action_space.n) baseline_target = Baseline_Critic(in_channels=4, num_actions=envs.action_space.n) if args.cuda: actor.cuda() critic.cuda() critic_target.cuda() target_actor.cuda() baseline_target.cuda() actor_optim = optim.Adam(actor.parameters(), lr=args.actor_lr) critic_optim = optim.Adam(critic.parameters(), lr=args.critic_lr) baseline_optim = optim.Adam(actor.parameters(), lr=1e-4) tau_soft_update = 0.001 mem_buffer = ReplayBuffer() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor.state_size, envs.action_space.n) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): temperature = 1.0 ## num_steps = 5 as in A2C for step in range(args.num_steps): temperature = temperature / (step + 1) # Sample actions action, action_log_prob, states, dist_entropy = actor.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True), temperature, envs.action_space.n, args.num_processes) value = critic.forward( Variable(rollouts.observations[step], volatile=True), action_log_prob) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks pre_state = rollouts.observations[step].cpu().numpy() update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, dist_entropy.data, value.data, reward, masks) nth_step_return = rollouts.returns[0].cpu().numpy() current_state = rollouts.observations[0].cpu().numpy() nth_state = rollouts.observations[-1].cpu().numpy() current_action = rollouts.action_log_probs[0].cpu().numpy() current_action_dist_entropy = rollouts.dist_entropy[0].cpu().numpy() mem_buffer.add((current_state, nth_state, current_action, nth_step_return, done, current_action_dist_entropy)) action, action_log_prob, states, dist_entropy = actor.act( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True), temperature, envs.action_space.n, args.num_processes) #[0].data next_value = critic.forward( Variable(rollouts.observations[-1], volatile=True), action_log_prob).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) bs_size = args.batch_size if len(mem_buffer.storage) >= bs_size: ##samples from the replay buffer state, next_state, action, returns, done, entropy_log_prob = mem_buffer.sample( bs_size) next_state = next_state.reshape([-1, *obs_shape]) state = state.reshape([-1, *obs_shape]) action = action.reshape([-1, envs.action_space.n]) #current Q estimate q_batch = critic(to_tensor(state), to_tensor(action)) # target Q estimate next_state_action_probs = target_actor( to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True)) next_q_values = critic_target(to_tensor(next_state, volatile=True), next_state_action_probs[1]) next_q_values.volatile = False target_q_batch = to_tensor(returns) + args.gamma * to_tensor( done.astype(np.float)) * next_q_values critic.zero_grad() value_loss = criterion(q_batch, target_q_batch) if args.gradient_penalty == True: gradients = torch.autograd.grad(value_loss, critic.parameters(), allow_unused=True, retain_graph=True, create_graph=True, only_inputs=True)[0] gradient_penalty = ((gradients.norm(2, dim=1) - 1)** 2).mean() * args.lambda_grad_penalty gradient_penalty.backward() else: value_loss = criterion(q_batch, target_q_batch) value_loss.backward() critic_optim.step() actor.zero_grad() policy_loss = -critic( to_tensor(state), actor(to_tensor(state), to_tensor(state), to_tensor(state))[0]) ### Soft trust region constraint for the actor current_action_probs = actor(to_tensor(state, volatile=False), to_tensor(state, volatile=False), to_tensor(state, volatile=False))[0] target_action_probs = target_actor(to_tensor(state, volatile=True), to_tensor(state, volatile=True), to_tensor(state, volatile=True))[0] policy_regularizer = criterion(current_action_probs, target_action_probs) ## Actor update with entropy penalty policy_loss = policy_loss.mean() - args.entropy_coef * Variable(torch.from_numpy(np.expand_dims(entropy_log_prob.mean(), axis=0))).cuda() \ + args.actor_kl_lambda * policy_regularizer if args.actor_several_updates == True: for p in range(args.actor_updates): policy_loss.backward(retain_variables=True) else: policy_loss.backward() ##clipping of gradient norms gradient_norms = nn.utils.clip_grad_norm(actor.parameters(), args.max_grad_norm) print("gradient_norms", gradient_norms) actor_optim.step() if args.second_order_grads == True: """ Training the Baseline critic (f(s, \mu(s))) """ baseline_target.zero_grad() ## f(s, \mu(s)) current_baseline = baseline_target( to_tensor(state), actor(to_tensor(state), to_tensor(state), to_tensor(state))[0]) ## \grad f(s,a) grad_baseline_params = torch.autograd.grad( current_baseline.mean(), actor.parameters(), retain_graph=True, create_graph=True) ## MSE : (Q - f)^{2} baseline_loss = (q_batch.detach() - current_baseline).pow(2).mean() # baseline_loss.volatile=True actor.zero_grad() baseline_target.zero_grad() grad_norm = 0 for grad_1, grad_2 in zip(grad_params, grad_baseline_params): grad_norm += grad_1.data.pow(2).sum() - grad_2.pow(2).sum() grad_norm = grad_norm.sqrt() ##Loss for the Baseline approximator (f) overall_loss = baseline_loss + args.lambda_second_order_grads * grad_norm overall_loss.backward() baseline_optim.step() soft_update(target_actor, actor, tau_soft_update) soft_update(critic_target, critic, tau_soft_update) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "" and len( mem_buffer.storage) >= bs_size: save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor if args.cuda: save_model = copy.deepcopy(actor).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(mem_buffer.storage) >= bs_size: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}, Entropy {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), value_loss.data.cpu().numpy()[0], policy_loss.data.cpu().numpy()[0], entropy_log_prob.mean())) final_rewards_mean = [final_rewards.mean()] final_rewards_median = [final_rewards.median()] final_rewards_min = [final_rewards.min()] final_rewards_max = [final_rewards.max()] all_value_loss = [value_loss.data.cpu().numpy()[0]] all_policy_loss = [policy_loss.data.cpu().numpy()[0]] # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max, all_value_loss, all_policy_loss) # # logger.save() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
class DDPG(object): def __init__(self, states_num, actions_num, args): self.states_num = states_num self.actions_num = actions_num self.epsilon = 1.0 self.actor = Actor(self.states_num, self.actions_num, args.hidden1, args.hidden2) self.actor_target = Actor(self.states_num, self.actions_num, args.hidden1, args.hidden2) self.critic = Critic(self.states_num, self.actions_num, args.hidden1, args.hidden2) self.critic_target = Critic(self.states_num, self.actions_num, args.hidden1, args.hidden2) initialize(self.actor_target, self.actor) # Make sure target is with the same weight initialize(self.critic_target, self.critic) #Create replay buffer ''' self.memory ''' self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.epsilon_decrease_factor = 1.0 / args.epsilon self.last_state = None self.last_action = None def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split( self.batch_size) # Target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)) ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + self.discount * to_tensor( terminal_batch.astype(np.float)) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = nn.MSELoss(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() # Target update update(self.actor_target, self.actor, self.tau) update(self.critic_target, self.critic, self.tau) def observe(self, reward_now, state_next, done): self.memory.append(self.last_state, self.last_action, reward_now, done) self.last_state = state_next def random_action(self): action = np.random.uniform(-100., 100., self.actions_num) self.last_action = action return action
class DDPG(object): def __init__(self, memory, nb_status, nb_actions, action_noise=None, gamma=0.99, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), actor_lr=1e-4, critic_lr=1e-3): self.nb_status = nb_status self.nb_actions = nb_actions self.action_range = action_range self.observation_range = observation_range self.normalize_observations = normalize_observations self.actor = Actor(self.nb_status, self.nb_actions) self.actor_target = Actor(self.nb_status, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(self.nb_status, self.nb_actions) self.critic_target = Critic(self.nb_status, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) # Create replay buffer self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.action_noise = action_noise # Hyper-parameters self.batch_size = batch_size self.tau = tau self.discount = gamma if self.normalize_observations: self.obs_rms = RunningMeanStd() else: self.obs_rms = None def pi(self, obs, apply_noise=True, compute_Q=True): obs = np.array([obs]) action = to_numpy(self.actor(to_tensor(obs))).squeeze(0) if compute_Q: q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q[0][0] def store_transition(self, obs0, action, reward, obs1, terminal1): self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) next_q_values = self.critic_target([ to_tensor(batch['obs1'], volatile=True), self.actor_target(to_tensor(batch['obs1'], volatile=True)) ]) next_q_values.volatile = False target_q_batch = to_tensor(batch['rewards']) + \ self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values self.critic.zero_grad() q_batch = self.critic( [to_tensor(batch['obs0']), to_tensor(batch['actions'])]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.cpu().data[0], policy_loss.cpu().data[0] def initialize(self): hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_target_net(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def reset(self): if self.action_noise is not None: self.action_noise.reset() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda()
class DDPG(): def __init__(self, env, action_dim, state_dim, device, critic_lr=3e-4, actor_lr=3e-4, gamma=0.99, batch_size=100, validate_steps=100, max_episode_length=150): """ param: env: An gym environment param: action_dim: Size of action space param: state_dim: Size of state space param: critic_lr: Learning rate of the critic param: actor_lr: Learning rate of the actor param: gamma: The discount factor param: batch_size: The batch size for training param: device: The device used for training param: validate_steps: Number of iterations after which we evaluate trained policy """ self.gamma = gamma self.batch_size = batch_size self.env = env self.device = device self.eval_env = deepcopy(env) self.validate_steps = validate_steps self.max_episode_length = max_episode_length # actor and actor_target where both networks have the same initial weights self.actor = Actor(state_dim=state_dim, action_dim=action_dim).to(self.device) self.actor_target = deepcopy(self.actor) # critic and critic_target where both networks have the same initial weights self.critic = Critic(state_dim=state_dim, action_dim=action_dim).to(self.device) self.critic_target = deepcopy(self.critic) # Optimizer for the actor and critic self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=actor_lr) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=critic_lr) # Replay buffer self.ReplayBuffer = ReplayBuffer(buffer_size=10000, init_length=1000, state_dim=state_dim, \ action_dim=action_dim, env=env, device = device) def update_target_networks(self): """ A function to update the target networks """ weighSync(self.actor_target, self.actor) weighSync(self.critic_target, self.critic) def update_network(self, batch): """ A function to update the function just once """ # Sample and parse batch state, action, reward, state_next, done = self.ReplayBuffer.batch_sample( batch) # Predicting the next action and q_value action_next = self.actor_target(state_next) q_next = self.critic_target(state_next, action_next) target_q = reward + (self.gamma * done * q_next) q = self.critic(state, action) # Critic update self.critic.zero_grad() value_loss = F.mse_loss(q, target_q) value_loss.backward() self.optimizer_critic.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic(state, self.actor(state)).mean() policy_loss.backward() self.optimizer_actor.step() # Target update self.update_target_networks() return value_loss.item(), policy_loss.item() def select_action(self, state, isEval): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) action = self.actor(state).squeeze(0).detach() if isEval: return action.cpu().numpy() action += torch.normal(0, 0.1, size=action.shape).to(self.device) action = torch.clamp(action, -1., 1.).cpu().numpy() return action def train(self, num_steps): """ Train the policy for the given number of iterations :param num_steps:The number of steps to train the policy for """ value_losses, policy_losses, validation_reward, validation_steps = [],[],[],[] step, episode, episode_steps, episode_reward, state = 0, 0, 0, 0., None while step < num_steps: # reset if it is the start of episode if state is None: state = deepcopy(self.env.reset()) action = self.select_action(state, False) # env response with next_state, reward, terminate_info state_next, reward, done, _ = self.env.step(action) state_next = deepcopy(state_next) if self.max_episode_length and episode_steps >= self.max_episode_length - 1: done = True # observe and store in replay buffer self.ReplayBuffer.buffer_add( Exp(state=state, action=action, reward=reward, state_next=state_next, done=done)) # update policy based on sampled batch batch = self.ReplayBuffer.buffer_sample(self.batch_size) value_loss, policy_loss = self.update_network(batch) value_losses.append(value_loss) policy_losses.append(policy_loss) # evaluate if step % self.validate_steps == 0: validate_reward, steps = self.evaluate() validation_reward.append(validate_reward) validation_steps.append(steps) print( "[Eval {:06d}/{:06d}] Steps: {:06d}, Episode Reward:{:04f}" .format(step, int(num_steps), steps, validate_reward)) # update step += 1 episode_steps += 1 episode_reward += reward state = deepcopy(state_next) if done: # reset at the end of episode #print("[Train {:06d}/{:06d}] - Episode Reward:{:04f} ".format(step, num_steps, step, episode_reward)) episode_steps, episode_reward, state = 0, 0., None episode += 1 return value_losses, policy_losses, validation_reward, validation_steps def evaluate(self): """ Evaluate the policy trained so far in an evaluation environment """ state, done, total_reward, steps = self.eval_env.reset(), False, 0., 0 while not done: action = self.select_action(state, True) state_next, reward, done, _ = self.eval_env.step(action) total_reward += reward steps += 1 state = state_next return total_reward / steps, steps
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { "hidden1": args.hidden1, "hidden2": args.hidden2, "init_w": args.init_w, } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update( self.actor_target, self.actor ) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory( limit=args.rmsize, window_length=args.window_length ) self.random_process = OrnsteinUhlenbeckProcess( size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma ) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda() def update_policy(self): # Sample batch ( state_batch, action_batch, reward_batch, next_state_batch, terminal_batch, ) = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target( [ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ] ) # next_q_values.volatile = False target_q_batch = ( to_tensor(reward_batch) + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values ) # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))] ) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.0, 1.0, self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1.0, 1.0) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output))) self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output)) torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
def training(opt): # ~~~~~~~~~~~~~~~~~~~ hyper parameters ~~~~~~~~~~~~~~~~~~~ # EPOCHS = opt.epochs CHANNELS = 1 H, W = 64, 64 work_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') FEATURE_D = 128 Z_DIM = 100 BATCH_SIZE = opt.batch_size # ~~~~~~~~~~~~~~~~~~~ as per WGAN paper ~~~~~~~~~~~~~~~~~~~ # lr = opt.lr CRITIC_TRAIN_STEPS = 5 WEIGHT_CLIP = 0.01 print(f"Epochs: {EPOCHS}| lr: {lr}| batch size {BATCH_SIZE}|" + f" device: {work_device}") # ~~~~~~~~~~~ creating directories for weights ~~~~~~~~~~~ # if opt.logs: log_dir = Path(f'{opt.logs}').resolve() if log_dir.exists(): shutil.rmtree(str(log_dir)) if opt.weights: Weight_dir = Path(f'{opt.weights}').resolve() if not Weight_dir.exists(): Weight_dir.mkdir() # ~~~~~~~~~~~~~~~~~~~ loading the dataset ~~~~~~~~~~~~~~~~~~~ # trans = transforms.Compose([ transforms.Resize((H, W)), transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )) ]) MNIST_data = MNIST(str(opt.data_dir), True, transform=trans, download=True) loader = DataLoader( MNIST_data, BATCH_SIZE, True, num_workers=2, pin_memory=True, ) # ~~~~~~~~~~~~~~~~~~~ creating tensorboard variables ~~~~~~~~~~~~~~~~~~~ # writer_fake = SummaryWriter(f"{str(log_dir)}/fake") writer_real = SummaryWriter(f"{str(log_dir)}/real") loss_writer = SummaryWriter(f"{str(log_dir)}/loss") # ~~~~~~~~~~~~~~~~~~~ loading the model ~~~~~~~~~~~~~~~~~~~ # critic = Critic(img_channels=CHANNELS, feature_d=FEATURE_D).to(work_device) gen = Faker(Z_DIM, CHANNELS, FEATURE_D).to(work_device) if opt.resume: if Path(Weight_dir / 'critic.pth').exists(): critic.load_state_dict( torch.load(str(Weight_dir / 'critic.pth'), map_location=work_device)) if Path(Weight_dir / 'generator.pth').exists(): gen.load_state_dict( torch.load(str(Weight_dir / 'generator.pth'), map_location=work_device)) # ~~~~~~~~~~~~~~~~~~~ create optimizers ~~~~~~~~~~~~~~~~~~~ # critic_optim = optim.RMSprop(critic.parameters(), lr) gen_optim = optim.RMSprop(gen.parameters(), lr) # ~~~~~~~~~~~~~~~~~~~ training loop ~~~~~~~~~~~~~~~~~~~ # # loss variables C_loss_prev = math.inf G_loss_prev = math.inf C_loss = 0 G_loss = 0 C_loss_avg = 0 G_loss_avg = 0 print_gpu_details() # setting the models to train mode critic.train() gen.train() for epoch in range(EPOCHS): # reset the average loss to zero C_loss_avg = 0 G_loss_avg = 0 print_memory_utilization() for batch_idx, (real, _) in enumerate(tqdm(loader)): real = real.to(work_device) fixed_noise = torch.rand(real.shape[0], Z_DIM, 1, 1).to(work_device) # ~~~~~~~~~~~~~~~~~~~ critic loop ~~~~~~~~~~~~~~~~~~~ # with torch.no_grad(): fake = gen(fixed_noise) # dim of (N,1,W,H) for _ in range(CRITIC_TRAIN_STEPS): critic.zero_grad() # ~~~~~~~~~~~ weight cliping as per WGAN paper ~~~~~~~~~~ # for p in critic.parameters(): p.data.clamp_(-WEIGHT_CLIP, WEIGHT_CLIP) # ~~~~~~~~~~~~~~~~~~~ forward ~~~~~~~~~~~~~~~~~~~ # # make it one dimensional array real_predict = critic(real).view(-1) # make it one dimensional array fake_predict = critic(fake.detach()).view(-1) # ~~~~~~~~~~~~~~~~~~~ loss ~~~~~~~~~~~~~~~~~~~ # C_loss = -(torch.mean(fake_predict) - torch.mean(real_predict)) C_loss_avg += C_loss # ~~~~~~~~~~~~~~~~~~~ backward ~~~~~~~~~~~~~~~~~~~ # C_loss.backward() critic_optim.step() # ~~~~~~~~~~~~~~~~~~~ generator loop ~~~~~~~~~~~~~~~~~~~ # gen.zero_grad() # ~~~~~~~~~~~~~~~~~~~ forward ~~~~~~~~~~~~~~~~~~~ # # make it one dimensional array fake_predict = critic(fake).view(-1) # ~~~~~~~~~~~~~~~~~~~ loss ~~~~~~~~~~~~~~~~~~~ # G_loss = -(torch.mean(fake_predict)) G_loss_avg += G_loss # ~~~~~~~~~~~~~~~~~~~ backward ~~~~~~~~~~~~~~~~~~~ # G_loss.backward() gen_optim.step() # ~~~~~~~~~~~~~~~~~~~ loading the tensorboard ~~~~~~~~~~~~~~~~~~~ # # will execute at every 50 steps if (batch_idx + 1) % 50 == 0: # ~~~~~~~~~~~~ calculate average loss ~~~~~~~~~~~~~ # C_loss_avg_ = C_loss_avg / (CRITIC_TRAIN_STEPS * batch_idx) G_loss_avg_ = G_loss_avg / (batch_idx) print(f"Epoch [{epoch}/{EPOCHS}] | batch size {batch_idx}" + f"Loss C: {C_loss_avg_:.4f}, loss G: {G_loss_avg_:.4f}") # ~~~~~~~~~~~~ send data to tensorboard ~~~~~~~~~~~~~ # with torch.no_grad(): critic.eval() gen.eval() if BATCH_SIZE > 32: fake = gen(fixed_noise[:32]).reshape( -1, CHANNELS, H, W) data = real[:32].reshape(-1, CHANNELS, H, W) else: fake = gen(fixed_noise).reshape(-1, CHANNELS, H, W) data = real.reshape(-1, CHANNELS, H, W) img_grid_fake = torchvision.utils.make_grid(fake, normalize=True) img_grid_real = torchvision.utils.make_grid(data, normalize=True) step = (epoch + 1) * (batch_idx + 1) writer_fake.add_image("Mnist Fake Images", img_grid_fake, global_step=step) writer_real.add_image("Mnist Real Images", img_grid_real, global_step=step) loss_writer.add_scalar('Critic', C_loss, global_step=step) loss_writer.add_scalar('generator', G_loss, global_step=step) # changing back the model to train mode critic.train() gen.train() # ~~~~~~~~~~~~~~~~~~~ saving the weights ~~~~~~~~~~~~~~~~~~~ # if opt.weights: if C_loss_prev > C_loss_avg: C_loss_prev = C_loss_avg weight_path = str(Weight_dir / 'critic.pth') torch.save(critic.state_dict(), weight_path) if G_loss_prev > G_loss_avg: G_loss_prev = G_loss_avg weight_path = str(Weight_dir / 'generator.pth') torch.save(gen.state_dict(), weight_path)
class DDPG(object): def __init__(self, nb_status, nb_actions, args): self.num_actor = 3 self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn } if args.pic: self.cnn = CNN(3, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actors = [ Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor) ] self.actor_targets = [ Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor) ] self.actor_optims = [ Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor) ] self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) for i in range(self.num_actor): hard_update( self.actor_targets[i], self.actors[i]) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self, train_actor=True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) print('label 1') print('size = ', state_batch.shape) state_batch = self.cnn(state_batch) print('label 2') next_state_batch = np.array( [self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn(next_state_batch) next_q_values = self.critic_target( [next_state_batch, self.actor_target(next_state_batch)]) else: index = np.random.randint(low=0, high=self.num_actor) next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_targets[index](to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() sum_policy_loss = 0 for i in range(self.num_actor): self.actors[i].zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actors[i](to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor: self.actor_optims[i].step() sum_policy_loss += policy_loss # Target update soft_update(self.actor_targets[i], self.actors[i], self.tau) soft_update(self.critic_target, self.critic, self.tau) return -sum_policy_loss / self.num_actor, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): for i in range(self.num_actor): self.actors[i].cuda() self.actor_targets[i].cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): actions = [] status = [] tot_score = [] for i in range(self.num_actor): action = to_numpy(self.actors[i](to_tensor( np.array([s_t]), volatile=True))).squeeze(0) noise_level = noise_level * max(self.epsilon, 0) action = action + self.random_process.sample() * noise_level status.append(s_t) actions.append(action) tot_score.append(0.) scores = self.critic([ to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True) ]) for j in range(self.num_actor): tot_score[j] += scores.data[j][0] best = np.array(tot_score).argmax() if decay_epsilon: self.epsilon -= self.depsilon self.a_t = actions[best] return actions[best] def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=0): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.cuda() self.critic.cuda()
for epoch in range(num_epochs): for batch_idx, (real, labels) in enumerate(loader): real = real.to(device) labels = labels.to(device) # For Critic for _ in range(critic_n): noise = torch.randn((batch_size, z_dim, 1, 1)).to(device) fake = gen(noise, labels) critic_real = critic(real, labels) critic_fake = critic(fake, labels) gp = gradient_penalty(critic, real, labels, fake, device) loss_critic = -(torch.mean(critic_real) - torch.mean(critic_fake) + lambda_gp * gp) critic.zero_grad() loss_critic.backward(retain_graph=True) opt_critic.step() # For Generator output = critic(fake, labels).view(-1) loss_gen = -torch.mean(output) gen.zero_grad() loss_gen.backward() opt_gen.step() if batch_idx == 0: print( f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \ Loss D: {lossD:.4f}, loss G: {lossG:.4f}")
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buffer_size=int(1e5), batch_size=256, learn_every=1, update_every=1, gamma=0.99, tau=0.02, lr_actor=2e-4, lr_critic=2e-3, random_seed=None, use_asn=True, asn_kwargs={}, use_psn=False, psn_kwargs={}, use_per=False, restore=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.update_every = update_every self.learn_every = learn_every self.batch_size = batch_size self.gamma = gamma self.tau = tau # Keep track of how many times we've updated weights self.i_updates = 0 self.i_step = 0 self.use_asn = use_asn self.use_psn = use_psn self.use_per = use_per if random_seed is not None: random.seed(random_seed) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) if self.use_psn: self.actor_perturbed = Actor(state_size, action_size).to(device) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) # restore networks if needed if restore is not None: checkpoint = torch.load(restore, map_location=device) self.actor_local.load_state_dict(checkpoint[0]['actor']) self.actor_target.load_state_dict(checkpoint[0]['actor']) if self.use_psn: self.actor_perturbed.load_state_dict(checkpoint[0]['actor']) self.critic_local.load_state_dict(checkpoint[0]['critic']) self.critic_target.load_state_dict(checkpoint[0]['critic']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Hard copy weights from local to target networks policy_update(self.actor_local, self.actor_target, 1.0) policy_update(self.critic_local, self.critic_target, 1.0) # Noise process if self.use_asn: self.action_noise = OUNoise(action_size, **asn_kwargs) if self.use_psn: self.param_noise = ParameterSpaceNoise(**psn_kwargs) if self.use_per: self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size, random_seed) else: self.buffer = ExperienceReplay(buffer_size, batch_size, random_seed) def act(self, states, perturb_mode=True, train_mode=True): """Returns actions for given state as per current policy.""" if not train_mode: self.actor_local.eval() if self.use_psn: self.actor_perturbed.eval() with torch.no_grad(): states = torch.from_numpy(states).float().to(device) actor = self.actor_perturbed if ( self.use_psn and perturb_mode) else self.actor_local actions = actor(states).cpu().numpy()[0] if train_mode: actions += self.action_noise.sample() self.actor_local.train() if self.use_psn: self.actor_perturbed.train() return np.clip(actions, -1, 1) def perturb_actor_parameters(self): """Apply parameter space noise to actor model, for exploration""" policy_update(self.actor_local, self.actor_perturbed, 1.0) params = self.actor_perturbed.state_dict() for name in params: if 'ln' in name: pass param = params[name] random = torch.randn(param.shape) if use_cuda: random = random.cuda() param += random * self.param_noise.current_stddev def reset(self): self.action_noise.reset() if self.use_psn: self.perturb_actor_parameters() def step(self, experience, priority=0.0): self.buffer.push(experience) self.i_step += 1 if len(self.buffer) > self.batch_size: if self.i_step % self.learn_every == 0: self.learn(priority) if self.i_step % self.update_every == 0: self.update( ) # soft update the target network towards the actual networks def learn(self, priority=0.0): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if self.use_per: (states, actions, rewards, states_next, dones), batch_idx = self.buffer.sample(priority) else: states, actions, rewards, states_next, dones = self.buffer.sample() # Get predicted next-state actions and Q values from target models with torch.no_grad(): actions_next = self.actor_target(states_next) Q_targets_next = self.critic_target(states_next, actions_next) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # ---------------------------- update critic ---------------------------- # # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.critic_local.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_local.zero_grad() actor_loss.backward() self.actor_optimizer.step() if self.use_per: Q_error = Q_expected - Q_targets new_deltas = torch.abs(Q_error.detach().squeeze(1)).numpy() self.buffer.update_deltas(batch_idx, new_deltas) def update(self): """soft update targets""" self.i_updates += 1 policy_update(self.actor_local, self.actor_target, self.tau) policy_update(self.critic_local, self.critic_target, self.tau) def save_model(self, model_dir, session_name, i_episode, best): filename = os.path.join( model_dir, f'ddpg_{session_name}-EP_{i_episode}-score_{best:.3f}.pt') filename_best = os.path.join(model_dir, f'ddpg_{session_name}-best.pt') save_dict_list = [] save_dict = { 'actor': self.actor_local.state_dict(), 'actor_optim_params': self.actor_optimizer.state_dict(), 'critic': self.critic_local.state_dict(), 'critic_optim_params': self.critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, filename) copyfile(filename, filename_best) def postprocess(self, t_step): if self.use_psn and t_step > 0: perturbed_states, perturbed_actions, _, _, _ = self.buffer.tail( t_step) unperturbed_actions = self.act(np.array(perturbed_states), False, False) diff = np.array(perturbed_actions) - unperturbed_actions mean_diff = np.mean(np.square(diff), axis=0) dist = sqrt(np.mean(mean_diff)) self.param_noise.adapt(dist)
class DDPG(object): def __init__(self, nb_states, nb_actions, args): self.nb_states = nb_states self.nb_actions = nb_actions self.discrete = args.discrete net_config = { 'hidden1' : args.hidden1, 'hidden2' : args.hidden2 } # Actor and Critic initialization self.actor = Actor(self.nb_states, self.nb_actions, **net_config) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_config) self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr) self.critic = Critic(self.nb_states, self.nb_actions, **net_config) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_config) self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr) hard_update(self.critic_target, self.critic) hard_update(self.actor_target, self.actor) # Replay Buffer and noise self.memory = ReplayBuffer(args.memory_size) self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions)) self.last_state = None self.last_action = None # Hyper parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount # CUDA self.use_cuda = args.cuda if self.use_cuda: self.cuda() def cuda(self): self.actor.to(device) self.actor_target.to(device) self.critic.to(device) self.critic_target.to(device) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def reset(self, obs): self.last_state = obs self.noise.reset() def observe(self, reward, state, done): self.memory.append([self.last_state, self.last_action, reward, state, done]) self.last_state = state def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.last_action = action return action.argmax() if self.discrete else action def select_action(self, state, apply_noise=False): self.eval() action = to_numpy(self.actor(to_tensor(np.array([state]), device=device))).squeeze(0) self.train() if apply_noise: action = action + self.noise.sample() action = np.clip(action, -1., 1.) self.last_action = action #print('action:', action, 'output:', action.argmax()) return action.argmax() if self.discrete else action def update_policy(self): state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) state = to_tensor(np.array(state_batch), device=device) action = to_tensor(np.array(action_batch), device=device) next_state = to_tensor(np.array(next_state_batch), device=device) # compute target Q value next_q_value = self.critic_target([next_state, self.actor_target(next_state)]) target_q_value = to_tensor(reward_batch, device=device) \ + self.discount * to_tensor((1 - terminal_batch.astype(np.float)), device=device) * next_q_value # Critic and Actor update self.critic.zero_grad() with torch.set_grad_enabled(True): q_values = self.critic([state, action]) critic_loss = criterion(q_values, target_q_value.detach()) critic_loss.backward() self.critic_optim.step() self.actor.zero_grad() with torch.set_grad_enabled(True): policy_loss = -self.critic([state.detach(), self.actor(state)]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return to_numpy(-policy_loss), to_numpy(critic_loss), to_numpy(q_values.mean()) def save_model(self, output, num=1): if self.use_cuda: self.actor.to(torch.device("cpu")) self.critic.to(torch.device("cpu")) torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.to(device) self.critic.to(device) def load_model(self, output, num=1): self.actor.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num))) if self.use_cuda: self.cuda()
class DDPGAgent: def __init__(self, plot=True, seed=1, env: gym.Env = None, batch_size=128, learning_rate_actor=0.001, learning_rate_critic=0.001, weight_decay=0.01, gamma=0.999): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.batch_size = batch_size self.learning_rate_actor = learning_rate_actor self.learning_rate_critic = learning_rate_critic self.weight_decay = weight_decay self.gamma = gamma self.tau = 0.001 self._to_tensor = util.to_tensor self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.actor = Actor(self.state_dim, self.action_dim).to(self.device) self.target_actor = Actor(self.state_dim, self.action_dim).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), self.learning_rate_actor, weight_decay=self.weight_decay) self.critic = Critic(self.state_dim, self.action_dim).to(self.device) self.target_critic = Critic(self.state_dim, self.action_dim).to(self.device) self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), self.learning_rate_critic, weight_decay=self.weight_decay) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.t = 0 def _learn_from_memory(self, memory): ''' 从记忆学习,更新两个网络的参数 ''' # 随机获取记忆里的Transition trans_pieces = memory.sample(self.batch_size) s0 = np.vstack([x.state for x in trans_pieces]) a0 = np.vstack([x.action for x in trans_pieces]) r1 = np.vstack([x.reward for x in trans_pieces]) s1 = np.vstack([x.next_state for x in trans_pieces]) terminal_batch = np.vstack([x.is_done for x in trans_pieces]) # 优化评论家网络参数 s1 = self._to_tensor(s1, device=self.device) s0 = self._to_tensor(s0, device=self.device) next_q_values = self.target_critic.forward( state=s1, action=self.target_actor.forward(s1)).detach() target_q_batch = self._to_tensor(r1, device=self.device) + \ self.gamma*self._to_tensor(terminal_batch.astype(np.float), device=self.device)*next_q_values q_batch = self.critic.forward(s0, self._to_tensor(a0, device=self.device)) # 计算critic的loss 更新critic网络参数 loss_critic = F.mse_loss(q_batch, target_q_batch) #self.critic_optimizer.zero_grad() self.critic.zero_grad() loss_critic.backward() self.critic_optimizer.step() # 反向传播,以某状态的价值估计为策略目标函数 loss_actor = -self.critic.forward(s0, self.actor.forward(s0)) # Q的梯度上升 loss_actor = loss_actor.mean() self.actor.zero_grad() #self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() # 软更新参数 soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) return (loss_critic.item(), loss_actor.item()) def learning(self, memory): self.actor.train() return self._learn_from_memory(memory) def save_models(self, episode_count): torch.save(self.target_actor.state_dict(), './Models/' + str(episode_count) + '_actor.pt') torch.save(self.target_critic.state_dict(), './Models/' + str(episode_count) + '_critic.pt') def load_models(self, episode): self.actor.load_state_dict( torch.load('./Models/' + str(episode) + '_actor.pt')) self.critic.load_state_dict( torch.load('./Models/' + str(episode) + '_critic.pt')) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) print('Models loaded successfully')
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_method':args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn, 'init_method': args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array( [self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target( [next_state_batch, self.actor_target(next_state_batch)]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([state_batch, self.actor(state_batch)]) else: policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array( np.mean([ np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters() ])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if (self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if (self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() if self.pic: action = np.concatenate( (softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy(self.actor_target(s_t)).squeeze(0) else: action = to_numpy(self.actor(to_tensor(np.array([s_t ])))).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = (action + self.random_action(fix=True)) / 2. # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self): # random seed for torch __seed = config.get(MODEL_SEED) self.policy_loss = [] self.critic_loss = [] if __seed > 0: self.seed(__seed) self.nb_states = config.get(MODEL_STATE_COUNT) self.nb_actions = config.get(MODEL_ACTION_COUNT) # Create Actor and Critic Network actor_net_cfg = { 'hidden1': config.get(MODEL_ACTOR_HIDDEN1), 'hidden2': config.get(MODEL_ACTOR_HIDDEN2), 'init_w': config.get(MODEL_INIT_WEIGHT) } critic_net_cfg = { 'hidden1': config.get(MODEL_CRITIC_HIDDEN1), 'hidden2': config.get(MODEL_CRITIC_HIDDEN2), 'init_w': config.get(MODEL_INIT_WEIGHT) } self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_optim = Adam( self.actor.parameters(), lr=config.get(MODEL_ACTOR_LR), weight_decay=config.get(MODEL_ACTOR_WEIGHT_DECAY)) self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_optim = Adam( self.critic.parameters(), lr=config.get(MODEL_CRITIC_LR), weight_decay=config.get(MODEL_CRITIC_WEIGHT_DECAY)) hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = Memory() self.random_process = OrnsteinUhlenbeckProcess( size=self.nb_actions, theta=config.get(RANDOM_THETA), mu=config.get(RANDOM_MU), sigma=config.get(RANDOM_SIGMA)) # Hyper-parameters self.batch_size = config.get(MODEL_BATCH_SIZE) self.tau = config.get(MODEL_TARGET_TAU) self.discount = config.get(MODEL_DISCOUNT) self.depsilon = 1.0 / config.get(MODEL_EPSILON) self.model_path = config.get(MODEL_SAVE_PATH) # self.epsilon = 1.0 # init device self.device_init() def update_policy(self, memory): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = memory.sample_and_split(self.batch_size) # Prepare for the target q batch with torch.no_grad(): next_q_values = self.critic_target([ to_tensor(next_state_batch), self.actor_target(to_tensor(next_state_batch)) ]) target_q_batch = to_tensor(reward_batch) + \ self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = F.mse_loss(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.critic_loss.append(value_loss.data[0]) # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() self.policy_loss.append(policy_loss.data[0]) # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def get_loss(self): return self.policy_loss, self.critic_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def device_init(self): self.actor.to(device) self.actor_target.to(device) self.critic.to(device) self.critic_target.to(device) def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) return action def select_action(self, s_t): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) return action def clean(self, decay_epsilon): if decay_epsilon: self.epsilon -= self.depsilon def reset(self): self.random_process.reset_states() def load_weights(self): if not os.path.exists(self.model_path): return actor_path = os.path.exists(os.path.join(self.model_path, 'actor.pkl')) if os.path.exists(actor_path): self.actor.load_state_dict(torch.load(actor_path)) critic_path = os.path.exists( os.path.join(self.model_path, 'critic.pkl')) if os.path.exists(critic_path): self.critic.load_state_dict(torch.load(critic_path)) def save_model(self): if not os.path.exists(self.model_path): os.makedirs(self.model_path) actor_path = os.path.exists(os.path.join(self.model_path, 'actor.pkl')) torch.save(self.actor.state_dict(), actor_path) critic_path = os.path.exists( os.path.join(self.model_path, 'critic.pkl')) torch.save(self.critic.state_dict(), critic_path) def get_model(self): return self.actor.state_dict(), self.critic.state_dict() def load_state_dict(self, actor_state, critic_state): self.actor.load_state_dict(actor_state) self.critic.load_state_dict(critic_state) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)