class DDPG(object): def __init__(self, nb_states, nb_actions, args): # self.cuda = USE_CUDA #args.cuda self.cuda = args.cuda self.nb_states = nb_states self.nb_actions = nb_actions #Init models #actor_kwargs = {'n_inp':self.nb_states, 'n_feature_list':[args.hidden1,args.hidden2], 'n_class':self.nb_actions} #self.actor = MLP(**actor_kwargs) #self.actor_target = MLP(**actor_kwargs) #self.critic = MLP(**actor_kwargs) #TODO: actor and critic has same structure for now. #self.critic_target = MLP(**actor_kwargs) net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.criterion = nn.MSELoss() if self.cuda: self.actor = self.actor.cuda( ) # torch.nn.DataParallel(self.model).cuda() #TODO dataparallel not working self.critic = self.critic.cuda() self.actor_target = self.actor_target.cuda() self.critic_target = self.critic_target.cuda() self.criterion = self.criterion.cuda() # Set optimizer self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=args.prate) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=args.rate) # Loss function self.loss_fn = torch.nn.MSELoss(size_average=False) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split( self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = self.criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_wts(self, modelfile): if os.path.isfile(modelfile + 'model.pth.tar'): checkpoint = torch.load(modelfile) self.actor.load_state_dict(checkpoint['actor_state_dict']) self.critic.load_state_dict(checkpoint['critic_state_dict']) self.actor_optim.load_state_dict(checkpoint['actor_optim']) self.critic_optim.load_state_dict(checkpoint['critic_optim']) return checkpoint['step'] else: return 0 def save_wts(self, savefile, step): saveme = { #TODO save other stuff too, like epoch etc 'actor_state_dict': self.actor.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'actor_optim': self.actor_optim.state_dict(), 'critic_optim': self.critic_optim.state_dict(), 'step': step } torch.save(saveme, savefile + 'model.pth.tar')
class DDPGAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, memory, device='cpu', params=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action memory (obj): Memory buffer to sample device (str): device string between cuda:0 and cpu params (dict): hyper-parameters """ self.state_size = state_size self.action_size = action_size self.device = device self.step_t = 0 self.update_every = params['update_every'] # Set parameters self.gamma = params['gamma'] self.tau = params['tau'] self.seed = random.seed(params['seed']) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_target = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) self.critic_target = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=params['lr_critic'], weight_decay=params['weight_decay']) # Noise process self.noise = OUNoise(action_size, params['seed'], theta=params['noise_theta'], sigma=params['noise_sigma']) # Replay memory self.memory = memory def store_weights(self, filenames): """Store weights of Actor/Critic Params ====== filenames (list): string of filename to store weights of actor and critic filenames[0] = actor weights filenames[1] = critic weights """ torch.save(self.actor_local.state_dict(), filenames[0]) torch.save(self.critic_local.state_dict(), filenames[1]) def load_weights(self, filenames): """Load weights of Actor/Critic Params ====== filenames (list): string of filename to load weights of actor and critic filenames[0] = actor weights filenames[1] = critic weights """ self.actor_local.load_state_dict(torch.load(filenames[0])) self.critic_local.load_state_dict(torch.load(filenames[1])) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.step_t = (self.step_t + 1) % self.update_every # Learn, if enough samples are available in memory if self.step_t == 0 and len( self.memory) > self.memory.get_batch_size(): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGPolicy(object): def __init__(self,env_name, policy_config,device = 'cpu'): self.device = device self.env = gym.make(env_name) #仅仅用于设置observation self.obs_dim = self.env.observation_space.shape[0] if isinstance(self.env.action_space, gym.spaces.Box): self.action_dim = self.env.action_space.shape[0] elif isinstance(self.env.action_space, gym.spaces.Discrete): raise TypeError('Unsupported action type') else: raise ValueError('unsupport action ', type(self.action_dim)) self.action_limit = self.env.action_space.high[0] self.lr = policy_config['lr'] self.actor = Actor(self.obs_dim, self.action_dim).to(device) self.critic = Critic(self.obs_dim, self.action_dim).to(device) self.actor_target = deepcopy(self.actor) self.critic_target = deepcopy(self.critic) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.actor_optim = torch.optim.Adam(params=self.actor.parameters(), lr=0.001) self.critic_optim = torch.optim.Adam(params=self.critic.parameters(), lr=0.001) self.discount_factor = policy_config['discount_factor'] self.tau = 0.005 def train_on_batch(self,rollouts_batch): # loss = r+q(st) - q (st+1), minimums loss obs, acs, next_obs, dones, r, un_r, summed_r = convert_listofrollouts(paths=rollouts_batch) acs = torch.tensor(acs).float().to(self.device) obs = torch.FloatTensor(obs).to(self.device) next_obs = torch.FloatTensor(next_obs).to(self.device) #acs_one_hot = torch.eye(2).to(self.device).index_select(0,acs)# to one hot discrete action space dones = torch.IntTensor(dones).to(self.device) r = torch.FloatTensor(r).to(self.device) # update critic self.critic_optim.zero_grad() act_target = self.actor_target(next_obs).to(self.device) q_target = r + self.discount_factor * self.critic_target(next_obs,act_target)* (1-dones) q_pred = self.critic(obs,acs) critic_loss = torch.nn.functional.mse_loss(q_pred,q_target) critic_loss.backward() self.critic_optim.step() #update actor self.actor_optim.zero_grad() actor_loss = -torch.mean(self.critic(obs,self.actor(obs))) actor_loss.backward() self.actor_optim.step() info = {'loss': actor_loss.cpu().detach().numpy(), # scale 'model_out': q_target, # torch.tensor [sum(batch), ac_dim], } return info def update_target_network(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def get_weights(self): #TODO: actor and critic parameters return {k:v for k,v in self.actor.state_dict().items()} def set_weights(self,weights): self.actor.load_state_dict(weights) def compute_actions(self, obs, noise_scale): #通过noise来判断是否需要增加噪声,如果是在eval中noise为0 obs = obs.to(self.device) actions = self.actor(obs).cpu().detach().numpy() actions += noise_scale * np.random.rand(self.action_dim) actions = np.clip(actions, -self.action_limit, self.action_limit) return actions def reset(self):#在env.reset的同时需要reset random_process self.random_process.reset_states()
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=False): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def train(self, env, n_episodes=2000, max_t=1000): scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, n_episodes + 1): state = env.reset() self.reset() score = 0 for t in range(max_t): action = self.act(state, add_noise=True) choice = np.argmax(action) next_state, reward, done, _ = env.step(choice) self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_deque.append(score) scores.append(score) print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format( i_episode, np.mean(scores_deque), score), end="") if i_episode % 1 == 0: torch.save(self.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(self.critic_local.state_dict(), 'checkpoint_critic.pth') print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) return scores