class Agent(): def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = T.Tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau*value_state_dict[name].clone() + \ (1-tau)*target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def save_models(self): print('.... saving models ....') self.actor.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() def load_models(self): print('.... loading models ....') self.actor.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.actor.device) done = T.tensor(done).to(self.actor.device) state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device) state = T.tensor(state, dtype=T.float).to(self.actor.device) action = T.tensor(action, dtype=T.float).to(self.actor.device) value = self.value(state).view(-1) value_ = self.target_value(state_).view(-1) value_[done] = 0.0 actions, log_probs = self.actor.sample_normal(state, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * F.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() actions, log_probs = self.actor.sample_normal(state, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = T.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * reward + self.gamma * value_ q1_old_policy = self.critic_1.forward(state, action).view(-1) q2_old_policy = self.critic_2.forward(state, action).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters()
class Agent_sm(): def __init__(self, alpha=0.0003, beta=0.0003, input_dims=8, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = 0.99 self.tau = tau self.memeory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, env.action_space.high, n_actions=n_actions) self.critic_1 = CriticNetwork(beta, input_dims, n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = torch.Tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memeory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_dict = dict(target_value_params) value_dict = dict(value_params) for name in target_value_dict: target_value_dict[name] = tau*value_dict[name].clone() + \ (1-tau)*target_value_dict[name].clone() self.target_value.load_state_dict(target_value_dict) def save_models(self): print('... saving models ...') self.actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() def load_models(self): print('... loading models ...') self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() def learn(self): if self.memeory.mem_cntr < self.batch_size: return states, new_states, actions, rewards, dones = self.memeory.sample_buffer( self.batch_size) states = torch.tensor(states, dtype=torch.float).to(self.actor.device) new_states = torch.tensor(new_states, dtype=torch.float).to(self.actor.device) actions = torch.tensor(actions, dtype=torch.float).to(self.actor.device) rewards = torch.tensor(rewards, dtype=torch.float).to(self.actor.device) dones = torch.tensor(dones).to(self.actor.device) states_value = self.value(states).view(-1) new_states_value = self.target_value(new_states).view(-1) new_states_value[dones] = 0.0 action, log_probs = self.actor.sample_normal(states, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(states, action) q2_new_policy = self.critic_2(states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * F.mse_loss(states_value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() action, log_probs = self.actor.sample_normal(states, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(states, action) q2_new_policy = self.critic_2(states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = torch.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * rewards + self.gamma * new_states_value q1_old_policy = self.critic_1(states, actions).view(-1) q2_old_policy = self.critic_2(states, actions).view(-1) critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic1_loss + critic2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters() # value_loss = value_loss.cpu().detach().numpy()[0] # actor_loss = actor_loss.cpu().detach().numpy()[0] # critic_loss = critic_loss.cpu().detach().numpy()[0] return 0, value_loss, actor_loss, critic_loss def learn_sm(self, sm_reg=1): if self.memeory.mem_cntr < self.batch_size: return states, new_states, actions, rewards, dones = self.memeory.sample_buffer( self.batch_size) states = torch.tensor(states, dtype=torch.float).to(self.actor.device) new_states = torch.tensor(new_states, dtype=torch.float).to(self.actor.device) actions = torch.tensor(actions, dtype=torch.float).to(self.actor.device) rewards = torch.tensor(rewards, dtype=torch.float).to(self.actor.device) dones = torch.tensor(dones).to(self.actor.device) states_value = self.value(states).view(-1) new_states_value = self.target_value(new_states).view(-1) new_states_value[dones] = 0.0 # action, log_probs = self.actor.sample_normal(states, reparameterize=False) # log_probs = log_probs.view(-1) # q1_new_policy = self.critic_1(states, action) # q2_new_policy = self.critic_2(states, action) # critic_value = torch.min(q1_new_policy, q2_new_policy) # critic_value = critic_value.view(-1) # self.value.optimizer.zero_grad() # value_target = critic_value - log_probs # value_loss = 0.5 * F.mse_loss(states_value, value_target) # value_loss.backward(retain_graph=True) # self.value.optimizer.step() # action, log_probs = self.actor.sample_normal(states, reparameterize=True) action, _ = self.actor.sample_normal(states, reparameterize=True) # log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(states, action) q2_new_policy = self.critic_2(states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) # sample actions for next batch states action_next, _ = self.actor.sample_normal(new_states, reparameterize=True) q1_new_policy = self.critic_1(new_states, action_next) q2_new_policy = self.critic_2(new_states, action_next) critic_value_next = torch.min(q1_new_policy, q2_new_policy) critic_value_next = critic_value.view(-1) # actor_loss = log_probs - critic_value actor_loss = -(critic_value + critic_value_next) + sm_reg * F.mse_loss( action, action_next) actor_loss = torch.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() # self.critic_1.optimizer.zero_grad() # self.critic_2.optimizer.zero_grad() # q_hat = self.scale*rewards + self.gamma*new_states_value # q1_old_policy = self.critic_1(states, actions).view(-1) # q2_old_policy = self.critic_2(states, actions).view(-1) # critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) # critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) # critic_loss = critic1_loss + critic2_loss # critic_loss.backward() # self.critic_1.optimizer.step() # self.critic_2.optimizer.step() # self.update_network_parameters() return 0, 0, actor_loss, 0
class Agent_2(): def __init__(self, alpha=0.00005, beta=0.00005, input_dims=5, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = 0.99 self.tau = tau self.memeory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions latent_dims = 10 self.actor = ActorNetwork_2(alpha, latent_dims, env.action_space.high, n_actions=n_actions) self.critic_1 = CriticNetwork(beta, latent_dims, n_actions, name='critic_det_1') self.critic_2 = CriticNetwork(beta, latent_dims, n_actions, name='critic__det_2') self.value = ValueNetwork(beta, latent_dims, name='value_det') self.target_value = ValueNetwork(beta, latent_dims, name='target_value_det') self.VAE = LinearVAE() self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = torch.Tensor([observation]).to(self.actor.device) state_latent = self.VAE.sample_normal(state) actions = self.actor(state_latent) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memeory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_dict = dict(target_value_params) value_dict = dict(value_params) for name in target_value_dict: target_value_dict[name] = tau*value_dict[name].clone() + \ (1-tau)*target_value_dict[name].clone() self.target_value.load_state_dict(target_value_dict) def save_models(self): print('... saving models ...') self.actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() def load_models(self): print('... loading models ...') self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() def learn(self): if self.memeory.mem_cntr < self.batch_size: return states, new_states, actions, rewards, dones = self.memeory.sample_buffer( self.batch_size) states = torch.tensor(states, dtype=torch.float).to(self.actor.device) new_states = torch.tensor(new_states, dtype=torch.float).to(self.actor.device) actions = torch.tensor(actions, dtype=torch.float).to(self.actor.device) rewards = torch.tensor(rewards, dtype=torch.float).to(self.actor.device) dones = torch.tensor(dones).to(self.actor.device) # Train VAE with KL divergence + reconstruction_loss + log_probs reconstruction, mu, logvar, log_probs = self.VAE(states) KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) reconstruction_loss = F.mse_loss(reconstruction, states) final_loss = KLD + reconstruction_loss self.VAE.optimizer.zero_grad() final_loss.backward(retain_graph=True) self.VAE.optimizer.step() latent_states = self.VAE.sample_normal(states) states_value = self.value(latent_states).view(-1) new_latent_states = self.VAE.sample_normal(new_states) new_states_value = self.target_value(new_latent_states).view(-1) new_states_value[dones] = 0.0 action = self.actor(latent_states) q1_new_policy = self.critic_1(latent_states, action) q2_new_policy = self.critic_2(latent_states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value value_loss = 0.5 * F.mse_loss(states_value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() actor_loss = -critic_value actor_loss = torch.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * rewards + self.gamma * new_states_value q1_old_policy = self.critic_1(latent_states, actions).view(-1) q2_old_policy = self.critic_2(latent_states, actions).view(-1) critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic1_loss + critic2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters() return final_loss, value_loss, actor_loss, critic_loss
class Agent(): def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) #sets the parameters of Target-network equals to the #values of Target-network in the beginning def choose_action(self, observation): state = T.Tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: #different copies: exact VS soft tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau * value_state_dict[name].clone() + \ (1 - tau) * target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def save_models(self): print('.... saving models ....') self.actor.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() def load_models(self): print('.... loading models ....') self.actor.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ #takes the batch self.memory.sample_buffer(self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.actor.device) #trasforms into tensors done = T.tensor(done).to(self.actor.device) state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device) state = T.tensor(state, dtype=T.float).to(self.actor.device) action = T.tensor(action, dtype=T.float).to(self.actor.device) value = self.value(state).view(-1) value_ = self.target_value(state_).view(-1) value_[done] = 0.0 #####_ sta usando 0 per dire True? @15, 17 actions, log_probs = self.actor.sample_normal(state, reparameterize=False) #takes the lower Q-values from 2 Critics to the Critic log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs ####_ Perchè non prende semplicemente il critic_value? value_loss = 0.5 * F.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() actions, log_probs = self.actor.sample_normal(state, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = T.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = T.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * reward + self.gamma * value_ #The scaling factor takes into account the entropy and encourage exploration q1_old_policy = self.critic_1.forward(state, action).view(-1) q2_old_policy = self.critic_2.forward(state, action).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters()
class Agent(): def __init__(self, alpha=0.0003, beta=0.0003, input_dims=8, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = 0.99 self.tau = tau self.memeory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, env.action_space.high, n_actions=n_actions) self.critic_1 = CriticNetwork(beta, input_dims, n_actions, name='critic_1') self.critic_2 = CriticNetwork(beta, input_dims, n_actions, name='critic_2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = torch.Tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memeory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_dict = dict(target_value_params) value_dict = dict(value_params) for name in target_value_dict: target_value_dict[name] = tau*value_dict[name].clone() + \ (1-tau)*target_value_dict[name].clone() self.target_value.load_state_dict(target_value_dict) def save_models(self): print('... saving models ...') self.actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() self.value.save_checkpoint() self.target_value.save_checkpoint() def load_models(self): print('... loading models ...') self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.value.load_checkpoint() self.target_value.load_checkpoint() def learn(self): if self.memeory.mem_cntr < self.batch_size: return states, new_states, actions, rewards, dones = self.memeory.sample_buffer( self.batch_size) states = torch.tensor(states, dtype=torch.float).to(self.actor.device) new_states = torch.tensor(new_states, dtype=torch.float).to(self.actor.device) actions = torch.tensor(actions, dtype=torch.float).to(self.actor.device) rewards = torch.tensor(rewards, dtype=torch.float).to(self.actor.device) dones = torch.tensor(dones).to(self.actor.device) states_value = self.value(states).view(-1) new_states_value = self.target_value(new_states).view(-1) new_states_value[dones] = 0.0 action, log_probs = self.actor.sample_normal(states, reparameterize=False) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(states, action) q2_new_policy = self.critic_2(states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.optimizer.zero_grad() value_target = critic_value - log_probs value_loss = 0.5 * F.mse_loss(states_value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() action, log_probs = self.actor.sample_normal(states, reparameterize=True) log_probs = log_probs.view(-1) q1_new_policy = self.critic_1(states, action) q2_new_policy = self.critic_2(states, action) critic_value = torch.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_probs - critic_value actor_loss = torch.mean(actor_loss) self.actor.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q_hat = self.scale * rewards + self.gamma * new_states_value q1_old_policy = self.critic_1(states, actions).view(-1) q2_old_policy = self.critic_2(states, actions).view(-1) critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic1_loss + critic2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.update_network_parameters() return value_loss, actor_loss, critic_loss def train_on_env(self, env): rewards = [] done = False observation = env.reset() while not done: action = self.choose_action(observation) observation_, reward, done, _ = env.step(action) self.remember(observation, action, reward, observation_, done) #if not load_checkpoints: self.learn() observation = observation_ rewards.append(reward) return np.sum(rewards) def generate_session(self, env, t_max=1000): states, traj_probs, actions, rewards = [], [], [], [] s = env.reset() q_t = 0 for t in range(t_max): state = torch.Tensor([s]).to(self.actor.device) action, log_probs = self.actor.sample_normal(state, reparameterize=False) action = action.cpu().detach().numpy()[0] new_s, r, done, info = env.step(action) log_probs = log_probs.cpu().detach().numpy()[0] #q_t *= probs q_t += log_probs[0] states.append(s.tolist()) traj_probs.append(q_t) actions.append(action[0]) rewards.append(r) s = new_s if done: break return np.array(states), np.array(traj_probs), np.array( actions), np.array(rewards)
class Agent(): def __init__(self, input_dims, env, n_actions): self.memory = ReplayBuffer(input_dims) self.n_actions = n_actions self.actor_nn = ActorNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_actor', max_action=env.action_space.n) self.critic_local_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_1') self.critic_local_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_2') self.critic_target_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_1') self.critic_target_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_2') self.value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_value') self.target_value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_target_value') self.update_network_parameters(tau=1) def choose_action(self, observation): state = T.Tensor([observation]).to(Constants.device) _, max_probability_action = self.actor_nn.sample_action(state) return max_probability_action def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def learn(self): if self.memory.mem_cntr < Hyper.batch_size: return state, action, reward, next_state, done = self.memory.sample_buffer() reward = T.tensor(reward, dtype=T.float).to(Constants.device) done = T.tensor(done).to(Constants.device) next_state = T.tensor(next_state, dtype=T.float).to(Constants.device) state = T.tensor(state, dtype=T.float).to(Constants.device) action = T.tensor(action, dtype=T.float).to(Constants.device) # value_from_nn = self.value_nn(state).view(-1) value_from_nn = self.value_nn(state) new_value_from_nn = self.target_value_nn(next_state).view(-1) new_value_from_nn[done] = 0.0 (action_probabilities, log_action_probabilities), _ = self.actor_nn.sample_action(next_state) with T.no_grad(): q1_new_policy = self.critic_target_1_nn(next_state) q2_new_policy = self.critic_target_2_nn(next_state) critic_value = T.min(q1_new_policy, q2_new_policy) self.value_nn.optimizer.zero_grad() # CHANGE0003 Soft state-value where actions are discrete inside_term = Hyper.alpha * log_action_probabilities - critic_value #value_target = action_probabilities * (critic_value - Hyper.alpha * log_action_probabilities) value_loss = (action_probabilities * inside_term).sum(dim=1).mean() value_loss.backward(retain_graph=True) self.value_nn.optimizer.step() (action_probabilities, log_action_probabilities), _ = self.actor_nn.sample_action(state) with T.no_grad(): q1_new_policy = self.critic_local_1_nn(state) q2_new_policy = self.critic_local_1_nn(state) critic_value = T.min(q1_new_policy, q2_new_policy) # CHANGE0005 Objective for policy actor_loss = action_probabilities * ( Hyper.alpha * log_action_probabilities - critic_value) actor_loss = T.mean(actor_loss) self.actor_nn.optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor_nn.optimizer.step() self.critic_local_1_nn.optimizer.zero_grad() self.critic_local_2_nn.optimizer.zero_grad() q_hat = Hyper.reward_scale * reward + Hyper.gamma * new_value_from_nn action_logits1 = self.critic_local_1_nn(state) q1_old_policy = T.argmax(action_logits1, dim=1, keepdim=True).view(-1) action_logits2 = self.critic_local_2_nn(state) q2_old_policy = T.argmax(action_logits2, dim=1, keepdim=True).view(-1) critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_local_1_nn.optimizer.step() self.critic_local_2_nn.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = Hyper.tau target_value_params = self.target_value_nn.named_parameters() value_params = self.value_nn.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau*value_state_dict[name].clone() + \ (1-tau)*target_value_state_dict[name].clone() self.target_value_nn.load_state_dict(value_state_dict) self.update_network_parameters_line( self.critic_target_1_nn.named_parameters(), self.critic_local_1_nn.named_parameters(), tau) self.update_network_parameters_line( self.critic_target_2_nn.named_parameters(), self.critic_local_2_nn.named_parameters(), tau) def update_network_parameters_line(self, target_params, local_params, tau): for target_param, local_param in zip(target_params, local_params): target_param[1].data.copy_(tau * local_param[1].data + (1.0 - tau) * target_param[1].data) def save_models(self): print('.... saving models ....') self.actor_nn.save_checkpoint() self.value_nn.save_checkpoint() self.target_value_nn.save_checkpoint() self.critic_local_1_nn.save_checkpoint() self.critic_local_2_nn.save_checkpoint() self.critic_target_1_nn.save_checkpoint() self.critic_target_2_nn.save_checkpoint() def load_models(self): print('.... loading models ....') self.actor_nn.load_checkpoint() self.value_nn.load_checkpoint() self.target_value_nn.load_checkpoint() self.critic_local_1_nn.load_checkpoint() self.critic_local_2_nn.load_checkpoint() self.critic_target_1_nn.load_checkpoint() self.critic_target_2_nn.load_checkpoint()